aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/torture.txt10
-rw-r--r--Documentation/kernel-parameters.txt6
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt54
-rw-r--r--Documentation/scheduler/sched-rt-group.txt20
-rw-r--r--Documentation/trace/events.txt3
-rw-r--r--Documentation/trace/ftrace.txt50
-rw-r--r--arch/s390/kernel/time.c1
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c75
-rw-r--r--drivers/oprofile/cpu_buffer.c4
-rw-r--r--drivers/xen/manage.c14
-rw-r--r--fs/eventpoll.c3
-rw-r--r--include/linux/cpuset.h16
-rw-r--r--include/linux/ftrace.h49
-rw-r--r--include/linux/ftrace_event.h81
-rw-r--r--include/linux/kernel.h11
-rw-r--r--include/linux/module.h6
-rw-r--r--include/linux/rcutiny.h2
-rw-r--r--include/linux/rcutree.h1
-rw-r--r--include/linux/ring_buffer.h10
-rw-r--r--include/linux/sched.h70
-rw-r--r--include/linux/stop_machine.h122
-rw-r--r--include/linux/syscalls.h57
-rw-r--r--include/linux/tick.h5
-rw-r--r--include/linux/tracepoint.h196
-rw-r--r--include/linux/wait.h35
-rw-r--r--include/trace/define_trace.h5
-rw-r--r--include/trace/events/module.h18
-rw-r--r--include/trace/events/napi.h10
-rw-r--r--include/trace/events/sched.h32
-rw-r--r--include/trace/events/signal.h52
-rw-r--r--include/trace/ftrace.h274
-rw-r--r--include/trace/syscall.h10
-rw-r--r--init/Kconfig3
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/cpuset.c67
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/module.c22
-rw-r--r--kernel/rcutorture.c2
-rw-r--r--kernel/sched.c726
-rw-r--r--kernel/sched_debug.c108
-rw-r--r--kernel/sched_fair.c350
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/stop_machine.c534
-rw-r--r--kernel/time/tick-sched.c84
-rw-r--r--kernel/time/timer_list.c1
-rw-r--r--kernel/trace/blktrace.c138
-rw-r--r--kernel/trace/ftrace.c36
-rw-r--r--kernel/trace/kmemtrace.c70
-rw-r--r--kernel/trace/ring_buffer.c179
-rw-r--r--kernel/trace/ring_buffer_benchmark.c5
-rw-r--r--kernel/trace/trace.c136
-rw-r--r--kernel/trace/trace.h36
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_perf.c17
-rw-r--r--kernel/trace/trace_events.c139
-rw-r--r--kernel/trace/trace_events_filter.c28
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_functions_graph.c176
-rw-r--r--kernel/trace/trace_irqsoff.c271
-rw-r--r--kernel/trace/trace_kprobe.c104
-rw-r--r--kernel/trace/trace_output.c139
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c29
-rw-r--r--kernel/trace/trace_selftest.c7
-rw-r--r--kernel/trace/trace_syscalls.c137
-rw-r--r--kernel/trace/trace_workqueue.c26
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/user.c11
-rw-r--r--net/core/drop_monitor.c12
-rw-r--r--samples/tracepoints/tp-samples-trace.h4
-rw-r--r--samples/tracepoints/tracepoint-probe-sample.c13
-rw-r--r--samples/tracepoints/tracepoint-probe-sample2.c7
81 files changed, 3048 insertions, 2145 deletions
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 0e50bc2aa1e2..5d9016795fd8 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
185 state: -1 / 0:0 3:0 4:0
186
187As before, the first four lines are similar to those for RCU.
188The last line shows the task-migration state. The first number is
189-1 if synchronize_sched_expedited() is idle, -2 if in the process of
190posting wakeups to the migration kthreads, and N when waiting on CPU N.
191Each of the colon-separated fields following the "/" is a CPU:state pair.
192Valid states are "0" for idle, "1" for waiting for quiescent state,
193"2" for passed through quiescent state, and "3" when a race with a
194CPU-hotplug event forces use of the synchronize_sched() primitive.
195 185
196 186
197USAGE 187USAGE
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 839b21b0699a..907010cea9ad 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -784,8 +784,12 @@ and is between 256 and 4096 characters. It is defined in the file
784 as early as possible in order to facilitate early 784 as early as possible in order to facilitate early
785 boot debugging. 785 boot debugging.
786 786
787 ftrace_dump_on_oops 787 ftrace_dump_on_oops[=orig_cpu]
788 [FTRACE] will dump the trace buffers on oops. 788 [FTRACE] will dump the trace buffers on oops.
789 If no parameter is passed, ftrace will dump
790 buffers of all CPUs, but if you pass orig_cpu, it will
791 dump only the buffer of the CPU that triggered the
792 oops.
789 793
790 ftrace_filter=[function-list] 794 ftrace_filter=[function-list]
791 [FTRACE] Limit the functions traced by the function 795 [FTRACE] Limit the functions traced by the function
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 6f33593e59e2..8239ebbcddce 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group. For example, it may be
211desirable to first provide fair CPU time to each user on the system and then to 211desirable to first provide fair CPU time to each user on the system and then to
212each task belonging to a user. 212each task belonging to a user.
213 213
214CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be 214CONFIG_CGROUP_SCHED strives to achieve exactly that. It lets tasks to be
215grouped and divides CPU time fairly among such groups. 215grouped and divides CPU time fairly among such groups.
216 216
217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and 217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
@@ -220,38 +220,11 @@ SCHED_RR) tasks.
220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and 220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
221SCHED_BATCH) tasks. 221SCHED_BATCH) tasks.
222 222
223At present, there are two (mutually exclusive) mechanisms to group tasks for 223 These options need CONFIG_CGROUPS to be defined, and let the administrator
224CPU bandwidth control purposes:
225
226 - Based on user id (CONFIG_USER_SCHED)
227
228 With this option, tasks are grouped according to their user id.
229
230 - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
231
232 This options needs CONFIG_CGROUPS to be defined, and lets the administrator
233 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See 224 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
234 Documentation/cgroups/cgroups.txt for more information about this filesystem. 225 Documentation/cgroups/cgroups.txt for more information about this filesystem.
235 226
236Only one of these options to group tasks can be chosen and not both. 227When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
237
238When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
239user and a "cpu_share" file is added in that directory.
240
241 # cd /sys/kernel/uids
242 # cat 512/cpu_share # Display user 512's CPU share
243 1024
244 # echo 2048 > 512/cpu_share # Modify user 512's CPU share
245 # cat 512/cpu_share # Display user 512's CPU share
246 2048
247 #
248
249CPU bandwidth between two users is divided in the ratio of their CPU shares.
250For example: if you would like user "root" to get twice the bandwidth of user
251"guest," then set the cpu_share for both the users such that "root"'s cpu_share
252is twice "guest"'s cpu_share.
253
254When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
255group created using the pseudo filesystem. See example steps below to create 228group created using the pseudo filesystem. See example steps below to create
256task groups and modify their CPU share using the "cgroups" pseudo filesystem. 229task groups and modify their CPU share using the "cgroups" pseudo filesystem.
257 230
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
273 246
274 # #Launch gmplayer (or your favourite movie player) 247 # #Launch gmplayer (or your favourite movie player)
275 # echo <movie_player_pid> > multimedia/tasks 248 # echo <movie_player_pid> > multimedia/tasks
276
2778. Implementation note: user namespaces
278
279User namespaces are intended to be hierarchical. But they are currently
280only partially implemented. Each of those has ramifications for CFS.
281
282First, since user namespaces are hierarchical, the /sys/kernel/uids
283presentation is inadequate. Eventually we will likely want to use sysfs
284tagging to provide private views of /sys/kernel/uids within each user
285namespace.
286
287Second, the hierarchical nature is intended to support completely
288unprivileged use of user namespaces. So if using user groups, then
289we want the users in a user namespace to be children of the user
290who created it.
291
292That is currently unimplemented. So instead, every user in a new
293user namespace will receive 1024 shares just like any user in the
294initial user namespace. Note that at the moment creation of a new
295user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
296CAP_SETGID.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 86eabe6c3419..605b0d40329d 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,23 +126,12 @@ priority!
1262.3 Basis for grouping tasks 1262.3 Basis for grouping tasks
127---------------------------- 127----------------------------
128 128
129There are two compile-time settings for allocating CPU bandwidth. These are 129Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
130configured using the "Basis for grouping tasks" multiple choice menu under 130CPU bandwidth to task groups.
131General setup > Group CPU Scheduler:
132
133a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id")
134
135This lets you use the virtual files under
136"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
137each user .
138
139The other option is:
140
141.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
142 131
143This uses the /cgroup virtual file system and 132This uses the /cgroup virtual file system and
144"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each 133"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
145control group instead. 134control group.
146 135
147For more information on working with control groups, you should read 136For more information on working with control groups, you should read
148Documentation/cgroups/cgroups.txt as well. 137Documentation/cgroups/cgroups.txt as well.
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans):
161=============== 150===============
162 151
163There is work in progress to make the scheduling period for each group 152There is work in progress to make the scheduling period for each group
164("/sys/kernel/uids/<uid>/cpu_rt_period_us" or 153("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well.
165"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
166 154
167The constraint on the period is that a subgroup must have a smaller or 155The constraint on the period is that a subgroup must have a smaller or
168equal period to its parent. But realistically its not very useful _yet_ 156equal period to its parent. But realistically its not very useful _yet_
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 02ac6ed38b2d..778ddf38b82c 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
90 90
91 trace_event=[event-list] 91 trace_event=[event-list]
92 92
93The format of this boot option is the same as described in section 2.1. 93event-list is a comma separated list of events. See section 2.1 for event
94format.
94 95
953. Defining an event-enabled tracepoint 963. Defining an event-enabled tracepoint
96======================================= 97=======================================
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 03485bfbd797..557c1edeccaf 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
155 to be traced. Echoing names of functions into this file 155 to be traced. Echoing names of functions into this file
156 will limit the trace to only those functions. 156 will limit the trace to only those functions.
157 157
158 This interface also allows for commands to be used. See the
159 "Filter commands" section for more details.
160
158 set_ftrace_notrace: 161 set_ftrace_notrace:
159 162
160 This has an effect opposite to that of 163 This has an effect opposite to that of
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
1337can either use the sysctl function or set it via the proc system 1340can either use the sysctl function or set it via the proc system
1338interface. 1341interface.
1339 1342
1340 sysctl kernel.ftrace_dump_on_oops=1 1343 sysctl kernel.ftrace_dump_on_oops=n
1341 1344
1342or 1345or
1343 1346
1344 echo 1 > /proc/sys/kernel/ftrace_dump_on_oops 1347 echo n > /proc/sys/kernel/ftrace_dump_on_oops
1345 1348
1349If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
1350only dump the buffer of the CPU that triggered the oops.
1346 1351
1347Here's an example of such a dump after a null pointer 1352Here's an example of such a dump after a null pointer
1348dereference in a kernel module: 1353dereference in a kernel module:
@@ -1822,6 +1827,47 @@ this special filter via:
1822 echo > set_graph_function 1827 echo > set_graph_function
1823 1828
1824 1829
1830Filter commands
1831---------------
1832
1833A few commands are supported by the set_ftrace_filter interface.
1834Trace commands have the following format:
1835
1836<function>:<command>:<parameter>
1837
1838The following commands are supported:
1839
1840- mod
1841 This command enables function filtering per module. The
1842 parameter defines the module. For example, if only the write*
1843 functions in the ext3 module are desired, run:
1844
1845 echo 'write*:mod:ext3' > set_ftrace_filter
1846
1847 This command interacts with the filter in the same way as
1848 filtering based on function names. Thus, adding more functions
1849 in a different module is accomplished by appending (>>) to the
1850 filter file. Remove specific module functions by prepending
1851 '!':
1852
1853 echo '!writeback*:mod:ext3' >> set_ftrace_filter
1854
1855- traceon/traceoff
1856 These commands turn tracing on and off when the specified
1857 functions are hit. The parameter determines how many times the
1858 tracing system is turned on and off. If unspecified, there is
1859 no limit. For example, to disable tracing when a schedule bug
1860 is hit the first 5 times, run:
1861
1862 echo '__schedule_bug:traceoff:5' > set_ftrace_filter
1863
1864 These commands are cumulative whether or not they are appended
1865 to set_ftrace_filter. To remove a command, prepend it by '!'
1866 and drop the parameter:
1867
1868 echo '!__schedule_bug:traceoff' > set_ftrace_filter
1869
1870
1825trace_pipe 1871trace_pipe
1826---------- 1872----------
1827 1873
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index d906bf19c14a..a2163c95eb98 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -391,7 +391,6 @@ static void __init time_init_wq(void)
391 if (time_sync_wq) 391 if (time_sync_wq)
392 return; 392 return;
393 time_sync_wq = create_singlethread_workqueue("timesync"); 393 time_sync_wq = create_singlethread_workqueue("timesync");
394 stop_machine_create();
395} 394}
396 395
397/* 396/*
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 59de2525d303..d4e8b213a462 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -289,7 +289,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
289 289
290static void sysrq_ftrace_dump(int key, struct tty_struct *tty) 290static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
291{ 291{
292 ftrace_dump(); 292 ftrace_dump(DUMP_ALL);
293} 293}
294static struct sysrq_key_op sysrq_ftrace_dump_op = { 294static struct sysrq_key_op sysrq_ftrace_dump_op = {
295 .handler = sysrq_ftrace_dump, 295 .handler = sysrq_ftrace_dump,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index bd444dc93cf2..8e9dbdc6c700 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -73,6 +73,7 @@ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
73 73
74struct cpu_dbs_info_s { 74struct cpu_dbs_info_s {
75 cputime64_t prev_cpu_idle; 75 cputime64_t prev_cpu_idle;
76 cputime64_t prev_cpu_iowait;
76 cputime64_t prev_cpu_wall; 77 cputime64_t prev_cpu_wall;
77 cputime64_t prev_cpu_nice; 78 cputime64_t prev_cpu_nice;
78 struct cpufreq_policy *cur_policy; 79 struct cpufreq_policy *cur_policy;
@@ -108,6 +109,7 @@ static struct dbs_tuners {
108 unsigned int down_differential; 109 unsigned int down_differential;
109 unsigned int ignore_nice; 110 unsigned int ignore_nice;
110 unsigned int powersave_bias; 111 unsigned int powersave_bias;
112 unsigned int io_is_busy;
111} dbs_tuners_ins = { 113} dbs_tuners_ins = {
112 .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, 114 .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
113 .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL, 115 .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
@@ -148,6 +150,16 @@ static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
148 return idle_time; 150 return idle_time;
149} 151}
150 152
153static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall)
154{
155 u64 iowait_time = get_cpu_iowait_time_us(cpu, wall);
156
157 if (iowait_time == -1ULL)
158 return 0;
159
160 return iowait_time;
161}
162
151/* 163/*
152 * Find right freq to be set now with powersave_bias on. 164 * Find right freq to be set now with powersave_bias on.
153 * Returns the freq_hi to be used right now and will set freq_hi_jiffies, 165 * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
@@ -249,6 +261,7 @@ static ssize_t show_##file_name \
249 return sprintf(buf, "%u\n", dbs_tuners_ins.object); \ 261 return sprintf(buf, "%u\n", dbs_tuners_ins.object); \
250} 262}
251show_one(sampling_rate, sampling_rate); 263show_one(sampling_rate, sampling_rate);
264show_one(io_is_busy, io_is_busy);
252show_one(up_threshold, up_threshold); 265show_one(up_threshold, up_threshold);
253show_one(ignore_nice_load, ignore_nice); 266show_one(ignore_nice_load, ignore_nice);
254show_one(powersave_bias, powersave_bias); 267show_one(powersave_bias, powersave_bias);
@@ -299,6 +312,23 @@ static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
299 return count; 312 return count;
300} 313}
301 314
315static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b,
316 const char *buf, size_t count)
317{
318 unsigned int input;
319 int ret;
320
321 ret = sscanf(buf, "%u", &input);
322 if (ret != 1)
323 return -EINVAL;
324
325 mutex_lock(&dbs_mutex);
326 dbs_tuners_ins.io_is_busy = !!input;
327 mutex_unlock(&dbs_mutex);
328
329 return count;
330}
331
302static ssize_t store_up_threshold(struct kobject *a, struct attribute *b, 332static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
303 const char *buf, size_t count) 333 const char *buf, size_t count)
304{ 334{
@@ -381,6 +411,7 @@ static struct global_attr _name = \
381__ATTR(_name, 0644, show_##_name, store_##_name) 411__ATTR(_name, 0644, show_##_name, store_##_name)
382 412
383define_one_rw(sampling_rate); 413define_one_rw(sampling_rate);
414define_one_rw(io_is_busy);
384define_one_rw(up_threshold); 415define_one_rw(up_threshold);
385define_one_rw(ignore_nice_load); 416define_one_rw(ignore_nice_load);
386define_one_rw(powersave_bias); 417define_one_rw(powersave_bias);
@@ -392,6 +423,7 @@ static struct attribute *dbs_attributes[] = {
392 &up_threshold.attr, 423 &up_threshold.attr,
393 &ignore_nice_load.attr, 424 &ignore_nice_load.attr,
394 &powersave_bias.attr, 425 &powersave_bias.attr,
426 &io_is_busy.attr,
395 NULL 427 NULL
396}; 428};
397 429
@@ -470,14 +502,15 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
470 502
471 for_each_cpu(j, policy->cpus) { 503 for_each_cpu(j, policy->cpus) {
472 struct cpu_dbs_info_s *j_dbs_info; 504 struct cpu_dbs_info_s *j_dbs_info;
473 cputime64_t cur_wall_time, cur_idle_time; 505 cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time;
474 unsigned int idle_time, wall_time; 506 unsigned int idle_time, wall_time, iowait_time;
475 unsigned int load, load_freq; 507 unsigned int load, load_freq;
476 int freq_avg; 508 int freq_avg;
477 509
478 j_dbs_info = &per_cpu(od_cpu_dbs_info, j); 510 j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
479 511
480 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 512 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
513 cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time);
481 514
482 wall_time = (unsigned int) cputime64_sub(cur_wall_time, 515 wall_time = (unsigned int) cputime64_sub(cur_wall_time,
483 j_dbs_info->prev_cpu_wall); 516 j_dbs_info->prev_cpu_wall);
@@ -487,6 +520,10 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
487 j_dbs_info->prev_cpu_idle); 520 j_dbs_info->prev_cpu_idle);
488 j_dbs_info->prev_cpu_idle = cur_idle_time; 521 j_dbs_info->prev_cpu_idle = cur_idle_time;
489 522
523 iowait_time = (unsigned int) cputime64_sub(cur_iowait_time,
524 j_dbs_info->prev_cpu_iowait);
525 j_dbs_info->prev_cpu_iowait = cur_iowait_time;
526
490 if (dbs_tuners_ins.ignore_nice) { 527 if (dbs_tuners_ins.ignore_nice) {
491 cputime64_t cur_nice; 528 cputime64_t cur_nice;
492 unsigned long cur_nice_jiffies; 529 unsigned long cur_nice_jiffies;
@@ -504,6 +541,16 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
504 idle_time += jiffies_to_usecs(cur_nice_jiffies); 541 idle_time += jiffies_to_usecs(cur_nice_jiffies);
505 } 542 }
506 543
544 /*
545 * For the purpose of ondemand, waiting for disk IO is an
546 * indication that you're performance critical, and not that
547 * the system is actually idle. So subtract the iowait time
548 * from the cpu idle time.
549 */
550
551 if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time)
552 idle_time -= iowait_time;
553
507 if (unlikely(!wall_time || wall_time < idle_time)) 554 if (unlikely(!wall_time || wall_time < idle_time))
508 continue; 555 continue;
509 556
@@ -617,6 +664,29 @@ static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
617 cancel_delayed_work_sync(&dbs_info->work); 664 cancel_delayed_work_sync(&dbs_info->work);
618} 665}
619 666
667/*
668 * Not all CPUs want IO time to be accounted as busy; this dependson how
669 * efficient idling at a higher frequency/voltage is.
670 * Pavel Machek says this is not so for various generations of AMD and old
671 * Intel systems.
672 * Mike Chan (androidlcom) calis this is also not true for ARM.
673 * Because of this, whitelist specific known (series) of CPUs by default, and
674 * leave all others up to the user.
675 */
676static int should_io_be_busy(void)
677{
678#if defined(CONFIG_X86)
679 /*
680 * For Intel, Core 2 (model 15) andl later have an efficient idle.
681 */
682 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
683 boot_cpu_data.x86 == 6 &&
684 boot_cpu_data.x86_model >= 15)
685 return 1;
686#endif
687 return 0;
688}
689
620static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 690static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
621 unsigned int event) 691 unsigned int event)
622{ 692{
@@ -679,6 +749,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
679 dbs_tuners_ins.sampling_rate = 749 dbs_tuners_ins.sampling_rate =
680 max(min_sampling_rate, 750 max(min_sampling_rate,
681 latency * LATENCY_MULTIPLIER); 751 latency * LATENCY_MULTIPLIER);
752 dbs_tuners_ins.io_is_busy = should_io_be_busy();
682 } 753 }
683 mutex_unlock(&dbs_mutex); 754 mutex_unlock(&dbs_mutex);
684 755
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 166b67ea622f..7581dbe456da 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -186,14 +186,14 @@ int op_cpu_buffer_write_commit(struct op_entry *entry)
186struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) 186struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
187{ 187{
188 struct ring_buffer_event *e; 188 struct ring_buffer_event *e;
189 e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); 189 e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
190 if (e) 190 if (e)
191 goto event; 191 goto event;
192 if (ring_buffer_swap_cpu(op_ring_buffer_read, 192 if (ring_buffer_swap_cpu(op_ring_buffer_read,
193 op_ring_buffer_write, 193 op_ring_buffer_write,
194 cpu)) 194 cpu))
195 return NULL; 195 return NULL;
196 e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); 196 e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
197 if (e) 197 if (e)
198 goto event; 198 goto event;
199 return NULL; 199 return NULL;
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2ac4440e7b08..8943b8ccee1a 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -80,12 +80,6 @@ static void do_suspend(void)
80 80
81 shutting_down = SHUTDOWN_SUSPEND; 81 shutting_down = SHUTDOWN_SUSPEND;
82 82
83 err = stop_machine_create();
84 if (err) {
85 printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err);
86 goto out;
87 }
88
89#ifdef CONFIG_PREEMPT 83#ifdef CONFIG_PREEMPT
90 /* If the kernel is preemptible, we need to freeze all the processes 84 /* If the kernel is preemptible, we need to freeze all the processes
91 to prevent them from being in the middle of a pagetable update 85 to prevent them from being in the middle of a pagetable update
@@ -93,7 +87,7 @@ static void do_suspend(void)
93 err = freeze_processes(); 87 err = freeze_processes();
94 if (err) { 88 if (err) {
95 printk(KERN_ERR "xen suspend: freeze failed %d\n", err); 89 printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
96 goto out_destroy_sm; 90 goto out;
97 } 91 }
98#endif 92#endif
99 93
@@ -136,12 +130,8 @@ out_resume:
136out_thaw: 130out_thaw:
137#ifdef CONFIG_PREEMPT 131#ifdef CONFIG_PREEMPT
138 thaw_processes(); 132 thaw_processes();
139
140out_destroy_sm:
141#endif
142 stop_machine_destroy();
143
144out: 133out:
134#endif
145 shutting_down = SHUTDOWN_INVALID; 135 shutting_down = SHUTDOWN_INVALID;
146} 136}
147#endif /* CONFIG_PM_SLEEP */ 137#endif /* CONFIG_PM_SLEEP */
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
1140 * ep_poll_callback() when events will become available. 1140 * ep_poll_callback() when events will become available.
1141 */ 1141 */
1142 init_waitqueue_entry(&wait, current); 1142 init_waitqueue_entry(&wait, current);
1143 wait.flags |= WQ_FLAG_EXCLUSIVE; 1143 __add_wait_queue_exclusive(&ep->wq, &wait);
1144 __add_wait_queue(&ep->wq, &wait);
1145 1144
1146 for (;;) { 1145 for (;;) {
1147 /* 1146 /*
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc4d04b..a73454aec333 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,8 +21,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
21extern int cpuset_init(void); 21extern int cpuset_init(void);
22extern void cpuset_init_smp(void); 22extern void cpuset_init_smp(void);
23extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 23extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
24extern void cpuset_cpus_allowed_locked(struct task_struct *p, 24extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
25 struct cpumask *mask);
26extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 25extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
27#define cpuset_current_mems_allowed (current->mems_allowed) 26#define cpuset_current_mems_allowed (current->mems_allowed)
28void cpuset_init_current_mems_allowed(void); 27void cpuset_init_current_mems_allowed(void);
@@ -69,9 +68,6 @@ struct seq_file;
69extern void cpuset_task_status_allowed(struct seq_file *m, 68extern void cpuset_task_status_allowed(struct seq_file *m,
70 struct task_struct *task); 69 struct task_struct *task);
71 70
72extern void cpuset_lock(void);
73extern void cpuset_unlock(void);
74
75extern int cpuset_mem_spread_node(void); 71extern int cpuset_mem_spread_node(void);
76 72
77static inline int cpuset_do_page_mem_spread(void) 73static inline int cpuset_do_page_mem_spread(void)
@@ -105,10 +101,11 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
105{ 101{
106 cpumask_copy(mask, cpu_possible_mask); 102 cpumask_copy(mask, cpu_possible_mask);
107} 103}
108static inline void cpuset_cpus_allowed_locked(struct task_struct *p, 104
109 struct cpumask *mask) 105static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
110{ 106{
111 cpumask_copy(mask, cpu_possible_mask); 107 cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
108 return cpumask_any(cpu_active_mask);
112} 109}
113 110
114static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) 111static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
@@ -157,9 +154,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m,
157{ 154{
158} 155}
159 156
160static inline void cpuset_lock(void) {}
161static inline void cpuset_unlock(void) {}
162
163static inline int cpuset_mem_spread_node(void) 157static inline int cpuset_mem_spread_node(void)
164{ 158{
165 return 0; 159 return 0;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index cc12b3c556b3..41e46330d9be 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -82,9 +82,13 @@ void clear_ftrace_function(void);
82extern void ftrace_stub(unsigned long a0, unsigned long a1); 82extern void ftrace_stub(unsigned long a0, unsigned long a1);
83 83
84#else /* !CONFIG_FUNCTION_TRACER */ 84#else /* !CONFIG_FUNCTION_TRACER */
85# define register_ftrace_function(ops) do { } while (0) 85/*
86# define unregister_ftrace_function(ops) do { } while (0) 86 * (un)register_ftrace_function must be a macro since the ops parameter
87# define clear_ftrace_function(ops) do { } while (0) 87 * must not be evaluated.
88 */
89#define register_ftrace_function(ops) ({ 0; })
90#define unregister_ftrace_function(ops) ({ 0; })
91static inline void clear_ftrace_function(void) { }
88static inline void ftrace_kill(void) { } 92static inline void ftrace_kill(void) { }
89static inline void ftrace_stop(void) { } 93static inline void ftrace_stop(void) { }
90static inline void ftrace_start(void) { } 94static inline void ftrace_start(void) { }
@@ -237,11 +241,13 @@ extern int skip_trace(unsigned long ip);
237extern void ftrace_disable_daemon(void); 241extern void ftrace_disable_daemon(void);
238extern void ftrace_enable_daemon(void); 242extern void ftrace_enable_daemon(void);
239#else 243#else
240# define skip_trace(ip) ({ 0; }) 244static inline int skip_trace(unsigned long ip) { return 0; }
241# define ftrace_force_update() ({ 0; }) 245static inline int ftrace_force_update(void) { return 0; }
242# define ftrace_set_filter(buf, len, reset) do { } while (0) 246static inline void ftrace_set_filter(unsigned char *buf, int len, int reset)
243# define ftrace_disable_daemon() do { } while (0) 247{
244# define ftrace_enable_daemon() do { } while (0) 248}
249static inline void ftrace_disable_daemon(void) { }
250static inline void ftrace_enable_daemon(void) { }
245static inline void ftrace_release_mod(struct module *mod) {} 251static inline void ftrace_release_mod(struct module *mod) {}
246static inline int register_ftrace_command(struct ftrace_func_command *cmd) 252static inline int register_ftrace_command(struct ftrace_func_command *cmd)
247{ 253{
@@ -314,16 +320,16 @@ static inline void __ftrace_enabled_restore(int enabled)
314 extern void time_hardirqs_on(unsigned long a0, unsigned long a1); 320 extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
315 extern void time_hardirqs_off(unsigned long a0, unsigned long a1); 321 extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
316#else 322#else
317# define time_hardirqs_on(a0, a1) do { } while (0) 323 static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { }
318# define time_hardirqs_off(a0, a1) do { } while (0) 324 static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
319#endif 325#endif
320 326
321#ifdef CONFIG_PREEMPT_TRACER 327#ifdef CONFIG_PREEMPT_TRACER
322 extern void trace_preempt_on(unsigned long a0, unsigned long a1); 328 extern void trace_preempt_on(unsigned long a0, unsigned long a1);
323 extern void trace_preempt_off(unsigned long a0, unsigned long a1); 329 extern void trace_preempt_off(unsigned long a0, unsigned long a1);
324#else 330#else
325# define trace_preempt_on(a0, a1) do { } while (0) 331 static inline void trace_preempt_on(unsigned long a0, unsigned long a1) { }
326# define trace_preempt_off(a0, a1) do { } while (0) 332 static inline void trace_preempt_off(unsigned long a0, unsigned long a1) { }
327#endif 333#endif
328 334
329#ifdef CONFIG_FTRACE_MCOUNT_RECORD 335#ifdef CONFIG_FTRACE_MCOUNT_RECORD
@@ -352,6 +358,10 @@ struct ftrace_graph_ret {
352 int depth; 358 int depth;
353}; 359};
354 360
361/* Type of the callback handlers for tracing function graph*/
362typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
363typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
364
355#ifdef CONFIG_FUNCTION_GRAPH_TRACER 365#ifdef CONFIG_FUNCTION_GRAPH_TRACER
356 366
357/* for init task */ 367/* for init task */
@@ -400,10 +410,6 @@ extern char __irqentry_text_end[];
400 410
401#define FTRACE_RETFUNC_DEPTH 50 411#define FTRACE_RETFUNC_DEPTH 50
402#define FTRACE_RETSTACK_ALLOC_SIZE 32 412#define FTRACE_RETSTACK_ALLOC_SIZE 32
403/* Type of the callback handlers for tracing function graph*/
404typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
405typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
406
407extern int register_ftrace_graph(trace_func_graph_ret_t retfunc, 413extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
408 trace_func_graph_ent_t entryfunc); 414 trace_func_graph_ent_t entryfunc);
409 415
@@ -441,6 +447,13 @@ static inline void unpause_graph_tracing(void)
441static inline void ftrace_graph_init_task(struct task_struct *t) { } 447static inline void ftrace_graph_init_task(struct task_struct *t) { }
442static inline void ftrace_graph_exit_task(struct task_struct *t) { } 448static inline void ftrace_graph_exit_task(struct task_struct *t) { }
443 449
450static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
451 trace_func_graph_ent_t entryfunc)
452{
453 return -1;
454}
455static inline void unregister_ftrace_graph(void) { }
456
444static inline int task_curr_ret_stack(struct task_struct *tsk) 457static inline int task_curr_ret_stack(struct task_struct *tsk)
445{ 458{
446 return -1; 459 return -1;
@@ -492,7 +505,9 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
492 return tsk->trace & TSK_TRACE_FL_GRAPH; 505 return tsk->trace & TSK_TRACE_FL_GRAPH;
493} 506}
494 507
495extern int ftrace_dump_on_oops; 508enum ftrace_dump_mode;
509
510extern enum ftrace_dump_mode ftrace_dump_on_oops;
496 511
497#ifdef CONFIG_PREEMPT 512#ifdef CONFIG_PREEMPT
498#define INIT_TRACE_RECURSION .trace_recursion = 0, 513#define INIT_TRACE_RECURSION .trace_recursion = 0,
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c0f4b364c711..dc7fc646fa2e 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -58,6 +58,7 @@ struct trace_iterator {
58 /* The below is zeroed out in pipe_read */ 58 /* The below is zeroed out in pipe_read */
59 struct trace_seq seq; 59 struct trace_seq seq;
60 struct trace_entry *ent; 60 struct trace_entry *ent;
61 unsigned long lost_events;
61 int leftover; 62 int leftover;
62 int cpu; 63 int cpu;
63 u64 ts; 64 u64 ts;
@@ -69,18 +70,25 @@ struct trace_iterator {
69}; 70};
70 71
71 72
73struct trace_event;
74
72typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, 75typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
73 int flags); 76 int flags, struct trace_event *event);
74struct trace_event { 77
75 struct hlist_node node; 78struct trace_event_functions {
76 struct list_head list;
77 int type;
78 trace_print_func trace; 79 trace_print_func trace;
79 trace_print_func raw; 80 trace_print_func raw;
80 trace_print_func hex; 81 trace_print_func hex;
81 trace_print_func binary; 82 trace_print_func binary;
82}; 83};
83 84
85struct trace_event {
86 struct hlist_node node;
87 struct list_head list;
88 int type;
89 struct trace_event_functions *funcs;
90};
91
84extern int register_ftrace_event(struct trace_event *event); 92extern int register_ftrace_event(struct trace_event *event);
85extern int unregister_ftrace_event(struct trace_event *event); 93extern int unregister_ftrace_event(struct trace_event *event);
86 94
@@ -112,28 +120,67 @@ void tracing_record_cmdline(struct task_struct *tsk);
112 120
113struct event_filter; 121struct event_filter;
114 122
123enum trace_reg {
124 TRACE_REG_REGISTER,
125 TRACE_REG_UNREGISTER,
126 TRACE_REG_PERF_REGISTER,
127 TRACE_REG_PERF_UNREGISTER,
128};
129
130struct ftrace_event_call;
131
132struct ftrace_event_class {
133 char *system;
134 void *probe;
135#ifdef CONFIG_PERF_EVENTS
136 void *perf_probe;
137#endif
138 int (*reg)(struct ftrace_event_call *event,
139 enum trace_reg type);
140 int (*define_fields)(struct ftrace_event_call *);
141 struct list_head *(*get_fields)(struct ftrace_event_call *);
142 struct list_head fields;
143 int (*raw_init)(struct ftrace_event_call *);
144};
145
146enum {
147 TRACE_EVENT_FL_ENABLED_BIT,
148 TRACE_EVENT_FL_FILTERED_BIT,
149};
150
151enum {
152 TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT),
153 TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
154};
155
115struct ftrace_event_call { 156struct ftrace_event_call {
116 struct list_head list; 157 struct list_head list;
158 struct ftrace_event_class *class;
117 char *name; 159 char *name;
118 char *system;
119 struct dentry *dir; 160 struct dentry *dir;
120 struct trace_event *event; 161 struct trace_event event;
121 int enabled;
122 int (*regfunc)(struct ftrace_event_call *);
123 void (*unregfunc)(struct ftrace_event_call *);
124 int id;
125 const char *print_fmt; 162 const char *print_fmt;
126 int (*raw_init)(struct ftrace_event_call *);
127 int (*define_fields)(struct ftrace_event_call *);
128 struct list_head fields;
129 int filter_active;
130 struct event_filter *filter; 163 struct event_filter *filter;
131 void *mod; 164 void *mod;
132 void *data; 165 void *data;
133 166
167 /*
168 * 32 bit flags:
169 * bit 1: enabled
170 * bit 2: filter_active
171 *
172 * Changes to flags must hold the event_mutex.
173 *
174 * Note: Reads of flags do not hold the event_mutex since
175 * they occur in critical sections. But the way flags
176 * is currently used, these changes do no affect the code
177 * except that when a change is made, it may have a slight
178 * delay in propagating the changes to other CPUs due to
179 * caching and such.
180 */
181 unsigned int flags;
182
134 int perf_refcount; 183 int perf_refcount;
135 int (*perf_event_enable)(struct ftrace_event_call *);
136 void (*perf_event_disable)(struct ftrace_event_call *);
137}; 184};
138 185
139#define PERF_MAX_TRACE_SIZE 2048 186#define PERF_MAX_TRACE_SIZE 2048
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 9365227dbaf6..9fb1c1299032 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -490,6 +490,13 @@ static inline void tracing_off(void) { }
490static inline void tracing_off_permanent(void) { } 490static inline void tracing_off_permanent(void) { }
491static inline int tracing_is_on(void) { return 0; } 491static inline int tracing_is_on(void) { return 0; }
492#endif 492#endif
493
494enum ftrace_dump_mode {
495 DUMP_NONE,
496 DUMP_ALL,
497 DUMP_ORIG,
498};
499
493#ifdef CONFIG_TRACING 500#ifdef CONFIG_TRACING
494extern void tracing_start(void); 501extern void tracing_start(void);
495extern void tracing_stop(void); 502extern void tracing_stop(void);
@@ -571,7 +578,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
571extern int 578extern int
572__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); 579__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
573 580
574extern void ftrace_dump(void); 581extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
575#else 582#else
576static inline void 583static inline void
577ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } 584ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
@@ -592,7 +599,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
592{ 599{
593 return 0; 600 return 0;
594} 601}
595static inline void ftrace_dump(void) { } 602static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
596#endif /* CONFIG_TRACING */ 603#endif /* CONFIG_TRACING */
597 604
598/* 605/*
diff --git a/include/linux/module.h b/include/linux/module.h
index 515d53ae6a79..6914fcad4673 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -465,8 +465,7 @@ static inline void __module_get(struct module *module)
465 if (module) { 465 if (module) {
466 preempt_disable(); 466 preempt_disable();
467 __this_cpu_inc(module->refptr->incs); 467 __this_cpu_inc(module->refptr->incs);
468 trace_module_get(module, _THIS_IP_, 468 trace_module_get(module, _THIS_IP_);
469 __this_cpu_read(module->refptr->incs));
470 preempt_enable(); 469 preempt_enable();
471 } 470 }
472} 471}
@@ -480,8 +479,7 @@ static inline int try_module_get(struct module *module)
480 479
481 if (likely(module_is_live(module))) { 480 if (likely(module_is_live(module))) {
482 __this_cpu_inc(module->refptr->incs); 481 __this_cpu_inc(module->refptr->incs);
483 trace_module_get(module, _THIS_IP_, 482 trace_module_get(module, _THIS_IP_);
484 __this_cpu_read(module->refptr->incs));
485 } else 483 } else
486 ret = 0; 484 ret = 0;
487 485
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a5195875480a..0006b2df00e1 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
60 return 0; 60 return 0;
61} 61}
62 62
63extern int rcu_expedited_torture_stats(char *page);
64
65static inline void rcu_force_quiescent_state(void) 63static inline void rcu_force_quiescent_state(void)
66{ 64{
67} 65}
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 42cc3a04779e..24e467e526b8 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,7 +35,6 @@ struct notifier_block;
35extern void rcu_sched_qs(int cpu); 35extern void rcu_sched_qs(int cpu);
36extern void rcu_bh_qs(int cpu); 36extern void rcu_bh_qs(int cpu);
37extern int rcu_needs_cpu(int cpu); 37extern int rcu_needs_cpu(int cpu);
38extern int rcu_expedited_torture_stats(char *page);
39 38
40#ifdef CONFIG_TREE_PREEMPT_RCU 39#ifdef CONFIG_TREE_PREEMPT_RCU
41 40
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 5fcc31ed5771..25b4f686d918 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -120,12 +120,16 @@ int ring_buffer_write(struct ring_buffer *buffer,
120 unsigned long length, void *data); 120 unsigned long length, void *data);
121 121
122struct ring_buffer_event * 122struct ring_buffer_event *
123ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts); 123ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
124 unsigned long *lost_events);
124struct ring_buffer_event * 125struct ring_buffer_event *
125ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts); 126ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
127 unsigned long *lost_events);
126 128
127struct ring_buffer_iter * 129struct ring_buffer_iter *
128ring_buffer_read_start(struct ring_buffer *buffer, int cpu); 130ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu);
131void ring_buffer_read_prepare_sync(void);
132void ring_buffer_read_start(struct ring_buffer_iter *iter);
129void ring_buffer_read_finish(struct ring_buffer_iter *iter); 133void ring_buffer_read_finish(struct ring_buffer_iter *iter);
130 134
131struct ring_buffer_event * 135struct ring_buffer_event *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e0447c64af6a..2a5b146fbaf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -274,11 +274,17 @@ extern cpumask_var_t nohz_cpu_mask;
274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
275extern int select_nohz_load_balancer(int cpu); 275extern int select_nohz_load_balancer(int cpu);
276extern int get_nohz_load_balancer(void); 276extern int get_nohz_load_balancer(void);
277extern int nohz_ratelimit(int cpu);
277#else 278#else
278static inline int select_nohz_load_balancer(int cpu) 279static inline int select_nohz_load_balancer(int cpu)
279{ 280{
280 return 0; 281 return 0;
281} 282}
283
284static inline int nohz_ratelimit(int cpu)
285{
286 return 0;
287}
282#endif 288#endif
283 289
284/* 290/*
@@ -953,6 +959,7 @@ struct sched_domain {
953 char *name; 959 char *name;
954#endif 960#endif
955 961
962 unsigned int span_weight;
956 /* 963 /*
957 * Span of all CPUs in this domain. 964 * Span of all CPUs in this domain.
958 * 965 *
@@ -1025,12 +1032,17 @@ struct sched_domain;
1025#define WF_SYNC 0x01 /* waker goes to sleep after wakup */ 1032#define WF_SYNC 0x01 /* waker goes to sleep after wakup */
1026#define WF_FORK 0x02 /* child wakeup after fork */ 1033#define WF_FORK 0x02 /* child wakeup after fork */
1027 1034
1035#define ENQUEUE_WAKEUP 1
1036#define ENQUEUE_WAKING 2
1037#define ENQUEUE_HEAD 4
1038
1039#define DEQUEUE_SLEEP 1
1040
1028struct sched_class { 1041struct sched_class {
1029 const struct sched_class *next; 1042 const struct sched_class *next;
1030 1043
1031 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, 1044 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1032 bool head); 1045 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1033 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
1034 void (*yield_task) (struct rq *rq); 1046 void (*yield_task) (struct rq *rq);
1035 1047
1036 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1048 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
@@ -1039,7 +1051,8 @@ struct sched_class {
1039 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1051 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1040 1052
1041#ifdef CONFIG_SMP 1053#ifdef CONFIG_SMP
1042 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1054 int (*select_task_rq)(struct rq *rq, struct task_struct *p,
1055 int sd_flag, int flags);
1043 1056
1044 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1057 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1045 void (*post_schedule) (struct rq *this_rq); 1058 void (*post_schedule) (struct rq *this_rq);
@@ -1076,36 +1089,8 @@ struct load_weight {
1076 unsigned long weight, inv_weight; 1089 unsigned long weight, inv_weight;
1077}; 1090};
1078 1091
1079/*
1080 * CFS stats for a schedulable entity (task, task-group etc)
1081 *
1082 * Current field usage histogram:
1083 *
1084 * 4 se->block_start
1085 * 4 se->run_node
1086 * 4 se->sleep_start
1087 * 6 se->load.weight
1088 */
1089struct sched_entity {
1090 struct load_weight load; /* for load-balancing */
1091 struct rb_node run_node;
1092 struct list_head group_node;
1093 unsigned int on_rq;
1094
1095 u64 exec_start;
1096 u64 sum_exec_runtime;
1097 u64 vruntime;
1098 u64 prev_sum_exec_runtime;
1099
1100 u64 last_wakeup;
1101 u64 avg_overlap;
1102
1103 u64 nr_migrations;
1104
1105 u64 start_runtime;
1106 u64 avg_wakeup;
1107
1108#ifdef CONFIG_SCHEDSTATS 1092#ifdef CONFIG_SCHEDSTATS
1093struct sched_statistics {
1109 u64 wait_start; 1094 u64 wait_start;
1110 u64 wait_max; 1095 u64 wait_max;
1111 u64 wait_count; 1096 u64 wait_count;
@@ -1137,6 +1122,24 @@ struct sched_entity {
1137 u64 nr_wakeups_affine_attempts; 1122 u64 nr_wakeups_affine_attempts;
1138 u64 nr_wakeups_passive; 1123 u64 nr_wakeups_passive;
1139 u64 nr_wakeups_idle; 1124 u64 nr_wakeups_idle;
1125};
1126#endif
1127
1128struct sched_entity {
1129 struct load_weight load; /* for load-balancing */
1130 struct rb_node run_node;
1131 struct list_head group_node;
1132 unsigned int on_rq;
1133
1134 u64 exec_start;
1135 u64 sum_exec_runtime;
1136 u64 vruntime;
1137 u64 prev_sum_exec_runtime;
1138
1139 u64 nr_migrations;
1140
1141#ifdef CONFIG_SCHEDSTATS
1142 struct sched_statistics statistics;
1140#endif 1143#endif
1141 1144
1142#ifdef CONFIG_FAIR_GROUP_SCHED 1145#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1840,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void);
1840extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1843extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1841 1844
1842#ifdef CONFIG_HOTPLUG_CPU 1845#ifdef CONFIG_HOTPLUG_CPU
1846extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
1843extern void idle_task_exit(void); 1847extern void idle_task_exit(void);
1844#else 1848#else
1845static inline void idle_task_exit(void) {} 1849static inline void idle_task_exit(void) {}
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index baba3a23a814..6b524a0d02e4 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -1,13 +1,101 @@
1#ifndef _LINUX_STOP_MACHINE 1#ifndef _LINUX_STOP_MACHINE
2#define _LINUX_STOP_MACHINE 2#define _LINUX_STOP_MACHINE
3/* "Bogolock": stop the entire machine, disable interrupts. This is a 3
4 very heavy lock, which is equivalent to grabbing every spinlock
5 (and more). So the "read" side to such a lock is anything which
6 disables preeempt. */
7#include <linux/cpu.h> 4#include <linux/cpu.h>
8#include <linux/cpumask.h> 5#include <linux/cpumask.h>
6#include <linux/list.h>
9#include <asm/system.h> 7#include <asm/system.h>
10 8
9/*
10 * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
11 * monopolization mechanism. The caller can specify a non-sleeping
12 * function to be executed on a single or multiple cpus preempting all
13 * other processes and monopolizing those cpus until it finishes.
14 *
15 * Resources for this mechanism are preallocated when a cpu is brought
16 * up and requests are guaranteed to be served as long as the target
17 * cpus are online.
18 */
19typedef int (*cpu_stop_fn_t)(void *arg);
20
21#ifdef CONFIG_SMP
22
23struct cpu_stop_work {
24 struct list_head list; /* cpu_stopper->works */
25 cpu_stop_fn_t fn;
26 void *arg;
27 struct cpu_stop_done *done;
28};
29
30int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
31void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
32 struct cpu_stop_work *work_buf);
33int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
34int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
35
36#else /* CONFIG_SMP */
37
38#include <linux/workqueue.h>
39
40struct cpu_stop_work {
41 struct work_struct work;
42 cpu_stop_fn_t fn;
43 void *arg;
44};
45
46static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
47{
48 int ret = -ENOENT;
49 preempt_disable();
50 if (cpu == smp_processor_id())
51 ret = fn(arg);
52 preempt_enable();
53 return ret;
54}
55
56static void stop_one_cpu_nowait_workfn(struct work_struct *work)
57{
58 struct cpu_stop_work *stwork =
59 container_of(work, struct cpu_stop_work, work);
60 preempt_disable();
61 stwork->fn(stwork->arg);
62 preempt_enable();
63}
64
65static inline void stop_one_cpu_nowait(unsigned int cpu,
66 cpu_stop_fn_t fn, void *arg,
67 struct cpu_stop_work *work_buf)
68{
69 if (cpu == smp_processor_id()) {
70 INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn);
71 work_buf->fn = fn;
72 work_buf->arg = arg;
73 schedule_work(&work_buf->work);
74 }
75}
76
77static inline int stop_cpus(const struct cpumask *cpumask,
78 cpu_stop_fn_t fn, void *arg)
79{
80 if (cpumask_test_cpu(raw_smp_processor_id(), cpumask))
81 return stop_one_cpu(raw_smp_processor_id(), fn, arg);
82 return -ENOENT;
83}
84
85static inline int try_stop_cpus(const struct cpumask *cpumask,
86 cpu_stop_fn_t fn, void *arg)
87{
88 return stop_cpus(cpumask, fn, arg);
89}
90
91#endif /* CONFIG_SMP */
92
93/*
94 * stop_machine "Bogolock": stop the entire machine, disable
95 * interrupts. This is a very heavy lock, which is equivalent to
96 * grabbing every spinlock (and more). So the "read" side to such a
97 * lock is anything which disables preeempt.
98 */
11#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) 99#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
12 100
13/** 101/**
@@ -36,24 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
36 */ 124 */
37int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus); 125int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
38 126
39/** 127#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */
40 * stop_machine_create: create all stop_machine threads
41 *
42 * Description: This causes all stop_machine threads to be created before
43 * stop_machine actually gets called. This can be used by subsystems that
44 * need a non failing stop_machine infrastructure.
45 */
46int stop_machine_create(void);
47
48/**
49 * stop_machine_destroy: destroy all stop_machine threads
50 *
51 * Description: This causes all stop_machine threads which were created with
52 * stop_machine_create to be destroyed again.
53 */
54void stop_machine_destroy(void);
55
56#else
57 128
58static inline int stop_machine(int (*fn)(void *), void *data, 129static inline int stop_machine(int (*fn)(void *), void *data,
59 const struct cpumask *cpus) 130 const struct cpumask *cpus)
@@ -65,8 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
65 return ret; 136 return ret;
66} 137}
67 138
68static inline int stop_machine_create(void) { return 0; } 139#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */
69static inline void stop_machine_destroy(void) { } 140#endif /* _LINUX_STOP_MACHINE */
70
71#endif /* CONFIG_SMP */
72#endif /* _LINUX_STOP_MACHINE */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 057929b0a651..a1a86a53bc73 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -103,22 +103,6 @@ struct perf_event_attr;
103#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) 103#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
104#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) 104#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
105 105
106#ifdef CONFIG_PERF_EVENTS
107
108#define TRACE_SYS_ENTER_PERF_INIT(sname) \
109 .perf_event_enable = perf_sysenter_enable, \
110 .perf_event_disable = perf_sysenter_disable,
111
112#define TRACE_SYS_EXIT_PERF_INIT(sname) \
113 .perf_event_enable = perf_sysexit_enable, \
114 .perf_event_disable = perf_sysexit_disable,
115#else
116#define TRACE_SYS_ENTER_PERF(sname)
117#define TRACE_SYS_ENTER_PERF_INIT(sname)
118#define TRACE_SYS_EXIT_PERF(sname)
119#define TRACE_SYS_EXIT_PERF_INIT(sname)
120#endif /* CONFIG_PERF_EVENTS */
121
122#ifdef CONFIG_FTRACE_SYSCALLS 106#ifdef CONFIG_FTRACE_SYSCALLS
123#define __SC_STR_ADECL1(t, a) #a 107#define __SC_STR_ADECL1(t, a) #a
124#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__) 108#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__)
@@ -134,54 +118,43 @@ struct perf_event_attr;
134#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) 118#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__)
135#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) 119#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__)
136 120
121extern struct ftrace_event_class event_class_syscall_enter;
122extern struct ftrace_event_class event_class_syscall_exit;
123extern struct trace_event_functions enter_syscall_print_funcs;
124extern struct trace_event_functions exit_syscall_print_funcs;
125
137#define SYSCALL_TRACE_ENTER_EVENT(sname) \ 126#define SYSCALL_TRACE_ENTER_EVENT(sname) \
138 static const struct syscall_metadata __syscall_meta_##sname; \ 127 static struct syscall_metadata __syscall_meta_##sname; \
139 static struct ftrace_event_call \ 128 static struct ftrace_event_call \
140 __attribute__((__aligned__(4))) event_enter_##sname; \ 129 __attribute__((__aligned__(4))) event_enter_##sname; \
141 static struct trace_event enter_syscall_print_##sname = { \
142 .trace = print_syscall_enter, \
143 }; \
144 static struct ftrace_event_call __used \ 130 static struct ftrace_event_call __used \
145 __attribute__((__aligned__(4))) \ 131 __attribute__((__aligned__(4))) \
146 __attribute__((section("_ftrace_events"))) \ 132 __attribute__((section("_ftrace_events"))) \
147 event_enter_##sname = { \ 133 event_enter_##sname = { \
148 .name = "sys_enter"#sname, \ 134 .name = "sys_enter"#sname, \
149 .system = "syscalls", \ 135 .class = &event_class_syscall_enter, \
150 .event = &enter_syscall_print_##sname, \ 136 .event.funcs = &enter_syscall_print_funcs, \
151 .raw_init = init_syscall_trace, \
152 .define_fields = syscall_enter_define_fields, \
153 .regfunc = reg_event_syscall_enter, \
154 .unregfunc = unreg_event_syscall_enter, \
155 .data = (void *)&__syscall_meta_##sname,\ 137 .data = (void *)&__syscall_meta_##sname,\
156 TRACE_SYS_ENTER_PERF_INIT(sname) \
157 } 138 }
158 139
159#define SYSCALL_TRACE_EXIT_EVENT(sname) \ 140#define SYSCALL_TRACE_EXIT_EVENT(sname) \
160 static const struct syscall_metadata __syscall_meta_##sname; \ 141 static struct syscall_metadata __syscall_meta_##sname; \
161 static struct ftrace_event_call \ 142 static struct ftrace_event_call \
162 __attribute__((__aligned__(4))) event_exit_##sname; \ 143 __attribute__((__aligned__(4))) event_exit_##sname; \
163 static struct trace_event exit_syscall_print_##sname = { \
164 .trace = print_syscall_exit, \
165 }; \
166 static struct ftrace_event_call __used \ 144 static struct ftrace_event_call __used \
167 __attribute__((__aligned__(4))) \ 145 __attribute__((__aligned__(4))) \
168 __attribute__((section("_ftrace_events"))) \ 146 __attribute__((section("_ftrace_events"))) \
169 event_exit_##sname = { \ 147 event_exit_##sname = { \
170 .name = "sys_exit"#sname, \ 148 .name = "sys_exit"#sname, \
171 .system = "syscalls", \ 149 .class = &event_class_syscall_exit, \
172 .event = &exit_syscall_print_##sname, \ 150 .event.funcs = &exit_syscall_print_funcs, \
173 .raw_init = init_syscall_trace, \
174 .define_fields = syscall_exit_define_fields, \
175 .regfunc = reg_event_syscall_exit, \
176 .unregfunc = unreg_event_syscall_exit, \
177 .data = (void *)&__syscall_meta_##sname,\ 151 .data = (void *)&__syscall_meta_##sname,\
178 TRACE_SYS_EXIT_PERF_INIT(sname) \
179 } 152 }
180 153
181#define SYSCALL_METADATA(sname, nb) \ 154#define SYSCALL_METADATA(sname, nb) \
182 SYSCALL_TRACE_ENTER_EVENT(sname); \ 155 SYSCALL_TRACE_ENTER_EVENT(sname); \
183 SYSCALL_TRACE_EXIT_EVENT(sname); \ 156 SYSCALL_TRACE_EXIT_EVENT(sname); \
184 static const struct syscall_metadata __used \ 157 static struct syscall_metadata __used \
185 __attribute__((__aligned__(4))) \ 158 __attribute__((__aligned__(4))) \
186 __attribute__((section("__syscalls_metadata"))) \ 159 __attribute__((section("__syscalls_metadata"))) \
187 __syscall_meta_##sname = { \ 160 __syscall_meta_##sname = { \
@@ -191,12 +164,14 @@ struct perf_event_attr;
191 .args = args_##sname, \ 164 .args = args_##sname, \
192 .enter_event = &event_enter_##sname, \ 165 .enter_event = &event_enter_##sname, \
193 .exit_event = &event_exit_##sname, \ 166 .exit_event = &event_exit_##sname, \
167 .enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
168 .exit_fields = LIST_HEAD_INIT(__syscall_meta_##sname.exit_fields), \
194 }; 169 };
195 170
196#define SYSCALL_DEFINE0(sname) \ 171#define SYSCALL_DEFINE0(sname) \
197 SYSCALL_TRACE_ENTER_EVENT(_##sname); \ 172 SYSCALL_TRACE_ENTER_EVENT(_##sname); \
198 SYSCALL_TRACE_EXIT_EVENT(_##sname); \ 173 SYSCALL_TRACE_EXIT_EVENT(_##sname); \
199 static const struct syscall_metadata __used \ 174 static struct syscall_metadata __used \
200 __attribute__((__aligned__(4))) \ 175 __attribute__((__aligned__(4))) \
201 __attribute__((section("__syscalls_metadata"))) \ 176 __attribute__((section("__syscalls_metadata"))) \
202 __syscall_meta__##sname = { \ 177 __syscall_meta__##sname = { \
@@ -204,6 +179,8 @@ struct perf_event_attr;
204 .nb_args = 0, \ 179 .nb_args = 0, \
205 .enter_event = &event_enter__##sname, \ 180 .enter_event = &event_enter__##sname, \
206 .exit_event = &event_exit__##sname, \ 181 .exit_event = &event_exit__##sname, \
182 .enter_fields = LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
183 .exit_fields = LIST_HEAD_INIT(__syscall_meta__##sname.exit_fields), \
207 }; \ 184 }; \
208 asmlinkage long sys_##sname(void) 185 asmlinkage long sys_##sname(void)
209#else 186#else
diff --git a/include/linux/tick.h b/include/linux/tick.h
index d2ae79e21be3..b232ccc0ee29 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -42,6 +42,7 @@ enum tick_nohz_mode {
42 * @idle_waketime: Time when the idle was interrupted 42 * @idle_waketime: Time when the idle was interrupted
43 * @idle_exittime: Time when the idle state was left 43 * @idle_exittime: Time when the idle state was left
44 * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped 44 * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
45 * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
45 * @sleep_length: Duration of the current idle sleep 46 * @sleep_length: Duration of the current idle sleep
46 * @do_timer_lst: CPU was the last one doing do_timer before going idle 47 * @do_timer_lst: CPU was the last one doing do_timer before going idle
47 */ 48 */
@@ -60,7 +61,7 @@ struct tick_sched {
60 ktime_t idle_waketime; 61 ktime_t idle_waketime;
61 ktime_t idle_exittime; 62 ktime_t idle_exittime;
62 ktime_t idle_sleeptime; 63 ktime_t idle_sleeptime;
63 ktime_t idle_lastupdate; 64 ktime_t iowait_sleeptime;
64 ktime_t sleep_length; 65 ktime_t sleep_length;
65 unsigned long last_jiffies; 66 unsigned long last_jiffies;
66 unsigned long next_jiffies; 67 unsigned long next_jiffies;
@@ -124,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle);
124extern void tick_nohz_restart_sched_tick(void); 125extern void tick_nohz_restart_sched_tick(void);
125extern ktime_t tick_nohz_get_sleep_length(void); 126extern ktime_t tick_nohz_get_sleep_length(void);
126extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); 127extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
128extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
127# else 129# else
128static inline void tick_nohz_stop_sched_tick(int inidle) { } 130static inline void tick_nohz_stop_sched_tick(int inidle) { }
129static inline void tick_nohz_restart_sched_tick(void) { } 131static inline void tick_nohz_restart_sched_tick(void) { }
@@ -134,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
134 return len; 136 return len;
135} 137}
136static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } 138static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
139static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
137# endif /* !NO_HZ */ 140# endif /* !NO_HZ */
138 141
139#endif 142#endif
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 78b4bd3be496..9a59d1f98cd4 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -20,12 +20,17 @@
20struct module; 20struct module;
21struct tracepoint; 21struct tracepoint;
22 22
23struct tracepoint_func {
24 void *func;
25 void *data;
26};
27
23struct tracepoint { 28struct tracepoint {
24 const char *name; /* Tracepoint name */ 29 const char *name; /* Tracepoint name */
25 int state; /* State. */ 30 int state; /* State. */
26 void (*regfunc)(void); 31 void (*regfunc)(void);
27 void (*unregfunc)(void); 32 void (*unregfunc)(void);
28 void **funcs; 33 struct tracepoint_func *funcs;
29} __attribute__((aligned(32))); /* 34} __attribute__((aligned(32))); /*
30 * Aligned on 32 bytes because it is 35 * Aligned on 32 bytes because it is
31 * globally visible and gcc happily 36 * globally visible and gcc happily
@@ -33,6 +38,68 @@ struct tracepoint {
33 * Keep in sync with vmlinux.lds.h. 38 * Keep in sync with vmlinux.lds.h.
34 */ 39 */
35 40
41/*
42 * Connect a probe to a tracepoint.
43 * Internal API, should not be used directly.
44 */
45extern int tracepoint_probe_register(const char *name, void *probe, void *data);
46
47/*
48 * Disconnect a probe from a tracepoint.
49 * Internal API, should not be used directly.
50 */
51extern int
52tracepoint_probe_unregister(const char *name, void *probe, void *data);
53
54extern int tracepoint_probe_register_noupdate(const char *name, void *probe,
55 void *data);
56extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
57 void *data);
58extern void tracepoint_probe_update_all(void);
59
60struct tracepoint_iter {
61 struct module *module;
62 struct tracepoint *tracepoint;
63};
64
65extern void tracepoint_iter_start(struct tracepoint_iter *iter);
66extern void tracepoint_iter_next(struct tracepoint_iter *iter);
67extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
68extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
69extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
70 struct tracepoint *begin, struct tracepoint *end);
71
72/*
73 * tracepoint_synchronize_unregister must be called between the last tracepoint
74 * probe unregistration and the end of module exit to make sure there is no
75 * caller executing a probe when it is freed.
76 */
77static inline void tracepoint_synchronize_unregister(void)
78{
79 synchronize_sched();
80}
81
82#define PARAMS(args...) args
83
84#ifdef CONFIG_TRACEPOINTS
85extern void tracepoint_update_probe_range(struct tracepoint *begin,
86 struct tracepoint *end);
87#else
88static inline void tracepoint_update_probe_range(struct tracepoint *begin,
89 struct tracepoint *end)
90{ }
91#endif /* CONFIG_TRACEPOINTS */
92
93#endif /* _LINUX_TRACEPOINT_H */
94
95/*
96 * Note: we keep the TRACE_EVENT and DECLARE_TRACE outside the include
97 * file ifdef protection.
98 * This is due to the way trace events work. If a file includes two
99 * trace event headers under one "CREATE_TRACE_POINTS" the first include
100 * will override the TRACE_EVENT and break the second include.
101 */
102
36#ifndef DECLARE_TRACE 103#ifndef DECLARE_TRACE
37 104
38#define TP_PROTO(args...) args 105#define TP_PROTO(args...) args
@@ -43,17 +110,27 @@ struct tracepoint {
43/* 110/*
44 * it_func[0] is never NULL because there is at least one element in the array 111 * it_func[0] is never NULL because there is at least one element in the array
45 * when the array itself is non NULL. 112 * when the array itself is non NULL.
113 *
114 * Note, the proto and args passed in includes "__data" as the first parameter.
115 * The reason for this is to handle the "void" prototype. If a tracepoint
116 * has a "void" prototype, then it is invalid to declare a function
117 * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just
118 * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto".
46 */ 119 */
47#define __DO_TRACE(tp, proto, args) \ 120#define __DO_TRACE(tp, proto, args) \
48 do { \ 121 do { \
49 void **it_func; \ 122 struct tracepoint_func *it_func_ptr; \
123 void *it_func; \
124 void *__data; \
50 \ 125 \
51 rcu_read_lock_sched_notrace(); \ 126 rcu_read_lock_sched_notrace(); \
52 it_func = rcu_dereference_sched((tp)->funcs); \ 127 it_func_ptr = rcu_dereference_sched((tp)->funcs); \
53 if (it_func) { \ 128 if (it_func_ptr) { \
54 do { \ 129 do { \
55 ((void(*)(proto))(*it_func))(args); \ 130 it_func = (it_func_ptr)->func; \
56 } while (*(++it_func)); \ 131 __data = (it_func_ptr)->data; \
132 ((void(*)(proto))(it_func))(args); \
133 } while ((++it_func_ptr)->func); \
57 } \ 134 } \
58 rcu_read_unlock_sched_notrace(); \ 135 rcu_read_unlock_sched_notrace(); \
59 } while (0) 136 } while (0)
@@ -63,24 +140,32 @@ struct tracepoint {
63 * not add unwanted padding between the beginning of the section and the 140 * not add unwanted padding between the beginning of the section and the
64 * structure. Force alignment to the same alignment as the section start. 141 * structure. Force alignment to the same alignment as the section start.
65 */ 142 */
66#define DECLARE_TRACE(name, proto, args) \ 143#define __DECLARE_TRACE(name, proto, args, data_proto, data_args) \
67 extern struct tracepoint __tracepoint_##name; \ 144 extern struct tracepoint __tracepoint_##name; \
68 static inline void trace_##name(proto) \ 145 static inline void trace_##name(proto) \
69 { \ 146 { \
70 if (unlikely(__tracepoint_##name.state)) \ 147 if (unlikely(__tracepoint_##name.state)) \
71 __DO_TRACE(&__tracepoint_##name, \ 148 __DO_TRACE(&__tracepoint_##name, \
72 TP_PROTO(proto), TP_ARGS(args)); \ 149 TP_PROTO(data_proto), \
150 TP_ARGS(data_args)); \
73 } \ 151 } \
74 static inline int register_trace_##name(void (*probe)(proto)) \ 152 static inline int \
153 register_trace_##name(void (*probe)(data_proto), void *data) \
75 { \ 154 { \
76 return tracepoint_probe_register(#name, (void *)probe); \ 155 return tracepoint_probe_register(#name, (void *)probe, \
156 data); \
77 } \ 157 } \
78 static inline int unregister_trace_##name(void (*probe)(proto)) \ 158 static inline int \
159 unregister_trace_##name(void (*probe)(data_proto), void *data) \
160 { \
161 return tracepoint_probe_unregister(#name, (void *)probe, \
162 data); \
163 } \
164 static inline void \
165 check_trace_callback_type_##name(void (*cb)(data_proto)) \
79 { \ 166 { \
80 return tracepoint_probe_unregister(#name, (void *)probe);\
81 } 167 }
82 168
83
84#define DEFINE_TRACE_FN(name, reg, unreg) \ 169#define DEFINE_TRACE_FN(name, reg, unreg) \
85 static const char __tpstrtab_##name[] \ 170 static const char __tpstrtab_##name[] \
86 __attribute__((section("__tracepoints_strings"))) = #name; \ 171 __attribute__((section("__tracepoints_strings"))) = #name; \
@@ -96,22 +181,24 @@ struct tracepoint {
96#define EXPORT_TRACEPOINT_SYMBOL(name) \ 181#define EXPORT_TRACEPOINT_SYMBOL(name) \
97 EXPORT_SYMBOL(__tracepoint_##name) 182 EXPORT_SYMBOL(__tracepoint_##name)
98 183
99extern void tracepoint_update_probe_range(struct tracepoint *begin,
100 struct tracepoint *end);
101
102#else /* !CONFIG_TRACEPOINTS */ 184#else /* !CONFIG_TRACEPOINTS */
103#define DECLARE_TRACE(name, proto, args) \ 185#define __DECLARE_TRACE(name, proto, args, data_proto, data_args) \
104 static inline void _do_trace_##name(struct tracepoint *tp, proto) \
105 { } \
106 static inline void trace_##name(proto) \ 186 static inline void trace_##name(proto) \
107 { } \ 187 { } \
108 static inline int register_trace_##name(void (*probe)(proto)) \ 188 static inline int \
189 register_trace_##name(void (*probe)(data_proto), \
190 void *data) \
109 { \ 191 { \
110 return -ENOSYS; \ 192 return -ENOSYS; \
111 } \ 193 } \
112 static inline int unregister_trace_##name(void (*probe)(proto)) \ 194 static inline int \
195 unregister_trace_##name(void (*probe)(data_proto), \
196 void *data) \
113 { \ 197 { \
114 return -ENOSYS; \ 198 return -ENOSYS; \
199 } \
200 static inline void check_trace_callback_type_##name(void (*cb)(data_proto)) \
201 { \
115 } 202 }
116 203
117#define DEFINE_TRACE_FN(name, reg, unreg) 204#define DEFINE_TRACE_FN(name, reg, unreg)
@@ -119,60 +206,31 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
119#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) 206#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
120#define EXPORT_TRACEPOINT_SYMBOL(name) 207#define EXPORT_TRACEPOINT_SYMBOL(name)
121 208
122static inline void tracepoint_update_probe_range(struct tracepoint *begin,
123 struct tracepoint *end)
124{ }
125#endif /* CONFIG_TRACEPOINTS */ 209#endif /* CONFIG_TRACEPOINTS */
126#endif /* DECLARE_TRACE */
127
128/*
129 * Connect a probe to a tracepoint.
130 * Internal API, should not be used directly.
131 */
132extern int tracepoint_probe_register(const char *name, void *probe);
133
134/*
135 * Disconnect a probe from a tracepoint.
136 * Internal API, should not be used directly.
137 */
138extern int tracepoint_probe_unregister(const char *name, void *probe);
139
140extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
141extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
142extern void tracepoint_probe_update_all(void);
143
144struct tracepoint_iter {
145 struct module *module;
146 struct tracepoint *tracepoint;
147};
148
149extern void tracepoint_iter_start(struct tracepoint_iter *iter);
150extern void tracepoint_iter_next(struct tracepoint_iter *iter);
151extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
152extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
153extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
154 struct tracepoint *begin, struct tracepoint *end);
155 210
156/* 211/*
157 * tracepoint_synchronize_unregister must be called between the last tracepoint 212 * The need for the DECLARE_TRACE_NOARGS() is to handle the prototype
158 * probe unregistration and the end of module exit to make sure there is no 213 * (void). "void" is a special value in a function prototype and can
159 * caller executing a probe when it is freed. 214 * not be combined with other arguments. Since the DECLARE_TRACE()
215 * macro adds a data element at the beginning of the prototype,
216 * we need a way to differentiate "(void *data, proto)" from
217 * "(void *data, void)". The second prototype is invalid.
218 *
219 * DECLARE_TRACE_NOARGS() passes "void" as the tracepoint prototype
220 * and "void *__data" as the callback prototype.
221 *
222 * DECLARE_TRACE() passes "proto" as the tracepoint protoype and
223 * "void *__data, proto" as the callback prototype.
160 */ 224 */
161static inline void tracepoint_synchronize_unregister(void) 225#define DECLARE_TRACE_NOARGS(name) \
162{ 226 __DECLARE_TRACE(name, void, , void *__data, __data)
163 synchronize_sched();
164}
165 227
166#define PARAMS(args...) args 228#define DECLARE_TRACE(name, proto, args) \
167 229 __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
168#endif /* _LINUX_TRACEPOINT_H */ 230 PARAMS(void *__data, proto), \
231 PARAMS(__data, args))
169 232
170/* 233#endif /* DECLARE_TRACE */
171 * Note: we keep the TRACE_EVENT outside the include file ifdef protection.
172 * This is due to the way trace events work. If a file includes two
173 * trace event headers under one "CREATE_TRACE_POINTS" the first include
174 * will override the TRACE_EVENT and break the second include.
175 */
176 234
177#ifndef TRACE_EVENT 235#ifndef TRACE_EVENT
178/* 236/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b77d5e..76d96d035ea0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -127,12 +127,26 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
127/* 127/*
128 * Used for wake-one threads: 128 * Used for wake-one threads:
129 */ 129 */
130static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
131 wait_queue_t *wait)
132{
133 wait->flags |= WQ_FLAG_EXCLUSIVE;
134 __add_wait_queue(q, wait);
135}
136
130static inline void __add_wait_queue_tail(wait_queue_head_t *head, 137static inline void __add_wait_queue_tail(wait_queue_head_t *head,
131 wait_queue_t *new) 138 wait_queue_t *new)
132{ 139{
133 list_add_tail(&new->task_list, &head->task_list); 140 list_add_tail(&new->task_list, &head->task_list);
134} 141}
135 142
143static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q,
144 wait_queue_t *wait)
145{
146 wait->flags |= WQ_FLAG_EXCLUSIVE;
147 __add_wait_queue_tail(q, wait);
148}
149
136static inline void __remove_wait_queue(wait_queue_head_t *head, 150static inline void __remove_wait_queue(wait_queue_head_t *head,
137 wait_queue_t *old) 151 wait_queue_t *old)
138{ 152{
@@ -404,25 +418,6 @@ do { \
404}) 418})
405 419
406/* 420/*
407 * Must be called with the spinlock in the wait_queue_head_t held.
408 */
409static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
410 wait_queue_t * wait)
411{
412 wait->flags |= WQ_FLAG_EXCLUSIVE;
413 __add_wait_queue_tail(q, wait);
414}
415
416/*
417 * Must be called with the spinlock in the wait_queue_head_t held.
418 */
419static inline void remove_wait_queue_locked(wait_queue_head_t *q,
420 wait_queue_t * wait)
421{
422 __remove_wait_queue(q, wait);
423}
424
425/*
426 * These are the old interfaces to sleep waiting for an event. 421 * These are the old interfaces to sleep waiting for an event.
427 * They are racy. DO NOT use them, use the wait_event* interfaces above. 422 * They are racy. DO NOT use them, use the wait_event* interfaces above.
428 * We plan to remove these interfaces. 423 * We plan to remove these interfaces.
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 5acfb1eb4df9..1dfab5401511 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -65,6 +65,10 @@
65 65
66#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 66#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
67 67
68/* Make all open coded DECLARE_TRACE nops */
69#undef DECLARE_TRACE
70#define DECLARE_TRACE(name, proto, args)
71
68#ifdef CONFIG_EVENT_TRACING 72#ifdef CONFIG_EVENT_TRACING
69#include <trace/ftrace.h> 73#include <trace/ftrace.h>
70#endif 74#endif
@@ -75,6 +79,7 @@
75#undef DEFINE_EVENT 79#undef DEFINE_EVENT
76#undef DEFINE_EVENT_PRINT 80#undef DEFINE_EVENT_PRINT
77#undef TRACE_HEADER_MULTI_READ 81#undef TRACE_HEADER_MULTI_READ
82#undef DECLARE_TRACE
78 83
79/* Only undef what we defined in this file */ 84/* Only undef what we defined in this file */
80#ifdef UNDEF_TRACE_INCLUDE_FILE 85#ifdef UNDEF_TRACE_INCLUDE_FILE
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 4b0f48ba16a6..c7bb2f0482fe 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -51,11 +51,14 @@ TRACE_EVENT(module_free,
51 TP_printk("%s", __get_str(name)) 51 TP_printk("%s", __get_str(name))
52); 52);
53 53
54#ifdef CONFIG_MODULE_UNLOAD
55/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */
56
54DECLARE_EVENT_CLASS(module_refcnt, 57DECLARE_EVENT_CLASS(module_refcnt,
55 58
56 TP_PROTO(struct module *mod, unsigned long ip, int refcnt), 59 TP_PROTO(struct module *mod, unsigned long ip),
57 60
58 TP_ARGS(mod, ip, refcnt), 61 TP_ARGS(mod, ip),
59 62
60 TP_STRUCT__entry( 63 TP_STRUCT__entry(
61 __field( unsigned long, ip ) 64 __field( unsigned long, ip )
@@ -65,7 +68,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
65 68
66 TP_fast_assign( 69 TP_fast_assign(
67 __entry->ip = ip; 70 __entry->ip = ip;
68 __entry->refcnt = refcnt; 71 __entry->refcnt = __this_cpu_read(mod->refptr->incs) + __this_cpu_read(mod->refptr->decs);
69 __assign_str(name, mod->name); 72 __assign_str(name, mod->name);
70 ), 73 ),
71 74
@@ -75,17 +78,18 @@ DECLARE_EVENT_CLASS(module_refcnt,
75 78
76DEFINE_EVENT(module_refcnt, module_get, 79DEFINE_EVENT(module_refcnt, module_get,
77 80
78 TP_PROTO(struct module *mod, unsigned long ip, int refcnt), 81 TP_PROTO(struct module *mod, unsigned long ip),
79 82
80 TP_ARGS(mod, ip, refcnt) 83 TP_ARGS(mod, ip)
81); 84);
82 85
83DEFINE_EVENT(module_refcnt, module_put, 86DEFINE_EVENT(module_refcnt, module_put,
84 87
85 TP_PROTO(struct module *mod, unsigned long ip, int refcnt), 88 TP_PROTO(struct module *mod, unsigned long ip),
86 89
87 TP_ARGS(mod, ip, refcnt) 90 TP_ARGS(mod, ip)
88); 91);
92#endif /* CONFIG_MODULE_UNLOAD */
89 93
90TRACE_EVENT(module_request, 94TRACE_EVENT(module_request,
91 95
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h
index a8989c4547e7..188deca2f3c7 100644
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -1,4 +1,7 @@
1#ifndef _TRACE_NAPI_H_ 1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM napi
3
4#if !defined(_TRACE_NAPI_H) || defined(TRACE_HEADER_MULTI_READ)
2#define _TRACE_NAPI_H_ 5#define _TRACE_NAPI_H_
3 6
4#include <linux/netdevice.h> 7#include <linux/netdevice.h>
@@ -8,4 +11,7 @@ DECLARE_TRACE(napi_poll,
8 TP_PROTO(struct napi_struct *napi), 11 TP_PROTO(struct napi_struct *napi),
9 TP_ARGS(napi)); 12 TP_ARGS(napi));
10 13
11#endif 14#endif /* _TRACE_NAPI_H_ */
15
16/* This part must be outside protection */
17#include <trace/define_trace.h>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index cfceb0b73e20..4f733ecea46e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -51,15 +51,12 @@ TRACE_EVENT(sched_kthread_stop_ret,
51 51
52/* 52/*
53 * Tracepoint for waiting on task to unschedule: 53 * Tracepoint for waiting on task to unschedule:
54 *
55 * (NOTE: the 'rq' argument is not used by generic trace events,
56 * but used by the latency tracer plugin. )
57 */ 54 */
58TRACE_EVENT(sched_wait_task, 55TRACE_EVENT(sched_wait_task,
59 56
60 TP_PROTO(struct rq *rq, struct task_struct *p), 57 TP_PROTO(struct task_struct *p),
61 58
62 TP_ARGS(rq, p), 59 TP_ARGS(p),
63 60
64 TP_STRUCT__entry( 61 TP_STRUCT__entry(
65 __array( char, comm, TASK_COMM_LEN ) 62 __array( char, comm, TASK_COMM_LEN )
@@ -79,15 +76,12 @@ TRACE_EVENT(sched_wait_task,
79 76
80/* 77/*
81 * Tracepoint for waking up a task: 78 * Tracepoint for waking up a task:
82 *
83 * (NOTE: the 'rq' argument is not used by generic trace events,
84 * but used by the latency tracer plugin. )
85 */ 79 */
86DECLARE_EVENT_CLASS(sched_wakeup_template, 80DECLARE_EVENT_CLASS(sched_wakeup_template,
87 81
88 TP_PROTO(struct rq *rq, struct task_struct *p, int success), 82 TP_PROTO(struct task_struct *p, int success),
89 83
90 TP_ARGS(rq, p, success), 84 TP_ARGS(p, success),
91 85
92 TP_STRUCT__entry( 86 TP_STRUCT__entry(
93 __array( char, comm, TASK_COMM_LEN ) 87 __array( char, comm, TASK_COMM_LEN )
@@ -111,31 +105,25 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
111); 105);
112 106
113DEFINE_EVENT(sched_wakeup_template, sched_wakeup, 107DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
114 TP_PROTO(struct rq *rq, struct task_struct *p, int success), 108 TP_PROTO(struct task_struct *p, int success),
115 TP_ARGS(rq, p, success)); 109 TP_ARGS(p, success));
116 110
117/* 111/*
118 * Tracepoint for waking up a new task: 112 * Tracepoint for waking up a new task:
119 *
120 * (NOTE: the 'rq' argument is not used by generic trace events,
121 * but used by the latency tracer plugin. )
122 */ 113 */
123DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, 114DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
124 TP_PROTO(struct rq *rq, struct task_struct *p, int success), 115 TP_PROTO(struct task_struct *p, int success),
125 TP_ARGS(rq, p, success)); 116 TP_ARGS(p, success));
126 117
127/* 118/*
128 * Tracepoint for task switches, performed by the scheduler: 119 * Tracepoint for task switches, performed by the scheduler:
129 *
130 * (NOTE: the 'rq' argument is not used by generic trace events,
131 * but used by the latency tracer plugin. )
132 */ 120 */
133TRACE_EVENT(sched_switch, 121TRACE_EVENT(sched_switch,
134 122
135 TP_PROTO(struct rq *rq, struct task_struct *prev, 123 TP_PROTO(struct task_struct *prev,
136 struct task_struct *next), 124 struct task_struct *next),
137 125
138 TP_ARGS(rq, prev, next), 126 TP_ARGS(prev, next),
139 127
140 TP_STRUCT__entry( 128 TP_STRUCT__entry(
141 __array( char, prev_comm, TASK_COMM_LEN ) 129 __array( char, prev_comm, TASK_COMM_LEN )
diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
index a510b75ac304..814566c99d29 100644
--- a/include/trace/events/signal.h
+++ b/include/trace/events/signal.h
@@ -100,18 +100,7 @@ TRACE_EVENT(signal_deliver,
100 __entry->sa_handler, __entry->sa_flags) 100 __entry->sa_handler, __entry->sa_flags)
101); 101);
102 102
103/** 103DECLARE_EVENT_CLASS(signal_queue_overflow,
104 * signal_overflow_fail - called when signal queue is overflow
105 * @sig: signal number
106 * @group: signal to process group or not (bool)
107 * @info: pointer to struct siginfo
108 *
109 * Kernel fails to generate 'sig' signal with 'info' siginfo, because
110 * siginfo queue is overflow, and the signal is dropped.
111 * 'group' is not 0 if the signal will be sent to a process group.
112 * 'sig' is always one of RT signals.
113 */
114TRACE_EVENT(signal_overflow_fail,
115 104
116 TP_PROTO(int sig, int group, struct siginfo *info), 105 TP_PROTO(int sig, int group, struct siginfo *info),
117 106
@@ -135,6 +124,24 @@ TRACE_EVENT(signal_overflow_fail,
135); 124);
136 125
137/** 126/**
127 * signal_overflow_fail - called when signal queue is overflow
128 * @sig: signal number
129 * @group: signal to process group or not (bool)
130 * @info: pointer to struct siginfo
131 *
132 * Kernel fails to generate 'sig' signal with 'info' siginfo, because
133 * siginfo queue is overflow, and the signal is dropped.
134 * 'group' is not 0 if the signal will be sent to a process group.
135 * 'sig' is always one of RT signals.
136 */
137DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail,
138
139 TP_PROTO(int sig, int group, struct siginfo *info),
140
141 TP_ARGS(sig, group, info)
142);
143
144/**
138 * signal_lose_info - called when siginfo is lost 145 * signal_lose_info - called when siginfo is lost
139 * @sig: signal number 146 * @sig: signal number
140 * @group: signal to process group or not (bool) 147 * @group: signal to process group or not (bool)
@@ -145,28 +152,13 @@ TRACE_EVENT(signal_overflow_fail,
145 * 'group' is not 0 if the signal will be sent to a process group. 152 * 'group' is not 0 if the signal will be sent to a process group.
146 * 'sig' is always one of non-RT signals. 153 * 'sig' is always one of non-RT signals.
147 */ 154 */
148TRACE_EVENT(signal_lose_info, 155DEFINE_EVENT(signal_queue_overflow, signal_lose_info,
149 156
150 TP_PROTO(int sig, int group, struct siginfo *info), 157 TP_PROTO(int sig, int group, struct siginfo *info),
151 158
152 TP_ARGS(sig, group, info), 159 TP_ARGS(sig, group, info)
153
154 TP_STRUCT__entry(
155 __field( int, sig )
156 __field( int, group )
157 __field( int, errno )
158 __field( int, code )
159 ),
160
161 TP_fast_assign(
162 __entry->sig = sig;
163 __entry->group = group;
164 TP_STORE_SIGINFO(__entry, info);
165 ),
166
167 TP_printk("sig=%d group=%d errno=%d code=%d",
168 __entry->sig, __entry->group, __entry->errno, __entry->code)
169); 160);
161
170#endif /* _TRACE_SIGNAL_H */ 162#endif /* _TRACE_SIGNAL_H */
171 163
172/* This part must be outside protection */ 164/* This part must be outside protection */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 882c64832ffe..e0e8daa6767e 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -62,7 +62,10 @@
62 struct trace_entry ent; \ 62 struct trace_entry ent; \
63 tstruct \ 63 tstruct \
64 char __data[0]; \ 64 char __data[0]; \
65 }; 65 }; \
66 \
67 static struct ftrace_event_class event_class_##name;
68
66#undef DEFINE_EVENT 69#undef DEFINE_EVENT
67#define DEFINE_EVENT(template, name, proto, args) \ 70#define DEFINE_EVENT(template, name, proto, args) \
68 static struct ftrace_event_call \ 71 static struct ftrace_event_call \
@@ -147,16 +150,18 @@
147 * 150 *
148 * entry = iter->ent; 151 * entry = iter->ent;
149 * 152 *
150 * if (entry->type != event_<call>.id) { 153 * if (entry->type != event_<call>->event.type) {
151 * WARN_ON_ONCE(1); 154 * WARN_ON_ONCE(1);
152 * return TRACE_TYPE_UNHANDLED; 155 * return TRACE_TYPE_UNHANDLED;
153 * } 156 * }
154 * 157 *
155 * field = (typeof(field))entry; 158 * field = (typeof(field))entry;
156 * 159 *
157 * p = get_cpu_var(ftrace_event_seq); 160 * p = &get_cpu_var(ftrace_event_seq);
158 * trace_seq_init(p); 161 * trace_seq_init(p);
159 * ret = trace_seq_printf(s, <TP_printk> "\n"); 162 * ret = trace_seq_printf(s, "%s: ", <call>);
163 * if (ret)
164 * ret = trace_seq_printf(s, <TP_printk> "\n");
160 * put_cpu(); 165 * put_cpu();
161 * if (!ret) 166 * if (!ret)
162 * return TRACE_TYPE_PARTIAL_LINE; 167 * return TRACE_TYPE_PARTIAL_LINE;
@@ -201,18 +206,22 @@
201#undef DECLARE_EVENT_CLASS 206#undef DECLARE_EVENT_CLASS
202#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ 207#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
203static notrace enum print_line_t \ 208static notrace enum print_line_t \
204ftrace_raw_output_id_##call(int event_id, const char *name, \ 209ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \
205 struct trace_iterator *iter, int flags) \ 210 struct trace_event *trace_event) \
206{ \ 211{ \
212 struct ftrace_event_call *event; \
207 struct trace_seq *s = &iter->seq; \ 213 struct trace_seq *s = &iter->seq; \
208 struct ftrace_raw_##call *field; \ 214 struct ftrace_raw_##call *field; \
209 struct trace_entry *entry; \ 215 struct trace_entry *entry; \
210 struct trace_seq *p; \ 216 struct trace_seq *p; \
211 int ret; \ 217 int ret; \
212 \ 218 \
219 event = container_of(trace_event, struct ftrace_event_call, \
220 event); \
221 \
213 entry = iter->ent; \ 222 entry = iter->ent; \
214 \ 223 \
215 if (entry->type != event_id) { \ 224 if (entry->type != event->event.type) { \
216 WARN_ON_ONCE(1); \ 225 WARN_ON_ONCE(1); \
217 return TRACE_TYPE_UNHANDLED; \ 226 return TRACE_TYPE_UNHANDLED; \
218 } \ 227 } \
@@ -221,7 +230,7 @@ ftrace_raw_output_id_##call(int event_id, const char *name, \
221 \ 230 \
222 p = &get_cpu_var(ftrace_event_seq); \ 231 p = &get_cpu_var(ftrace_event_seq); \
223 trace_seq_init(p); \ 232 trace_seq_init(p); \
224 ret = trace_seq_printf(s, "%s: ", name); \ 233 ret = trace_seq_printf(s, "%s: ", event->name); \
225 if (ret) \ 234 if (ret) \
226 ret = trace_seq_printf(s, print); \ 235 ret = trace_seq_printf(s, print); \
227 put_cpu(); \ 236 put_cpu(); \
@@ -229,21 +238,16 @@ ftrace_raw_output_id_##call(int event_id, const char *name, \
229 return TRACE_TYPE_PARTIAL_LINE; \ 238 return TRACE_TYPE_PARTIAL_LINE; \
230 \ 239 \
231 return TRACE_TYPE_HANDLED; \ 240 return TRACE_TYPE_HANDLED; \
232} 241} \
233 242static struct trace_event_functions ftrace_event_type_funcs_##call = { \
234#undef DEFINE_EVENT 243 .trace = ftrace_raw_output_##call, \
235#define DEFINE_EVENT(template, name, proto, args) \ 244};
236static notrace enum print_line_t \
237ftrace_raw_output_##name(struct trace_iterator *iter, int flags) \
238{ \
239 return ftrace_raw_output_id_##template(event_##name.id, \
240 #name, iter, flags); \
241}
242 245
243#undef DEFINE_EVENT_PRINT 246#undef DEFINE_EVENT_PRINT
244#define DEFINE_EVENT_PRINT(template, call, proto, args, print) \ 247#define DEFINE_EVENT_PRINT(template, call, proto, args, print) \
245static notrace enum print_line_t \ 248static notrace enum print_line_t \
246ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ 249ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \
250 struct trace_event *event) \
247{ \ 251{ \
248 struct trace_seq *s = &iter->seq; \ 252 struct trace_seq *s = &iter->seq; \
249 struct ftrace_raw_##template *field; \ 253 struct ftrace_raw_##template *field; \
@@ -253,7 +257,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
253 \ 257 \
254 entry = iter->ent; \ 258 entry = iter->ent; \
255 \ 259 \
256 if (entry->type != event_##call.id) { \ 260 if (entry->type != event_##call.event.type) { \
257 WARN_ON_ONCE(1); \ 261 WARN_ON_ONCE(1); \
258 return TRACE_TYPE_UNHANDLED; \ 262 return TRACE_TYPE_UNHANDLED; \
259 } \ 263 } \
@@ -270,7 +274,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
270 return TRACE_TYPE_PARTIAL_LINE; \ 274 return TRACE_TYPE_PARTIAL_LINE; \
271 \ 275 \
272 return TRACE_TYPE_HANDLED; \ 276 return TRACE_TYPE_HANDLED; \
273} 277} \
278static struct trace_event_functions ftrace_event_type_funcs_##call = { \
279 .trace = ftrace_raw_output_##call, \
280};
274 281
275#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 282#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
276 283
@@ -376,142 +383,83 @@ static inline notrace int ftrace_get_offsets_##call( \
376 383
377#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 384#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
378 385
379#ifdef CONFIG_PERF_EVENTS
380
381/*
382 * Generate the functions needed for tracepoint perf_event support.
383 *
384 * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
385 *
386 * static int ftrace_profile_enable_<call>(void)
387 * {
388 * return register_trace_<call>(ftrace_profile_<call>);
389 * }
390 *
391 * static void ftrace_profile_disable_<call>(void)
392 * {
393 * unregister_trace_<call>(ftrace_profile_<call>);
394 * }
395 *
396 */
397
398#undef DECLARE_EVENT_CLASS
399#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
400
401#undef DEFINE_EVENT
402#define DEFINE_EVENT(template, name, proto, args) \
403 \
404static void perf_trace_##name(proto); \
405 \
406static notrace int \
407perf_trace_enable_##name(struct ftrace_event_call *unused) \
408{ \
409 return register_trace_##name(perf_trace_##name); \
410} \
411 \
412static notrace void \
413perf_trace_disable_##name(struct ftrace_event_call *unused) \
414{ \
415 unregister_trace_##name(perf_trace_##name); \
416}
417
418#undef DEFINE_EVENT_PRINT
419#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
420 DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
421
422#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
423
424#endif /* CONFIG_PERF_EVENTS */
425
426/* 386/*
427 * Stage 4 of the trace events. 387 * Stage 4 of the trace events.
428 * 388 *
429 * Override the macros in <trace/trace_events.h> to include the following: 389 * Override the macros in <trace/trace_events.h> to include the following:
430 * 390 *
431 * static void ftrace_event_<call>(proto)
432 * {
433 * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
434 * }
435 *
436 * static int ftrace_reg_event_<call>(struct ftrace_event_call *unused)
437 * {
438 * return register_trace_<call>(ftrace_event_<call>);
439 * }
440 *
441 * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
442 * {
443 * unregister_trace_<call>(ftrace_event_<call>);
444 * }
445 *
446 *
447 * For those macros defined with TRACE_EVENT: 391 * For those macros defined with TRACE_EVENT:
448 * 392 *
449 * static struct ftrace_event_call event_<call>; 393 * static struct ftrace_event_call event_<call>;
450 * 394 *
451 * static void ftrace_raw_event_<call>(proto) 395 * static void ftrace_raw_event_<call>(void *__data, proto)
452 * { 396 * {
397 * struct ftrace_event_call *event_call = __data;
398 * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
453 * struct ring_buffer_event *event; 399 * struct ring_buffer_event *event;
454 * struct ftrace_raw_<call> *entry; <-- defined in stage 1 400 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
455 * struct ring_buffer *buffer; 401 * struct ring_buffer *buffer;
456 * unsigned long irq_flags; 402 * unsigned long irq_flags;
403 * int __data_size;
457 * int pc; 404 * int pc;
458 * 405 *
459 * local_save_flags(irq_flags); 406 * local_save_flags(irq_flags);
460 * pc = preempt_count(); 407 * pc = preempt_count();
461 * 408 *
409 * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
410 *
462 * event = trace_current_buffer_lock_reserve(&buffer, 411 * event = trace_current_buffer_lock_reserve(&buffer,
463 * event_<call>.id, 412 * event_<call>->event.type,
464 * sizeof(struct ftrace_raw_<call>), 413 * sizeof(*entry) + __data_size,
465 * irq_flags, pc); 414 * irq_flags, pc);
466 * if (!event) 415 * if (!event)
467 * return; 416 * return;
468 * entry = ring_buffer_event_data(event); 417 * entry = ring_buffer_event_data(event);
469 * 418 *
470 * <assign>; <-- Here we assign the entries by the __field and 419 * { <assign>; } <-- Here we assign the entries by the __field and
471 * __array macros. 420 * __array macros.
472 *
473 * trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
474 * }
475 *
476 * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
477 * {
478 * int ret;
479 *
480 * ret = register_trace_<call>(ftrace_raw_event_<call>);
481 * if (!ret)
482 * pr_info("event trace: Could not activate trace point "
483 * "probe to <call>");
484 * return ret;
485 * }
486 * 421 *
487 * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused) 422 * if (!filter_current_check_discard(buffer, event_call, entry, event))
488 * { 423 * trace_current_buffer_unlock_commit(buffer,
489 * unregister_trace_<call>(ftrace_raw_event_<call>); 424 * event, irq_flags, pc);
490 * } 425 * }
491 * 426 *
492 * static struct trace_event ftrace_event_type_<call> = { 427 * static struct trace_event ftrace_event_type_<call> = {
493 * .trace = ftrace_raw_output_<call>, <-- stage 2 428 * .trace = ftrace_raw_output_<call>, <-- stage 2
494 * }; 429 * };
495 * 430 *
431 * static const char print_fmt_<call>[] = <TP_printk>;
432 *
433 * static struct ftrace_event_class __used event_class_<template> = {
434 * .system = "<system>",
435 * .define_fields = ftrace_define_fields_<call>,
436 * .fields = LIST_HEAD_INIT(event_class_##call.fields),
437 * .raw_init = trace_event_raw_init,
438 * .probe = ftrace_raw_event_##call,
439 * };
440 *
496 * static struct ftrace_event_call __used 441 * static struct ftrace_event_call __used
497 * __attribute__((__aligned__(4))) 442 * __attribute__((__aligned__(4)))
498 * __attribute__((section("_ftrace_events"))) event_<call> = { 443 * __attribute__((section("_ftrace_events"))) event_<call> = {
499 * .name = "<call>", 444 * .name = "<call>",
500 * .system = "<system>", 445 * .class = event_class_<template>,
501 * .raw_init = trace_event_raw_init, 446 * .event = &ftrace_event_type_<call>,
502 * .regfunc = ftrace_reg_event_<call>, 447 * .print_fmt = print_fmt_<call>,
503 * .unregfunc = ftrace_unreg_event_<call>, 448 * };
504 * }
505 * 449 *
506 */ 450 */
507 451
508#ifdef CONFIG_PERF_EVENTS 452#ifdef CONFIG_PERF_EVENTS
509 453
454#define _TRACE_PERF_PROTO(call, proto) \
455 static notrace void \
456 perf_trace_##call(void *__data, proto);
457
510#define _TRACE_PERF_INIT(call) \ 458#define _TRACE_PERF_INIT(call) \
511 .perf_event_enable = perf_trace_enable_##call, \ 459 .perf_probe = perf_trace_##call,
512 .perf_event_disable = perf_trace_disable_##call,
513 460
514#else 461#else
462#define _TRACE_PERF_PROTO(call, proto)
515#define _TRACE_PERF_INIT(call) 463#define _TRACE_PERF_INIT(call)
516#endif /* CONFIG_PERF_EVENTS */ 464#endif /* CONFIG_PERF_EVENTS */
517 465
@@ -545,9 +493,9 @@ perf_trace_disable_##name(struct ftrace_event_call *unused) \
545#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ 493#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
546 \ 494 \
547static notrace void \ 495static notrace void \
548ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \ 496ftrace_raw_event_##call(void *__data, proto) \
549 proto) \
550{ \ 497{ \
498 struct ftrace_event_call *event_call = __data; \
551 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ 499 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
552 struct ring_buffer_event *event; \ 500 struct ring_buffer_event *event; \
553 struct ftrace_raw_##call *entry; \ 501 struct ftrace_raw_##call *entry; \
@@ -562,14 +510,13 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \
562 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 510 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
563 \ 511 \
564 event = trace_current_buffer_lock_reserve(&buffer, \ 512 event = trace_current_buffer_lock_reserve(&buffer, \
565 event_call->id, \ 513 event_call->event.type, \
566 sizeof(*entry) + __data_size, \ 514 sizeof(*entry) + __data_size, \
567 irq_flags, pc); \ 515 irq_flags, pc); \
568 if (!event) \ 516 if (!event) \
569 return; \ 517 return; \
570 entry = ring_buffer_event_data(event); \ 518 entry = ring_buffer_event_data(event); \
571 \ 519 \
572 \
573 tstruct \ 520 tstruct \
574 \ 521 \
575 { assign; } \ 522 { assign; } \
@@ -578,34 +525,21 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \
578 trace_nowake_buffer_unlock_commit(buffer, \ 525 trace_nowake_buffer_unlock_commit(buffer, \
579 event, irq_flags, pc); \ 526 event, irq_flags, pc); \
580} 527}
528/*
529 * The ftrace_test_probe is compiled out, it is only here as a build time check
530 * to make sure that if the tracepoint handling changes, the ftrace probe will
531 * fail to compile unless it too is updated.
532 */
581 533
582#undef DEFINE_EVENT 534#undef DEFINE_EVENT
583#define DEFINE_EVENT(template, call, proto, args) \ 535#define DEFINE_EVENT(template, call, proto, args) \
584 \ 536static inline void ftrace_test_probe_##call(void) \
585static notrace void ftrace_raw_event_##call(proto) \
586{ \
587 ftrace_raw_event_id_##template(&event_##call, args); \
588} \
589 \
590static notrace int \
591ftrace_raw_reg_event_##call(struct ftrace_event_call *unused) \
592{ \
593 return register_trace_##call(ftrace_raw_event_##call); \
594} \
595 \
596static notrace void \
597ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused) \
598{ \ 537{ \
599 unregister_trace_##call(ftrace_raw_event_##call); \ 538 check_trace_callback_type_##call(ftrace_raw_event_##template); \
600} \ 539}
601 \
602static struct trace_event ftrace_event_type_##call = { \
603 .trace = ftrace_raw_output_##call, \
604};
605 540
606#undef DEFINE_EVENT_PRINT 541#undef DEFINE_EVENT_PRINT
607#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ 542#define DEFINE_EVENT_PRINT(template, name, proto, args, print)
608 DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
609 543
610#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 544#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
611 545
@@ -622,7 +556,16 @@ static struct trace_event ftrace_event_type_##call = { \
622 556
623#undef DECLARE_EVENT_CLASS 557#undef DECLARE_EVENT_CLASS
624#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ 558#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
625static const char print_fmt_##call[] = print; 559_TRACE_PERF_PROTO(call, PARAMS(proto)); \
560static const char print_fmt_##call[] = print; \
561static struct ftrace_event_class __used event_class_##call = { \
562 .system = __stringify(TRACE_SYSTEM), \
563 .define_fields = ftrace_define_fields_##call, \
564 .fields = LIST_HEAD_INIT(event_class_##call.fields),\
565 .raw_init = trace_event_raw_init, \
566 .probe = ftrace_raw_event_##call, \
567 _TRACE_PERF_INIT(call) \
568};
626 569
627#undef DEFINE_EVENT 570#undef DEFINE_EVENT
628#define DEFINE_EVENT(template, call, proto, args) \ 571#define DEFINE_EVENT(template, call, proto, args) \
@@ -631,15 +574,10 @@ static struct ftrace_event_call __used \
631__attribute__((__aligned__(4))) \ 574__attribute__((__aligned__(4))) \
632__attribute__((section("_ftrace_events"))) event_##call = { \ 575__attribute__((section("_ftrace_events"))) event_##call = { \
633 .name = #call, \ 576 .name = #call, \
634 .system = __stringify(TRACE_SYSTEM), \ 577 .class = &event_class_##template, \
635 .event = &ftrace_event_type_##call, \ 578 .event.funcs = &ftrace_event_type_funcs_##template, \
636 .raw_init = trace_event_raw_init, \
637 .regfunc = ftrace_raw_reg_event_##call, \
638 .unregfunc = ftrace_raw_unreg_event_##call, \
639 .print_fmt = print_fmt_##template, \ 579 .print_fmt = print_fmt_##template, \
640 .define_fields = ftrace_define_fields_##template, \ 580};
641 _TRACE_PERF_INIT(call) \
642}
643 581
644#undef DEFINE_EVENT_PRINT 582#undef DEFINE_EVENT_PRINT
645#define DEFINE_EVENT_PRINT(template, call, proto, args, print) \ 583#define DEFINE_EVENT_PRINT(template, call, proto, args, print) \
@@ -650,14 +588,9 @@ static struct ftrace_event_call __used \
650__attribute__((__aligned__(4))) \ 588__attribute__((__aligned__(4))) \
651__attribute__((section("_ftrace_events"))) event_##call = { \ 589__attribute__((section("_ftrace_events"))) event_##call = { \
652 .name = #call, \ 590 .name = #call, \
653 .system = __stringify(TRACE_SYSTEM), \ 591 .class = &event_class_##template, \
654 .event = &ftrace_event_type_##call, \ 592 .event.funcs = &ftrace_event_type_funcs_##call, \
655 .raw_init = trace_event_raw_init, \
656 .regfunc = ftrace_raw_reg_event_##call, \
657 .unregfunc = ftrace_raw_unreg_event_##call, \
658 .print_fmt = print_fmt_##call, \ 593 .print_fmt = print_fmt_##call, \
659 .define_fields = ftrace_define_fields_##template, \
660 _TRACE_PERF_INIT(call) \
661} 594}
662 595
663#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 596#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -757,17 +690,20 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
757#undef DECLARE_EVENT_CLASS 690#undef DECLARE_EVENT_CLASS
758#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ 691#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
759static notrace void \ 692static notrace void \
760perf_trace_templ_##call(struct ftrace_event_call *event_call, \ 693perf_trace_##call(void *__data, proto) \
761 struct pt_regs *__regs, proto) \
762{ \ 694{ \
695 struct ftrace_event_call *event_call = __data; \
763 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ 696 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
764 struct ftrace_raw_##call *entry; \ 697 struct ftrace_raw_##call *entry; \
698 struct pt_regs *__regs = &get_cpu_var(perf_trace_regs); \
765 u64 __addr = 0, __count = 1; \ 699 u64 __addr = 0, __count = 1; \
766 unsigned long irq_flags; \ 700 unsigned long irq_flags; \
767 int __entry_size; \ 701 int __entry_size; \
768 int __data_size; \ 702 int __data_size; \
769 int rctx; \ 703 int rctx; \
770 \ 704 \
705 perf_fetch_caller_regs(__regs, 1); \
706 \
771 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 707 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
772 __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ 708 __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
773 sizeof(u64)); \ 709 sizeof(u64)); \
@@ -775,33 +711,35 @@ perf_trace_templ_##call(struct ftrace_event_call *event_call, \
775 \ 711 \
776 if (WARN_ONCE(__entry_size > PERF_MAX_TRACE_SIZE, \ 712 if (WARN_ONCE(__entry_size > PERF_MAX_TRACE_SIZE, \
777 "profile buffer not large enough")) \ 713 "profile buffer not large enough")) \
778 return; \ 714 goto out; \
779 entry = (struct ftrace_raw_##call *)perf_trace_buf_prepare( \ 715 entry = (struct ftrace_raw_##call *)perf_trace_buf_prepare( \
780 __entry_size, event_call->id, &rctx, &irq_flags); \ 716 __entry_size, event_call->event.type, &rctx, &irq_flags); \
781 if (!entry) \ 717 if (!entry) \
782 return; \ 718 goto out; \
783 tstruct \ 719 tstruct \
784 \ 720 \
785 { assign; } \ 721 { assign; } \
786 \ 722 \
787 perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ 723 perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
788 __count, irq_flags, __regs); \ 724 __count, irq_flags, __regs); \
725 out: \
726 put_cpu_var(perf_trace_regs); \
789} 727}
790 728
729/*
730 * This part is compiled out, it is only here as a build time check
731 * to make sure that if the tracepoint handling changes, the
732 * perf probe will fail to compile unless it too is updated.
733 */
791#undef DEFINE_EVENT 734#undef DEFINE_EVENT
792#define DEFINE_EVENT(template, call, proto, args) \ 735#define DEFINE_EVENT(template, call, proto, args) \
793static notrace void perf_trace_##call(proto) \ 736static inline void perf_test_probe_##call(void) \
794{ \ 737{ \
795 struct ftrace_event_call *event_call = &event_##call; \ 738 check_trace_callback_type_##call(perf_trace_##template); \
796 struct pt_regs *__regs = &get_cpu_var(perf_trace_regs); \
797 \
798 perf_fetch_caller_regs(__regs, 1); \
799 \ 739 \
800 perf_trace_templ_##template(event_call, __regs, args); \
801 \
802 put_cpu_var(perf_trace_regs); \
803} 740}
804 741
742
805#undef DEFINE_EVENT_PRINT 743#undef DEFINE_EVENT_PRINT
806#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ 744#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
807 DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) 745 DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index e5e5f48dbfb3..257e08960d7b 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -25,6 +25,8 @@ struct syscall_metadata {
25 int nb_args; 25 int nb_args;
26 const char **types; 26 const char **types;
27 const char **args; 27 const char **args;
28 struct list_head enter_fields;
29 struct list_head exit_fields;
28 30
29 struct ftrace_event_call *enter_event; 31 struct ftrace_event_call *enter_event;
30 struct ftrace_event_call *exit_event; 32 struct ftrace_event_call *exit_event;
@@ -34,16 +36,16 @@ struct syscall_metadata {
34extern unsigned long arch_syscall_addr(int nr); 36extern unsigned long arch_syscall_addr(int nr);
35extern int init_syscall_trace(struct ftrace_event_call *call); 37extern int init_syscall_trace(struct ftrace_event_call *call);
36 38
37extern int syscall_enter_define_fields(struct ftrace_event_call *call);
38extern int syscall_exit_define_fields(struct ftrace_event_call *call);
39extern int reg_event_syscall_enter(struct ftrace_event_call *call); 39extern int reg_event_syscall_enter(struct ftrace_event_call *call);
40extern void unreg_event_syscall_enter(struct ftrace_event_call *call); 40extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
41extern int reg_event_syscall_exit(struct ftrace_event_call *call); 41extern int reg_event_syscall_exit(struct ftrace_event_call *call);
42extern void unreg_event_syscall_exit(struct ftrace_event_call *call); 42extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
43extern int 43extern int
44ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s); 44ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
45enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); 45enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
46enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); 46 struct trace_event *event);
47enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
48 struct trace_event *event);
47#endif 49#endif
48 50
49#ifdef CONFIG_PERF_EVENTS 51#ifdef CONFIG_PERF_EVENTS
diff --git a/init/Kconfig b/init/Kconfig
index eb77e8ccde1c..5fe94b82e4c0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -604,8 +604,7 @@ config RT_GROUP_SCHED
604 default n 604 default n
605 help 605 help
606 This feature lets you explicitly allocate real CPU bandwidth 606 This feature lets you explicitly allocate real CPU bandwidth
607 to users or control groups (depending on the "Basis for grouping tasks" 607 to task groups. If enabled, it will also make it impossible to
608 setting below. If enabled, it will also make it impossible to
609 schedule realtime tasks for non-root users until you allocate 608 schedule realtime tasks for non-root users until you allocate
610 realtime bandwidth for them. 609 realtime bandwidth for them.
611 See Documentation/scheduler/sched-rt-group.txt for more information. 610 See Documentation/scheduler/sched-rt-group.txt for more information.
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..149e18ef1ab1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
69obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b276..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "cred-internals.h"
19 18
20/* 19/*
21 * Leveraged for setting/resetting capabilities 20 * Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980c..4a07d057a265 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3010,7 +3010,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3010 unsigned long flags = (unsigned long)key; 3010 unsigned long flags = (unsigned long)key;
3011 3011
3012 if (flags & POLLHUP) { 3012 if (flags & POLLHUP) {
3013 remove_wait_queue_locked(event->wqh, &event->wait); 3013 __remove_wait_queue(event->wqh, &event->wait);
3014 spin_lock(&cgrp->event_list_lock); 3014 spin_lock(&cgrp->event_list_lock);
3015 list_del(&event->list); 3015 list_del(&event->list);
3016 spin_unlock(&cgrp->event_list_lock); 3016 spin_unlock(&cgrp->event_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25bba73b1be3..545777574779 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu)
164} 164}
165 165
166struct take_cpu_down_param { 166struct take_cpu_down_param {
167 struct task_struct *caller;
167 unsigned long mod; 168 unsigned long mod;
168 void *hcpu; 169 void *hcpu;
169}; 170};
@@ -172,6 +173,7 @@ struct take_cpu_down_param {
172static int __ref take_cpu_down(void *_param) 173static int __ref take_cpu_down(void *_param)
173{ 174{
174 struct take_cpu_down_param *param = _param; 175 struct take_cpu_down_param *param = _param;
176 unsigned int cpu = (unsigned long)param->hcpu;
175 int err; 177 int err;
176 178
177 /* Ensure this CPU doesn't handle any more interrupts. */ 179 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
182 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 184 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
183 param->hcpu); 185 param->hcpu);
184 186
187 if (task_cpu(param->caller) == cpu)
188 move_task_off_dead_cpu(cpu, param->caller);
185 /* Force idle task to run as soon as we yield: it should 189 /* Force idle task to run as soon as we yield: it should
186 immediately notice cpu is offline and die quickly. */ 190 immediately notice cpu is offline and die quickly. */
187 sched_idle_next(); 191 sched_idle_next();
@@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 196static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
193{ 197{
194 int err, nr_calls = 0; 198 int err, nr_calls = 0;
195 cpumask_var_t old_allowed;
196 void *hcpu = (void *)(long)cpu; 199 void *hcpu = (void *)(long)cpu;
197 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 200 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
198 struct take_cpu_down_param tcd_param = { 201 struct take_cpu_down_param tcd_param = {
202 .caller = current,
199 .mod = mod, 203 .mod = mod,
200 .hcpu = hcpu, 204 .hcpu = hcpu,
201 }; 205 };
@@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
206 if (!cpu_online(cpu)) 210 if (!cpu_online(cpu))
207 return -EINVAL; 211 return -EINVAL;
208 212
209 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
210 return -ENOMEM;
211
212 cpu_hotplug_begin(); 213 cpu_hotplug_begin();
213 set_cpu_active(cpu, false); 214 set_cpu_active(cpu, false);
214 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
225 goto out_release; 226 goto out_release;
226 } 227 }
227 228
228 /* Ensure that we are not runnable on dying cpu */
229 cpumask_copy(old_allowed, &current->cpus_allowed);
230 set_cpus_allowed_ptr(current, cpu_active_mask);
231
232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
233 if (err) { 230 if (err) {
234 set_cpu_active(cpu, true); 231 set_cpu_active(cpu, true);
@@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
237 hcpu) == NOTIFY_BAD) 234 hcpu) == NOTIFY_BAD)
238 BUG(); 235 BUG();
239 236
240 goto out_allowed; 237 goto out_release;
241 } 238 }
242 BUG_ON(cpu_online(cpu)); 239 BUG_ON(cpu_online(cpu));
243 240
@@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
255 252
256 check_for_tasks(cpu); 253 check_for_tasks(cpu);
257 254
258out_allowed:
259 set_cpus_allowed_ptr(current, old_allowed);
260out_release: 255out_release:
261 cpu_hotplug_done(); 256 cpu_hotplug_done();
262 if (!err) { 257 if (!err) {
@@ -264,7 +259,6 @@ out_release:
264 hcpu) == NOTIFY_BAD) 259 hcpu) == NOTIFY_BAD)
265 BUG(); 260 BUG();
266 } 261 }
267 free_cpumask_var(old_allowed);
268 return err; 262 return err;
269} 263}
270 264
@@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
272{ 266{
273 int err; 267 int err;
274 268
275 err = stop_machine_create();
276 if (err)
277 return err;
278 cpu_maps_update_begin(); 269 cpu_maps_update_begin();
279 270
280 if (cpu_hotplug_disabled) { 271 if (cpu_hotplug_disabled) {
@@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
286 277
287out: 278out:
288 cpu_maps_update_done(); 279 cpu_maps_update_done();
289 stop_machine_destroy();
290 return err; 280 return err;
291} 281}
292EXPORT_SYMBOL(cpu_down); 282EXPORT_SYMBOL(cpu_down);
@@ -367,9 +357,6 @@ int disable_nonboot_cpus(void)
367{ 357{
368 int cpu, first_cpu, error; 358 int cpu, first_cpu, error;
369 359
370 error = stop_machine_create();
371 if (error)
372 return error;
373 cpu_maps_update_begin(); 360 cpu_maps_update_begin();
374 first_cpu = cpumask_first(cpu_online_mask); 361 first_cpu = cpumask_first(cpu_online_mask);
375 /* 362 /*
@@ -400,7 +387,6 @@ int disable_nonboot_cpus(void)
400 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 387 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
401 } 388 }
402 cpu_maps_update_done(); 389 cpu_maps_update_done();
403 stop_machine_destroy();
404 return error; 390 return error;
405} 391}
406 392
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec2..9a50c5f6e727 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void)
2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2183{ 2183{
2184 mutex_lock(&callback_mutex); 2184 mutex_lock(&callback_mutex);
2185 cpuset_cpus_allowed_locked(tsk, pmask); 2185 task_lock(tsk);
2186 guarantee_online_cpus(task_cs(tsk), pmask);
2187 task_unlock(tsk);
2186 mutex_unlock(&callback_mutex); 2188 mutex_unlock(&callback_mutex);
2187} 2189}
2188 2190
2189/** 2191int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2190 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2191 * Must be called with callback_mutex held.
2192 **/
2193void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2194{ 2192{
2195 task_lock(tsk); 2193 const struct cpuset *cs;
2196 guarantee_online_cpus(task_cs(tsk), pmask); 2194 int cpu;
2197 task_unlock(tsk); 2195
2196 rcu_read_lock();
2197 cs = task_cs(tsk);
2198 if (cs)
2199 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2200 rcu_read_unlock();
2201
2202 /*
2203 * We own tsk->cpus_allowed, nobody can change it under us.
2204 *
2205 * But we used cs && cs->cpus_allowed lockless and thus can
2206 * race with cgroup_attach_task() or update_cpumask() and get
2207 * the wrong tsk->cpus_allowed. However, both cases imply the
2208 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2209 * which takes task_rq_lock().
2210 *
2211 * If we are called after it dropped the lock we must see all
2212 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2213 * set any mask even if it is not right from task_cs() pov,
2214 * the pending set_cpus_allowed_ptr() will fix things.
2215 */
2216
2217 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2218 if (cpu >= nr_cpu_ids) {
2219 /*
2220 * Either tsk->cpus_allowed is wrong (see above) or it
2221 * is actually empty. The latter case is only possible
2222 * if we are racing with remove_tasks_in_empty_cpuset().
2223 * Like above we can temporary set any mask and rely on
2224 * set_cpus_allowed_ptr() as synchronization point.
2225 */
2226 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2227 cpu = cpumask_any(cpu_active_mask);
2228 }
2229
2230 return cpu;
2198} 2231}
2199 2232
2200void cpuset_init_current_mems_allowed(void) 2233void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2383} 2416}
2384 2417
2385/** 2418/**
2386 * cpuset_lock - lock out any changes to cpuset structures
2387 *
2388 * The out of memory (oom) code needs to mutex_lock cpusets
2389 * from being changed while it scans the tasklist looking for a
2390 * task in an overlapping cpuset. Expose callback_mutex via this
2391 * cpuset_lock() routine, so the oom code can lock it, before
2392 * locking the task list. The tasklist_lock is a spinlock, so
2393 * must be taken inside callback_mutex.
2394 */
2395
2396void cpuset_lock(void)
2397{
2398 mutex_lock(&callback_mutex);
2399}
2400
2401/**
2402 * cpuset_unlock - release lock on cpuset changes 2419 * cpuset_unlock - release lock on cpuset changes
2403 * 2420 *
2404 * Undo the lock taken in a previous cpuset_lock() call. 2421 * Undo the lock taken in a previous cpuset_lock() call.
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
index 62af1816c235..8f3672a58a1e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -17,7 +17,6 @@
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
20#include "cred-internals.h"
21 20
22#if 0 21#if 0
23#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
@@ -560,8 +559,6 @@ int commit_creds(struct cred *new)
560 atomic_dec(&old->user->processes); 559 atomic_dec(&old->user->processes);
561 alter_cred_subscribers(old, -2); 560 alter_cred_subscribers(old, -2);
562 561
563 sched_switch_user(task);
564
565 /* send notifications */ 562 /* send notifications */
566 if (new->uid != old->uid || 563 if (new->uid != old->uid ||
567 new->euid != old->euid || 564 new->euid != old->euid ||
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f2683a10ac4..eabca5a73a85 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
55#include <asm/unistd.h> 55#include <asm/unistd.h>
56#include <asm/pgtable.h> 56#include <asm/pgtable.h>
57#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59 58
60static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
61 60
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026a..e2564580f3f1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
59#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 60#include <trace/events/module.h>
61 61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
64#if 0 62#if 0
65#define DEBUGP printk 63#define DEBUGP printk
66#else 64#else
@@ -515,6 +513,9 @@ MODINFO_ATTR(srcversion);
515static char last_unloaded_module[MODULE_NAME_LEN+1]; 513static char last_unloaded_module[MODULE_NAME_LEN+1];
516 514
517#ifdef CONFIG_MODULE_UNLOAD 515#ifdef CONFIG_MODULE_UNLOAD
516
517EXPORT_TRACEPOINT_SYMBOL(module_get);
518
518/* Init the unload section of the module. */ 519/* Init the unload section of the module. */
519static void module_unload_init(struct module *mod) 520static void module_unload_init(struct module *mod)
520{ 521{
@@ -723,16 +724,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
723 return -EFAULT; 724 return -EFAULT;
724 name[MODULE_NAME_LEN-1] = '\0'; 725 name[MODULE_NAME_LEN-1] = '\0';
725 726
726 /* Create stop_machine threads since free_module relies on 727 if (mutex_lock_interruptible(&module_mutex) != 0)
727 * a non-failing stop_machine call. */ 728 return -EINTR;
728 ret = stop_machine_create();
729 if (ret)
730 return ret;
731
732 if (mutex_lock_interruptible(&module_mutex) != 0) {
733 ret = -EINTR;
734 goto out_stop;
735 }
736 729
737 mod = find_module(name); 730 mod = find_module(name);
738 if (!mod) { 731 if (!mod) {
@@ -792,8 +785,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
792 785
793 out: 786 out:
794 mutex_unlock(&module_mutex); 787 mutex_unlock(&module_mutex);
795out_stop:
796 stop_machine_destroy();
797 return ret; 788 return ret;
798} 789}
799 790
@@ -867,8 +858,7 @@ void module_put(struct module *module)
867 smp_wmb(); /* see comment in module_refcount */ 858 smp_wmb(); /* see comment in module_refcount */
868 __this_cpu_inc(module->refptr->decs); 859 __this_cpu_inc(module->refptr->decs);
869 860
870 trace_module_put(module, _RET_IP_, 861 trace_module_put(module, _RET_IP_);
871 __this_cpu_read(module->refptr->decs));
872 /* Maybe they're waiting for us to drop reference? */ 862 /* Maybe they're waiting for us to drop reference? */
873 if (unlikely(!module_is_live(module))) 863 if (unlikely(!module_is_live(module)))
874 wake_up_process(module->waiter); 864 wake_up_process(module->waiter);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..2b676f3a0f26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
669 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
670 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state, 671 .fqs = rcu_sched_force_quiescent_state,
672 .stats = rcu_expedited_torture_stats, 672 .stats = NULL,
673 .irq_capable = 1, 673 .irq_capable = 1,
674 .name = "sched_expedited" 674 .name = "sched_expedited"
675}; 675};
diff --git a/kernel/sched.c b/kernel/sched.c
index b11b80a3eed3..78554dd0d1a4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -503,8 +503,11 @@ struct rq {
503 #define CPU_LOAD_IDX_MAX 5 503 #define CPU_LOAD_IDX_MAX 5
504 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 504 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
505#ifdef CONFIG_NO_HZ 505#ifdef CONFIG_NO_HZ
506 u64 nohz_stamp;
506 unsigned char in_nohz_recently; 507 unsigned char in_nohz_recently;
507#endif 508#endif
509 unsigned int skip_clock_update;
510
508 /* capture load from *all* tasks on this cpu: */ 511 /* capture load from *all* tasks on this cpu: */
509 struct load_weight load; 512 struct load_weight load;
510 unsigned long nr_load_updates; 513 unsigned long nr_load_updates;
@@ -546,15 +549,13 @@ struct rq {
546 int post_schedule; 549 int post_schedule;
547 int active_balance; 550 int active_balance;
548 int push_cpu; 551 int push_cpu;
552 struct cpu_stop_work active_balance_work;
549 /* cpu of this runqueue: */ 553 /* cpu of this runqueue: */
550 int cpu; 554 int cpu;
551 int online; 555 int online;
552 556
553 unsigned long avg_load_per_task; 557 unsigned long avg_load_per_task;
554 558
555 struct task_struct *migration_thread;
556 struct list_head migration_queue;
557
558 u64 rt_avg; 559 u64 rt_avg;
559 u64 age_stamp; 560 u64 age_stamp;
560 u64 idle_stamp; 561 u64 idle_stamp;
@@ -602,6 +603,13 @@ static inline
602void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 603void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
603{ 604{
604 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 605 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
606
607 /*
608 * A queue event has occurred, and we're going to schedule. In
609 * this case, we can save a useless back to back clock update.
610 */
611 if (test_tsk_need_resched(p))
612 rq->skip_clock_update = 1;
605} 613}
606 614
607static inline int cpu_of(struct rq *rq) 615static inline int cpu_of(struct rq *rq)
@@ -636,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
636 644
637inline void update_rq_clock(struct rq *rq) 645inline void update_rq_clock(struct rq *rq)
638{ 646{
639 rq->clock = sched_clock_cpu(cpu_of(rq)); 647 if (!rq->skip_clock_update)
648 rq->clock = sched_clock_cpu(cpu_of(rq));
640} 649}
641 650
642/* 651/*
@@ -914,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
914#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 923#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
915 924
916/* 925/*
917 * Check whether the task is waking, we use this to synchronize against 926 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
918 * ttwu() so that task_cpu() reports a stable number. 927 * against ttwu().
919 *
920 * We need to make an exception for PF_STARTING tasks because the fork
921 * path might require task_rq_lock() to work, eg. it can call
922 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
923 */ 928 */
924static inline int task_is_waking(struct task_struct *p) 929static inline int task_is_waking(struct task_struct *p)
925{ 930{
926 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 931 return unlikely(p->state == TASK_WAKING);
927} 932}
928 933
929/* 934/*
@@ -936,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936 struct rq *rq; 941 struct rq *rq;
937 942
938 for (;;) { 943 for (;;) {
939 while (task_is_waking(p))
940 cpu_relax();
941 rq = task_rq(p); 944 rq = task_rq(p);
942 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
943 if (likely(rq == task_rq(p) && !task_is_waking(p))) 946 if (likely(rq == task_rq(p)))
944 return rq; 947 return rq;
945 raw_spin_unlock(&rq->lock); 948 raw_spin_unlock(&rq->lock);
946 } 949 }
@@ -957,12 +960,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
957 struct rq *rq; 960 struct rq *rq;
958 961
959 for (;;) { 962 for (;;) {
960 while (task_is_waking(p))
961 cpu_relax();
962 local_irq_save(*flags); 963 local_irq_save(*flags);
963 rq = task_rq(p); 964 rq = task_rq(p);
964 raw_spin_lock(&rq->lock); 965 raw_spin_lock(&rq->lock);
965 if (likely(rq == task_rq(p) && !task_is_waking(p))) 966 if (likely(rq == task_rq(p)))
966 return rq; 967 return rq;
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 968 raw_spin_unlock_irqrestore(&rq->lock, *flags);
968 } 969 }
@@ -1239,6 +1240,17 @@ void wake_up_idle_cpu(int cpu)
1239 if (!tsk_is_polling(rq->idle)) 1240 if (!tsk_is_polling(rq->idle))
1240 smp_send_reschedule(cpu); 1241 smp_send_reschedule(cpu);
1241} 1242}
1243
1244int nohz_ratelimit(int cpu)
1245{
1246 struct rq *rq = cpu_rq(cpu);
1247 u64 diff = rq->clock - rq->nohz_stamp;
1248
1249 rq->nohz_stamp = rq->clock;
1250
1251 return diff < (NSEC_PER_SEC / HZ) >> 1;
1252}
1253
1242#endif /* CONFIG_NO_HZ */ 1254#endif /* CONFIG_NO_HZ */
1243 1255
1244static u64 sched_avg_period(void) 1256static u64 sched_avg_period(void)
@@ -1781,8 +1793,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1781 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1793 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1782 } 1794 }
1783 } 1795 }
1784 update_rq_clock(rq1);
1785 update_rq_clock(rq2);
1786} 1796}
1787 1797
1788/* 1798/*
@@ -1813,7 +1823,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1813} 1823}
1814#endif 1824#endif
1815 1825
1816static void calc_load_account_active(struct rq *this_rq); 1826static void calc_load_account_idle(struct rq *this_rq);
1817static void update_sysctl(void); 1827static void update_sysctl(void);
1818static int get_update_sysctl_factor(void); 1828static int get_update_sysctl_factor(void);
1819 1829
@@ -1870,62 +1880,43 @@ static void set_load_weight(struct task_struct *p)
1870 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1880 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1871} 1881}
1872 1882
1873static void update_avg(u64 *avg, u64 sample) 1883static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1874{ 1884{
1875 s64 diff = sample - *avg; 1885 update_rq_clock(rq);
1876 *avg += diff >> 3;
1877}
1878
1879static void
1880enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1881{
1882 if (wakeup)
1883 p->se.start_runtime = p->se.sum_exec_runtime;
1884
1885 sched_info_queued(p); 1886 sched_info_queued(p);
1886 p->sched_class->enqueue_task(rq, p, wakeup, head); 1887 p->sched_class->enqueue_task(rq, p, flags);
1887 p->se.on_rq = 1; 1888 p->se.on_rq = 1;
1888} 1889}
1889 1890
1890static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1891static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1891{ 1892{
1892 if (sleep) { 1893 update_rq_clock(rq);
1893 if (p->se.last_wakeup) {
1894 update_avg(&p->se.avg_overlap,
1895 p->se.sum_exec_runtime - p->se.last_wakeup);
1896 p->se.last_wakeup = 0;
1897 } else {
1898 update_avg(&p->se.avg_wakeup,
1899 sysctl_sched_wakeup_granularity);
1900 }
1901 }
1902
1903 sched_info_dequeued(p); 1894 sched_info_dequeued(p);
1904 p->sched_class->dequeue_task(rq, p, sleep); 1895 p->sched_class->dequeue_task(rq, p, flags);
1905 p->se.on_rq = 0; 1896 p->se.on_rq = 0;
1906} 1897}
1907 1898
1908/* 1899/*
1909 * activate_task - move a task to the runqueue. 1900 * activate_task - move a task to the runqueue.
1910 */ 1901 */
1911static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1902static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1912{ 1903{
1913 if (task_contributes_to_load(p)) 1904 if (task_contributes_to_load(p))
1914 rq->nr_uninterruptible--; 1905 rq->nr_uninterruptible--;
1915 1906
1916 enqueue_task(rq, p, wakeup, false); 1907 enqueue_task(rq, p, flags);
1917 inc_nr_running(rq); 1908 inc_nr_running(rq);
1918} 1909}
1919 1910
1920/* 1911/*
1921 * deactivate_task - remove a task from the runqueue. 1912 * deactivate_task - remove a task from the runqueue.
1922 */ 1913 */
1923static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1914static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1924{ 1915{
1925 if (task_contributes_to_load(p)) 1916 if (task_contributes_to_load(p))
1926 rq->nr_uninterruptible++; 1917 rq->nr_uninterruptible++;
1927 1918
1928 dequeue_task(rq, p, sleep); 1919 dequeue_task(rq, p, flags);
1929 dec_nr_running(rq); 1920 dec_nr_running(rq);
1930} 1921}
1931 1922
@@ -2054,21 +2045,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2054 __set_task_cpu(p, new_cpu); 2045 __set_task_cpu(p, new_cpu);
2055} 2046}
2056 2047
2057struct migration_req { 2048struct migration_arg {
2058 struct list_head list;
2059
2060 struct task_struct *task; 2049 struct task_struct *task;
2061 int dest_cpu; 2050 int dest_cpu;
2062
2063 struct completion done;
2064}; 2051};
2065 2052
2053static int migration_cpu_stop(void *data);
2054
2066/* 2055/*
2067 * The task's runqueue lock must be held. 2056 * The task's runqueue lock must be held.
2068 * Returns true if you have to wait for migration thread. 2057 * Returns true if you have to wait for migration thread.
2069 */ 2058 */
2070static int 2059static bool migrate_task(struct task_struct *p, int dest_cpu)
2071migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2072{ 2060{
2073 struct rq *rq = task_rq(p); 2061 struct rq *rq = task_rq(p);
2074 2062
@@ -2076,15 +2064,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2076 * If the task is not on a runqueue (and not running), then 2064 * If the task is not on a runqueue (and not running), then
2077 * the next wake-up will properly place the task. 2065 * the next wake-up will properly place the task.
2078 */ 2066 */
2079 if (!p->se.on_rq && !task_running(rq, p)) 2067 return p->se.on_rq || task_running(rq, p);
2080 return 0;
2081
2082 init_completion(&req->done);
2083 req->task = p;
2084 req->dest_cpu = dest_cpu;
2085 list_add(&req->list, &rq->migration_queue);
2086
2087 return 1;
2088} 2068}
2089 2069
2090/* 2070/*
@@ -2142,7 +2122,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2142 * just go back and repeat. 2122 * just go back and repeat.
2143 */ 2123 */
2144 rq = task_rq_lock(p, &flags); 2124 rq = task_rq_lock(p, &flags);
2145 trace_sched_wait_task(rq, p); 2125 trace_sched_wait_task(p);
2146 running = task_running(rq, p); 2126 running = task_running(rq, p);
2147 on_rq = p->se.on_rq; 2127 on_rq = p->se.on_rq;
2148 ncsw = 0; 2128 ncsw = 0;
@@ -2240,6 +2220,9 @@ void task_oncpu_function_call(struct task_struct *p,
2240} 2220}
2241 2221
2242#ifdef CONFIG_SMP 2222#ifdef CONFIG_SMP
2223/*
2224 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2225 */
2243static int select_fallback_rq(int cpu, struct task_struct *p) 2226static int select_fallback_rq(int cpu, struct task_struct *p)
2244{ 2227{
2245 int dest_cpu; 2228 int dest_cpu;
@@ -2256,12 +2239,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2256 return dest_cpu; 2239 return dest_cpu;
2257 2240
2258 /* No more Mr. Nice Guy. */ 2241 /* No more Mr. Nice Guy. */
2259 if (dest_cpu >= nr_cpu_ids) { 2242 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2260 rcu_read_lock(); 2243 dest_cpu = cpuset_cpus_allowed_fallback(p);
2261 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2262 rcu_read_unlock();
2263 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2264
2265 /* 2244 /*
2266 * Don't tell them about moving exiting tasks or 2245 * Don't tell them about moving exiting tasks or
2267 * kernel threads (both mm NULL), since they never 2246 * kernel threads (both mm NULL), since they never
@@ -2278,17 +2257,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2278} 2257}
2279 2258
2280/* 2259/*
2281 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2260 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2282 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2283 * by:
2284 *
2285 * exec: is unstable, retry loop
2286 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2287 */ 2261 */
2288static inline 2262static inline
2289int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2263int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2290{ 2264{
2291 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2265 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2292 2266
2293 /* 2267 /*
2294 * In order not to call set_task_cpu() on a blocking task we need 2268 * In order not to call set_task_cpu() on a blocking task we need
@@ -2306,6 +2280,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2306 2280
2307 return cpu; 2281 return cpu;
2308} 2282}
2283
2284static void update_avg(u64 *avg, u64 sample)
2285{
2286 s64 diff = sample - *avg;
2287 *avg += diff >> 3;
2288}
2309#endif 2289#endif
2310 2290
2311/*** 2291/***
@@ -2327,16 +2307,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2327{ 2307{
2328 int cpu, orig_cpu, this_cpu, success = 0; 2308 int cpu, orig_cpu, this_cpu, success = 0;
2329 unsigned long flags; 2309 unsigned long flags;
2310 unsigned long en_flags = ENQUEUE_WAKEUP;
2330 struct rq *rq; 2311 struct rq *rq;
2331 2312
2332 if (!sched_feat(SYNC_WAKEUPS))
2333 wake_flags &= ~WF_SYNC;
2334
2335 this_cpu = get_cpu(); 2313 this_cpu = get_cpu();
2336 2314
2337 smp_wmb(); 2315 smp_wmb();
2338 rq = task_rq_lock(p, &flags); 2316 rq = task_rq_lock(p, &flags);
2339 update_rq_clock(rq);
2340 if (!(p->state & state)) 2317 if (!(p->state & state))
2341 goto out; 2318 goto out;
2342 2319
@@ -2356,28 +2333,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2356 * 2333 *
2357 * First fix up the nr_uninterruptible count: 2334 * First fix up the nr_uninterruptible count:
2358 */ 2335 */
2359 if (task_contributes_to_load(p)) 2336 if (task_contributes_to_load(p)) {
2360 rq->nr_uninterruptible--; 2337 if (likely(cpu_online(orig_cpu)))
2338 rq->nr_uninterruptible--;
2339 else
2340 this_rq()->nr_uninterruptible--;
2341 }
2361 p->state = TASK_WAKING; 2342 p->state = TASK_WAKING;
2362 2343
2363 if (p->sched_class->task_waking) 2344 if (p->sched_class->task_waking) {
2364 p->sched_class->task_waking(rq, p); 2345 p->sched_class->task_waking(rq, p);
2346 en_flags |= ENQUEUE_WAKING;
2347 }
2365 2348
2366 __task_rq_unlock(rq); 2349 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2367 2350 if (cpu != orig_cpu)
2368 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2369 if (cpu != orig_cpu) {
2370 /*
2371 * Since we migrate the task without holding any rq->lock,
2372 * we need to be careful with task_rq_lock(), since that
2373 * might end up locking an invalid rq.
2374 */
2375 set_task_cpu(p, cpu); 2351 set_task_cpu(p, cpu);
2376 } 2352 __task_rq_unlock(rq);
2377 2353
2378 rq = cpu_rq(cpu); 2354 rq = cpu_rq(cpu);
2379 raw_spin_lock(&rq->lock); 2355 raw_spin_lock(&rq->lock);
2380 update_rq_clock(rq);
2381 2356
2382 /* 2357 /*
2383 * We migrated the task without holding either rq->lock, however 2358 * We migrated the task without holding either rq->lock, however
@@ -2405,36 +2380,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2405 2380
2406out_activate: 2381out_activate:
2407#endif /* CONFIG_SMP */ 2382#endif /* CONFIG_SMP */
2408 schedstat_inc(p, se.nr_wakeups); 2383 schedstat_inc(p, se.statistics.nr_wakeups);
2409 if (wake_flags & WF_SYNC) 2384 if (wake_flags & WF_SYNC)
2410 schedstat_inc(p, se.nr_wakeups_sync); 2385 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2411 if (orig_cpu != cpu) 2386 if (orig_cpu != cpu)
2412 schedstat_inc(p, se.nr_wakeups_migrate); 2387 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2413 if (cpu == this_cpu) 2388 if (cpu == this_cpu)
2414 schedstat_inc(p, se.nr_wakeups_local); 2389 schedstat_inc(p, se.statistics.nr_wakeups_local);
2415 else 2390 else
2416 schedstat_inc(p, se.nr_wakeups_remote); 2391 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2417 activate_task(rq, p, 1); 2392 activate_task(rq, p, en_flags);
2418 success = 1; 2393 success = 1;
2419 2394
2420 /*
2421 * Only attribute actual wakeups done by this task.
2422 */
2423 if (!in_interrupt()) {
2424 struct sched_entity *se = &current->se;
2425 u64 sample = se->sum_exec_runtime;
2426
2427 if (se->last_wakeup)
2428 sample -= se->last_wakeup;
2429 else
2430 sample -= se->start_runtime;
2431 update_avg(&se->avg_wakeup, sample);
2432
2433 se->last_wakeup = se->sum_exec_runtime;
2434 }
2435
2436out_running: 2395out_running:
2437 trace_sched_wakeup(rq, p, success); 2396 trace_sched_wakeup(p, success);
2438 check_preempt_curr(rq, p, wake_flags); 2397 check_preempt_curr(rq, p, wake_flags);
2439 2398
2440 p->state = TASK_RUNNING; 2399 p->state = TASK_RUNNING;
@@ -2494,42 +2453,9 @@ static void __sched_fork(struct task_struct *p)
2494 p->se.sum_exec_runtime = 0; 2453 p->se.sum_exec_runtime = 0;
2495 p->se.prev_sum_exec_runtime = 0; 2454 p->se.prev_sum_exec_runtime = 0;
2496 p->se.nr_migrations = 0; 2455 p->se.nr_migrations = 0;
2497 p->se.last_wakeup = 0;
2498 p->se.avg_overlap = 0;
2499 p->se.start_runtime = 0;
2500 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2501 2456
2502#ifdef CONFIG_SCHEDSTATS 2457#ifdef CONFIG_SCHEDSTATS
2503 p->se.wait_start = 0; 2458 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2504 p->se.wait_max = 0;
2505 p->se.wait_count = 0;
2506 p->se.wait_sum = 0;
2507
2508 p->se.sleep_start = 0;
2509 p->se.sleep_max = 0;
2510 p->se.sum_sleep_runtime = 0;
2511
2512 p->se.block_start = 0;
2513 p->se.block_max = 0;
2514 p->se.exec_max = 0;
2515 p->se.slice_max = 0;
2516
2517 p->se.nr_migrations_cold = 0;
2518 p->se.nr_failed_migrations_affine = 0;
2519 p->se.nr_failed_migrations_running = 0;
2520 p->se.nr_failed_migrations_hot = 0;
2521 p->se.nr_forced_migrations = 0;
2522
2523 p->se.nr_wakeups = 0;
2524 p->se.nr_wakeups_sync = 0;
2525 p->se.nr_wakeups_migrate = 0;
2526 p->se.nr_wakeups_local = 0;
2527 p->se.nr_wakeups_remote = 0;
2528 p->se.nr_wakeups_affine = 0;
2529 p->se.nr_wakeups_affine_attempts = 0;
2530 p->se.nr_wakeups_passive = 0;
2531 p->se.nr_wakeups_idle = 0;
2532
2533#endif 2459#endif
2534 2460
2535 INIT_LIST_HEAD(&p->rt.run_list); 2461 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2550,11 +2476,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2550 2476
2551 __sched_fork(p); 2477 __sched_fork(p);
2552 /* 2478 /*
2553 * We mark the process as waking here. This guarantees that 2479 * We mark the process as running here. This guarantees that
2554 * nobody will actually run it, and a signal or other external 2480 * nobody will actually run it, and a signal or other external
2555 * event cannot wake it up and insert it on the runqueue either. 2481 * event cannot wake it up and insert it on the runqueue either.
2556 */ 2482 */
2557 p->state = TASK_WAKING; 2483 p->state = TASK_RUNNING;
2558 2484
2559 /* 2485 /*
2560 * Revert to default priority/policy on fork if requested. 2486 * Revert to default priority/policy on fork if requested.
@@ -2621,31 +2547,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2621 int cpu __maybe_unused = get_cpu(); 2547 int cpu __maybe_unused = get_cpu();
2622 2548
2623#ifdef CONFIG_SMP 2549#ifdef CONFIG_SMP
2550 rq = task_rq_lock(p, &flags);
2551 p->state = TASK_WAKING;
2552
2624 /* 2553 /*
2625 * Fork balancing, do it here and not earlier because: 2554 * Fork balancing, do it here and not earlier because:
2626 * - cpus_allowed can change in the fork path 2555 * - cpus_allowed can change in the fork path
2627 * - any previously selected cpu might disappear through hotplug 2556 * - any previously selected cpu might disappear through hotplug
2628 * 2557 *
2629 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2558 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2630 * ->cpus_allowed is stable, we have preemption disabled, meaning 2559 * without people poking at ->cpus_allowed.
2631 * cpu_online_mask is stable.
2632 */ 2560 */
2633 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2561 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2634 set_task_cpu(p, cpu); 2562 set_task_cpu(p, cpu);
2635#endif
2636 2563
2637 /*
2638 * Since the task is not on the rq and we still have TASK_WAKING set
2639 * nobody else will migrate this task.
2640 */
2641 rq = cpu_rq(cpu);
2642 raw_spin_lock_irqsave(&rq->lock, flags);
2643
2644 BUG_ON(p->state != TASK_WAKING);
2645 p->state = TASK_RUNNING; 2564 p->state = TASK_RUNNING;
2646 update_rq_clock(rq); 2565 task_rq_unlock(rq, &flags);
2566#endif
2567
2568 rq = task_rq_lock(p, &flags);
2647 activate_task(rq, p, 0); 2569 activate_task(rq, p, 0);
2648 trace_sched_wakeup_new(rq, p, 1); 2570 trace_sched_wakeup_new(p, 1);
2649 check_preempt_curr(rq, p, WF_FORK); 2571 check_preempt_curr(rq, p, WF_FORK);
2650#ifdef CONFIG_SMP 2572#ifdef CONFIG_SMP
2651 if (p->sched_class->task_woken) 2573 if (p->sched_class->task_woken)
@@ -2865,7 +2787,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2865 struct mm_struct *mm, *oldmm; 2787 struct mm_struct *mm, *oldmm;
2866 2788
2867 prepare_task_switch(rq, prev, next); 2789 prepare_task_switch(rq, prev, next);
2868 trace_sched_switch(rq, prev, next); 2790 trace_sched_switch(prev, next);
2869 mm = next->mm; 2791 mm = next->mm;
2870 oldmm = prev->active_mm; 2792 oldmm = prev->active_mm;
2871 /* 2793 /*
@@ -2982,6 +2904,61 @@ static unsigned long calc_load_update;
2982unsigned long avenrun[3]; 2904unsigned long avenrun[3];
2983EXPORT_SYMBOL(avenrun); 2905EXPORT_SYMBOL(avenrun);
2984 2906
2907static long calc_load_fold_active(struct rq *this_rq)
2908{
2909 long nr_active, delta = 0;
2910
2911 nr_active = this_rq->nr_running;
2912 nr_active += (long) this_rq->nr_uninterruptible;
2913
2914 if (nr_active != this_rq->calc_load_active) {
2915 delta = nr_active - this_rq->calc_load_active;
2916 this_rq->calc_load_active = nr_active;
2917 }
2918
2919 return delta;
2920}
2921
2922#ifdef CONFIG_NO_HZ
2923/*
2924 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2925 *
2926 * When making the ILB scale, we should try to pull this in as well.
2927 */
2928static atomic_long_t calc_load_tasks_idle;
2929
2930static void calc_load_account_idle(struct rq *this_rq)
2931{
2932 long delta;
2933
2934 delta = calc_load_fold_active(this_rq);
2935 if (delta)
2936 atomic_long_add(delta, &calc_load_tasks_idle);
2937}
2938
2939static long calc_load_fold_idle(void)
2940{
2941 long delta = 0;
2942
2943 /*
2944 * Its got a race, we don't care...
2945 */
2946 if (atomic_long_read(&calc_load_tasks_idle))
2947 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2948
2949 return delta;
2950}
2951#else
2952static void calc_load_account_idle(struct rq *this_rq)
2953{
2954}
2955
2956static inline long calc_load_fold_idle(void)
2957{
2958 return 0;
2959}
2960#endif
2961
2985/** 2962/**
2986 * get_avenrun - get the load average array 2963 * get_avenrun - get the load average array
2987 * @loads: pointer to dest load array 2964 * @loads: pointer to dest load array
@@ -3028,20 +3005,22 @@ void calc_global_load(void)
3028} 3005}
3029 3006
3030/* 3007/*
3031 * Either called from update_cpu_load() or from a cpu going idle 3008 * Called from update_cpu_load() to periodically update this CPU's
3009 * active count.
3032 */ 3010 */
3033static void calc_load_account_active(struct rq *this_rq) 3011static void calc_load_account_active(struct rq *this_rq)
3034{ 3012{
3035 long nr_active, delta; 3013 long delta;
3036 3014
3037 nr_active = this_rq->nr_running; 3015 if (time_before(jiffies, this_rq->calc_load_update))
3038 nr_active += (long) this_rq->nr_uninterruptible; 3016 return;
3039 3017
3040 if (nr_active != this_rq->calc_load_active) { 3018 delta = calc_load_fold_active(this_rq);
3041 delta = nr_active - this_rq->calc_load_active; 3019 delta += calc_load_fold_idle();
3042 this_rq->calc_load_active = nr_active; 3020 if (delta)
3043 atomic_long_add(delta, &calc_load_tasks); 3021 atomic_long_add(delta, &calc_load_tasks);
3044 } 3022
3023 this_rq->calc_load_update += LOAD_FREQ;
3045} 3024}
3046 3025
3047/* 3026/*
@@ -3073,10 +3052,7 @@ static void update_cpu_load(struct rq *this_rq)
3073 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3052 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3074 } 3053 }
3075 3054
3076 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3055 calc_load_account_active(this_rq);
3077 this_rq->calc_load_update += LOAD_FREQ;
3078 calc_load_account_active(this_rq);
3079 }
3080} 3056}
3081 3057
3082#ifdef CONFIG_SMP 3058#ifdef CONFIG_SMP
@@ -3088,44 +3064,27 @@ static void update_cpu_load(struct rq *this_rq)
3088void sched_exec(void) 3064void sched_exec(void)
3089{ 3065{
3090 struct task_struct *p = current; 3066 struct task_struct *p = current;
3091 struct migration_req req;
3092 int dest_cpu, this_cpu;
3093 unsigned long flags; 3067 unsigned long flags;
3094 struct rq *rq; 3068 struct rq *rq;
3095 3069 int dest_cpu;
3096again:
3097 this_cpu = get_cpu();
3098 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3099 if (dest_cpu == this_cpu) {
3100 put_cpu();
3101 return;
3102 }
3103 3070
3104 rq = task_rq_lock(p, &flags); 3071 rq = task_rq_lock(p, &flags);
3105 put_cpu(); 3072 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3073 if (dest_cpu == smp_processor_id())
3074 goto unlock;
3106 3075
3107 /* 3076 /*
3108 * select_task_rq() can race against ->cpus_allowed 3077 * select_task_rq() can race against ->cpus_allowed
3109 */ 3078 */
3110 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3079 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3111 || unlikely(!cpu_active(dest_cpu))) { 3080 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3112 task_rq_unlock(rq, &flags); 3081 struct migration_arg arg = { p, dest_cpu };
3113 goto again;
3114 }
3115 3082
3116 /* force the process onto the specified CPU */
3117 if (migrate_task(p, dest_cpu, &req)) {
3118 /* Need to wait for migration thread (might exit: take ref). */
3119 struct task_struct *mt = rq->migration_thread;
3120
3121 get_task_struct(mt);
3122 task_rq_unlock(rq, &flags); 3083 task_rq_unlock(rq, &flags);
3123 wake_up_process(mt); 3084 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3124 put_task_struct(mt);
3125 wait_for_completion(&req.done);
3126
3127 return; 3085 return;
3128 } 3086 }
3087unlock:
3129 task_rq_unlock(rq, &flags); 3088 task_rq_unlock(rq, &flags);
3130} 3089}
3131 3090
@@ -3597,23 +3556,9 @@ static inline void schedule_debug(struct task_struct *prev)
3597 3556
3598static void put_prev_task(struct rq *rq, struct task_struct *prev) 3557static void put_prev_task(struct rq *rq, struct task_struct *prev)
3599{ 3558{
3600 if (prev->state == TASK_RUNNING) { 3559 if (prev->se.on_rq)
3601 u64 runtime = prev->se.sum_exec_runtime; 3560 update_rq_clock(rq);
3602 3561 rq->skip_clock_update = 0;
3603 runtime -= prev->se.prev_sum_exec_runtime;
3604 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3605
3606 /*
3607 * In order to avoid avg_overlap growing stale when we are
3608 * indeed overlapping and hence not getting put to sleep, grow
3609 * the avg_overlap on preemption.
3610 *
3611 * We use the average preemption runtime because that
3612 * correlates to the amount of cache footprint a task can
3613 * build up.
3614 */
3615 update_avg(&prev->se.avg_overlap, runtime);
3616 }
3617 prev->sched_class->put_prev_task(rq, prev); 3562 prev->sched_class->put_prev_task(rq, prev);
3618} 3563}
3619 3564
@@ -3676,14 +3621,13 @@ need_resched_nonpreemptible:
3676 hrtick_clear(rq); 3621 hrtick_clear(rq);
3677 3622
3678 raw_spin_lock_irq(&rq->lock); 3623 raw_spin_lock_irq(&rq->lock);
3679 update_rq_clock(rq);
3680 clear_tsk_need_resched(prev); 3624 clear_tsk_need_resched(prev);
3681 3625
3682 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3626 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3683 if (unlikely(signal_pending_state(prev->state, prev))) 3627 if (unlikely(signal_pending_state(prev->state, prev)))
3684 prev->state = TASK_RUNNING; 3628 prev->state = TASK_RUNNING;
3685 else 3629 else
3686 deactivate_task(rq, prev, 1); 3630 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3687 switch_count = &prev->nvcsw; 3631 switch_count = &prev->nvcsw;
3688 } 3632 }
3689 3633
@@ -4006,8 +3950,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4006 if (!x->done) { 3950 if (!x->done) {
4007 DECLARE_WAITQUEUE(wait, current); 3951 DECLARE_WAITQUEUE(wait, current);
4008 3952
4009 wait.flags |= WQ_FLAG_EXCLUSIVE; 3953 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4010 __add_wait_queue_tail(&x->wait, &wait);
4011 do { 3954 do {
4012 if (signal_pending_state(state, current)) { 3955 if (signal_pending_state(state, current)) {
4013 timeout = -ERESTARTSYS; 3956 timeout = -ERESTARTSYS;
@@ -4233,7 +4176,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4233 BUG_ON(prio < 0 || prio > MAX_PRIO); 4176 BUG_ON(prio < 0 || prio > MAX_PRIO);
4234 4177
4235 rq = task_rq_lock(p, &flags); 4178 rq = task_rq_lock(p, &flags);
4236 update_rq_clock(rq);
4237 4179
4238 oldprio = p->prio; 4180 oldprio = p->prio;
4239 prev_class = p->sched_class; 4181 prev_class = p->sched_class;
@@ -4254,7 +4196,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4254 if (running) 4196 if (running)
4255 p->sched_class->set_curr_task(rq); 4197 p->sched_class->set_curr_task(rq);
4256 if (on_rq) { 4198 if (on_rq) {
4257 enqueue_task(rq, p, 0, oldprio < prio); 4199 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4258 4200
4259 check_class_changed(rq, p, prev_class, oldprio, running); 4201 check_class_changed(rq, p, prev_class, oldprio, running);
4260 } 4202 }
@@ -4276,7 +4218,6 @@ void set_user_nice(struct task_struct *p, long nice)
4276 * the task might be in the middle of scheduling on another CPU. 4218 * the task might be in the middle of scheduling on another CPU.
4277 */ 4219 */
4278 rq = task_rq_lock(p, &flags); 4220 rq = task_rq_lock(p, &flags);
4279 update_rq_clock(rq);
4280 /* 4221 /*
4281 * The RT priorities are set via sched_setscheduler(), but we still 4222 * The RT priorities are set via sched_setscheduler(), but we still
4282 * allow the 'normal' nice value to be set - but as expected 4223 * allow the 'normal' nice value to be set - but as expected
@@ -4298,7 +4239,7 @@ void set_user_nice(struct task_struct *p, long nice)
4298 delta = p->prio - old_prio; 4239 delta = p->prio - old_prio;
4299 4240
4300 if (on_rq) { 4241 if (on_rq) {
4301 enqueue_task(rq, p, 0, false); 4242 enqueue_task(rq, p, 0);
4302 /* 4243 /*
4303 * If the task increased its priority or is running and 4244 * If the task increased its priority or is running and
4304 * lowered its priority, then reschedule its CPU: 4245 * lowered its priority, then reschedule its CPU:
@@ -4559,7 +4500,6 @@ recheck:
4559 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4500 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4560 goto recheck; 4501 goto recheck;
4561 } 4502 }
4562 update_rq_clock(rq);
4563 on_rq = p->se.on_rq; 4503 on_rq = p->se.on_rq;
4564 running = task_current(rq, p); 4504 running = task_current(rq, p);
4565 if (on_rq) 4505 if (on_rq)
@@ -5296,17 +5236,15 @@ static inline void sched_init_granularity(void)
5296/* 5236/*
5297 * This is how migration works: 5237 * This is how migration works:
5298 * 5238 *
5299 * 1) we queue a struct migration_req structure in the source CPU's 5239 * 1) we invoke migration_cpu_stop() on the target CPU using
5300 * runqueue and wake up that CPU's migration thread. 5240 * stop_one_cpu().
5301 * 2) we down() the locked semaphore => thread blocks. 5241 * 2) stopper starts to run (implicitly forcing the migrated thread
5302 * 3) migration thread wakes up (implicitly it forces the migrated 5242 * off the CPU)
5303 * thread off the CPU) 5243 * 3) it checks whether the migrated task is still in the wrong runqueue.
5304 * 4) it gets the migration request and checks whether the migrated 5244 * 4) if it's in the wrong runqueue then the migration thread removes
5305 * task is still in the wrong runqueue.
5306 * 5) if it's in the wrong runqueue then the migration thread removes
5307 * it and puts it into the right queue. 5245 * it and puts it into the right queue.
5308 * 6) migration thread up()s the semaphore. 5246 * 5) stopper completes and stop_one_cpu() returns and the migration
5309 * 7) we wake up and the migration is done. 5247 * is done.
5310 */ 5248 */
5311 5249
5312/* 5250/*
@@ -5320,12 +5258,23 @@ static inline void sched_init_granularity(void)
5320 */ 5258 */
5321int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5259int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5322{ 5260{
5323 struct migration_req req;
5324 unsigned long flags; 5261 unsigned long flags;
5325 struct rq *rq; 5262 struct rq *rq;
5263 unsigned int dest_cpu;
5326 int ret = 0; 5264 int ret = 0;
5327 5265
5266 /*
5267 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5268 * drop the rq->lock and still rely on ->cpus_allowed.
5269 */
5270again:
5271 while (task_is_waking(p))
5272 cpu_relax();
5328 rq = task_rq_lock(p, &flags); 5273 rq = task_rq_lock(p, &flags);
5274 if (task_is_waking(p)) {
5275 task_rq_unlock(rq, &flags);
5276 goto again;
5277 }
5329 5278
5330 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5279 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5331 ret = -EINVAL; 5280 ret = -EINVAL;
@@ -5349,15 +5298,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5349 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5298 if (cpumask_test_cpu(task_cpu(p), new_mask))
5350 goto out; 5299 goto out;
5351 5300
5352 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5301 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5302 if (migrate_task(p, dest_cpu)) {
5303 struct migration_arg arg = { p, dest_cpu };
5353 /* Need help from migration thread: drop lock and wait. */ 5304 /* Need help from migration thread: drop lock and wait. */
5354 struct task_struct *mt = rq->migration_thread;
5355
5356 get_task_struct(mt);
5357 task_rq_unlock(rq, &flags); 5305 task_rq_unlock(rq, &flags);
5358 wake_up_process(mt); 5306 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5359 put_task_struct(mt);
5360 wait_for_completion(&req.done);
5361 tlb_migrate_finish(p->mm); 5307 tlb_migrate_finish(p->mm);
5362 return 0; 5308 return 0;
5363 } 5309 }
@@ -5415,98 +5361,49 @@ fail:
5415 return ret; 5361 return ret;
5416} 5362}
5417 5363
5418#define RCU_MIGRATION_IDLE 0
5419#define RCU_MIGRATION_NEED_QS 1
5420#define RCU_MIGRATION_GOT_QS 2
5421#define RCU_MIGRATION_MUST_SYNC 3
5422
5423/* 5364/*
5424 * migration_thread - this is a highprio system thread that performs 5365 * migration_cpu_stop - this will be executed by a highprio stopper thread
5425 * thread migration by bumping thread off CPU then 'pushing' onto 5366 * and performs thread migration by bumping thread off CPU then
5426 * another runqueue. 5367 * 'pushing' onto another runqueue.
5427 */ 5368 */
5428static int migration_thread(void *data) 5369static int migration_cpu_stop(void *data)
5429{ 5370{
5430 int badcpu; 5371 struct migration_arg *arg = data;
5431 int cpu = (long)data;
5432 struct rq *rq;
5433
5434 rq = cpu_rq(cpu);
5435 BUG_ON(rq->migration_thread != current);
5436
5437 set_current_state(TASK_INTERRUPTIBLE);
5438 while (!kthread_should_stop()) {
5439 struct migration_req *req;
5440 struct list_head *head;
5441
5442 raw_spin_lock_irq(&rq->lock);
5443
5444 if (cpu_is_offline(cpu)) {
5445 raw_spin_unlock_irq(&rq->lock);
5446 break;
5447 }
5448
5449 if (rq->active_balance) {
5450 active_load_balance(rq, cpu);
5451 rq->active_balance = 0;
5452 }
5453
5454 head = &rq->migration_queue;
5455
5456 if (list_empty(head)) {
5457 raw_spin_unlock_irq(&rq->lock);
5458 schedule();
5459 set_current_state(TASK_INTERRUPTIBLE);
5460 continue;
5461 }
5462 req = list_entry(head->next, struct migration_req, list);
5463 list_del_init(head->next);
5464
5465 if (req->task != NULL) {
5466 raw_spin_unlock(&rq->lock);
5467 __migrate_task(req->task, cpu, req->dest_cpu);
5468 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5469 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5470 raw_spin_unlock(&rq->lock);
5471 } else {
5472 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5473 raw_spin_unlock(&rq->lock);
5474 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5475 }
5476 local_irq_enable();
5477
5478 complete(&req->done);
5479 }
5480 __set_current_state(TASK_RUNNING);
5481
5482 return 0;
5483}
5484
5485#ifdef CONFIG_HOTPLUG_CPU
5486
5487static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5488{
5489 int ret;
5490 5372
5373 /*
5374 * The original target cpu might have gone down and we might
5375 * be on another cpu but it doesn't matter.
5376 */
5491 local_irq_disable(); 5377 local_irq_disable();
5492 ret = __migrate_task(p, src_cpu, dest_cpu); 5378 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5493 local_irq_enable(); 5379 local_irq_enable();
5494 return ret; 5380 return 0;
5495} 5381}
5496 5382
5383#ifdef CONFIG_HOTPLUG_CPU
5497/* 5384/*
5498 * Figure out where task on dead CPU should go, use force if necessary. 5385 * Figure out where task on dead CPU should go, use force if necessary.
5499 */ 5386 */
5500static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5387void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5501{ 5388{
5502 int dest_cpu; 5389 struct rq *rq = cpu_rq(dead_cpu);
5390 int needs_cpu, uninitialized_var(dest_cpu);
5391 unsigned long flags;
5503 5392
5504again: 5393 local_irq_save(flags);
5505 dest_cpu = select_fallback_rq(dead_cpu, p);
5506 5394
5507 /* It can have affinity changed while we were choosing. */ 5395 raw_spin_lock(&rq->lock);
5508 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5396 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5509 goto again; 5397 if (needs_cpu)
5398 dest_cpu = select_fallback_rq(dead_cpu, p);
5399 raw_spin_unlock(&rq->lock);
5400 /*
5401 * It can only fail if we race with set_cpus_allowed(),
5402 * in the racer should migrate the task anyway.
5403 */
5404 if (needs_cpu)
5405 __migrate_task(p, dead_cpu, dest_cpu);
5406 local_irq_restore(flags);
5510} 5407}
5511 5408
5512/* 5409/*
@@ -5570,7 +5467,6 @@ void sched_idle_next(void)
5570 5467
5571 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5468 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5572 5469
5573 update_rq_clock(rq);
5574 activate_task(rq, p, 0); 5470 activate_task(rq, p, 0);
5575 5471
5576 raw_spin_unlock_irqrestore(&rq->lock, flags); 5472 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5625,7 +5521,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5625 for ( ; ; ) { 5521 for ( ; ; ) {
5626 if (!rq->nr_running) 5522 if (!rq->nr_running)
5627 break; 5523 break;
5628 update_rq_clock(rq);
5629 next = pick_next_task(rq); 5524 next = pick_next_task(rq);
5630 if (!next) 5525 if (!next)
5631 break; 5526 break;
@@ -5848,35 +5743,20 @@ static void set_rq_offline(struct rq *rq)
5848static int __cpuinit 5743static int __cpuinit
5849migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5744migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5850{ 5745{
5851 struct task_struct *p;
5852 int cpu = (long)hcpu; 5746 int cpu = (long)hcpu;
5853 unsigned long flags; 5747 unsigned long flags;
5854 struct rq *rq; 5748 struct rq *rq = cpu_rq(cpu);
5855 5749
5856 switch (action) { 5750 switch (action) {
5857 5751
5858 case CPU_UP_PREPARE: 5752 case CPU_UP_PREPARE:
5859 case CPU_UP_PREPARE_FROZEN: 5753 case CPU_UP_PREPARE_FROZEN:
5860 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5861 if (IS_ERR(p))
5862 return NOTIFY_BAD;
5863 kthread_bind(p, cpu);
5864 /* Must be high prio: stop_machine expects to yield to it. */
5865 rq = task_rq_lock(p, &flags);
5866 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5867 task_rq_unlock(rq, &flags);
5868 get_task_struct(p);
5869 cpu_rq(cpu)->migration_thread = p;
5870 rq->calc_load_update = calc_load_update; 5754 rq->calc_load_update = calc_load_update;
5871 break; 5755 break;
5872 5756
5873 case CPU_ONLINE: 5757 case CPU_ONLINE:
5874 case CPU_ONLINE_FROZEN: 5758 case CPU_ONLINE_FROZEN:
5875 /* Strictly unnecessary, as first user will wake it. */
5876 wake_up_process(cpu_rq(cpu)->migration_thread);
5877
5878 /* Update our root-domain */ 5759 /* Update our root-domain */
5879 rq = cpu_rq(cpu);
5880 raw_spin_lock_irqsave(&rq->lock, flags); 5760 raw_spin_lock_irqsave(&rq->lock, flags);
5881 if (rq->rd) { 5761 if (rq->rd) {
5882 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5762 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5887,61 +5767,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5887 break; 5767 break;
5888 5768
5889#ifdef CONFIG_HOTPLUG_CPU 5769#ifdef CONFIG_HOTPLUG_CPU
5890 case CPU_UP_CANCELED:
5891 case CPU_UP_CANCELED_FROZEN:
5892 if (!cpu_rq(cpu)->migration_thread)
5893 break;
5894 /* Unbind it from offline cpu so it can run. Fall thru. */
5895 kthread_bind(cpu_rq(cpu)->migration_thread,
5896 cpumask_any(cpu_online_mask));
5897 kthread_stop(cpu_rq(cpu)->migration_thread);
5898 put_task_struct(cpu_rq(cpu)->migration_thread);
5899 cpu_rq(cpu)->migration_thread = NULL;
5900 break;
5901
5902 case CPU_DEAD: 5770 case CPU_DEAD:
5903 case CPU_DEAD_FROZEN: 5771 case CPU_DEAD_FROZEN:
5904 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5905 migrate_live_tasks(cpu); 5772 migrate_live_tasks(cpu);
5906 rq = cpu_rq(cpu);
5907 kthread_stop(rq->migration_thread);
5908 put_task_struct(rq->migration_thread);
5909 rq->migration_thread = NULL;
5910 /* Idle task back to normal (off runqueue, low prio) */ 5773 /* Idle task back to normal (off runqueue, low prio) */
5911 raw_spin_lock_irq(&rq->lock); 5774 raw_spin_lock_irq(&rq->lock);
5912 update_rq_clock(rq);
5913 deactivate_task(rq, rq->idle, 0); 5775 deactivate_task(rq, rq->idle, 0);
5914 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5776 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5915 rq->idle->sched_class = &idle_sched_class; 5777 rq->idle->sched_class = &idle_sched_class;
5916 migrate_dead_tasks(cpu); 5778 migrate_dead_tasks(cpu);
5917 raw_spin_unlock_irq(&rq->lock); 5779 raw_spin_unlock_irq(&rq->lock);
5918 cpuset_unlock();
5919 migrate_nr_uninterruptible(rq); 5780 migrate_nr_uninterruptible(rq);
5920 BUG_ON(rq->nr_running != 0); 5781 BUG_ON(rq->nr_running != 0);
5921 calc_global_load_remove(rq); 5782 calc_global_load_remove(rq);
5922 /*
5923 * No need to migrate the tasks: it was best-effort if
5924 * they didn't take sched_hotcpu_mutex. Just wake up
5925 * the requestors.
5926 */
5927 raw_spin_lock_irq(&rq->lock);
5928 while (!list_empty(&rq->migration_queue)) {
5929 struct migration_req *req;
5930
5931 req = list_entry(rq->migration_queue.next,
5932 struct migration_req, list);
5933 list_del_init(&req->list);
5934 raw_spin_unlock_irq(&rq->lock);
5935 complete(&req->done);
5936 raw_spin_lock_irq(&rq->lock);
5937 }
5938 raw_spin_unlock_irq(&rq->lock);
5939 break; 5783 break;
5940 5784
5941 case CPU_DYING: 5785 case CPU_DYING:
5942 case CPU_DYING_FROZEN: 5786 case CPU_DYING_FROZEN:
5943 /* Update our root-domain */ 5787 /* Update our root-domain */
5944 rq = cpu_rq(cpu);
5945 raw_spin_lock_irqsave(&rq->lock, flags); 5788 raw_spin_lock_irqsave(&rq->lock, flags);
5946 if (rq->rd) { 5789 if (rq->rd) {
5947 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5790 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6272,6 +6115,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6272 struct rq *rq = cpu_rq(cpu); 6115 struct rq *rq = cpu_rq(cpu);
6273 struct sched_domain *tmp; 6116 struct sched_domain *tmp;
6274 6117
6118 for (tmp = sd; tmp; tmp = tmp->parent)
6119 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6120
6275 /* Remove the sched domains which do not contribute to scheduling. */ 6121 /* Remove the sched domains which do not contribute to scheduling. */
6276 for (tmp = sd; tmp; ) { 6122 for (tmp = sd; tmp; ) {
6277 struct sched_domain *parent = tmp->parent; 6123 struct sched_domain *parent = tmp->parent;
@@ -7755,10 +7601,8 @@ void __init sched_init(void)
7755 rq->push_cpu = 0; 7601 rq->push_cpu = 0;
7756 rq->cpu = i; 7602 rq->cpu = i;
7757 rq->online = 0; 7603 rq->online = 0;
7758 rq->migration_thread = NULL;
7759 rq->idle_stamp = 0; 7604 rq->idle_stamp = 0;
7760 rq->avg_idle = 2*sysctl_sched_migration_cost; 7605 rq->avg_idle = 2*sysctl_sched_migration_cost;
7761 INIT_LIST_HEAD(&rq->migration_queue);
7762 rq_attach_root(rq, &def_root_domain); 7606 rq_attach_root(rq, &def_root_domain);
7763#endif 7607#endif
7764 init_rq_hrtick(rq); 7608 init_rq_hrtick(rq);
@@ -7859,7 +7703,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7859{ 7703{
7860 int on_rq; 7704 int on_rq;
7861 7705
7862 update_rq_clock(rq);
7863 on_rq = p->se.on_rq; 7706 on_rq = p->se.on_rq;
7864 if (on_rq) 7707 if (on_rq)
7865 deactivate_task(rq, p, 0); 7708 deactivate_task(rq, p, 0);
@@ -7886,9 +7729,9 @@ void normalize_rt_tasks(void)
7886 7729
7887 p->se.exec_start = 0; 7730 p->se.exec_start = 0;
7888#ifdef CONFIG_SCHEDSTATS 7731#ifdef CONFIG_SCHEDSTATS
7889 p->se.wait_start = 0; 7732 p->se.statistics.wait_start = 0;
7890 p->se.sleep_start = 0; 7733 p->se.statistics.sleep_start = 0;
7891 p->se.block_start = 0; 7734 p->se.statistics.block_start = 0;
7892#endif 7735#endif
7893 7736
7894 if (!rt_task(p)) { 7737 if (!rt_task(p)) {
@@ -8221,8 +8064,6 @@ void sched_move_task(struct task_struct *tsk)
8221 8064
8222 rq = task_rq_lock(tsk, &flags); 8065 rq = task_rq_lock(tsk, &flags);
8223 8066
8224 update_rq_clock(rq);
8225
8226 running = task_current(rq, tsk); 8067 running = task_current(rq, tsk);
8227 on_rq = tsk->se.on_rq; 8068 on_rq = tsk->se.on_rq;
8228 8069
@@ -8241,7 +8082,7 @@ void sched_move_task(struct task_struct *tsk)
8241 if (unlikely(running)) 8082 if (unlikely(running))
8242 tsk->sched_class->set_curr_task(rq); 8083 tsk->sched_class->set_curr_task(rq);
8243 if (on_rq) 8084 if (on_rq)
8244 enqueue_task(rq, tsk, 0, false); 8085 enqueue_task(rq, tsk, 0);
8245 8086
8246 task_rq_unlock(rq, &flags); 8087 task_rq_unlock(rq, &flags);
8247} 8088}
@@ -9055,43 +8896,32 @@ struct cgroup_subsys cpuacct_subsys = {
9055 8896
9056#ifndef CONFIG_SMP 8897#ifndef CONFIG_SMP
9057 8898
9058int rcu_expedited_torture_stats(char *page)
9059{
9060 return 0;
9061}
9062EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9063
9064void synchronize_sched_expedited(void) 8899void synchronize_sched_expedited(void)
9065{ 8900{
8901 barrier();
9066} 8902}
9067EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8903EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9068 8904
9069#else /* #ifndef CONFIG_SMP */ 8905#else /* #ifndef CONFIG_SMP */
9070 8906
9071static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 8907static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9072static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9073
9074#define RCU_EXPEDITED_STATE_POST -2
9075#define RCU_EXPEDITED_STATE_IDLE -1
9076
9077static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9078 8908
9079int rcu_expedited_torture_stats(char *page) 8909static int synchronize_sched_expedited_cpu_stop(void *data)
9080{ 8910{
9081 int cnt = 0; 8911 /*
9082 int cpu; 8912 * There must be a full memory barrier on each affected CPU
9083 8913 * between the time that try_stop_cpus() is called and the
9084 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 8914 * time that it returns.
9085 for_each_online_cpu(cpu) { 8915 *
9086 cnt += sprintf(&page[cnt], " %d:%d", 8916 * In the current initial implementation of cpu_stop, the
9087 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 8917 * above condition is already met when the control reaches
9088 } 8918 * this point and the following smp_mb() is not strictly
9089 cnt += sprintf(&page[cnt], "\n"); 8919 * necessary. Do smp_mb() anyway for documentation and
9090 return cnt; 8920 * robustness against future implementation changes.
8921 */
8922 smp_mb(); /* See above comment block. */
8923 return 0;
9091} 8924}
9092EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9093
9094static long synchronize_sched_expedited_count;
9095 8925
9096/* 8926/*
9097 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 8927 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9105,18 +8935,14 @@ static long synchronize_sched_expedited_count;
9105 */ 8935 */
9106void synchronize_sched_expedited(void) 8936void synchronize_sched_expedited(void)
9107{ 8937{
9108 int cpu; 8938 int snap, trycount = 0;
9109 unsigned long flags;
9110 bool need_full_sync = 0;
9111 struct rq *rq;
9112 struct migration_req *req;
9113 long snap;
9114 int trycount = 0;
9115 8939
9116 smp_mb(); /* ensure prior mod happens before capturing snap. */ 8940 smp_mb(); /* ensure prior mod happens before capturing snap. */
9117 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 8941 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9118 get_online_cpus(); 8942 get_online_cpus();
9119 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 8943 while (try_stop_cpus(cpu_online_mask,
8944 synchronize_sched_expedited_cpu_stop,
8945 NULL) == -EAGAIN) {
9120 put_online_cpus(); 8946 put_online_cpus();
9121 if (trycount++ < 10) 8947 if (trycount++ < 10)
9122 udelay(trycount * num_online_cpus()); 8948 udelay(trycount * num_online_cpus());
@@ -9124,41 +8950,15 @@ void synchronize_sched_expedited(void)
9124 synchronize_sched(); 8950 synchronize_sched();
9125 return; 8951 return;
9126 } 8952 }
9127 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 8953 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9128 smp_mb(); /* ensure test happens before caller kfree */ 8954 smp_mb(); /* ensure test happens before caller kfree */
9129 return; 8955 return;
9130 } 8956 }
9131 get_online_cpus(); 8957 get_online_cpus();
9132 } 8958 }
9133 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 8959 atomic_inc(&synchronize_sched_expedited_count);
9134 for_each_online_cpu(cpu) { 8960 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9135 rq = cpu_rq(cpu);
9136 req = &per_cpu(rcu_migration_req, cpu);
9137 init_completion(&req->done);
9138 req->task = NULL;
9139 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9140 raw_spin_lock_irqsave(&rq->lock, flags);
9141 list_add(&req->list, &rq->migration_queue);
9142 raw_spin_unlock_irqrestore(&rq->lock, flags);
9143 wake_up_process(rq->migration_thread);
9144 }
9145 for_each_online_cpu(cpu) {
9146 rcu_expedited_state = cpu;
9147 req = &per_cpu(rcu_migration_req, cpu);
9148 rq = cpu_rq(cpu);
9149 wait_for_completion(&req->done);
9150 raw_spin_lock_irqsave(&rq->lock, flags);
9151 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9152 need_full_sync = 1;
9153 req->dest_cpu = RCU_MIGRATION_IDLE;
9154 raw_spin_unlock_irqrestore(&rq->lock, flags);
9155 }
9156 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9157 synchronize_sched_expedited_count++;
9158 mutex_unlock(&rcu_sched_expedited_mutex);
9159 put_online_cpus(); 8961 put_online_cpus();
9160 if (need_full_sync)
9161 synchronize_sched();
9162} 8962}
9163EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8963EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9164 8964
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9b49db144037..9cf1baf6616a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
70 PN(se->vruntime); 70 PN(se->vruntime);
71 PN(se->sum_exec_runtime); 71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS 72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start); 73 PN(se->statistics.wait_start);
74 PN(se->sleep_start); 74 PN(se->statistics.sleep_start);
75 PN(se->block_start); 75 PN(se->statistics.block_start);
76 PN(se->sleep_max); 76 PN(se->statistics.sleep_max);
77 PN(se->block_max); 77 PN(se->statistics.block_max);
78 PN(se->exec_max); 78 PN(se->statistics.exec_max);
79 PN(se->slice_max); 79 PN(se->statistics.slice_max);
80 PN(se->wait_max); 80 PN(se->statistics.wait_max);
81 PN(se->wait_sum); 81 PN(se->statistics.wait_sum);
82 P(se->wait_count); 82 P(se->statistics.wait_count);
83#endif 83#endif
84 P(se->load.weight); 84 P(se->load.weight);
85#undef PN 85#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
105 SPLIT_NS(p->se.vruntime), 105 SPLIT_NS(p->se.vruntime),
106 SPLIT_NS(p->se.sum_exec_runtime), 106 SPLIT_NS(p->se.sum_exec_runtime),
107 SPLIT_NS(p->se.sum_sleep_runtime)); 107 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
108#else 108#else
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -173,11 +173,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
173 task_group_path(tg, path, sizeof(path)); 173 task_group_path(tg, path, sizeof(path));
174 174
175 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 175 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
176#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
177 {
178 uid_t uid = cfs_rq->tg->uid;
179 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
180 }
181#else 176#else
182 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 177 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
183#endif 178#endif
@@ -407,40 +402,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
407 PN(se.exec_start); 402 PN(se.exec_start);
408 PN(se.vruntime); 403 PN(se.vruntime);
409 PN(se.sum_exec_runtime); 404 PN(se.sum_exec_runtime);
410 PN(se.avg_overlap);
411 PN(se.avg_wakeup);
412 405
413 nr_switches = p->nvcsw + p->nivcsw; 406 nr_switches = p->nvcsw + p->nivcsw;
414 407
415#ifdef CONFIG_SCHEDSTATS 408#ifdef CONFIG_SCHEDSTATS
416 PN(se.wait_start); 409 PN(se.statistics.wait_start);
417 PN(se.sleep_start); 410 PN(se.statistics.sleep_start);
418 PN(se.block_start); 411 PN(se.statistics.block_start);
419 PN(se.sleep_max); 412 PN(se.statistics.sleep_max);
420 PN(se.block_max); 413 PN(se.statistics.block_max);
421 PN(se.exec_max); 414 PN(se.statistics.exec_max);
422 PN(se.slice_max); 415 PN(se.statistics.slice_max);
423 PN(se.wait_max); 416 PN(se.statistics.wait_max);
424 PN(se.wait_sum); 417 PN(se.statistics.wait_sum);
425 P(se.wait_count); 418 P(se.statistics.wait_count);
426 PN(se.iowait_sum); 419 PN(se.statistics.iowait_sum);
427 P(se.iowait_count); 420 P(se.statistics.iowait_count);
428 P(sched_info.bkl_count); 421 P(sched_info.bkl_count);
429 P(se.nr_migrations); 422 P(se.nr_migrations);
430 P(se.nr_migrations_cold); 423 P(se.statistics.nr_migrations_cold);
431 P(se.nr_failed_migrations_affine); 424 P(se.statistics.nr_failed_migrations_affine);
432 P(se.nr_failed_migrations_running); 425 P(se.statistics.nr_failed_migrations_running);
433 P(se.nr_failed_migrations_hot); 426 P(se.statistics.nr_failed_migrations_hot);
434 P(se.nr_forced_migrations); 427 P(se.statistics.nr_forced_migrations);
435 P(se.nr_wakeups); 428 P(se.statistics.nr_wakeups);
436 P(se.nr_wakeups_sync); 429 P(se.statistics.nr_wakeups_sync);
437 P(se.nr_wakeups_migrate); 430 P(se.statistics.nr_wakeups_migrate);
438 P(se.nr_wakeups_local); 431 P(se.statistics.nr_wakeups_local);
439 P(se.nr_wakeups_remote); 432 P(se.statistics.nr_wakeups_remote);
440 P(se.nr_wakeups_affine); 433 P(se.statistics.nr_wakeups_affine);
441 P(se.nr_wakeups_affine_attempts); 434 P(se.statistics.nr_wakeups_affine_attempts);
442 P(se.nr_wakeups_passive); 435 P(se.statistics.nr_wakeups_passive);
443 P(se.nr_wakeups_idle); 436 P(se.statistics.nr_wakeups_idle);
444 437
445 { 438 {
446 u64 avg_atom, avg_per_cpu; 439 u64 avg_atom, avg_per_cpu;
@@ -491,31 +484,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
491void proc_sched_set_task(struct task_struct *p) 484void proc_sched_set_task(struct task_struct *p)
492{ 485{
493#ifdef CONFIG_SCHEDSTATS 486#ifdef CONFIG_SCHEDSTATS
494 p->se.wait_max = 0; 487 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
495 p->se.wait_sum = 0;
496 p->se.wait_count = 0;
497 p->se.iowait_sum = 0;
498 p->se.iowait_count = 0;
499 p->se.sleep_max = 0;
500 p->se.sum_sleep_runtime = 0;
501 p->se.block_max = 0;
502 p->se.exec_max = 0;
503 p->se.slice_max = 0;
504 p->se.nr_migrations = 0;
505 p->se.nr_migrations_cold = 0;
506 p->se.nr_failed_migrations_affine = 0;
507 p->se.nr_failed_migrations_running = 0;
508 p->se.nr_failed_migrations_hot = 0;
509 p->se.nr_forced_migrations = 0;
510 p->se.nr_wakeups = 0;
511 p->se.nr_wakeups_sync = 0;
512 p->se.nr_wakeups_migrate = 0;
513 p->se.nr_wakeups_local = 0;
514 p->se.nr_wakeups_remote = 0;
515 p->se.nr_wakeups_affine = 0;
516 p->se.nr_wakeups_affine_attempts = 0;
517 p->se.nr_wakeups_passive = 0;
518 p->se.nr_wakeups_idle = 0;
519 p->sched_info.bkl_count = 0;
520#endif 488#endif
521} 489}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924f..217e4a9393e4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
35 * (to see the precise effective timeslice length of your workload, 35 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
37 */ 37 */
38unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 40
41/* 41/*
42 * The initial- and re-scaling of tunables is configurable 42 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 2000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 5; 63static unsigned int sched_nr_latency = 3;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
505{ 505{
506 unsigned long delta_exec_weighted; 506 unsigned long delta_exec_weighted;
507 507
508 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 508 schedstat_set(curr->statistics.exec_max,
509 max((u64)delta_exec, curr->statistics.exec_max));
509 510
510 curr->sum_exec_runtime += delta_exec; 511 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 512 schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
548static inline void 549static inline void
549update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{ 551{
551 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
552} 553}
553 554
554/* 555/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
567static void 568static void
568update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
569{ 570{
570 schedstat_set(se->wait_max, max(se->wait_max, 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
571 rq_of(cfs_rq)->clock - se->wait_start)); 572 rq_of(cfs_rq)->clock - se->statistics.wait_start));
572 schedstat_set(se->wait_count, se->wait_count + 1); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
573 schedstat_set(se->wait_sum, se->wait_sum + 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
574 rq_of(cfs_rq)->clock - se->wait_start); 575 rq_of(cfs_rq)->clock - se->statistics.wait_start);
575#ifdef CONFIG_SCHEDSTATS 576#ifdef CONFIG_SCHEDSTATS
576 if (entity_is_task(se)) { 577 if (entity_is_task(se)) {
577 trace_sched_stat_wait(task_of(se), 578 trace_sched_stat_wait(task_of(se),
578 rq_of(cfs_rq)->clock - se->wait_start); 579 rq_of(cfs_rq)->clock - se->statistics.wait_start);
579 } 580 }
580#endif 581#endif
581 schedstat_set(se->wait_start, 0); 582 schedstat_set(se->statistics.wait_start, 0);
582} 583}
583 584
584static inline void 585static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 if (entity_is_task(se)) 658 if (entity_is_task(se))
658 tsk = task_of(se); 659 tsk = task_of(se);
659 660
660 if (se->sleep_start) { 661 if (se->statistics.sleep_start) {
661 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
662 663
663 if ((s64)delta < 0) 664 if ((s64)delta < 0)
664 delta = 0; 665 delta = 0;
665 666
666 if (unlikely(delta > se->sleep_max)) 667 if (unlikely(delta > se->statistics.sleep_max))
667 se->sleep_max = delta; 668 se->statistics.sleep_max = delta;
668 669
669 se->sleep_start = 0; 670 se->statistics.sleep_start = 0;
670 se->sum_sleep_runtime += delta; 671 se->statistics.sum_sleep_runtime += delta;
671 672
672 if (tsk) { 673 if (tsk) {
673 account_scheduler_latency(tsk, delta >> 10, 1); 674 account_scheduler_latency(tsk, delta >> 10, 1);
674 trace_sched_stat_sleep(tsk, delta); 675 trace_sched_stat_sleep(tsk, delta);
675 } 676 }
676 } 677 }
677 if (se->block_start) { 678 if (se->statistics.block_start) {
678 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
679 680
680 if ((s64)delta < 0) 681 if ((s64)delta < 0)
681 delta = 0; 682 delta = 0;
682 683
683 if (unlikely(delta > se->block_max)) 684 if (unlikely(delta > se->statistics.block_max))
684 se->block_max = delta; 685 se->statistics.block_max = delta;
685 686
686 se->block_start = 0; 687 se->statistics.block_start = 0;
687 se->sum_sleep_runtime += delta; 688 se->statistics.sum_sleep_runtime += delta;
688 689
689 if (tsk) { 690 if (tsk) {
690 if (tsk->in_iowait) { 691 if (tsk->in_iowait) {
691 se->iowait_sum += delta; 692 se->statistics.iowait_sum += delta;
692 se->iowait_count++; 693 se->statistics.iowait_count++;
693 trace_sched_stat_iowait(tsk, delta); 694 trace_sched_stat_iowait(tsk, delta);
694 } 695 }
695 696
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
737 vruntime += sched_vslice(cfs_rq, se); 738 vruntime += sched_vslice(cfs_rq, se);
738 739
739 /* sleeps up to a single latency don't count. */ 740 /* sleeps up to a single latency don't count. */
740 if (!initial && sched_feat(FAIR_SLEEPERS)) { 741 if (!initial) {
741 unsigned long thresh = sysctl_sched_latency; 742 unsigned long thresh = sysctl_sched_latency;
742 743
743 /* 744 /*
744 * Convert the sleeper threshold into virtual time.
745 * SCHED_IDLE is a special sub-class. We care about
746 * fairness only relative to other SCHED_IDLE tasks,
747 * all of which have the same weight.
748 */
749 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
750 task_of(se)->policy != SCHED_IDLE))
751 thresh = calc_delta_fair(thresh, se);
752
753 /*
754 * Halve their sleep time's effect, to allow 745 * Halve their sleep time's effect, to allow
755 * for a gentler effect of sleepers: 746 * for a gentler effect of sleepers:
756 */ 747 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
766 se->vruntime = vruntime; 757 se->vruntime = vruntime;
767} 758}
768 759
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
772static void 760static void
773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
774{ 762{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
776 * Update the normalized vruntime before updating min_vruntime 764 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr(). 765 * through callig update_curr().
778 */ 766 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
780 se->vruntime += cfs_rq->min_vruntime; 768 se->vruntime += cfs_rq->min_vruntime;
781 769
782 /* 770 /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
812} 800}
813 801
814static void 802static void
815dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
816{ 804{
817 /* 805 /*
818 * Update run-time statistics of the 'current'. 806 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
820 update_curr(cfs_rq); 808 update_curr(cfs_rq);
821 809
822 update_stats_dequeue(cfs_rq, se); 810 update_stats_dequeue(cfs_rq, se);
823 if (sleep) { 811 if (flags & DEQUEUE_SLEEP) {
824#ifdef CONFIG_SCHEDSTATS 812#ifdef CONFIG_SCHEDSTATS
825 if (entity_is_task(se)) { 813 if (entity_is_task(se)) {
826 struct task_struct *tsk = task_of(se); 814 struct task_struct *tsk = task_of(se);
827 815
828 if (tsk->state & TASK_INTERRUPTIBLE) 816 if (tsk->state & TASK_INTERRUPTIBLE)
829 se->sleep_start = rq_of(cfs_rq)->clock; 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
830 if (tsk->state & TASK_UNINTERRUPTIBLE) 818 if (tsk->state & TASK_UNINTERRUPTIBLE)
831 se->block_start = rq_of(cfs_rq)->clock; 819 se->statistics.block_start = rq_of(cfs_rq)->clock;
832 } 820 }
833#endif 821#endif
834 } 822 }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
845 * update can refer to the ->curr item and we need to reflect this 833 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position. 834 * movement in our normalized position.
847 */ 835 */
848 if (!sleep) 836 if (!(flags & DEQUEUE_SLEEP))
849 se->vruntime -= cfs_rq->min_vruntime; 837 se->vruntime -= cfs_rq->min_vruntime;
850} 838}
851 839
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
912 * when there are only lesser-weight tasks around): 900 * when there are only lesser-weight tasks around):
913 */ 901 */
914 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
915 se->slice_max = max(se->slice_max, 903 se->statistics.slice_max = max(se->statistics.slice_max,
916 se->sum_exec_runtime - se->prev_sum_exec_runtime); 904 se->sum_exec_runtime - se->prev_sum_exec_runtime);
917 } 905 }
918#endif 906#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
1054 * then put the task into the rbtree: 1042 * then put the task into the rbtree:
1055 */ 1043 */
1056static void 1044static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) 1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1058{ 1046{
1059 struct cfs_rq *cfs_rq; 1047 struct cfs_rq *cfs_rq;
1060 struct sched_entity *se = &p->se; 1048 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1067 1049
1068 for_each_sched_entity(se) { 1050 for_each_sched_entity(se) {
1069 if (se->on_rq) 1051 if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1081 * decreased. We remove the task from the rbtree and 1063 * decreased. We remove the task from the rbtree and
1082 * update the fair scheduling stats: 1064 * update the fair scheduling stats:
1083 */ 1065 */
1084static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1085{ 1067{
1086 struct cfs_rq *cfs_rq; 1068 struct cfs_rq *cfs_rq;
1087 struct sched_entity *se = &p->se; 1069 struct sched_entity *se = &p->se;
1088 1070
1089 for_each_sched_entity(se) { 1071 for_each_sched_entity(se) {
1090 cfs_rq = cfs_rq_of(se); 1072 cfs_rq = cfs_rq_of(se);
1091 dequeue_entity(cfs_rq, se, sleep); 1073 dequeue_entity(cfs_rq, se, flags);
1092 /* Don't dequeue parent if it has other entities besides us */ 1074 /* Don't dequeue parent if it has other entities besides us */
1093 if (cfs_rq->load.weight) 1075 if (cfs_rq->load.weight)
1094 break; 1076 break;
1095 sleep = 1; 1077 flags |= DEQUEUE_SLEEP;
1096 } 1078 }
1097 1079
1098 hrtick_update(rq); 1080 hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1240 1222
1241static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1242{ 1224{
1243 struct task_struct *curr = current;
1244 unsigned long this_load, load; 1225 unsigned long this_load, load;
1245 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1246 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1255 load = source_load(prev_cpu, idx); 1236 load = source_load(prev_cpu, idx);
1256 this_load = target_load(this_cpu, idx); 1237 this_load = target_load(this_cpu, idx);
1257 1238
1258 if (sync) {
1259 if (sched_feat(SYNC_LESS) &&
1260 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1261 p->se.avg_overlap > sysctl_sched_migration_cost))
1262 sync = 0;
1263 } else {
1264 if (sched_feat(SYNC_MORE) &&
1265 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1266 p->se.avg_overlap < sysctl_sched_migration_cost))
1267 sync = 1;
1268 }
1269
1270 /* 1239 /*
1271 * If sync wakeup then subtract the (maximum possible) 1240 * If sync wakeup then subtract the (maximum possible)
1272 * effect of the currently running task from the load 1241 * effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1306 if (sync && balanced) 1275 if (sync && balanced)
1307 return 1; 1276 return 1;
1308 1277
1309 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1278 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1310 tl_per_task = cpu_avg_load_per_task(this_cpu); 1279 tl_per_task = cpu_avg_load_per_task(this_cpu);
1311 1280
1312 if (balanced || 1281 if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1318 * there is no bad imbalance. 1287 * there is no bad imbalance.
1319 */ 1288 */
1320 schedstat_inc(sd, ttwu_move_affine); 1289 schedstat_inc(sd, ttwu_move_affine);
1321 schedstat_inc(p, se.nr_wakeups_affine); 1290 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1322 1291
1323 return 1; 1292 return 1;
1324 } 1293 }
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1406/* 1375/*
1407 * Try and locate an idle CPU in the sched_domain. 1376 * Try and locate an idle CPU in the sched_domain.
1408 */ 1377 */
1409static int 1378static int select_idle_sibling(struct task_struct *p, int target)
1410select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1411{ 1379{
1412 int cpu = smp_processor_id(); 1380 int cpu = smp_processor_id();
1413 int prev_cpu = task_cpu(p); 1381 int prev_cpu = task_cpu(p);
1382 struct sched_domain *sd;
1414 int i; 1383 int i;
1415 1384
1416 /* 1385 /*
1417 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE 1386 * If the task is going to be woken-up on this cpu and if it is
1418 * test in select_task_rq_fair) and the prev_cpu is idle then that's 1387 * already idle, then it is the right target.
1419 * always a better target than the current cpu.
1420 */ 1388 */
1421 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) 1389 if (target == cpu && idle_cpu(cpu))
1390 return cpu;
1391
1392 /*
1393 * If the task is going to be woken-up on the cpu where it previously
1394 * ran and if it is currently idle, then it the right target.
1395 */
1396 if (target == prev_cpu && idle_cpu(prev_cpu))
1422 return prev_cpu; 1397 return prev_cpu;
1423 1398
1424 /* 1399 /*
1425 * Otherwise, iterate the domain and find an elegible idle cpu. 1400 * Otherwise, iterate the domains and find an elegible idle cpu.
1426 */ 1401 */
1427 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1402 for_each_domain(target, sd) {
1428 if (!cpu_rq(i)->cfs.nr_running) { 1403 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1429 target = i;
1430 break; 1404 break;
1405
1406 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1407 if (idle_cpu(i)) {
1408 target = i;
1409 break;
1410 }
1431 } 1411 }
1412
1413 /*
1414 * Lets stop looking for an idle sibling when we reached
1415 * the domain that spans the current cpu and prev_cpu.
1416 */
1417 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1418 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1419 break;
1432 } 1420 }
1433 1421
1434 return target; 1422 return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1445 * 1433 *
1446 * preempt must be disabled. 1434 * preempt must be disabled.
1447 */ 1435 */
1448static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1436static int
1437select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1449{ 1438{
1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1439 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1451 int cpu = smp_processor_id(); 1440 int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1456 int sync = wake_flags & WF_SYNC; 1445 int sync = wake_flags & WF_SYNC;
1457 1446
1458 if (sd_flag & SD_BALANCE_WAKE) { 1447 if (sd_flag & SD_BALANCE_WAKE) {
1459 if (sched_feat(AFFINE_WAKEUPS) && 1448 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1460 cpumask_test_cpu(cpu, &p->cpus_allowed))
1461 want_affine = 1; 1449 want_affine = 1;
1462 new_cpu = prev_cpu; 1450 new_cpu = prev_cpu;
1463 } 1451 }
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1491 } 1479 }
1492 1480
1493 /* 1481 /*
1494 * While iterating the domains looking for a spanning 1482 * If both cpu and prev_cpu are part of this domain,
1495 * WAKE_AFFINE domain, adjust the affine target to any idle cpu 1483 * cpu is a valid SD_WAKE_AFFINE target.
1496 * in cache sharing domains along the way.
1497 */ 1484 */
1498 if (want_affine) { 1485 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1499 int target = -1; 1486 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1500 1487 affine_sd = tmp;
1501 /* 1488 want_affine = 0;
1502 * If both cpu and prev_cpu are part of this domain,
1503 * cpu is a valid SD_WAKE_AFFINE target.
1504 */
1505 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1506 target = cpu;
1507
1508 /*
1509 * If there's an idle sibling in this domain, make that
1510 * the wake_affine target instead of the current cpu.
1511 */
1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1513 target = select_idle_sibling(p, tmp, target);
1514
1515 if (target >= 0) {
1516 if (tmp->flags & SD_WAKE_AFFINE) {
1517 affine_sd = tmp;
1518 want_affine = 0;
1519 }
1520 cpu = target;
1521 }
1522 } 1489 }
1523 1490
1524 if (!want_sd && !want_affine) 1491 if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1531 sd = tmp; 1498 sd = tmp;
1532 } 1499 }
1533 1500
1501#ifdef CONFIG_FAIR_GROUP_SCHED
1534 if (sched_feat(LB_SHARES_UPDATE)) { 1502 if (sched_feat(LB_SHARES_UPDATE)) {
1535 /* 1503 /*
1536 * Pick the largest domain to update shares over 1504 * Pick the largest domain to update shares over
1537 */ 1505 */
1538 tmp = sd; 1506 tmp = sd;
1539 if (affine_sd && (!tmp || 1507 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1540 cpumask_weight(sched_domain_span(affine_sd)) >
1541 cpumask_weight(sched_domain_span(sd))))
1542 tmp = affine_sd; 1508 tmp = affine_sd;
1543 1509
1544 if (tmp) 1510 if (tmp) {
1511 raw_spin_unlock(&rq->lock);
1545 update_shares(tmp); 1512 update_shares(tmp);
1513 raw_spin_lock(&rq->lock);
1514 }
1546 } 1515 }
1516#endif
1547 1517
1548 if (affine_sd && wake_affine(affine_sd, p, sync)) 1518 if (affine_sd) {
1549 return cpu; 1519 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1520 return select_idle_sibling(p, cpu);
1521 else
1522 return select_idle_sibling(p, prev_cpu);
1523 }
1550 1524
1551 while (sd) { 1525 while (sd) {
1552 int load_idx = sd->forkexec_idx; 1526 int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1576 1550
1577 /* Now try balancing at a lower domain level of new_cpu */ 1551 /* Now try balancing at a lower domain level of new_cpu */
1578 cpu = new_cpu; 1552 cpu = new_cpu;
1579 weight = cpumask_weight(sched_domain_span(sd)); 1553 weight = sd->span_weight;
1580 sd = NULL; 1554 sd = NULL;
1581 for_each_domain(cpu, tmp) { 1555 for_each_domain(cpu, tmp) {
1582 if (weight <= cpumask_weight(sched_domain_span(tmp))) 1556 if (weight <= tmp->span_weight)
1583 break; 1557 break;
1584 if (tmp->flags & sd_flag) 1558 if (tmp->flags & sd_flag)
1585 sd = tmp; 1559 sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1591} 1565}
1592#endif /* CONFIG_SMP */ 1566#endif /* CONFIG_SMP */
1593 1567
1594/*
1595 * Adaptive granularity
1596 *
1597 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1598 * with the limit of wakeup_gran -- when it never does a wakeup.
1599 *
1600 * So the smaller avg_wakeup is the faster we want this task to preempt,
1601 * but we don't want to treat the preemptee unfairly and therefore allow it
1602 * to run for at least the amount of time we'd like to run.
1603 *
1604 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1605 *
1606 * NOTE: we use *nr_running to scale with load, this nicely matches the
1607 * degrading latency on load.
1608 */
1609static unsigned long
1610adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1611{
1612 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1613 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1614 u64 gran = 0;
1615
1616 if (this_run < expected_wakeup)
1617 gran = expected_wakeup - this_run;
1618
1619 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1620}
1621
1622static unsigned long 1568static unsigned long
1623wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1569wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1624{ 1570{
1625 unsigned long gran = sysctl_sched_wakeup_granularity; 1571 unsigned long gran = sysctl_sched_wakeup_granularity;
1626 1572
1627 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1628 gran = adaptive_gran(curr, se);
1629
1630 /* 1573 /*
1631 * Since its curr running now, convert the gran from real-time 1574 * Since its curr running now, convert the gran from real-time
1632 * to virtual-time in his units. 1575 * to virtual-time in his units.
1576 *
1577 * By using 'se' instead of 'curr' we penalize light tasks, so
1578 * they get preempted easier. That is, if 'se' < 'curr' then
1579 * the resulting gran will be larger, therefore penalizing the
1580 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1581 * be smaller, again penalizing the lighter task.
1582 *
1583 * This is especially important for buddies when the leftmost
1584 * task is higher priority than the buddy.
1633 */ 1585 */
1634 if (sched_feat(ASYM_GRAN)) { 1586 if (unlikely(se->load.weight != NICE_0_LOAD))
1635 /* 1587 gran = calc_delta_fair(gran, se);
1636 * By using 'se' instead of 'curr' we penalize light tasks, so
1637 * they get preempted easier. That is, if 'se' < 'curr' then
1638 * the resulting gran will be larger, therefore penalizing the
1639 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1640 * be smaller, again penalizing the lighter task.
1641 *
1642 * This is especially important for buddies when the leftmost
1643 * task is higher priority than the buddy.
1644 */
1645 if (unlikely(se->load.weight != NICE_0_LOAD))
1646 gran = calc_delta_fair(gran, se);
1647 } else {
1648 if (unlikely(curr->load.weight != NICE_0_LOAD))
1649 gran = calc_delta_fair(gran, curr);
1650 }
1651 1588
1652 return gran; 1589 return gran;
1653} 1590}
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1705 struct task_struct *curr = rq->curr; 1642 struct task_struct *curr = rq->curr;
1706 struct sched_entity *se = &curr->se, *pse = &p->se; 1643 struct sched_entity *se = &curr->se, *pse = &p->se;
1707 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1644 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1708 int sync = wake_flags & WF_SYNC;
1709 int scale = cfs_rq->nr_running >= sched_nr_latency; 1645 int scale = cfs_rq->nr_running >= sched_nr_latency;
1710 1646
1711 if (unlikely(rt_prio(p->prio))) 1647 if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1738 if (unlikely(curr->policy == SCHED_IDLE)) 1674 if (unlikely(curr->policy == SCHED_IDLE))
1739 goto preempt; 1675 goto preempt;
1740 1676
1741 if (sched_feat(WAKEUP_SYNC) && sync)
1742 goto preempt;
1743
1744 if (sched_feat(WAKEUP_OVERLAP) &&
1745 se->avg_overlap < sysctl_sched_migration_cost &&
1746 pse->avg_overlap < sysctl_sched_migration_cost)
1747 goto preempt;
1748
1749 if (!sched_feat(WAKEUP_PREEMPT)) 1677 if (!sched_feat(WAKEUP_PREEMPT))
1750 return; 1678 return;
1751 1679
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1844 * 3) are cache-hot on their current CPU. 1772 * 3) are cache-hot on their current CPU.
1845 */ 1773 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 1774 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine); 1775 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1848 return 0; 1776 return 0;
1849 } 1777 }
1850 *all_pinned = 0; 1778 *all_pinned = 0;
1851 1779
1852 if (task_running(rq, p)) { 1780 if (task_running(rq, p)) {
1853 schedstat_inc(p, se.nr_failed_migrations_running); 1781 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1854 return 0; 1782 return 0;
1855 } 1783 }
1856 1784
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1866#ifdef CONFIG_SCHEDSTATS 1794#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) { 1795 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]); 1796 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations); 1797 schedstat_inc(p, se.statistics.nr_forced_migrations);
1870 } 1798 }
1871#endif 1799#endif
1872 return 1; 1800 return 1;
1873 } 1801 }
1874 1802
1875 if (tsk_cache_hot) { 1803 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot); 1804 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1877 return 0; 1805 return 0;
1878 } 1806 }
1879 return 1; 1807 return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2311 2239
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 2240unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{ 2241{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2242 unsigned long weight = sd->span_weight;
2315 unsigned long smt_gain = sd->smt_gain; 2243 unsigned long smt_gain = sd->smt_gain;
2316 2244
2317 smt_gain /= weight; 2245 smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
2344 2272
2345static void update_cpu_power(struct sched_domain *sd, int cpu) 2273static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{ 2274{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2275 unsigned long weight = sd->span_weight;
2348 unsigned long power = SCHED_LOAD_SCALE; 2276 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups; 2277 struct sched_group *sdg = sd->groups;
2350 2278
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2798 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871} 2799}
2872 2800
2801static int active_load_balance_cpu_stop(void *data);
2802
2873/* 2803/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2804 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance. 2805 * tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
2959 if (need_active_balance(sd, sd_idle, idle)) { 2889 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags); 2890 raw_spin_lock_irqsave(&busiest->lock, flags);
2961 2891
2962 /* don't kick the migration_thread, if the curr 2892 /* don't kick the active_load_balance_cpu_stop,
2963 * task on busiest cpu can't be moved to this_cpu 2893 * if the curr task on busiest cpu can't be
2894 * moved to this_cpu
2964 */ 2895 */
2965 if (!cpumask_test_cpu(this_cpu, 2896 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) { 2897 &busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
2970 goto out_one_pinned; 2901 goto out_one_pinned;
2971 } 2902 }
2972 2903
2904 /*
2905 * ->active_balance synchronizes accesses to
2906 * ->active_balance_work. Once set, it's cleared
2907 * only after active load balance is finished.
2908 */
2973 if (!busiest->active_balance) { 2909 if (!busiest->active_balance) {
2974 busiest->active_balance = 1; 2910 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu; 2911 busiest->push_cpu = this_cpu;
2976 active_balance = 1; 2912 active_balance = 1;
2977 } 2913 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags); 2914 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2915
2979 if (active_balance) 2916 if (active_balance)
2980 wake_up_process(busiest->migration_thread); 2917 stop_one_cpu_nowait(cpu_of(busiest),
2918 active_load_balance_cpu_stop, busiest,
2919 &busiest->active_balance_work);
2981 2920
2982 /* 2921 /*
2983 * We've kicked active balancing, reset the failure 2922 * We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3084} 3023}
3085 3024
3086/* 3025/*
3087 * active_load_balance is run by migration threads. It pushes running tasks 3026 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 3027 * running tasks off the busiest CPU onto idle CPUs. It requires at
3089 * running on each physical CPU where possible, and avoids physical / 3028 * least 1 task to be running on each physical CPU where possible, and
3090 * logical imbalances. 3029 * avoids physical / logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */ 3030 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 3031static int active_load_balance_cpu_stop(void *data)
3095{ 3032{
3033 struct rq *busiest_rq = data;
3034 int busiest_cpu = cpu_of(busiest_rq);
3096 int target_cpu = busiest_rq->push_cpu; 3035 int target_cpu = busiest_rq->push_cpu;
3036 struct rq *target_rq = cpu_rq(target_cpu);
3097 struct sched_domain *sd; 3037 struct sched_domain *sd;
3098 struct rq *target_rq; 3038
3039 raw_spin_lock_irq(&busiest_rq->lock);
3040
3041 /* make sure the requested cpu hasn't gone down in the meantime */
3042 if (unlikely(busiest_cpu != smp_processor_id() ||
3043 !busiest_rq->active_balance))
3044 goto out_unlock;
3099 3045
3100 /* Is there any task to move? */ 3046 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1) 3047 if (busiest_rq->nr_running <= 1)
3102 return; 3048 goto out_unlock;
3103
3104 target_rq = cpu_rq(target_cpu);
3105 3049
3106 /* 3050 /*
3107 * This condition is "impossible", if it occurs 3051 * This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3112 3056
3113 /* move a task from busiest_rq to target_rq */ 3057 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq); 3058 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117 3059
3118 /* Search for an sd spanning us and the target CPU. */ 3060 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) { 3061 for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3132 schedstat_inc(sd, alb_failed); 3074 schedstat_inc(sd, alb_failed);
3133 } 3075 }
3134 double_unlock_balance(busiest_rq, target_rq); 3076 double_unlock_balance(busiest_rq, target_rq);
3077out_unlock:
3078 busiest_rq->active_balance = 0;
3079 raw_spin_unlock_irq(&busiest_rq->lock);
3080 return 0;
3135} 3081}
3136 3082
3137#ifdef CONFIG_NO_HZ 3083#ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
1/* 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows 2 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart. 4 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14 7
15/* 8/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
21
22/*
23 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
24 * tasks 10 * tasks
25 */ 11 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
31SCHED_FEAT(WAKEUP_PREEMPT, 1) 17SCHED_FEAT(WAKEUP_PREEMPT, 1)
32 18
33/* 19/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
58 * the remote end is likely to consume the data we just wrote, and
59 * therefore has cache benefit from being placed on the same cpu, see
60 * also AFFINE_WAKEUPS.
61 */
62SCHED_FEAT(SYNC_WAKEUPS, 1)
63
64/*
65 * Based on load and program behaviour, see if it makes sense to place 20 * Based on load and program behaviour, see if it makes sense to place
66 * a newly woken task on the same cpu as the task that woke it -- 21 * a newly woken task on the same cpu as the task that woke it --
67 * improve cache locality. Typically used with SYNC wakeups as 22 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
70SCHED_FEAT(AFFINE_WAKEUPS, 1) 25SCHED_FEAT(AFFINE_WAKEUPS, 1)
71 26
72/* 27/*
73 * Weaken SYNC hint based on overlap
74 */
75SCHED_FEAT(SYNC_LESS, 1)
76
77/*
78 * Add SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_MORE, 0)
81
82/*
83 * Prefer to schedule the task we woke last (assuming it failed 28 * Prefer to schedule the task we woke last (assuming it failed
84 * wakeup-preemption), since its likely going to consume data we 29 * wakeup-preemption), since its likely going to consume data we
85 * touched, increases cache locality. 30 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
22static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 24{
24 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
25 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
26 calc_load_account_active(rq);
27 return rq->idle; 27 return rq->idle;
28} 28}
29 29
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
32 * message if some code attempts to do it: 32 * message if some code attempts to do it:
33 */ 33 */
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea7..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
616 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 616 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
617 617
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
888 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
889 */ 889 */
890static void 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) 891enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
892{ 892{
893 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
894 894
895 if (wakeup) 895 if (flags & ENQUEUE_WAKEUP)
896 rt_se->timeout = 0; 896 rt_se->timeout = 0;
897 897
898 enqueue_rt_entity(rt_se, head); 898 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
899 899
900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
901 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
902} 902}
903 903
904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
905{ 905{
906 struct sched_rt_entity *rt_se = &p->rt; 906 struct sched_rt_entity *rt_se = &p->rt;
907 907
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
948#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
949static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
950 950
951static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
952{ 953{
953 struct rq *rq = task_rq(p);
954
955 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
956 return smp_processor_id(); 955 return smp_processor_id();
957 956
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..ef51d1fcf5e6 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,381 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct cpu_stop_work *work;
298 struct task_struct *p;
299
300 switch (action & ~CPU_TASKS_FROZEN) {
301 case CPU_UP_PREPARE:
302 BUG_ON(stopper->thread || stopper->enabled ||
303 !list_empty(&stopper->works));
304 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
305 cpu);
306 if (IS_ERR(p))
307 return NOTIFY_BAD;
308 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
309 get_task_struct(p);
310 stopper->thread = p;
311 break;
312
313 case CPU_ONLINE:
314 kthread_bind(stopper->thread, cpu);
315 /* strictly unnecessary, as first user will wake it */
316 wake_up_process(stopper->thread);
317 /* mark enabled */
318 spin_lock_irq(&stopper->lock);
319 stopper->enabled = true;
320 spin_unlock_irq(&stopper->lock);
321 break;
322
323#ifdef CONFIG_HOTPLUG_CPU
324 case CPU_UP_CANCELED:
325 case CPU_DEAD:
326 /* kill the stopper */
327 kthread_stop(stopper->thread);
328 /* drain remaining works */
329 spin_lock_irq(&stopper->lock);
330 list_for_each_entry(work, &stopper->works, list)
331 cpu_stop_signal_done(work->done, false);
332 stopper->enabled = false;
333 spin_unlock_irq(&stopper->lock);
334 /* release the stopper */
335 put_task_struct(stopper->thread);
336 stopper->thread = NULL;
337 break;
338#endif
339 }
340
341 return NOTIFY_OK;
342}
343
344/*
345 * Give it a higher priority so that cpu stopper is available to other
346 * cpu notifiers. It currently shares the same priority as sched
347 * migration_notifier.
348 */
349static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
350 .notifier_call = cpu_stop_cpu_callback,
351 .priority = 10,
352};
353
354static int __init cpu_stop_init(void)
355{
356 void *bcpu = (void *)(long)smp_processor_id();
357 unsigned int cpu;
358 int err;
359
360 for_each_possible_cpu(cpu) {
361 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
362
363 spin_lock_init(&stopper->lock);
364 INIT_LIST_HEAD(&stopper->works);
365 }
366
367 /* start one for the boot cpu */
368 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
369 bcpu);
370 BUG_ON(err == NOTIFY_BAD);
371 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
372 register_cpu_notifier(&cpu_stop_cpu_notifier);
373
374 return 0;
375}
376early_initcall(cpu_stop_init);
377
378#ifdef CONFIG_STOP_MACHINE
15 379
16/* This controls the threads on each CPU. */ 380/* This controls the threads on each CPU. */
17enum stopmachine_state { 381enum stopmachine_state {
@@ -26,174 +390,94 @@ enum stopmachine_state {
26 /* Exit */ 390 /* Exit */
27 STOPMACHINE_EXIT, 391 STOPMACHINE_EXIT,
28}; 392};
29static enum stopmachine_state state;
30 393
31struct stop_machine_data { 394struct stop_machine_data {
32 int (*fn)(void *); 395 int (*fn)(void *);
33 void *data; 396 void *data;
34 int fnret; 397 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
398 unsigned int num_threads;
399 const struct cpumask *active_cpus;
400
401 enum stopmachine_state state;
402 atomic_t thread_ack;
35}; 403};
36 404
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 405static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 406 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void __percpu *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 407{
52 /* Reset ack counter. */ 408 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 409 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 410 smp_wmb();
55 state = newstate; 411 smdata->state = newstate;
56} 412}
57 413
58/* Last one to ack a state moves to the next state. */ 414/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 415static void ack_state(struct stop_machine_data *smdata)
60{ 416{
61 if (atomic_dec_and_test(&thread_ack)) 417 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 418 set_state(smdata, smdata->state + 1);
63} 419}
64 420
65/* This is the actual function which stops the CPU. It runs 421/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 422static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 423{
424 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 425 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 426 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 427 bool is_active;
72 int err; 428
429 if (!smdata->active_cpus)
430 is_active = cpu == cpumask_first(cpu_online_mask);
431 else
432 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 433
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 434 /* Simple state machine */
82 do { 435 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 436 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 437 cpu_relax();
85 if (state != curstate) { 438 if (smdata->state != curstate) {
86 curstate = state; 439 curstate = smdata->state;
87 switch (curstate) { 440 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 441 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 442 local_irq_disable();
90 hard_irq_disable(); 443 hard_irq_disable();
91 break; 444 break;
92 case STOPMACHINE_RUN: 445 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 446 if (is_active)
94 * is needed to tell that something failed. */ 447 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 448 break;
99 default: 449 default:
100 break; 450 break;
101 } 451 }
102 ack_state(); 452 ack_state(smdata);
103 } 453 }
104 } while (curstate != STOPMACHINE_EXIT); 454 } while (curstate != STOPMACHINE_EXIT);
105 455
106 local_irq_enable(); 456 local_irq_enable();
457 return err;
107} 458}
108 459
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 460int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 461{
154 struct work_struct *sm_work; 462 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 463 .num_threads = num_online_cpus(),
156 464 .active_cpus = cpus };
157 /* Set up initial state. */ 465
158 mutex_lock(&lock); 466 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 467 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 468 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 469}
184 470
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 471int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 472{
187 int ret; 473 int ret;
188 474
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 475 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 476 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 477 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 478 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 479 return ret;
198} 480}
199EXPORT_SYMBOL_GPL(stop_machine); 481EXPORT_SYMBOL_GPL(stop_machine);
482
483#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..1d7b9bc1c034 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
150 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
151} 151}
152 152
153/*
154 * Updates the per cpu time idle statistics counters
155 */
156static void
157update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{
159 ktime_t delta;
160
161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu() > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now;
167 }
168
169 if (last_update_time)
170 *last_update_time = ktime_to_us(now);
171
172}
173
153static void tick_nohz_stop_idle(int cpu, ktime_t now) 174static void tick_nohz_stop_idle(int cpu, ktime_t now)
154{ 175{
155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
157 177
158 delta = ktime_sub(now, ts->idle_entrytime); 178 update_ts_time_stats(ts, now, NULL);
159 ts->idle_lastupdate = now;
160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
161 ts->idle_active = 0; 179 ts->idle_active = 0;
162 180
163 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
165 183
166static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
167{ 185{
168 ktime_t now, delta; 186 ktime_t now;
169 187
170 now = ktime_get(); 188 now = ktime_get();
171 if (ts->idle_active) { 189
172 delta = ktime_sub(now, ts->idle_entrytime); 190 update_ts_time_stats(ts, now, NULL);
173 ts->idle_lastupdate = now; 191
174 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
175 }
176 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
177 ts->idle_active = 1; 193 ts->idle_active = 1;
178 sched_clock_idle_sleep_event(); 194 sched_clock_idle_sleep_event();
179 return now; 195 return now;
180} 196}
181 197
198/**
199 * get_cpu_idle_time_us - get the total idle time of a cpu
200 * @cpu: CPU number to query
201 * @last_update_time: variable to store update time in
202 *
203 * Return the cummulative idle time (since boot) for a given
204 * CPU, in microseconds. The idle time returned includes
205 * the iowait time (unlike what "top" and co report).
206 *
207 * This time is measured via accounting rather than sampling,
208 * and is as accurate as ktime_get() is.
209 *
210 * This function returns -1 if NOHZ is not enabled.
211 */
182u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 212u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
183{ 213{
184 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 214 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
186 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
187 return -1; 217 return -1;
188 218
189 if (ts->idle_active) 219 update_ts_time_stats(ts, ktime_get(), last_update_time);
190 *last_update_time = ktime_to_us(ts->idle_lastupdate);
191 else
192 *last_update_time = ktime_to_us(ktime_get());
193 220
194 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
195} 222}
196EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 223EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
197 224
225/*
226 * get_cpu_iowait_time_us - get the total iowait time of a cpu
227 * @cpu: CPU number to query
228 * @last_update_time: variable to store update time in
229 *
230 * Return the cummulative iowait time (since boot) for a given
231 * CPU, in microseconds.
232 *
233 * This time is measured via accounting rather than sampling,
234 * and is as accurate as ktime_get() is.
235 *
236 * This function returns -1 if NOHZ is not enabled.
237 */
238u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
239{
240 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
241
242 if (!tick_nohz_enabled)
243 return -1;
244
245 update_ts_time_stats(ts, ktime_get(), last_update_time);
246
247 return ktime_to_us(ts->iowait_sleeptime);
248}
249EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
250
198/** 251/**
199 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 252 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
200 * 253 *
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
262 goto end; 315 goto end;
263 } 316 }
264 317
318 if (nohz_ratelimit(cpu))
319 goto end;
320
265 ts->idle_calls++; 321 ts->idle_calls++;
266 /* Read jiffies and the time when jiffies were updated last */ 322 /* Read jiffies and the time when jiffies were updated last */
267 do { 323 do {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd78777..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
176 P_ns(idle_waketime); 176 P_ns(idle_waketime);
177 P_ns(idle_exittime); 177 P_ns(idle_exittime);
178 P_ns(idle_sleeptime); 178 P_ns(idle_sleeptime);
179 P_ns(iowait_sleeptime);
179 P(last_jiffies); 180 P(last_jiffies);
180 P(next_jiffies); 181 P(next_jiffies);
181 P_ns(idle_expires); 182 P_ns(idle_expires);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a3f510..36ea2b65dcdc 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
675 } 675 }
676} 676}
677 677
678static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) 678static void blk_add_trace_rq_abort(void *ignore,
679 struct request_queue *q, struct request *rq)
679{ 680{
680 blk_add_trace_rq(q, rq, BLK_TA_ABORT); 681 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
681} 682}
682 683
683static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) 684static void blk_add_trace_rq_insert(void *ignore,
685 struct request_queue *q, struct request *rq)
684{ 686{
685 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 687 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
686} 688}
687 689
688static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) 690static void blk_add_trace_rq_issue(void *ignore,
691 struct request_queue *q, struct request *rq)
689{ 692{
690 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 693 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
691} 694}
692 695
693static void blk_add_trace_rq_requeue(struct request_queue *q, 696static void blk_add_trace_rq_requeue(void *ignore,
697 struct request_queue *q,
694 struct request *rq) 698 struct request *rq)
695{ 699{
696 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 700 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
697} 701}
698 702
699static void blk_add_trace_rq_complete(struct request_queue *q, 703static void blk_add_trace_rq_complete(void *ignore,
704 struct request_queue *q,
700 struct request *rq) 705 struct request *rq)
701{ 706{
702 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 707 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
724 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 729 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
725} 730}
726 731
727static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) 732static void blk_add_trace_bio_bounce(void *ignore,
733 struct request_queue *q, struct bio *bio)
728{ 734{
729 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 735 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
730} 736}
731 737
732static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) 738static void blk_add_trace_bio_complete(void *ignore,
739 struct request_queue *q, struct bio *bio)
733{ 740{
734 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 741 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
735} 742}
736 743
737static void blk_add_trace_bio_backmerge(struct request_queue *q, 744static void blk_add_trace_bio_backmerge(void *ignore,
745 struct request_queue *q,
738 struct bio *bio) 746 struct bio *bio)
739{ 747{
740 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 748 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
741} 749}
742 750
743static void blk_add_trace_bio_frontmerge(struct request_queue *q, 751static void blk_add_trace_bio_frontmerge(void *ignore,
752 struct request_queue *q,
744 struct bio *bio) 753 struct bio *bio)
745{ 754{
746 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 755 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
747} 756}
748 757
749static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) 758static void blk_add_trace_bio_queue(void *ignore,
759 struct request_queue *q, struct bio *bio)
750{ 760{
751 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 761 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
752} 762}
753 763
754static void blk_add_trace_getrq(struct request_queue *q, 764static void blk_add_trace_getrq(void *ignore,
765 struct request_queue *q,
755 struct bio *bio, int rw) 766 struct bio *bio, int rw)
756{ 767{
757 if (bio) 768 if (bio)
@@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
765} 776}
766 777
767 778
768static void blk_add_trace_sleeprq(struct request_queue *q, 779static void blk_add_trace_sleeprq(void *ignore,
780 struct request_queue *q,
769 struct bio *bio, int rw) 781 struct bio *bio, int rw)
770{ 782{
771 if (bio) 783 if (bio)
@@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
779 } 791 }
780} 792}
781 793
782static void blk_add_trace_plug(struct request_queue *q) 794static void blk_add_trace_plug(void *ignore, struct request_queue *q)
783{ 795{
784 struct blk_trace *bt = q->blk_trace; 796 struct blk_trace *bt = q->blk_trace;
785 797
@@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
787 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 799 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
788} 800}
789 801
790static void blk_add_trace_unplug_io(struct request_queue *q) 802static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
791{ 803{
792 struct blk_trace *bt = q->blk_trace; 804 struct blk_trace *bt = q->blk_trace;
793 805
@@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
800 } 812 }
801} 813}
802 814
803static void blk_add_trace_unplug_timer(struct request_queue *q) 815static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
804{ 816{
805 struct blk_trace *bt = q->blk_trace; 817 struct blk_trace *bt = q->blk_trace;
806 818
@@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
813 } 825 }
814} 826}
815 827
816static void blk_add_trace_split(struct request_queue *q, struct bio *bio, 828static void blk_add_trace_split(void *ignore,
829 struct request_queue *q, struct bio *bio,
817 unsigned int pdu) 830 unsigned int pdu)
818{ 831{
819 struct blk_trace *bt = q->blk_trace; 832 struct blk_trace *bt = q->blk_trace;
@@ -839,8 +852,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
839 * it spans a stripe (or similar). Add a trace for that action. 852 * it spans a stripe (or similar). Add a trace for that action.
840 * 853 *
841 **/ 854 **/
842static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 855static void blk_add_trace_remap(void *ignore,
843 dev_t dev, sector_t from) 856 struct request_queue *q, struct bio *bio,
857 dev_t dev, sector_t from)
844{ 858{
845 struct blk_trace *bt = q->blk_trace; 859 struct blk_trace *bt = q->blk_trace;
846 struct blk_io_trace_remap r; 860 struct blk_io_trace_remap r;
@@ -869,7 +883,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
869 * Add a trace for that action. 883 * Add a trace for that action.
870 * 884 *
871 **/ 885 **/
872static void blk_add_trace_rq_remap(struct request_queue *q, 886static void blk_add_trace_rq_remap(void *ignore,
887 struct request_queue *q,
873 struct request *rq, dev_t dev, 888 struct request *rq, dev_t dev,
874 sector_t from) 889 sector_t from)
875{ 890{
@@ -921,64 +936,64 @@ static void blk_register_tracepoints(void)
921{ 936{
922 int ret; 937 int ret;
923 938
924 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); 939 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
925 WARN_ON(ret); 940 WARN_ON(ret);
926 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); 941 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
927 WARN_ON(ret); 942 WARN_ON(ret);
928 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); 943 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
929 WARN_ON(ret); 944 WARN_ON(ret);
930 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); 945 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
931 WARN_ON(ret); 946 WARN_ON(ret);
932 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); 947 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
933 WARN_ON(ret); 948 WARN_ON(ret);
934 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); 949 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
935 WARN_ON(ret); 950 WARN_ON(ret);
936 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); 951 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
937 WARN_ON(ret); 952 WARN_ON(ret);
938 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 953 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
939 WARN_ON(ret); 954 WARN_ON(ret);
940 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 955 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
941 WARN_ON(ret); 956 WARN_ON(ret);
942 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); 957 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
943 WARN_ON(ret); 958 WARN_ON(ret);
944 ret = register_trace_block_getrq(blk_add_trace_getrq); 959 ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
945 WARN_ON(ret); 960 WARN_ON(ret);
946 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); 961 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
947 WARN_ON(ret); 962 WARN_ON(ret);
948 ret = register_trace_block_plug(blk_add_trace_plug); 963 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
949 WARN_ON(ret); 964 WARN_ON(ret);
950 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); 965 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
951 WARN_ON(ret); 966 WARN_ON(ret);
952 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); 967 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
953 WARN_ON(ret); 968 WARN_ON(ret);
954 ret = register_trace_block_split(blk_add_trace_split); 969 ret = register_trace_block_split(blk_add_trace_split, NULL);
955 WARN_ON(ret); 970 WARN_ON(ret);
956 ret = register_trace_block_remap(blk_add_trace_remap); 971 ret = register_trace_block_remap(blk_add_trace_remap, NULL);
957 WARN_ON(ret); 972 WARN_ON(ret);
958 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); 973 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
959 WARN_ON(ret); 974 WARN_ON(ret);
960} 975}
961 976
962static void blk_unregister_tracepoints(void) 977static void blk_unregister_tracepoints(void)
963{ 978{
964 unregister_trace_block_rq_remap(blk_add_trace_rq_remap); 979 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
965 unregister_trace_block_remap(blk_add_trace_remap); 980 unregister_trace_block_remap(blk_add_trace_remap, NULL);
966 unregister_trace_block_split(blk_add_trace_split); 981 unregister_trace_block_split(blk_add_trace_split, NULL);
967 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 982 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
968 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); 983 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
969 unregister_trace_block_plug(blk_add_trace_plug); 984 unregister_trace_block_plug(blk_add_trace_plug, NULL);
970 unregister_trace_block_sleeprq(blk_add_trace_sleeprq); 985 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
971 unregister_trace_block_getrq(blk_add_trace_getrq); 986 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
972 unregister_trace_block_bio_queue(blk_add_trace_bio_queue); 987 unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
973 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 988 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
974 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 989 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
975 unregister_trace_block_bio_complete(blk_add_trace_bio_complete); 990 unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
976 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); 991 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
977 unregister_trace_block_rq_complete(blk_add_trace_rq_complete); 992 unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
978 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); 993 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
979 unregister_trace_block_rq_issue(blk_add_trace_rq_issue); 994 unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
980 unregister_trace_block_rq_insert(blk_add_trace_rq_insert); 995 unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
981 unregister_trace_block_rq_abort(blk_add_trace_rq_abort); 996 unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
982 997
983 tracepoint_synchronize_unregister(); 998 tracepoint_synchronize_unregister();
984} 999}
@@ -1321,7 +1336,7 @@ out:
1321} 1336}
1322 1337
1323static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1338static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1324 int flags) 1339 int flags, struct trace_event *event)
1325{ 1340{
1326 return print_one_line(iter, false); 1341 return print_one_line(iter, false);
1327} 1342}
@@ -1343,7 +1358,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1343} 1358}
1344 1359
1345static enum print_line_t 1360static enum print_line_t
1346blk_trace_event_print_binary(struct trace_iterator *iter, int flags) 1361blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1362 struct trace_event *event)
1347{ 1363{
1348 return blk_trace_synthesize_old_trace(iter) ? 1364 return blk_trace_synthesize_old_trace(iter) ?
1349 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1365 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1397,16 @@ static struct tracer blk_tracer __read_mostly = {
1381 .set_flag = blk_tracer_set_flag, 1397 .set_flag = blk_tracer_set_flag,
1382}; 1398};
1383 1399
1384static struct trace_event trace_blk_event = { 1400static struct trace_event_functions trace_blk_event_funcs = {
1385 .type = TRACE_BLK,
1386 .trace = blk_trace_event_print, 1401 .trace = blk_trace_event_print,
1387 .binary = blk_trace_event_print_binary, 1402 .binary = blk_trace_event_print_binary,
1388}; 1403};
1389 1404
1405static struct trace_event trace_blk_event = {
1406 .type = TRACE_BLK,
1407 .funcs = &trace_blk_event_funcs,
1408};
1409
1390static int __init init_blk_tracer(void) 1410static int __init init_blk_tracer(void)
1391{ 1411{
1392 if (!register_ftrace_event(&trace_blk_event)) { 1412 if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b3097..6d2cb14f9449 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
264 unsigned long counter; 264 unsigned long counter;
265#ifdef CONFIG_FUNCTION_GRAPH_TRACER 265#ifdef CONFIG_FUNCTION_GRAPH_TRACER
266 unsigned long long time; 266 unsigned long long time;
267 unsigned long long time_squared;
267#endif 268#endif
268}; 269};
269 270
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
366{ 367{
367#ifdef CONFIG_FUNCTION_GRAPH_TRACER 368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
368 seq_printf(m, " Function " 369 seq_printf(m, " Function "
369 "Hit Time Avg\n" 370 "Hit Time Avg s^2\n"
370 " -------- " 371 " -------- "
371 "--- ---- ---\n"); 372 "--- ---- --- ---\n");
372#else 373#else
373 seq_printf(m, " Function Hit\n" 374 seq_printf(m, " Function Hit\n"
374 " -------- ---\n"); 375 " -------- ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
384 static DEFINE_MUTEX(mutex); 385 static DEFINE_MUTEX(mutex);
385 static struct trace_seq s; 386 static struct trace_seq s;
386 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev;
387#endif 389#endif
388 390
389 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
394 avg = rec->time; 396 avg = rec->time;
395 do_div(avg, rec->counter); 397 do_div(avg, rec->counter);
396 398
399 /* Sample standard deviation (s^2) */
400 if (rec->counter <= 1)
401 stddev = 0;
402 else {
403 stddev = rec->time_squared - rec->counter * avg * avg;
404 /*
405 * Divide only 1000 for ns^2 -> us^2 conversion.
406 * trace_print_graph_duration will divide 1000 again.
407 */
408 do_div(stddev, (rec->counter - 1) * 1000);
409 }
410
397 mutex_lock(&mutex); 411 mutex_lock(&mutex);
398 trace_seq_init(&s); 412 trace_seq_init(&s);
399 trace_print_graph_duration(rec->time, &s); 413 trace_print_graph_duration(rec->time, &s);
400 trace_seq_puts(&s, " "); 414 trace_seq_puts(&s, " ");
401 trace_print_graph_duration(avg, &s); 415 trace_print_graph_duration(avg, &s);
416 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s);
402 trace_print_seq(m, &s); 418 trace_print_seq(m, &s);
403 mutex_unlock(&mutex); 419 mutex_unlock(&mutex);
404#endif 420#endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
650 if (!stat->hash || !ftrace_profile_enabled) 666 if (!stat->hash || !ftrace_profile_enabled)
651 goto out; 667 goto out;
652 668
669 /* If the calltime was zero'd ignore it */
670 if (!trace->calltime)
671 goto out;
672
653 calltime = trace->rettime - trace->calltime; 673 calltime = trace->rettime - trace->calltime;
654 674
655 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 675 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
668 } 688 }
669 689
670 rec = ftrace_find_profiled_func(stat, trace->func); 690 rec = ftrace_find_profiled_func(stat, trace->func);
671 if (rec) 691 if (rec) {
672 rec->time += calltime; 692 rec->time += calltime;
693 rec->time_squared += calltime * calltime;
694 }
673 695
674 out: 696 out:
675 local_irq_restore(flags); 697 local_irq_restore(flags);
@@ -3212,8 +3234,8 @@ free:
3212} 3234}
3213 3235
3214static void 3236static void
3215ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, 3237ftrace_graph_probe_sched_switch(void *ignore,
3216 struct task_struct *next) 3238 struct task_struct *prev, struct task_struct *next)
3217{ 3239{
3218 unsigned long long timestamp; 3240 unsigned long long timestamp;
3219 int index; 3241 int index;
@@ -3267,7 +3289,7 @@ static int start_graph_tracing(void)
3267 } while (ret == -EAGAIN); 3289 } while (ret == -EAGAIN);
3268 3290
3269 if (!ret) { 3291 if (!ret) {
3270 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); 3292 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3271 if (ret) 3293 if (ret)
3272 pr_info("ftrace_graph: Couldn't activate tracepoint" 3294 pr_info("ftrace_graph: Couldn't activate tracepoint"
3273 " probe to kernel_sched_switch\n"); 3295 " probe to kernel_sched_switch\n");
@@ -3339,11 +3361,11 @@ void unregister_ftrace_graph(void)
3339 goto out; 3361 goto out;
3340 3362
3341 ftrace_graph_active--; 3363 ftrace_graph_active--;
3342 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3343 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3364 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3344 ftrace_graph_entry = ftrace_graph_entry_stub; 3365 ftrace_graph_entry = ftrace_graph_entry_stub;
3345 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3366 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3346 unregister_pm_notifier(&ftrace_suspend_notifier); 3367 unregister_pm_notifier(&ftrace_suspend_notifier);
3368 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3347 3369
3348 out: 3370 out:
3349 mutex_unlock(&ftrace_lock); 3371 mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153a..bbfc1bb1660b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
95 trace_wake_up(); 95 trace_wake_up();
96} 96}
97 97
98static void kmemtrace_kmalloc(unsigned long call_site, 98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
99 const void *ptr, 100 const void *ptr,
100 size_t bytes_req, 101 size_t bytes_req,
101 size_t bytes_alloc, 102 size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
105 bytes_req, bytes_alloc, gfp_flags, -1); 106 bytes_req, bytes_alloc, gfp_flags, -1);
106} 107}
107 108
108static void kmemtrace_kmem_cache_alloc(unsigned long call_site, 109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
109 const void *ptr, 111 const void *ptr,
110 size_t bytes_req, 112 size_t bytes_req,
111 size_t bytes_alloc, 113 size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
115 bytes_req, bytes_alloc, gfp_flags, -1); 117 bytes_req, bytes_alloc, gfp_flags, -1);
116} 118}
117 119
118static void kmemtrace_kmalloc_node(unsigned long call_site, 120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
119 const void *ptr, 122 const void *ptr,
120 size_t bytes_req, 123 size_t bytes_req,
121 size_t bytes_alloc, 124 size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
126 bytes_req, bytes_alloc, gfp_flags, node); 129 bytes_req, bytes_alloc, gfp_flags, node);
127} 130}
128 131
129static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, 132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
130 const void *ptr, 134 const void *ptr,
131 size_t bytes_req, 135 size_t bytes_req,
132 size_t bytes_alloc, 136 size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
137 bytes_req, bytes_alloc, gfp_flags, node); 141 bytes_req, bytes_alloc, gfp_flags, node);
138} 142}
139 143
140static void kmemtrace_kfree(unsigned long call_site, const void *ptr) 144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
141{ 146{
142 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); 147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
143} 148}
144 149
145static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) 150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
146{ 152{
147 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); 153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
148} 154}
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
151{ 157{
152 int err; 158 int err;
153 159
154 err = register_trace_kmalloc(kmemtrace_kmalloc); 160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
155 if (err) 161 if (err)
156 return err; 162 return err;
157 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
158 if (err) 164 if (err)
159 return err; 165 return err;
160 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); 166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
161 if (err) 167 if (err)
162 return err; 168 return err;
163 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
164 if (err) 170 if (err)
165 return err; 171 return err;
166 err = register_trace_kfree(kmemtrace_kfree); 172 err = register_trace_kfree(kmemtrace_kfree, NULL);
167 if (err) 173 if (err)
168 return err; 174 return err;
169 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
170 176
171 return err; 177 return err;
172} 178}
173 179
174static void kmemtrace_stop_probes(void) 180static void kmemtrace_stop_probes(void)
175{ 181{
176 unregister_trace_kmalloc(kmemtrace_kmalloc); 182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
177 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
178 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); 184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
179 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
180 unregister_trace_kfree(kmemtrace_kfree); 186 unregister_trace_kfree(kmemtrace_kfree, NULL);
181 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
182} 188}
183 189
184static int kmem_trace_init(struct trace_array *tr) 190static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
237}; 243};
238 244
239static enum print_line_t 245static enum print_line_t
240kmemtrace_print_alloc(struct trace_iterator *iter, int flags) 246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
241{ 248{
242 struct trace_seq *s = &iter->seq; 249 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry; 250 struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
257} 264}
258 265
259static enum print_line_t 266static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags) 267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
261{ 269{
262 struct trace_seq *s = &iter->seq; 270 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry; 271 struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
275} 283}
276 284
277static enum print_line_t 285static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) 286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
279{ 288{
280 struct trace_seq *s = &iter->seq; 289 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry; 290 struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
309} 318}
310 319
311static enum print_line_t 320static enum print_line_t
312kmemtrace_print_free_user(struct trace_iterator *iter, int flags) 321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
313{ 323{
314 struct trace_seq *s = &iter->seq; 324 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry; 325 struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
463 } 473 }
464} 474}
465 475
466static struct trace_event kmem_trace_alloc = { 476static struct trace_event_functions kmem_trace_alloc_funcs = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc, 477 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user, 478 .binary = kmemtrace_print_alloc_user,
470}; 479};
471 480
472static struct trace_event kmem_trace_free = { 481static struct trace_event kmem_trace_alloc = {
473 .type = TRACE_KMEM_FREE, 482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
474 .trace = kmemtrace_print_free, 487 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user, 488 .binary = kmemtrace_print_free_user,
476}; 489};
477 490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
478static struct tracer kmem_tracer __read_mostly = { 496static struct tracer kmem_tracer __read_mostly = {
479 .name = "kmemtrace", 497 .name = "kmemtrace",
480 .init = kmem_trace_init, 498 .init = kmem_trace_init,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41ca394feb22..7f6059c5aa94 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
319#define TS_MASK ((1ULL << TS_SHIFT) - 1) 319#define TS_MASK ((1ULL << TS_SHIFT) - 1)
320#define TS_DELTA_TEST (~TS_MASK) 320#define TS_DELTA_TEST (~TS_MASK)
321 321
322/* Flag when events were overwritten */
323#define RB_MISSED_EVENTS (1 << 31)
324/* Missed count stored at end */
325#define RB_MISSED_STORED (1 << 30)
326
322struct buffer_data_page { 327struct buffer_data_page {
323 u64 time_stamp; /* page time stamp */ 328 u64 time_stamp; /* page time stamp */
324 local_t commit; /* write committed index */ 329 local_t commit; /* write committed index */
@@ -338,6 +343,7 @@ struct buffer_page {
338 local_t write; /* index for next write */ 343 local_t write; /* index for next write */
339 unsigned read; /* index for next read */ 344 unsigned read; /* index for next read */
340 local_t entries; /* entries on this page */ 345 local_t entries; /* entries on this page */
346 unsigned long real_end; /* real end of data */
341 struct buffer_data_page *page; /* Actual data page */ 347 struct buffer_data_page *page; /* Actual data page */
342}; 348};
343 349
@@ -417,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
417 (unsigned int)sizeof(field.commit), 423 (unsigned int)sizeof(field.commit),
418 (unsigned int)is_signed_type(long)); 424 (unsigned int)is_signed_type(long));
419 425
426 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 1,
430 (unsigned int)is_signed_type(long));
431
420 ret = trace_seq_printf(s, "\tfield: char data;\t" 432 ret = trace_seq_printf(s, "\tfield: char data;\t"
421 "offset:%u;\tsize:%u;\tsigned:%u;\n", 433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
422 (unsigned int)offsetof(typeof(field), data), 434 (unsigned int)offsetof(typeof(field), data),
@@ -440,6 +452,8 @@ struct ring_buffer_per_cpu {
440 struct buffer_page *tail_page; /* write to tail */ 452 struct buffer_page *tail_page; /* write to tail */
441 struct buffer_page *commit_page; /* committed pages */ 453 struct buffer_page *commit_page; /* committed pages */
442 struct buffer_page *reader_page; 454 struct buffer_page *reader_page;
455 unsigned long lost_events;
456 unsigned long last_overrun;
443 local_t commit_overrun; 457 local_t commit_overrun;
444 local_t overrun; 458 local_t overrun;
445 local_t entries; 459 local_t entries;
@@ -1762,6 +1776,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1762 kmemcheck_annotate_bitfield(event, bitfield); 1776 kmemcheck_annotate_bitfield(event, bitfield);
1763 1777
1764 /* 1778 /*
1779 * Save the original length to the meta data.
1780 * This will be used by the reader to add lost event
1781 * counter.
1782 */
1783 tail_page->real_end = tail;
1784
1785 /*
1765 * If this event is bigger than the minimum size, then 1786 * If this event is bigger than the minimum size, then
1766 * we need to be careful that we don't subtract the 1787 * we need to be careful that we don't subtract the
1767 * write counter enough to allow another writer to slip 1788 * write counter enough to allow another writer to slip
@@ -1979,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1979 u64 *ts, u64 *delta) 2000 u64 *ts, u64 *delta)
1980{ 2001{
1981 struct ring_buffer_event *event; 2002 struct ring_buffer_event *event;
1982 static int once;
1983 int ret; 2003 int ret;
1984 2004
1985 if (unlikely(*delta > (1ULL << 59) && !once++)) { 2005 WARN_ONCE(*delta > (1ULL << 59),
1986 printk(KERN_WARNING "Delta way too big! %llu" 2006 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
1987 " ts=%llu write stamp = %llu\n", 2007 (unsigned long long)*delta,
1988 (unsigned long long)*delta, 2008 (unsigned long long)*ts,
1989 (unsigned long long)*ts, 2009 (unsigned long long)cpu_buffer->write_stamp);
1990 (unsigned long long)cpu_buffer->write_stamp);
1991 WARN_ON(1);
1992 }
1993 2010
1994 /* 2011 /*
1995 * The delta is too big, we to add a 2012 * The delta is too big, we to add a
@@ -2838,6 +2855,7 @@ static struct buffer_page *
2838rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2855rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2839{ 2856{
2840 struct buffer_page *reader = NULL; 2857 struct buffer_page *reader = NULL;
2858 unsigned long overwrite;
2841 unsigned long flags; 2859 unsigned long flags;
2842 int nr_loops = 0; 2860 int nr_loops = 0;
2843 int ret; 2861 int ret;
@@ -2879,6 +2897,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2879 local_set(&cpu_buffer->reader_page->write, 0); 2897 local_set(&cpu_buffer->reader_page->write, 0);
2880 local_set(&cpu_buffer->reader_page->entries, 0); 2898 local_set(&cpu_buffer->reader_page->entries, 0);
2881 local_set(&cpu_buffer->reader_page->page->commit, 0); 2899 local_set(&cpu_buffer->reader_page->page->commit, 0);
2900 cpu_buffer->reader_page->real_end = 0;
2882 2901
2883 spin: 2902 spin:
2884 /* 2903 /*
@@ -2899,6 +2918,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2899 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2918 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2900 2919
2901 /* 2920 /*
2921 * We want to make sure we read the overruns after we set up our
2922 * pointers to the next object. The writer side does a
2923 * cmpxchg to cross pages which acts as the mb on the writer
2924 * side. Note, the reader will constantly fail the swap
2925 * while the writer is updating the pointers, so this
2926 * guarantees that the overwrite recorded here is the one we
2927 * want to compare with the last_overrun.
2928 */
2929 smp_mb();
2930 overwrite = local_read(&(cpu_buffer->overrun));
2931
2932 /*
2902 * Here's the tricky part. 2933 * Here's the tricky part.
2903 * 2934 *
2904 * We need to move the pointer past the header page. 2935 * We need to move the pointer past the header page.
@@ -2929,6 +2960,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2929 cpu_buffer->reader_page = reader; 2960 cpu_buffer->reader_page = reader;
2930 rb_reset_reader_page(cpu_buffer); 2961 rb_reset_reader_page(cpu_buffer);
2931 2962
2963 if (overwrite != cpu_buffer->last_overrun) {
2964 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2965 cpu_buffer->last_overrun = overwrite;
2966 }
2967
2932 goto again; 2968 goto again;
2933 2969
2934 out: 2970 out:
@@ -3005,8 +3041,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3005 rb_advance_iter(iter); 3041 rb_advance_iter(iter);
3006} 3042}
3007 3043
3044static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3045{
3046 return cpu_buffer->lost_events;
3047}
3048
3008static struct ring_buffer_event * 3049static struct ring_buffer_event *
3009rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3050rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3051 unsigned long *lost_events)
3010{ 3052{
3011 struct ring_buffer_event *event; 3053 struct ring_buffer_event *event;
3012 struct buffer_page *reader; 3054 struct buffer_page *reader;
@@ -3058,6 +3100,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3058 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3100 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3059 cpu_buffer->cpu, ts); 3101 cpu_buffer->cpu, ts);
3060 } 3102 }
3103 if (lost_events)
3104 *lost_events = rb_lost_events(cpu_buffer);
3061 return event; 3105 return event;
3062 3106
3063 default: 3107 default:
@@ -3168,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
3168 * @buffer: The ring buffer to read 3212 * @buffer: The ring buffer to read
3169 * @cpu: The cpu to peak at 3213 * @cpu: The cpu to peak at
3170 * @ts: The timestamp counter of this event. 3214 * @ts: The timestamp counter of this event.
3215 * @lost_events: a variable to store if events were lost (may be NULL)
3171 * 3216 *
3172 * This will return the event that will be read next, but does 3217 * This will return the event that will be read next, but does
3173 * not consume the data. 3218 * not consume the data.
3174 */ 3219 */
3175struct ring_buffer_event * 3220struct ring_buffer_event *
3176ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3221ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3222 unsigned long *lost_events)
3177{ 3223{
3178 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3224 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3179 struct ring_buffer_event *event; 3225 struct ring_buffer_event *event;
@@ -3188,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3188 local_irq_save(flags); 3234 local_irq_save(flags);
3189 if (dolock) 3235 if (dolock)
3190 spin_lock(&cpu_buffer->reader_lock); 3236 spin_lock(&cpu_buffer->reader_lock);
3191 event = rb_buffer_peek(cpu_buffer, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3192 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3238 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3193 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
3194 if (dolock) 3240 if (dolock)
@@ -3230,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3230/** 3276/**
3231 * ring_buffer_consume - return an event and consume it 3277 * ring_buffer_consume - return an event and consume it
3232 * @buffer: The ring buffer to get the next event from 3278 * @buffer: The ring buffer to get the next event from
3279 * @cpu: the cpu to read the buffer from
3280 * @ts: a variable to store the timestamp (may be NULL)
3281 * @lost_events: a variable to store if events were lost (may be NULL)
3233 * 3282 *
3234 * Returns the next event in the ring buffer, and that event is consumed. 3283 * Returns the next event in the ring buffer, and that event is consumed.
3235 * Meaning, that sequential reads will keep returning a different event, 3284 * Meaning, that sequential reads will keep returning a different event,
3236 * and eventually empty the ring buffer if the producer is slower. 3285 * and eventually empty the ring buffer if the producer is slower.
3237 */ 3286 */
3238struct ring_buffer_event * 3287struct ring_buffer_event *
3239ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3288ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3289 unsigned long *lost_events)
3240{ 3290{
3241 struct ring_buffer_per_cpu *cpu_buffer; 3291 struct ring_buffer_per_cpu *cpu_buffer;
3242 struct ring_buffer_event *event = NULL; 3292 struct ring_buffer_event *event = NULL;
@@ -3257,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3257 if (dolock) 3307 if (dolock)
3258 spin_lock(&cpu_buffer->reader_lock); 3308 spin_lock(&cpu_buffer->reader_lock);
3259 3309
3260 event = rb_buffer_peek(cpu_buffer, ts); 3310 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3261 if (event) 3311 if (event) {
3312 cpu_buffer->lost_events = 0;
3262 rb_advance_reader(cpu_buffer); 3313 rb_advance_reader(cpu_buffer);
3314 }
3263 3315
3264 if (dolock) 3316 if (dolock)
3265 spin_unlock(&cpu_buffer->reader_lock); 3317 spin_unlock(&cpu_buffer->reader_lock);
@@ -3276,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3276EXPORT_SYMBOL_GPL(ring_buffer_consume); 3328EXPORT_SYMBOL_GPL(ring_buffer_consume);
3277 3329
3278/** 3330/**
3279 * ring_buffer_read_start - start a non consuming read of the buffer 3331 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3280 * @buffer: The ring buffer to read from 3332 * @buffer: The ring buffer to read from
3281 * @cpu: The cpu buffer to iterate over 3333 * @cpu: The cpu buffer to iterate over
3282 * 3334 *
3283 * This starts up an iteration through the buffer. It also disables 3335 * This performs the initial preparations necessary to iterate
3284 * the recording to the buffer until the reading is finished. 3336 * through the buffer. Memory is allocated, buffer recording
3285 * This prevents the reading from being corrupted. This is not 3337 * is disabled, and the iterator pointer is returned to the caller.
3286 * a consuming read, so a producer is not expected.
3287 * 3338 *
3288 * Must be paired with ring_buffer_finish. 3339 * Disabling buffer recordng prevents the reading from being
3340 * corrupted. This is not a consuming read, so a producer is not
3341 * expected.
3342 *
3343 * After a sequence of ring_buffer_read_prepare calls, the user is
3344 * expected to make at least one call to ring_buffer_prepare_sync.
3345 * Afterwards, ring_buffer_read_start is invoked to get things going
3346 * for real.
3347 *
3348 * This overall must be paired with ring_buffer_finish.
3289 */ 3349 */
3290struct ring_buffer_iter * 3350struct ring_buffer_iter *
3291ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3351ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3292{ 3352{
3293 struct ring_buffer_per_cpu *cpu_buffer; 3353 struct ring_buffer_per_cpu *cpu_buffer;
3294 struct ring_buffer_iter *iter; 3354 struct ring_buffer_iter *iter;
3295 unsigned long flags;
3296 3355
3297 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3356 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3298 return NULL; 3357 return NULL;
@@ -3306,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3306 iter->cpu_buffer = cpu_buffer; 3365 iter->cpu_buffer = cpu_buffer;
3307 3366
3308 atomic_inc(&cpu_buffer->record_disabled); 3367 atomic_inc(&cpu_buffer->record_disabled);
3368
3369 return iter;
3370}
3371EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3372
3373/**
3374 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3375 *
3376 * All previously invoked ring_buffer_read_prepare calls to prepare
3377 * iterators will be synchronized. Afterwards, read_buffer_read_start
3378 * calls on those iterators are allowed.
3379 */
3380void
3381ring_buffer_read_prepare_sync(void)
3382{
3309 synchronize_sched(); 3383 synchronize_sched();
3384}
3385EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3386
3387/**
3388 * ring_buffer_read_start - start a non consuming read of the buffer
3389 * @iter: The iterator returned by ring_buffer_read_prepare
3390 *
3391 * This finalizes the startup of an iteration through the buffer.
3392 * The iterator comes from a call to ring_buffer_read_prepare and
3393 * an intervening ring_buffer_read_prepare_sync must have been
3394 * performed.
3395 *
3396 * Must be paired with ring_buffer_finish.
3397 */
3398void
3399ring_buffer_read_start(struct ring_buffer_iter *iter)
3400{
3401 struct ring_buffer_per_cpu *cpu_buffer;
3402 unsigned long flags;
3403
3404 if (!iter)
3405 return;
3406
3407 cpu_buffer = iter->cpu_buffer;
3310 3408
3311 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3312 arch_spin_lock(&cpu_buffer->lock); 3410 arch_spin_lock(&cpu_buffer->lock);
3313 rb_iter_reset(iter); 3411 rb_iter_reset(iter);
3314 arch_spin_unlock(&cpu_buffer->lock); 3412 arch_spin_unlock(&cpu_buffer->lock);
3315 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3413 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3316
3317 return iter;
3318} 3414}
3319EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3415EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3320 3416
@@ -3408,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3408 cpu_buffer->write_stamp = 0; 3504 cpu_buffer->write_stamp = 0;
3409 cpu_buffer->read_stamp = 0; 3505 cpu_buffer->read_stamp = 0;
3410 3506
3507 cpu_buffer->lost_events = 0;
3508 cpu_buffer->last_overrun = 0;
3509
3411 rb_head_page_activate(cpu_buffer); 3510 rb_head_page_activate(cpu_buffer);
3412} 3511}
3413 3512
@@ -3683,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3683 struct ring_buffer_event *event; 3782 struct ring_buffer_event *event;
3684 struct buffer_data_page *bpage; 3783 struct buffer_data_page *bpage;
3685 struct buffer_page *reader; 3784 struct buffer_page *reader;
3785 unsigned long missed_events;
3686 unsigned long flags; 3786 unsigned long flags;
3687 unsigned int commit; 3787 unsigned int commit;
3688 unsigned int read; 3788 unsigned int read;
@@ -3719,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3719 read = reader->read; 3819 read = reader->read;
3720 commit = rb_page_commit(reader); 3820 commit = rb_page_commit(reader);
3721 3821
3822 /* Check if any events were dropped */
3823 missed_events = cpu_buffer->lost_events;
3824
3722 /* 3825 /*
3723 * If this page has been partially read or 3826 * If this page has been partially read or
3724 * if len is not big enough to read the rest of the page or 3827 * if len is not big enough to read the rest of the page or
@@ -3779,9 +3882,35 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3779 local_set(&reader->entries, 0); 3882 local_set(&reader->entries, 0);
3780 reader->read = 0; 3883 reader->read = 0;
3781 *data_page = bpage; 3884 *data_page = bpage;
3885
3886 /*
3887 * Use the real_end for the data size,
3888 * This gives us a chance to store the lost events
3889 * on the page.
3890 */
3891 if (reader->real_end)
3892 local_set(&bpage->commit, reader->real_end);
3782 } 3893 }
3783 ret = read; 3894 ret = read;
3784 3895
3896 cpu_buffer->lost_events = 0;
3897 /*
3898 * Set a flag in the commit field if we lost events
3899 */
3900 if (missed_events) {
3901 commit = local_read(&bpage->commit);
3902
3903 /* If there is room at the end of the page to save the
3904 * missed events, then record it there.
3905 */
3906 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3907 memcpy(&bpage->data[commit], &missed_events,
3908 sizeof(missed_events));
3909 local_add(RB_MISSED_STORED, &bpage->commit);
3910 }
3911 local_add(RB_MISSED_EVENTS, &bpage->commit);
3912 }
3913
3785 out_unlock: 3914 out_unlock:
3786 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3915 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3787 3916
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c7982255..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
81 int *entry; 81 int *entry;
82 u64 ts; 82 u64 ts;
83 83
84 event = ring_buffer_consume(buffer, cpu, &ts); 84 event = ring_buffer_consume(buffer, cpu, &ts, NULL);
85 if (!event) 85 if (!event)
86 return EVENT_DROPPED; 86 return EVENT_DROPPED;
87 87
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); 113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
114 if (ret >= 0) { 114 if (ret >= 0) {
115 rpage = bpage; 115 rpage = bpage;
116 commit = local_read(&rpage->commit); 116 /* The commit may have missed event flags set, clear them */
117 commit = local_read(&rpage->commit) & 0xfffff;
117 for (i = 0; i < commit && !kill_test; i += inc) { 118 for (i = 0; i < commit && !kill_test; i += inc) {
118 119
119 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { 120 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a04065..ba0ec81158b2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
117 * 117 *
118 * It is default off, but you can enable it with either specifying 118 * It is default off, but you can enable it with either specifying
119 * "ftrace_dump_on_oops" in the kernel command line, or setting 119 * "ftrace_dump_on_oops" in the kernel command line, or setting
120 * /proc/sys/kernel/ftrace_dump_on_oops to true. 120 * /proc/sys/kernel/ftrace_dump_on_oops
121 * Set 1 if you want to dump buffers of all CPUs
122 * Set 2 if you want to dump the buffer of the CPU that triggered oops
121 */ 123 */
122int ftrace_dump_on_oops; 124
125enum ftrace_dump_mode ftrace_dump_on_oops;
123 126
124static int tracing_set_tracer(const char *buf); 127static int tracing_set_tracer(const char *buf);
125 128
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
139 142
140static int __init set_ftrace_dump_on_oops(char *str) 143static int __init set_ftrace_dump_on_oops(char *str)
141{ 144{
142 ftrace_dump_on_oops = 1; 145 if (*str++ != '=' || !*str) {
143 return 1; 146 ftrace_dump_on_oops = DUMP_ALL;
147 return 1;
148 }
149
150 if (!strcmp("orig_cpu", str)) {
151 ftrace_dump_on_oops = DUMP_ORIG;
152 return 1;
153 }
154
155 return 0;
144} 156}
145__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
146 158
@@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
1545} 1557}
1546 1558
1547static struct trace_entry * 1559static struct trace_entry *
1548peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) 1560peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1561 unsigned long *lost_events)
1549{ 1562{
1550 struct ring_buffer_event *event; 1563 struct ring_buffer_event *event;
1551 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1564 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1556 if (buf_iter) 1569 if (buf_iter)
1557 event = ring_buffer_iter_peek(buf_iter, ts); 1570 event = ring_buffer_iter_peek(buf_iter, ts);
1558 else 1571 else
1559 event = ring_buffer_peek(iter->tr->buffer, cpu, ts); 1572 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1573 lost_events);
1560 1574
1561 ftrace_enable_cpu(); 1575 ftrace_enable_cpu();
1562 1576
@@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1564} 1578}
1565 1579
1566static struct trace_entry * 1580static struct trace_entry *
1567__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1581__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1582 unsigned long *missing_events, u64 *ent_ts)
1568{ 1583{
1569 struct ring_buffer *buffer = iter->tr->buffer; 1584 struct ring_buffer *buffer = iter->tr->buffer;
1570 struct trace_entry *ent, *next = NULL; 1585 struct trace_entry *ent, *next = NULL;
1586 unsigned long lost_events = 0, next_lost = 0;
1571 int cpu_file = iter->cpu_file; 1587 int cpu_file = iter->cpu_file;
1572 u64 next_ts = 0, ts; 1588 u64 next_ts = 0, ts;
1573 int next_cpu = -1; 1589 int next_cpu = -1;
@@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1580 if (cpu_file > TRACE_PIPE_ALL_CPU) { 1596 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1581 if (ring_buffer_empty_cpu(buffer, cpu_file)) 1597 if (ring_buffer_empty_cpu(buffer, cpu_file))
1582 return NULL; 1598 return NULL;
1583 ent = peek_next_entry(iter, cpu_file, ent_ts); 1599 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
1584 if (ent_cpu) 1600 if (ent_cpu)
1585 *ent_cpu = cpu_file; 1601 *ent_cpu = cpu_file;
1586 1602
@@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1592 if (ring_buffer_empty_cpu(buffer, cpu)) 1608 if (ring_buffer_empty_cpu(buffer, cpu))
1593 continue; 1609 continue;
1594 1610
1595 ent = peek_next_entry(iter, cpu, &ts); 1611 ent = peek_next_entry(iter, cpu, &ts, &lost_events);
1596 1612
1597 /* 1613 /*
1598 * Pick the entry with the smallest timestamp: 1614 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1601 next = ent; 1617 next = ent;
1602 next_cpu = cpu; 1618 next_cpu = cpu;
1603 next_ts = ts; 1619 next_ts = ts;
1620 next_lost = lost_events;
1604 } 1621 }
1605 } 1622 }
1606 1623
@@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1610 if (ent_ts) 1627 if (ent_ts)
1611 *ent_ts = next_ts; 1628 *ent_ts = next_ts;
1612 1629
1630 if (missing_events)
1631 *missing_events = next_lost;
1632
1613 return next; 1633 return next;
1614} 1634}
1615 1635
@@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1617struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 1637struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1618 int *ent_cpu, u64 *ent_ts) 1638 int *ent_cpu, u64 *ent_ts)
1619{ 1639{
1620 return __find_next_entry(iter, ent_cpu, ent_ts); 1640 return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
1621} 1641}
1622 1642
1623/* Find the next real entry, and increment the iterator to the next entry */ 1643/* Find the next real entry, and increment the iterator to the next entry */
1624static void *find_next_entry_inc(struct trace_iterator *iter) 1644static void *find_next_entry_inc(struct trace_iterator *iter)
1625{ 1645{
1626 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1646 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts);
1627 1648
1628 if (iter->ent) 1649 if (iter->ent)
1629 trace_iterator_increment(iter); 1650 trace_iterator_increment(iter);
@@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
1635{ 1656{
1636 /* Don't allow ftrace to trace into the ring buffers */ 1657 /* Don't allow ftrace to trace into the ring buffers */
1637 ftrace_disable_cpu(); 1658 ftrace_disable_cpu();
1638 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); 1659 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1660 &iter->lost_events);
1639 ftrace_enable_cpu(); 1661 ftrace_enable_cpu();
1640} 1662}
1641 1663
@@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
1786} 1808}
1787 1809
1788 1810
1789static void 1811void
1790print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1812print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1791{ 1813{
1792 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1814 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1914,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1914 } 1936 }
1915 1937
1916 if (event) 1938 if (event)
1917 return event->trace(iter, sym_flags); 1939 return event->funcs->trace(iter, sym_flags, event);
1918 1940
1919 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 1941 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
1920 goto partial; 1942 goto partial;
@@ -1940,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1940 1962
1941 event = ftrace_find_event(entry->type); 1963 event = ftrace_find_event(entry->type);
1942 if (event) 1964 if (event)
1943 return event->raw(iter, 0); 1965 return event->funcs->raw(iter, 0, event);
1944 1966
1945 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 1967 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1946 goto partial; 1968 goto partial;
@@ -1967,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1967 1989
1968 event = ftrace_find_event(entry->type); 1990 event = ftrace_find_event(entry->type);
1969 if (event) { 1991 if (event) {
1970 enum print_line_t ret = event->hex(iter, 0); 1992 enum print_line_t ret = event->funcs->hex(iter, 0, event);
1971 if (ret != TRACE_TYPE_HANDLED) 1993 if (ret != TRACE_TYPE_HANDLED)
1972 return ret; 1994 return ret;
1973 } 1995 }
@@ -1992,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1992 } 2014 }
1993 2015
1994 event = ftrace_find_event(entry->type); 2016 event = ftrace_find_event(entry->type);
1995 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 2017 return event ? event->funcs->binary(iter, 0, event) :
2018 TRACE_TYPE_HANDLED;
1996} 2019}
1997 2020
1998static int trace_empty(struct trace_iterator *iter) 2021int trace_empty(struct trace_iterator *iter)
1999{ 2022{
2000 int cpu; 2023 int cpu;
2001 2024
@@ -2030,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2030{ 2053{
2031 enum print_line_t ret; 2054 enum print_line_t ret;
2032 2055
2056 if (iter->lost_events)
2057 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2058 iter->cpu, iter->lost_events);
2059
2033 if (iter->trace && iter->trace->print_line) { 2060 if (iter->trace && iter->trace->print_line) {
2034 ret = iter->trace->print_line(iter); 2061 ret = iter->trace->print_line(iter);
2035 if (ret != TRACE_TYPE_UNHANDLED) 2062 if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2058 return print_trace_fmt(iter); 2085 return print_trace_fmt(iter);
2059} 2086}
2060 2087
2088void trace_default_header(struct seq_file *m)
2089{
2090 struct trace_iterator *iter = m->private;
2091
2092 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2093 /* print nothing if the buffers are empty */
2094 if (trace_empty(iter))
2095 return;
2096 print_trace_header(m, iter);
2097 if (!(trace_flags & TRACE_ITER_VERBOSE))
2098 print_lat_help_header(m);
2099 } else {
2100 if (!(trace_flags & TRACE_ITER_VERBOSE))
2101 print_func_help_header(m);
2102 }
2103}
2104
2061static int s_show(struct seq_file *m, void *v) 2105static int s_show(struct seq_file *m, void *v)
2062{ 2106{
2063 struct trace_iterator *iter = v; 2107 struct trace_iterator *iter = v;
@@ -2070,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v)
2070 } 2114 }
2071 if (iter->trace && iter->trace->print_header) 2115 if (iter->trace && iter->trace->print_header)
2072 iter->trace->print_header(m); 2116 iter->trace->print_header(m);
2073 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2117 else
2074 /* print nothing if the buffers are empty */ 2118 trace_default_header(m);
2075 if (trace_empty(iter)) 2119
2076 return 0;
2077 print_trace_header(m, iter);
2078 if (!(trace_flags & TRACE_ITER_VERBOSE))
2079 print_lat_help_header(m);
2080 } else {
2081 if (!(trace_flags & TRACE_ITER_VERBOSE))
2082 print_func_help_header(m);
2083 }
2084 } else if (iter->leftover) { 2120 } else if (iter->leftover) {
2085 /* 2121 /*
2086 * If we filled the seq_file buffer earlier, we 2122 * If we filled the seq_file buffer earlier, we
@@ -2166,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file)
2166 2202
2167 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2203 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2168 for_each_tracing_cpu(cpu) { 2204 for_each_tracing_cpu(cpu) {
2169
2170 iter->buffer_iter[cpu] = 2205 iter->buffer_iter[cpu] =
2171 ring_buffer_read_start(iter->tr->buffer, cpu); 2206 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2207 }
2208 ring_buffer_read_prepare_sync();
2209 for_each_tracing_cpu(cpu) {
2210 ring_buffer_read_start(iter->buffer_iter[cpu]);
2172 tracing_iter_reset(iter, cpu); 2211 tracing_iter_reset(iter, cpu);
2173 } 2212 }
2174 } else { 2213 } else {
2175 cpu = iter->cpu_file; 2214 cpu = iter->cpu_file;
2176 iter->buffer_iter[cpu] = 2215 iter->buffer_iter[cpu] =
2177 ring_buffer_read_start(iter->tr->buffer, cpu); 2216 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2217 ring_buffer_read_prepare_sync();
2218 ring_buffer_read_start(iter->buffer_iter[cpu]);
2178 tracing_iter_reset(iter, cpu); 2219 tracing_iter_reset(iter, cpu);
2179 } 2220 }
2180 2221
@@ -4324,7 +4365,7 @@ static int trace_panic_handler(struct notifier_block *this,
4324 unsigned long event, void *unused) 4365 unsigned long event, void *unused)
4325{ 4366{
4326 if (ftrace_dump_on_oops) 4367 if (ftrace_dump_on_oops)
4327 ftrace_dump(); 4368 ftrace_dump(ftrace_dump_on_oops);
4328 return NOTIFY_OK; 4369 return NOTIFY_OK;
4329} 4370}
4330 4371
@@ -4341,7 +4382,7 @@ static int trace_die_handler(struct notifier_block *self,
4341 switch (val) { 4382 switch (val) {
4342 case DIE_OOPS: 4383 case DIE_OOPS:
4343 if (ftrace_dump_on_oops) 4384 if (ftrace_dump_on_oops)
4344 ftrace_dump(); 4385 ftrace_dump(ftrace_dump_on_oops);
4345 break; 4386 break;
4346 default: 4387 default:
4347 break; 4388 break;
@@ -4382,7 +4423,8 @@ trace_printk_seq(struct trace_seq *s)
4382 trace_seq_init(s); 4423 trace_seq_init(s);
4383} 4424}
4384 4425
4385static void __ftrace_dump(bool disable_tracing) 4426static void
4427__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4386{ 4428{
4387 static arch_spinlock_t ftrace_dump_lock = 4429 static arch_spinlock_t ftrace_dump_lock =
4388 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 4430 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4415,12 +4457,25 @@ static void __ftrace_dump(bool disable_tracing)
4415 /* don't look at user memory in panic mode */ 4457 /* don't look at user memory in panic mode */
4416 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4458 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4417 4459
4418 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4419
4420 /* Simulate the iterator */ 4460 /* Simulate the iterator */
4421 iter.tr = &global_trace; 4461 iter.tr = &global_trace;
4422 iter.trace = current_trace; 4462 iter.trace = current_trace;
4423 iter.cpu_file = TRACE_PIPE_ALL_CPU; 4463
4464 switch (oops_dump_mode) {
4465 case DUMP_ALL:
4466 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4467 break;
4468 case DUMP_ORIG:
4469 iter.cpu_file = raw_smp_processor_id();
4470 break;
4471 case DUMP_NONE:
4472 goto out_enable;
4473 default:
4474 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
4475 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4476 }
4477
4478 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4424 4479
4425 /* 4480 /*
4426 * We need to stop all tracing on all CPUS to read the 4481 * We need to stop all tracing on all CPUS to read the
@@ -4459,6 +4514,7 @@ static void __ftrace_dump(bool disable_tracing)
4459 else 4514 else
4460 printk(KERN_TRACE "---------------------------------\n"); 4515 printk(KERN_TRACE "---------------------------------\n");
4461 4516
4517 out_enable:
4462 /* Re-enable tracing if requested */ 4518 /* Re-enable tracing if requested */
4463 if (!disable_tracing) { 4519 if (!disable_tracing) {
4464 trace_flags |= old_userobj; 4520 trace_flags |= old_userobj;
@@ -4475,9 +4531,9 @@ static void __ftrace_dump(bool disable_tracing)
4475} 4531}
4476 4532
4477/* By default: disable tracing after the dump */ 4533/* By default: disable tracing after the dump */
4478void ftrace_dump(void) 4534void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4479{ 4535{
4480 __ftrace_dump(true); 4536 __ftrace_dump(true, oops_dump_mode);
4481} 4537}
4482 4538
4483__init static int tracer_alloc_buffers(void) 4539__init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3ebdb6bd2362..2cd96399463f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -364,6 +364,9 @@ void trace_function(struct trace_array *tr,
364 unsigned long ip, 364 unsigned long ip,
365 unsigned long parent_ip, 365 unsigned long parent_ip,
366 unsigned long flags, int pc); 366 unsigned long flags, int pc);
367void trace_default_header(struct seq_file *m);
368void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
369int trace_empty(struct trace_iterator *iter);
367 370
368void trace_graph_return(struct ftrace_graph_ret *trace); 371void trace_graph_return(struct ftrace_graph_ret *trace);
369int trace_graph_entry(struct ftrace_graph_ent *trace); 372int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -402,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
402void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 405void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
403 int pc); 406 int pc);
404#else 407#else
405static inline void ftrace_trace_stack(struct trace_array *tr, 408static inline void ftrace_trace_stack(struct ring_buffer *buffer,
406 unsigned long flags, int skip, int pc) 409 unsigned long flags, int skip, int pc)
407{ 410{
408} 411}
409 412
410static inline void ftrace_trace_userstack(struct trace_array *tr, 413static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
411 unsigned long flags, int pc) 414 unsigned long flags, int pc)
412{ 415{
413} 416}
@@ -475,9 +478,29 @@ extern int trace_clock_id;
475 478
476/* Standard output formatting function used for function return traces */ 479/* Standard output formatting function used for function return traces */
477#ifdef CONFIG_FUNCTION_GRAPH_TRACER 480#ifdef CONFIG_FUNCTION_GRAPH_TRACER
478extern enum print_line_t print_graph_function(struct trace_iterator *iter); 481
482/* Flag options */
483#define TRACE_GRAPH_PRINT_OVERRUN 0x1
484#define TRACE_GRAPH_PRINT_CPU 0x2
485#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
486#define TRACE_GRAPH_PRINT_PROC 0x8
487#define TRACE_GRAPH_PRINT_DURATION 0x10
488#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
489
490extern enum print_line_t
491print_graph_function_flags(struct trace_iterator *iter, u32 flags);
492extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
479extern enum print_line_t 493extern enum print_line_t
480trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 494trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
495extern void graph_trace_open(struct trace_iterator *iter);
496extern void graph_trace_close(struct trace_iterator *iter);
497extern int __trace_graph_entry(struct trace_array *tr,
498 struct ftrace_graph_ent *trace,
499 unsigned long flags, int pc);
500extern void __trace_graph_return(struct trace_array *tr,
501 struct ftrace_graph_ret *trace,
502 unsigned long flags, int pc);
503
481 504
482#ifdef CONFIG_DYNAMIC_FTRACE 505#ifdef CONFIG_DYNAMIC_FTRACE
483/* TODO: make this variable */ 506/* TODO: make this variable */
@@ -508,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
508#endif /* CONFIG_DYNAMIC_FTRACE */ 531#endif /* CONFIG_DYNAMIC_FTRACE */
509#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 532#else /* CONFIG_FUNCTION_GRAPH_TRACER */
510static inline enum print_line_t 533static inline enum print_line_t
511print_graph_function(struct trace_iterator *iter) 534print_graph_function_flags(struct trace_iterator *iter, u32 flags)
512{ 535{
513 return TRACE_TYPE_UNHANDLED; 536 return TRACE_TYPE_UNHANDLED;
514} 537}
@@ -755,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
755 struct trace_seq *s); 778 struct trace_seq *s);
756extern int filter_assign_type(const char *type); 779extern int filter_assign_type(const char *type);
757 780
781struct list_head *
782trace_get_fields(struct ftrace_event_call *event_call);
783
758static inline int 784static inline int
759filter_check_discard(struct ftrace_event_call *call, void *rec, 785filter_check_discard(struct ftrace_event_call *call, void *rec,
760 struct ring_buffer *buffer, 786 struct ring_buffer *buffer,
761 struct ring_buffer_event *event) 787 struct ring_buffer_event *event)
762{ 788{
763 if (unlikely(call->filter_active) && 789 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
764 !filter_match_preds(call->filter, rec)) { 790 !filter_match_preds(call->filter, rec)) {
765 ring_buffer_discard_commit(buffer, event); 791 ring_buffer_discard_commit(buffer, event);
766 return 1; 792 return 1;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d470177..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
143} 143}
144 144
145static enum print_line_t trace_branch_print(struct trace_iterator *iter, 145static enum print_line_t trace_branch_print(struct trace_iterator *iter,
146 int flags) 146 int flags, struct trace_event *event)
147{ 147{
148 struct trace_branch *field; 148 struct trace_branch *field;
149 149
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
167 " |\n"); 167 " |\n");
168} 168}
169 169
170static struct trace_event_functions trace_branch_funcs = {
171 .trace = trace_branch_print,
172};
173
170static struct trace_event trace_branch_event = { 174static struct trace_event trace_branch_event = {
171 .type = TRACE_BRANCH, 175 .type = TRACE_BRANCH,
172 .trace = trace_branch_print, 176 .funcs = &trace_branch_funcs,
173}; 177};
174 178
175static struct tracer branch_trace __read_mostly = 179static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566f..0a47e8d6b491 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -49,7 +49,12 @@ static int perf_trace_event_enable(struct ftrace_event_call *event)
49 rcu_assign_pointer(perf_trace_buf_nmi, buf); 49 rcu_assign_pointer(perf_trace_buf_nmi, buf);
50 } 50 }
51 51
52 ret = event->perf_event_enable(event); 52 if (event->class->reg)
53 ret = event->class->reg(event, TRACE_REG_PERF_REGISTER);
54 else
55 ret = tracepoint_probe_register(event->name,
56 event->class->perf_probe,
57 event);
53 if (!ret) { 58 if (!ret) {
54 total_ref_count++; 59 total_ref_count++;
55 return 0; 60 return 0;
@@ -75,7 +80,8 @@ int perf_trace_enable(int event_id)
75 80
76 mutex_lock(&event_mutex); 81 mutex_lock(&event_mutex);
77 list_for_each_entry(event, &ftrace_events, list) { 82 list_for_each_entry(event, &ftrace_events, list) {
78 if (event->id == event_id && event->perf_event_enable && 83 if (event->event.type == event_id &&
84 event->class && event->class->perf_probe &&
79 try_module_get(event->mod)) { 85 try_module_get(event->mod)) {
80 ret = perf_trace_event_enable(event); 86 ret = perf_trace_event_enable(event);
81 break; 87 break;
@@ -93,7 +99,10 @@ static void perf_trace_event_disable(struct ftrace_event_call *event)
93 if (--event->perf_refcount > 0) 99 if (--event->perf_refcount > 0)
94 return; 100 return;
95 101
96 event->perf_event_disable(event); 102 if (event->class->reg)
103 event->class->reg(event, TRACE_REG_PERF_UNREGISTER);
104 else
105 tracepoint_probe_unregister(event->name, event->class->perf_probe, event);
97 106
98 if (!--total_ref_count) { 107 if (!--total_ref_count) {
99 buf = perf_trace_buf; 108 buf = perf_trace_buf;
@@ -119,7 +128,7 @@ void perf_trace_disable(int event_id)
119 128
120 mutex_lock(&event_mutex); 129 mutex_lock(&event_mutex);
121 list_for_each_entry(event, &ftrace_events, list) { 130 list_for_each_entry(event, &ftrace_events, list) {
122 if (event->id == event_id) { 131 if (event->event.type == event_id) {
123 perf_trace_event_disable(event); 132 perf_trace_event_disable(event);
124 module_put(event->mod); 133 module_put(event->mod);
125 break; 134 break;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c697c7043349..53cffc0b0801 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -29,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31 31
32struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call)
34{
35 if (!event_call->class->get_fields)
36 return &event_call->class->fields;
37 return event_call->class->get_fields(event_call);
38}
39
32int trace_define_field(struct ftrace_event_call *call, const char *type, 40int trace_define_field(struct ftrace_event_call *call, const char *type,
33 const char *name, int offset, int size, int is_signed, 41 const char *name, int offset, int size, int is_signed,
34 int filter_type) 42 int filter_type)
35{ 43{
36 struct ftrace_event_field *field; 44 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
37 49
38 field = kzalloc(sizeof(*field), GFP_KERNEL); 50 field = kzalloc(sizeof(*field), GFP_KERNEL);
39 if (!field) 51 if (!field)
@@ -56,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
56 field->size = size; 68 field->size = size;
57 field->is_signed = is_signed; 69 field->is_signed = is_signed;
58 70
59 list_add(&field->link, &call->fields); 71 head = trace_get_fields(call);
72 list_add(&field->link, head);
60 73
61 return 0; 74 return 0;
62 75
@@ -94,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
94void trace_destroy_fields(struct ftrace_event_call *call) 107void trace_destroy_fields(struct ftrace_event_call *call)
95{ 108{
96 struct ftrace_event_field *field, *next; 109 struct ftrace_event_field *field, *next;
110 struct list_head *head;
97 111
98 list_for_each_entry_safe(field, next, &call->fields, link) { 112 head = trace_get_fields(call);
113 list_for_each_entry_safe(field, next, head, link) {
99 list_del(&field->link); 114 list_del(&field->link);
100 kfree(field->type); 115 kfree(field->type);
101 kfree(field->name); 116 kfree(field->name);
@@ -107,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
107{ 122{
108 int id; 123 int id;
109 124
110 id = register_ftrace_event(call->event); 125 id = register_ftrace_event(&call->event);
111 if (!id) 126 if (!id)
112 return -ENODEV; 127 return -ENODEV;
113 call->id = id;
114 INIT_LIST_HEAD(&call->fields);
115 128
116 return 0; 129 return 0;
117} 130}
@@ -124,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
124 137
125 switch (enable) { 138 switch (enable) {
126 case 0: 139 case 0:
127 if (call->enabled) { 140 if (call->flags & TRACE_EVENT_FL_ENABLED) {
128 call->enabled = 0; 141 call->flags &= ~TRACE_EVENT_FL_ENABLED;
129 tracing_stop_cmdline_record(); 142 tracing_stop_cmdline_record();
130 call->unregfunc(call); 143 if (call->class->reg)
144 call->class->reg(call, TRACE_REG_UNREGISTER);
145 else
146 tracepoint_probe_unregister(call->name,
147 call->class->probe,
148 call);
131 } 149 }
132 break; 150 break;
133 case 1: 151 case 1:
134 if (!call->enabled) { 152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
135 tracing_start_cmdline_record(); 153 tracing_start_cmdline_record();
136 ret = call->regfunc(call); 154 if (call->class->reg)
155 ret = call->class->reg(call, TRACE_REG_REGISTER);
156 else
157 ret = tracepoint_probe_register(call->name,
158 call->class->probe,
159 call);
137 if (ret) { 160 if (ret) {
138 tracing_stop_cmdline_record(); 161 tracing_stop_cmdline_record();
139 pr_info("event trace: Could not enable event " 162 pr_info("event trace: Could not enable event "
140 "%s\n", call->name); 163 "%s\n", call->name);
141 break; 164 break;
142 } 165 }
143 call->enabled = 1; 166 call->flags |= TRACE_EVENT_FL_ENABLED;
144 } 167 }
145 break; 168 break;
146 } 169 }
@@ -171,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
171 mutex_lock(&event_mutex); 194 mutex_lock(&event_mutex);
172 list_for_each_entry(call, &ftrace_events, list) { 195 list_for_each_entry(call, &ftrace_events, list) {
173 196
174 if (!call->name || !call->regfunc) 197 if (!call->name || !call->class ||
198 (!call->class->probe && !call->class->reg))
175 continue; 199 continue;
176 200
177 if (match && 201 if (match &&
178 strcmp(match, call->name) != 0 && 202 strcmp(match, call->name) != 0 &&
179 strcmp(match, call->system) != 0) 203 strcmp(match, call->class->system) != 0)
180 continue; 204 continue;
181 205
182 if (sub && strcmp(sub, call->system) != 0) 206 if (sub && strcmp(sub, call->class->system) != 0)
183 continue; 207 continue;
184 208
185 if (event && strcmp(event, call->name) != 0) 209 if (event && strcmp(event, call->name) != 0)
@@ -297,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
297 * The ftrace subsystem is for showing formats only. 321 * The ftrace subsystem is for showing formats only.
298 * They can not be enabled or disabled via the event files. 322 * They can not be enabled or disabled via the event files.
299 */ 323 */
300 if (call->regfunc) 324 if (call->class && (call->class->probe || call->class->reg))
301 return call; 325 return call;
302 } 326 }
303 327
@@ -328,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
328 (*pos)++; 352 (*pos)++;
329 353
330 list_for_each_entry_continue(call, &ftrace_events, list) { 354 list_for_each_entry_continue(call, &ftrace_events, list) {
331 if (call->enabled) 355 if (call->flags & TRACE_EVENT_FL_ENABLED)
332 return call; 356 return call;
333 } 357 }
334 358
@@ -355,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
355{ 379{
356 struct ftrace_event_call *call = v; 380 struct ftrace_event_call *call = v;
357 381
358 if (strcmp(call->system, TRACE_SYSTEM) != 0) 382 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
359 seq_printf(m, "%s:", call->system); 383 seq_printf(m, "%s:", call->class->system);
360 seq_printf(m, "%s\n", call->name); 384 seq_printf(m, "%s\n", call->name);
361 385
362 return 0; 386 return 0;
@@ -387,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
387 struct ftrace_event_call *call = filp->private_data; 411 struct ftrace_event_call *call = filp->private_data;
388 char *buf; 412 char *buf;
389 413
390 if (call->enabled) 414 if (call->flags & TRACE_EVENT_FL_ENABLED)
391 buf = "1\n"; 415 buf = "1\n";
392 else 416 else
393 buf = "0\n"; 417 buf = "0\n";
@@ -450,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
450 474
451 mutex_lock(&event_mutex); 475 mutex_lock(&event_mutex);
452 list_for_each_entry(call, &ftrace_events, list) { 476 list_for_each_entry(call, &ftrace_events, list) {
453 if (!call->name || !call->regfunc) 477 if (!call->name || !call->class ||
478 (!call->class->probe && !call->class->reg))
454 continue; 479 continue;
455 480
456 if (system && strcmp(call->system, system) != 0) 481 if (system && strcmp(call->class->system, system) != 0)
457 continue; 482 continue;
458 483
459 /* 484 /*
@@ -461,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
461 * or if all events or cleared, or if we have 486 * or if all events or cleared, or if we have
462 * a mixture. 487 * a mixture.
463 */ 488 */
464 set |= (1 << !!call->enabled); 489 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
465 490
466 /* 491 /*
467 * If we have a mixture, no need to look further. 492 * If we have a mixture, no need to look further.
@@ -525,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
525{ 550{
526 struct ftrace_event_call *call = filp->private_data; 551 struct ftrace_event_call *call = filp->private_data;
527 struct ftrace_event_field *field; 552 struct ftrace_event_field *field;
553 struct list_head *head;
528 struct trace_seq *s; 554 struct trace_seq *s;
529 int common_field_count = 5; 555 int common_field_count = 5;
530 char *buf; 556 char *buf;
@@ -540,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
540 trace_seq_init(s); 566 trace_seq_init(s);
541 567
542 trace_seq_printf(s, "name: %s\n", call->name); 568 trace_seq_printf(s, "name: %s\n", call->name);
543 trace_seq_printf(s, "ID: %d\n", call->id); 569 trace_seq_printf(s, "ID: %d\n", call->event.type);
544 trace_seq_printf(s, "format:\n"); 570 trace_seq_printf(s, "format:\n");
545 571
546 list_for_each_entry_reverse(field, &call->fields, link) { 572 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) {
547 /* 574 /*
548 * Smartly shows the array type(except dynamic array). 575 * Smartly shows the array type(except dynamic array).
549 * Normal: 576 * Normal:
@@ -613,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
613 return -ENOMEM; 640 return -ENOMEM;
614 641
615 trace_seq_init(s); 642 trace_seq_init(s);
616 trace_seq_printf(s, "%d\n", call->id); 643 trace_seq_printf(s, "%d\n", call->event.type);
617 644
618 r = simple_read_from_buffer(ubuf, cnt, ppos, 645 r = simple_read_from_buffer(ubuf, cnt, ppos,
619 s->buffer, s->len); 646 s->buffer, s->len);
@@ -919,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
919 const struct file_operations *filter, 946 const struct file_operations *filter,
920 const struct file_operations *format) 947 const struct file_operations *format)
921{ 948{
949 struct list_head *head;
922 int ret; 950 int ret;
923 951
924 /* 952 /*
925 * If the trace point header did not define TRACE_SYSTEM 953 * If the trace point header did not define TRACE_SYSTEM
926 * then the system would be called "TRACE_SYSTEM". 954 * then the system would be called "TRACE_SYSTEM".
927 */ 955 */
928 if (strcmp(call->system, TRACE_SYSTEM) != 0) 956 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
929 d_events = event_subsystem_dir(call->system, d_events); 957 d_events = event_subsystem_dir(call->class->system, d_events);
930 958
931 call->dir = debugfs_create_dir(call->name, d_events); 959 call->dir = debugfs_create_dir(call->name, d_events);
932 if (!call->dir) { 960 if (!call->dir) {
@@ -935,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
935 return -1; 963 return -1;
936 } 964 }
937 965
938 if (call->regfunc) 966 if (call->class->probe || call->class->reg)
939 trace_create_file("enable", 0644, call->dir, call, 967 trace_create_file("enable", 0644, call->dir, call,
940 enable); 968 enable);
941 969
942 if (call->id && call->perf_event_enable) 970#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg))
943 trace_create_file("id", 0444, call->dir, call, 972 trace_create_file("id", 0444, call->dir, call,
944 id); 973 id);
974#endif
945 975
946 if (call->define_fields) { 976 if (call->class->define_fields) {
947 ret = trace_define_common_fields(call); 977 /*
948 if (!ret) 978 * Other events may have the same class. Only update
949 ret = call->define_fields(call); 979 * the fields if they are not already defined.
950 if (ret < 0) { 980 */
951 pr_warning("Could not initialize trace point" 981 head = trace_get_fields(call);
952 " events/%s\n", call->name); 982 if (list_empty(head)) {
953 return ret; 983 ret = trace_define_common_fields(call);
984 if (!ret)
985 ret = call->class->define_fields(call);
986 if (ret < 0) {
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
954 } 991 }
955 trace_create_file("filter", 0644, call->dir, call, 992 trace_create_file("filter", 0644, call->dir, call,
956 filter); 993 filter);
@@ -970,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
970 if (!call->name) 1007 if (!call->name)
971 return -EINVAL; 1008 return -EINVAL;
972 1009
973 if (call->raw_init) { 1010 if (call->class->raw_init) {
974 ret = call->raw_init(call); 1011 ret = call->class->raw_init(call);
975 if (ret < 0) { 1012 if (ret < 0) {
976 if (ret != -ENOSYS) 1013 if (ret != -ENOSYS)
977 pr_warning("Could not initialize trace " 1014 pr_warning("Could not initialize trace "
@@ -1035,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
1035static void __trace_remove_event_call(struct ftrace_event_call *call) 1072static void __trace_remove_event_call(struct ftrace_event_call *call)
1036{ 1073{
1037 ftrace_event_enable_disable(call, 0); 1074 ftrace_event_enable_disable(call, 0);
1038 if (call->event) 1075 if (call->event.funcs)
1039 __unregister_ftrace_event(call->event); 1076 __unregister_ftrace_event(&call->event);
1040 debugfs_remove_recursive(call->dir); 1077 debugfs_remove_recursive(call->dir);
1041 list_del(&call->list); 1078 list_del(&call->list);
1042 trace_destroy_fields(call); 1079 trace_destroy_fields(call);
1043 destroy_preds(call); 1080 destroy_preds(call);
1044 remove_subsystem_dir(call->system); 1081 remove_subsystem_dir(call->class->system);
1045} 1082}
1046 1083
1047/* Remove an event_call */ 1084/* Remove an event_call */
@@ -1132,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
1132 /* The linker may leave blanks */ 1169 /* The linker may leave blanks */
1133 if (!call->name) 1170 if (!call->name)
1134 continue; 1171 continue;
1135 if (call->raw_init) { 1172 if (call->class->raw_init) {
1136 ret = call->raw_init(call); 1173 ret = call->class->raw_init(call);
1137 if (ret < 0) { 1174 if (ret < 0) {
1138 if (ret != -ENOSYS) 1175 if (ret != -ENOSYS)
1139 pr_warning("Could not initialize trace " 1176 pr_warning("Could not initialize trace "
@@ -1286,8 +1323,8 @@ static __init int event_trace_init(void)
1286 /* The linker may leave blanks */ 1323 /* The linker may leave blanks */
1287 if (!call->name) 1324 if (!call->name)
1288 continue; 1325 continue;
1289 if (call->raw_init) { 1326 if (call->class->raw_init) {
1290 ret = call->raw_init(call); 1327 ret = call->class->raw_init(call);
1291 if (ret < 0) { 1328 if (ret < 0) {
1292 if (ret != -ENOSYS) 1329 if (ret != -ENOSYS)
1293 pr_warning("Could not initialize trace " 1330 pr_warning("Could not initialize trace "
@@ -1388,8 +1425,8 @@ static __init void event_trace_self_tests(void)
1388 1425
1389 list_for_each_entry(call, &ftrace_events, list) { 1426 list_for_each_entry(call, &ftrace_events, list) {
1390 1427
1391 /* Only test those that have a regfunc */ 1428 /* Only test those that have a probe */
1392 if (!call->regfunc) 1429 if (!call->class || !call->class->probe)
1393 continue; 1430 continue;
1394 1431
1395/* 1432/*
@@ -1399,8 +1436,8 @@ static __init void event_trace_self_tests(void)
1399 * syscalls as we test. 1436 * syscalls as we test.
1400 */ 1437 */
1401#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS 1438#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1402 if (call->system && 1439 if (call->class->system &&
1403 strcmp(call->system, "syscalls") == 0) 1440 strcmp(call->class->system, "syscalls") == 0)
1404 continue; 1441 continue;
1405#endif 1442#endif
1406 1443
@@ -1410,7 +1447,7 @@ static __init void event_trace_self_tests(void)
1410 * If an event is already enabled, someone is using 1447 * If an event is already enabled, someone is using
1411 * it and the self test should not be on. 1448 * it and the self test should not be on.
1412 */ 1449 */
1413 if (call->enabled) { 1450 if (call->flags & TRACE_EVENT_FL_ENABLED) {
1414 pr_warning("Enabled event during self test!\n"); 1451 pr_warning("Enabled event during self test!\n");
1415 WARN_ON_ONCE(1); 1452 WARN_ON_ONCE(1);
1416 continue; 1453 continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 58092d844a1f..57bb1bb32999 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -500,8 +500,10 @@ static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500find_event_field(struct ftrace_event_call *call, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
503 504
504 list_for_each_entry(field, &call->fields, link) { 505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) {
505 if (!strcmp(field->name, name)) 507 if (!strcmp(field->name, name))
506 return field; 508 return field;
507 } 509 }
@@ -545,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
545 struct event_filter *filter = call->filter; 547 struct event_filter *filter = call->filter;
546 int i; 548 int i;
547 549
548 call->filter_active = 0; 550 call->flags &= ~TRACE_EVENT_FL_FILTERED;
549 filter->n_preds = 0; 551 filter->n_preds = 0;
550 552
551 for (i = 0; i < MAX_FILTER_PRED; i++) 553 for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
572{ 574{
573 __free_preds(call->filter); 575 __free_preds(call->filter);
574 call->filter = NULL; 576 call->filter = NULL;
575 call->filter_active = 0; 577 call->flags &= ~TRACE_EVENT_FL_FILTERED;
576} 578}
577 579
578static struct event_filter *__alloc_preds(void) 580static struct event_filter *__alloc_preds(void)
@@ -611,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
611 if (call->filter) 613 if (call->filter)
612 return 0; 614 return 0;
613 615
614 call->filter_active = 0; 616 call->flags &= ~TRACE_EVENT_FL_FILTERED;
615 call->filter = __alloc_preds(); 617 call->filter = __alloc_preds();
616 if (IS_ERR(call->filter)) 618 if (IS_ERR(call->filter))
617 return PTR_ERR(call->filter); 619 return PTR_ERR(call->filter);
@@ -625,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
625 int err; 627 int err;
626 628
627 list_for_each_entry(call, &ftrace_events, list) { 629 list_for_each_entry(call, &ftrace_events, list) {
628 if (!call->define_fields) 630 if (!call->class || !call->class->define_fields)
629 continue; 631 continue;
630 632
631 if (strcmp(call->system, system->name) != 0) 633 if (strcmp(call->class->system, system->name) != 0)
632 continue; 634 continue;
633 635
634 err = init_preds(call); 636 err = init_preds(call);
@@ -644,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
644 struct ftrace_event_call *call; 646 struct ftrace_event_call *call;
645 647
646 list_for_each_entry(call, &ftrace_events, list) { 648 list_for_each_entry(call, &ftrace_events, list) {
647 if (!call->define_fields) 649 if (!call->class || !call->class->define_fields)
648 continue; 650 continue;
649 651
650 if (strcmp(call->system, system->name) != 0) 652 if (strcmp(call->class->system, system->name) != 0)
651 continue; 653 continue;
652 654
653 filter_disable_preds(call); 655 filter_disable_preds(call);
@@ -1249,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
1249 list_for_each_entry(call, &ftrace_events, list) { 1251 list_for_each_entry(call, &ftrace_events, list) {
1250 struct event_filter *filter = call->filter; 1252 struct event_filter *filter = call->filter;
1251 1253
1252 if (!call->define_fields) 1254 if (!call->class || !call->class->define_fields)
1253 continue; 1255 continue;
1254 1256
1255 if (strcmp(call->system, system->name) != 0) 1257 if (strcmp(call->class->system, system->name) != 0)
1256 continue; 1258 continue;
1257 1259
1258 /* try to see if the filter can be applied */ 1260 /* try to see if the filter can be applied */
@@ -1266,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
1266 if (err) 1268 if (err)
1267 filter_disable_preds(call); 1269 filter_disable_preds(call);
1268 else { 1270 else {
1269 call->filter_active = 1; 1271 call->flags |= TRACE_EVENT_FL_FILTERED;
1270 replace_filter_string(filter, filter_string); 1272 replace_filter_string(filter, filter_string);
1271 } 1273 }
1272 fail = false; 1274 fail = false;
@@ -1315,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1315 if (err) 1317 if (err)
1316 append_filter_err(ps, call->filter); 1318 append_filter_err(ps, call->filter);
1317 else 1319 else
1318 call->filter_active = 1; 1320 call->flags |= TRACE_EVENT_FL_FILTERED;
1319out: 1321out:
1320 filter_opstack_clear(ps); 1322 filter_opstack_clear(ps);
1321 postfix_clear(ps); 1323 postfix_clear(ps);
@@ -1393,7 +1395,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1393 mutex_lock(&event_mutex); 1395 mutex_lock(&event_mutex);
1394 1396
1395 list_for_each_entry(call, &ftrace_events, list) { 1397 list_for_each_entry(call, &ftrace_events, list) {
1396 if (call->id == event_id) 1398 if (call->event.type == event_id)
1397 break; 1399 break;
1398 } 1400 }
1399 1401
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6ce..8536e2a65969 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call) 128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{ 129{
130 INIT_LIST_HEAD(&call->fields); 130 INIT_LIST_HEAD(&call->class->fields);
131 return 0; 131 return 0;
132} 132}
133 133
@@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
153#define F_printk(fmt, args...) #fmt ", " __stringify(args) 153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154 154
155#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
157 \
158struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \
162}; \
157 \ 163 \
158struct ftrace_event_call __used \ 164struct ftrace_event_call __used \
159__attribute__((__aligned__(4))) \ 165__attribute__((__aligned__(4))) \
160__attribute__((section("_ftrace_events"))) event_##call = { \ 166__attribute__((section("_ftrace_events"))) event_##call = { \
161 .name = #call, \ 167 .name = #call, \
162 .id = type, \ 168 .event.type = etype, \
163 .system = __stringify(TRACE_SYSTEM), \ 169 .class = &event_class_ftrace_##call, \
164 .raw_init = ftrace_raw_init_event, \
165 .print_fmt = print, \ 170 .print_fmt = print, \
166 .define_fields = ftrace_define_fields_##call, \
167}; \ 171}; \
168 172
169#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5cf553..79f4bac99a94 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
41#define TRACE_GRAPH_PRINT_PROC 0x8 41#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 42#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
44 44
45static struct tracer_opt trace_opts[] = { 45static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 46 /* Display overruns? (for self-debug purpose) */
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
179 return ret; 179 return ret;
180} 180}
181 181
182static int __trace_graph_entry(struct trace_array *tr, 182int __trace_graph_entry(struct trace_array *tr,
183 struct ftrace_graph_ent *trace, 183 struct ftrace_graph_ent *trace,
184 unsigned long flags, 184 unsigned long flags,
185 int pc) 185 int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 246 return trace_graph_entry(trace);
247} 247}
248 248
249static void __trace_graph_return(struct trace_array *tr, 249void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
251 unsigned long flags, 251 unsigned long flags,
252 int pc) 252 int pc)
@@ -490,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
490 * We need to consume the current entry to see 490 * We need to consume the current entry to see
491 * the next one. 491 * the next one.
492 */ 492 */
493 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 493 ring_buffer_consume(iter->tr->buffer, iter->cpu,
494 NULL, NULL);
494 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 495 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
495 NULL); 496 NULL, NULL);
496 } 497 }
497 498
498 if (!event) 499 if (!event)
@@ -526,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
526 527
527/* Signal a overhead of time execution to the output */ 528/* Signal a overhead of time execution to the output */
528static int 529static int
529print_graph_overhead(unsigned long long duration, struct trace_seq *s) 530print_graph_overhead(unsigned long long duration, struct trace_seq *s,
531 u32 flags)
530{ 532{
531 /* If duration disappear, we don't need anything */ 533 /* If duration disappear, we don't need anything */
532 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) 534 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
533 return 1; 535 return 1;
534 536
535 /* Non nested entry or return */ 537 /* Non nested entry or return */
536 if (duration == -1) 538 if (duration == -1)
537 return trace_seq_printf(s, " "); 539 return trace_seq_printf(s, " ");
538 540
539 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 541 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
540 /* Duration exceeded 100 msecs */ 542 /* Duration exceeded 100 msecs */
541 if (duration > 100000ULL) 543 if (duration > 100000ULL)
542 return trace_seq_printf(s, "! "); 544 return trace_seq_printf(s, "! ");
@@ -562,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
562 564
563static enum print_line_t 565static enum print_line_t
564print_graph_irq(struct trace_iterator *iter, unsigned long addr, 566print_graph_irq(struct trace_iterator *iter, unsigned long addr,
565 enum trace_type type, int cpu, pid_t pid) 567 enum trace_type type, int cpu, pid_t pid, u32 flags)
566{ 568{
567 int ret; 569 int ret;
568 struct trace_seq *s = &iter->seq; 570 struct trace_seq *s = &iter->seq;
@@ -572,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
572 return TRACE_TYPE_UNHANDLED; 574 return TRACE_TYPE_UNHANDLED;
573 575
574 /* Absolute time */ 576 /* Absolute time */
575 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 577 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
576 ret = print_graph_abs_time(iter->ts, s); 578 ret = print_graph_abs_time(iter->ts, s);
577 if (!ret) 579 if (!ret)
578 return TRACE_TYPE_PARTIAL_LINE; 580 return TRACE_TYPE_PARTIAL_LINE;
579 } 581 }
580 582
581 /* Cpu */ 583 /* Cpu */
582 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 584 if (flags & TRACE_GRAPH_PRINT_CPU) {
583 ret = print_graph_cpu(s, cpu); 585 ret = print_graph_cpu(s, cpu);
584 if (ret == TRACE_TYPE_PARTIAL_LINE) 586 if (ret == TRACE_TYPE_PARTIAL_LINE)
585 return TRACE_TYPE_PARTIAL_LINE; 587 return TRACE_TYPE_PARTIAL_LINE;
586 } 588 }
587 589
588 /* Proc */ 590 /* Proc */
589 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 591 if (flags & TRACE_GRAPH_PRINT_PROC) {
590 ret = print_graph_proc(s, pid); 592 ret = print_graph_proc(s, pid);
591 if (ret == TRACE_TYPE_PARTIAL_LINE) 593 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE; 594 return TRACE_TYPE_PARTIAL_LINE;
@@ -596,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
596 } 598 }
597 599
598 /* No overhead */ 600 /* No overhead */
599 ret = print_graph_overhead(-1, s); 601 ret = print_graph_overhead(-1, s, flags);
600 if (!ret) 602 if (!ret)
601 return TRACE_TYPE_PARTIAL_LINE; 603 return TRACE_TYPE_PARTIAL_LINE;
602 604
@@ -609,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
609 return TRACE_TYPE_PARTIAL_LINE; 611 return TRACE_TYPE_PARTIAL_LINE;
610 612
611 /* Don't close the duration column if haven't one */ 613 /* Don't close the duration column if haven't one */
612 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 614 if (flags & TRACE_GRAPH_PRINT_DURATION)
613 trace_seq_printf(s, " |"); 615 trace_seq_printf(s, " |");
614 ret = trace_seq_printf(s, "\n"); 616 ret = trace_seq_printf(s, "\n");
615 617
@@ -679,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
679static enum print_line_t 681static enum print_line_t
680print_graph_entry_leaf(struct trace_iterator *iter, 682print_graph_entry_leaf(struct trace_iterator *iter,
681 struct ftrace_graph_ent_entry *entry, 683 struct ftrace_graph_ent_entry *entry,
682 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) 684 struct ftrace_graph_ret_entry *ret_entry,
685 struct trace_seq *s, u32 flags)
683{ 686{
684 struct fgraph_data *data = iter->private; 687 struct fgraph_data *data = iter->private;
685 struct ftrace_graph_ret *graph_ret; 688 struct ftrace_graph_ret *graph_ret;
@@ -711,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
711 } 714 }
712 715
713 /* Overhead */ 716 /* Overhead */
714 ret = print_graph_overhead(duration, s); 717 ret = print_graph_overhead(duration, s, flags);
715 if (!ret) 718 if (!ret)
716 return TRACE_TYPE_PARTIAL_LINE; 719 return TRACE_TYPE_PARTIAL_LINE;
717 720
718 /* Duration */ 721 /* Duration */
719 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 722 if (flags & TRACE_GRAPH_PRINT_DURATION) {
720 ret = print_graph_duration(duration, s); 723 ret = print_graph_duration(duration, s);
721 if (ret == TRACE_TYPE_PARTIAL_LINE) 724 if (ret == TRACE_TYPE_PARTIAL_LINE)
722 return TRACE_TYPE_PARTIAL_LINE; 725 return TRACE_TYPE_PARTIAL_LINE;
@@ -739,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
739static enum print_line_t 742static enum print_line_t
740print_graph_entry_nested(struct trace_iterator *iter, 743print_graph_entry_nested(struct trace_iterator *iter,
741 struct ftrace_graph_ent_entry *entry, 744 struct ftrace_graph_ent_entry *entry,
742 struct trace_seq *s, int cpu) 745 struct trace_seq *s, int cpu, u32 flags)
743{ 746{
744 struct ftrace_graph_ent *call = &entry->graph_ent; 747 struct ftrace_graph_ent *call = &entry->graph_ent;
745 struct fgraph_data *data = iter->private; 748 struct fgraph_data *data = iter->private;
@@ -759,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
759 } 762 }
760 763
761 /* No overhead */ 764 /* No overhead */
762 ret = print_graph_overhead(-1, s); 765 ret = print_graph_overhead(-1, s, flags);
763 if (!ret) 766 if (!ret)
764 return TRACE_TYPE_PARTIAL_LINE; 767 return TRACE_TYPE_PARTIAL_LINE;
765 768
766 /* No time */ 769 /* No time */
767 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 770 if (flags & TRACE_GRAPH_PRINT_DURATION) {
768 ret = trace_seq_printf(s, " | "); 771 ret = trace_seq_printf(s, " | ");
769 if (!ret) 772 if (!ret)
770 return TRACE_TYPE_PARTIAL_LINE; 773 return TRACE_TYPE_PARTIAL_LINE;
@@ -790,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
790 793
791static enum print_line_t 794static enum print_line_t
792print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 795print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
793 int type, unsigned long addr) 796 int type, unsigned long addr, u32 flags)
794{ 797{
795 struct fgraph_data *data = iter->private; 798 struct fgraph_data *data = iter->private;
796 struct trace_entry *ent = iter->ent; 799 struct trace_entry *ent = iter->ent;
@@ -803,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
803 806
804 if (type) { 807 if (type) {
805 /* Interrupt */ 808 /* Interrupt */
806 ret = print_graph_irq(iter, addr, type, cpu, ent->pid); 809 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
807 if (ret == TRACE_TYPE_PARTIAL_LINE) 810 if (ret == TRACE_TYPE_PARTIAL_LINE)
808 return TRACE_TYPE_PARTIAL_LINE; 811 return TRACE_TYPE_PARTIAL_LINE;
809 } 812 }
810 813
811 /* Absolute time */ 814 /* Absolute time */
812 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 815 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
813 ret = print_graph_abs_time(iter->ts, s); 816 ret = print_graph_abs_time(iter->ts, s);
814 if (!ret) 817 if (!ret)
815 return TRACE_TYPE_PARTIAL_LINE; 818 return TRACE_TYPE_PARTIAL_LINE;
816 } 819 }
817 820
818 /* Cpu */ 821 /* Cpu */
819 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 822 if (flags & TRACE_GRAPH_PRINT_CPU) {
820 ret = print_graph_cpu(s, cpu); 823 ret = print_graph_cpu(s, cpu);
821 if (ret == TRACE_TYPE_PARTIAL_LINE) 824 if (ret == TRACE_TYPE_PARTIAL_LINE)
822 return TRACE_TYPE_PARTIAL_LINE; 825 return TRACE_TYPE_PARTIAL_LINE;
823 } 826 }
824 827
825 /* Proc */ 828 /* Proc */
826 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 829 if (flags & TRACE_GRAPH_PRINT_PROC) {
827 ret = print_graph_proc(s, ent->pid); 830 ret = print_graph_proc(s, ent->pid);
828 if (ret == TRACE_TYPE_PARTIAL_LINE) 831 if (ret == TRACE_TYPE_PARTIAL_LINE)
829 return TRACE_TYPE_PARTIAL_LINE; 832 return TRACE_TYPE_PARTIAL_LINE;
@@ -845,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
845 848
846static enum print_line_t 849static enum print_line_t
847print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 850print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
848 struct trace_iterator *iter) 851 struct trace_iterator *iter, u32 flags)
849{ 852{
850 struct fgraph_data *data = iter->private; 853 struct fgraph_data *data = iter->private;
851 struct ftrace_graph_ent *call = &field->graph_ent; 854 struct ftrace_graph_ent *call = &field->graph_ent;
@@ -853,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
853 static enum print_line_t ret; 856 static enum print_line_t ret;
854 int cpu = iter->cpu; 857 int cpu = iter->cpu;
855 858
856 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 859 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
857 return TRACE_TYPE_PARTIAL_LINE; 860 return TRACE_TYPE_PARTIAL_LINE;
858 861
859 leaf_ret = get_return_for_leaf(iter, field); 862 leaf_ret = get_return_for_leaf(iter, field);
860 if (leaf_ret) 863 if (leaf_ret)
861 ret = print_graph_entry_leaf(iter, field, leaf_ret, s); 864 ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
862 else 865 else
863 ret = print_graph_entry_nested(iter, field, s, cpu); 866 ret = print_graph_entry_nested(iter, field, s, cpu, flags);
864 867
865 if (data) { 868 if (data) {
866 /* 869 /*
@@ -879,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
879 882
880static enum print_line_t 883static enum print_line_t
881print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 884print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
882 struct trace_entry *ent, struct trace_iterator *iter) 885 struct trace_entry *ent, struct trace_iterator *iter,
886 u32 flags)
883{ 887{
884 unsigned long long duration = trace->rettime - trace->calltime; 888 unsigned long long duration = trace->rettime - trace->calltime;
885 struct fgraph_data *data = iter->private; 889 struct fgraph_data *data = iter->private;
@@ -909,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
909 } 913 }
910 } 914 }
911 915
912 if (print_graph_prologue(iter, s, 0, 0)) 916 if (print_graph_prologue(iter, s, 0, 0, flags))
913 return TRACE_TYPE_PARTIAL_LINE; 917 return TRACE_TYPE_PARTIAL_LINE;
914 918
915 /* Overhead */ 919 /* Overhead */
916 ret = print_graph_overhead(duration, s); 920 ret = print_graph_overhead(duration, s, flags);
917 if (!ret) 921 if (!ret)
918 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
919 923
920 /* Duration */ 924 /* Duration */
921 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 925 if (flags & TRACE_GRAPH_PRINT_DURATION) {
922 ret = print_graph_duration(duration, s); 926 ret = print_graph_duration(duration, s);
923 if (ret == TRACE_TYPE_PARTIAL_LINE) 927 if (ret == TRACE_TYPE_PARTIAL_LINE)
924 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -948,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
948 } 952 }
949 953
950 /* Overrun */ 954 /* Overrun */
951 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 955 if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
952 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 956 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
953 trace->overrun); 957 trace->overrun);
954 if (!ret) 958 if (!ret)
955 return TRACE_TYPE_PARTIAL_LINE; 959 return TRACE_TYPE_PARTIAL_LINE;
956 } 960 }
957 961
958 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); 962 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
963 cpu, pid, flags);
959 if (ret == TRACE_TYPE_PARTIAL_LINE) 964 if (ret == TRACE_TYPE_PARTIAL_LINE)
960 return TRACE_TYPE_PARTIAL_LINE; 965 return TRACE_TYPE_PARTIAL_LINE;
961 966
@@ -963,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
963} 968}
964 969
965static enum print_line_t 970static enum print_line_t
966print_graph_comment(struct trace_seq *s, struct trace_entry *ent, 971print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
967 struct trace_iterator *iter) 972 struct trace_iterator *iter, u32 flags)
968{ 973{
969 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 974 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
970 struct fgraph_data *data = iter->private; 975 struct fgraph_data *data = iter->private;
@@ -976,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
976 if (data) 981 if (data)
977 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 982 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
978 983
979 if (print_graph_prologue(iter, s, 0, 0)) 984 if (print_graph_prologue(iter, s, 0, 0, flags))
980 return TRACE_TYPE_PARTIAL_LINE; 985 return TRACE_TYPE_PARTIAL_LINE;
981 986
982 /* No overhead */ 987 /* No overhead */
983 ret = print_graph_overhead(-1, s); 988 ret = print_graph_overhead(-1, s, flags);
984 if (!ret) 989 if (!ret)
985 return TRACE_TYPE_PARTIAL_LINE; 990 return TRACE_TYPE_PARTIAL_LINE;
986 991
987 /* No time */ 992 /* No time */
988 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 993 if (flags & TRACE_GRAPH_PRINT_DURATION) {
989 ret = trace_seq_printf(s, " | "); 994 ret = trace_seq_printf(s, " | ");
990 if (!ret) 995 if (!ret)
991 return TRACE_TYPE_PARTIAL_LINE; 996 return TRACE_TYPE_PARTIAL_LINE;
@@ -1020,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1020 if (!event) 1025 if (!event)
1021 return TRACE_TYPE_UNHANDLED; 1026 return TRACE_TYPE_UNHANDLED;
1022 1027
1023 ret = event->trace(iter, sym_flags); 1028 ret = event->funcs->trace(iter, sym_flags, event);
1024 if (ret != TRACE_TYPE_HANDLED) 1029 if (ret != TRACE_TYPE_HANDLED)
1025 return ret; 1030 return ret;
1026 } 1031 }
@@ -1040,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1040 1045
1041 1046
1042enum print_line_t 1047enum print_line_t
1043print_graph_function(struct trace_iterator *iter) 1048print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1044{ 1049{
1045 struct ftrace_graph_ent_entry *field; 1050 struct ftrace_graph_ent_entry *field;
1046 struct fgraph_data *data = iter->private; 1051 struct fgraph_data *data = iter->private;
@@ -1061,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
1061 if (data && data->failed) { 1066 if (data && data->failed) {
1062 field = &data->ent; 1067 field = &data->ent;
1063 iter->cpu = data->cpu; 1068 iter->cpu = data->cpu;
1064 ret = print_graph_entry(field, s, iter); 1069 ret = print_graph_entry(field, s, iter, flags);
1065 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { 1070 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1066 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; 1071 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1067 ret = TRACE_TYPE_NO_CONSUME; 1072 ret = TRACE_TYPE_NO_CONSUME;
@@ -1081,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter)
1081 struct ftrace_graph_ent_entry saved; 1086 struct ftrace_graph_ent_entry saved;
1082 trace_assign_type(field, entry); 1087 trace_assign_type(field, entry);
1083 saved = *field; 1088 saved = *field;
1084 return print_graph_entry(&saved, s, iter); 1089 return print_graph_entry(&saved, s, iter, flags);
1085 } 1090 }
1086 case TRACE_GRAPH_RET: { 1091 case TRACE_GRAPH_RET: {
1087 struct ftrace_graph_ret_entry *field; 1092 struct ftrace_graph_ret_entry *field;
1088 trace_assign_type(field, entry); 1093 trace_assign_type(field, entry);
1089 return print_graph_return(&field->ret, s, entry, iter); 1094 return print_graph_return(&field->ret, s, entry, iter, flags);
1090 } 1095 }
1096 case TRACE_STACK:
1097 case TRACE_FN:
1098 /* dont trace stack and functions as comments */
1099 return TRACE_TYPE_UNHANDLED;
1100
1091 default: 1101 default:
1092 return print_graph_comment(s, entry, iter); 1102 return print_graph_comment(s, entry, iter, flags);
1093 } 1103 }
1094 1104
1095 return TRACE_TYPE_HANDLED; 1105 return TRACE_TYPE_HANDLED;
1096} 1106}
1097 1107
1098static void print_lat_header(struct seq_file *s) 1108static enum print_line_t
1109print_graph_function(struct trace_iterator *iter)
1110{
1111 return print_graph_function_flags(iter, tracer_flags.val);
1112}
1113
1114static enum print_line_t
1115print_graph_function_event(struct trace_iterator *iter, int flags,
1116 struct trace_event *event)
1117{
1118 return print_graph_function(iter);
1119}
1120
1121static void print_lat_header(struct seq_file *s, u32 flags)
1099{ 1122{
1100 static const char spaces[] = " " /* 16 spaces */ 1123 static const char spaces[] = " " /* 16 spaces */
1101 " " /* 4 spaces */ 1124 " " /* 4 spaces */
1102 " "; /* 17 spaces */ 1125 " "; /* 17 spaces */
1103 int size = 0; 1126 int size = 0;
1104 1127
1105 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1128 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1106 size += 16; 1129 size += 16;
1107 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1130 if (flags & TRACE_GRAPH_PRINT_CPU)
1108 size += 4; 1131 size += 4;
1109 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1132 if (flags & TRACE_GRAPH_PRINT_PROC)
1110 size += 17; 1133 size += 17;
1111 1134
1112 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); 1135 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1117,43 +1140,48 @@ static void print_lat_header(struct seq_file *s)
1117 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1140 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1118} 1141}
1119 1142
1120static void print_graph_headers(struct seq_file *s) 1143void print_graph_headers_flags(struct seq_file *s, u32 flags)
1121{ 1144{
1122 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1145 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1123 1146
1124 if (lat) 1147 if (lat)
1125 print_lat_header(s); 1148 print_lat_header(s, flags);
1126 1149
1127 /* 1st line */ 1150 /* 1st line */
1128 seq_printf(s, "#"); 1151 seq_printf(s, "#");
1129 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1152 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1130 seq_printf(s, " TIME "); 1153 seq_printf(s, " TIME ");
1131 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1154 if (flags & TRACE_GRAPH_PRINT_CPU)
1132 seq_printf(s, " CPU"); 1155 seq_printf(s, " CPU");
1133 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1156 if (flags & TRACE_GRAPH_PRINT_PROC)
1134 seq_printf(s, " TASK/PID "); 1157 seq_printf(s, " TASK/PID ");
1135 if (lat) 1158 if (lat)
1136 seq_printf(s, "|||||"); 1159 seq_printf(s, "|||||");
1137 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1160 if (flags & TRACE_GRAPH_PRINT_DURATION)
1138 seq_printf(s, " DURATION "); 1161 seq_printf(s, " DURATION ");
1139 seq_printf(s, " FUNCTION CALLS\n"); 1162 seq_printf(s, " FUNCTION CALLS\n");
1140 1163
1141 /* 2nd line */ 1164 /* 2nd line */
1142 seq_printf(s, "#"); 1165 seq_printf(s, "#");
1143 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1166 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1144 seq_printf(s, " | "); 1167 seq_printf(s, " | ");
1145 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1168 if (flags & TRACE_GRAPH_PRINT_CPU)
1146 seq_printf(s, " | "); 1169 seq_printf(s, " | ");
1147 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1170 if (flags & TRACE_GRAPH_PRINT_PROC)
1148 seq_printf(s, " | | "); 1171 seq_printf(s, " | | ");
1149 if (lat) 1172 if (lat)
1150 seq_printf(s, "|||||"); 1173 seq_printf(s, "|||||");
1151 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1174 if (flags & TRACE_GRAPH_PRINT_DURATION)
1152 seq_printf(s, " | | "); 1175 seq_printf(s, " | | ");
1153 seq_printf(s, " | | | |\n"); 1176 seq_printf(s, " | | | |\n");
1154} 1177}
1155 1178
1156static void graph_trace_open(struct trace_iterator *iter) 1179void print_graph_headers(struct seq_file *s)
1180{
1181 print_graph_headers_flags(s, tracer_flags.val);
1182}
1183
1184void graph_trace_open(struct trace_iterator *iter)
1157{ 1185{
1158 /* pid and depth on the last trace processed */ 1186 /* pid and depth on the last trace processed */
1159 struct fgraph_data *data; 1187 struct fgraph_data *data;
@@ -1188,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter)
1188 pr_warning("function graph tracer: not enough memory\n"); 1216 pr_warning("function graph tracer: not enough memory\n");
1189} 1217}
1190 1218
1191static void graph_trace_close(struct trace_iterator *iter) 1219void graph_trace_close(struct trace_iterator *iter)
1192{ 1220{
1193 struct fgraph_data *data = iter->private; 1221 struct fgraph_data *data = iter->private;
1194 1222
@@ -1198,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter)
1198 } 1226 }
1199} 1227}
1200 1228
1229static struct trace_event_functions graph_functions = {
1230 .trace = print_graph_function_event,
1231};
1232
1233static struct trace_event graph_trace_entry_event = {
1234 .type = TRACE_GRAPH_ENT,
1235 .funcs = &graph_functions,
1236};
1237
1238static struct trace_event graph_trace_ret_event = {
1239 .type = TRACE_GRAPH_RET,
1240 .funcs = &graph_functions
1241};
1242
1201static struct tracer graph_trace __read_mostly = { 1243static struct tracer graph_trace __read_mostly = {
1202 .name = "function_graph", 1244 .name = "function_graph",
1203 .open = graph_trace_open, 1245 .open = graph_trace_open,
@@ -1219,6 +1261,16 @@ static __init int init_graph_trace(void)
1219{ 1261{
1220 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1262 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1221 1263
1264 if (!register_ftrace_event(&graph_trace_entry_event)) {
1265 pr_warning("Warning: could not register graph trace events\n");
1266 return 1;
1267 }
1268
1269 if (!register_ftrace_event(&graph_trace_ret_event)) {
1270 pr_warning("Warning: could not register graph trace events\n");
1271 return 1;
1272 }
1273
1222 return register_tracer(&graph_trace); 1274 return register_tracer(&graph_trace);
1223} 1275}
1224 1276
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_lat_flag;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
39
37#ifdef CONFIG_PREEMPT_TRACER 40#ifdef CONFIG_PREEMPT_TRACER
38static inline int 41static inline int
39preempt_trace(void) 42preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
55# define irq_trace() (0) 58# define irq_trace() (0)
56#endif 59#endif
57 60
61#define TRACE_DISPLAY_GRAPH 1
62
63static struct tracer_opt trace_opts[] = {
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 /* display latency trace as call graph */
66 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
67#endif
68 { } /* Empty entry */
69};
70
71static struct tracer_flags tracer_flags = {
72 .val = 0,
73 .opts = trace_opts,
74};
75
76#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
77
58/* 78/*
59 * Sequence count - we record it when starting a measurement and 79 * Sequence count - we record it when starting a measurement and
60 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
108}; 128};
109#endif /* CONFIG_FUNCTION_TRACER */ 129#endif /* CONFIG_FUNCTION_TRACER */
110 130
131#ifdef CONFIG_FUNCTION_GRAPH_TRACER
132static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
133{
134 int cpu;
135
136 if (!(bit & TRACE_DISPLAY_GRAPH))
137 return -EINVAL;
138
139 if (!(is_graph() ^ set))
140 return 0;
141
142 stop_irqsoff_tracer(irqsoff_trace, !set);
143
144 for_each_possible_cpu(cpu)
145 per_cpu(tracing_cpu, cpu) = 0;
146
147 tracing_max_latency = 0;
148 tracing_reset_online_cpus(irqsoff_trace);
149
150 return start_irqsoff_tracer(irqsoff_trace, set);
151}
152
153static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
154{
155 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data;
157 unsigned long flags;
158 long disabled;
159 int ret;
160 int cpu;
161 int pc;
162
163 cpu = raw_smp_processor_id();
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled);
182 return ret;
183}
184
185static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
186{
187 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data;
189 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc;
193
194 cpu = raw_smp_processor_id();
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled);
212}
213
214static void irqsoff_trace_open(struct trace_iterator *iter)
215{
216 if (is_graph())
217 graph_trace_open(iter);
218
219}
220
221static void irqsoff_trace_close(struct trace_iterator *iter)
222{
223 if (iter->private)
224 graph_trace_close(iter);
225}
226
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC)
229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /*
240 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler
242 */
243 if (is_graph())
244 return print_graph_function_flags(iter, flags);
245
246 return TRACE_TYPE_UNHANDLED;
247}
248
249static void irqsoff_print_header(struct seq_file *s)
250{
251 if (is_graph()) {
252 struct trace_iterator *iter = s->private;
253 u32 flags = GRAPH_TRACER_FLAGS;
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s);
268}
269
270static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc)
294{
295 if (!is_graph())
296 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301}
302
303#else
304#define __trace_function trace_function
305
306static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
307{
308 return -EINVAL;
309}
310
311static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
312{
313 return -1;
314}
315
316static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
317{
318 return TRACE_TYPE_UNHANDLED;
319}
320
321static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
322static void irqsoff_print_header(struct seq_file *s) { }
323static void irqsoff_trace_open(struct trace_iterator *iter) { }
324static void irqsoff_trace_close(struct trace_iterator *iter) { }
325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
326
111/* 327/*
112 * Should this new latency be reported/recorded? 328 * Should this new latency be reported/recorded?
113 */ 329 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 366 if (!report_latency(delta))
151 goto out_unlock; 367 goto out_unlock;
152 368
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 369 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */ 370 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc); 371 __trace_stack(tr, flags, 5, pc);
156 372
@@ -172,7 +388,7 @@ out_unlock:
172out: 388out:
173 data->critical_sequence = max_sequence; 389 data->critical_sequence = max_sequence;
174 data->preempt_timestamp = ftrace_now(cpu); 390 data->preempt_timestamp = ftrace_now(cpu);
175 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 391 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
176} 392}
177 393
178static inline void 394static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
204 420
205 local_save_flags(flags); 421 local_save_flags(flags);
206 422
207 trace_function(tr, ip, parent_ip, flags, preempt_count()); 423 __trace_function(tr, ip, parent_ip, flags, preempt_count());
208 424
209 per_cpu(tracing_cpu, cpu) = 1; 425 per_cpu(tracing_cpu, cpu) = 1;
210 426
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
238 atomic_inc(&data->disabled); 454 atomic_inc(&data->disabled);
239 455
240 local_save_flags(flags); 456 local_save_flags(flags);
241 trace_function(tr, ip, parent_ip, flags, preempt_count()); 457 __trace_function(tr, ip, parent_ip, flags, preempt_count());
242 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 458 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
243 data->critical_start = 0; 459 data->critical_start = 0;
244 atomic_dec(&data->disabled); 460 atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
347} 563}
348#endif /* CONFIG_PREEMPT_TRACER */ 564#endif /* CONFIG_PREEMPT_TRACER */
349 565
350static void start_irqsoff_tracer(struct trace_array *tr) 566static int start_irqsoff_tracer(struct trace_array *tr, int graph)
351{ 567{
352 register_ftrace_function(&trace_ops); 568 int ret = 0;
353 if (tracing_is_enabled()) 569
570 if (!graph)
571 ret = register_ftrace_function(&trace_ops);
572 else
573 ret = register_ftrace_graph(&irqsoff_graph_return,
574 &irqsoff_graph_entry);
575
576 if (!ret && tracing_is_enabled())
354 tracer_enabled = 1; 577 tracer_enabled = 1;
355 else 578 else
356 tracer_enabled = 0; 579 tracer_enabled = 0;
580
581 return ret;
357} 582}
358 583
359static void stop_irqsoff_tracer(struct trace_array *tr) 584static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
360{ 585{
361 tracer_enabled = 0; 586 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 587
588 if (!graph)
589 unregister_ftrace_function(&trace_ops);
590 else
591 unregister_ftrace_graph();
363} 592}
364 593
365static void __irqsoff_tracer_init(struct trace_array *tr) 594static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
372 /* make sure that the tracer is visible */ 601 /* make sure that the tracer is visible */
373 smp_wmb(); 602 smp_wmb();
374 tracing_reset_online_cpus(tr); 603 tracing_reset_online_cpus(tr);
375 start_irqsoff_tracer(tr); 604
605 if (start_irqsoff_tracer(tr, is_graph()))
606 printk(KERN_ERR "failed to start irqsoff tracer\n");
376} 607}
377 608
378static void irqsoff_tracer_reset(struct trace_array *tr) 609static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 610{
380 stop_irqsoff_tracer(tr); 611 stop_irqsoff_tracer(tr, is_graph());
381 612
382 if (!save_lat_flag) 613 if (!save_lat_flag)
383 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 614 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
409 .start = irqsoff_tracer_start, 640 .start = irqsoff_tracer_start,
410 .stop = irqsoff_tracer_stop, 641 .stop = irqsoff_tracer_stop,
411 .print_max = 1, 642 .print_max = 1,
643 .print_header = irqsoff_print_header,
644 .print_line = irqsoff_print_line,
645 .flags = &tracer_flags,
646 .set_flag = irqsoff_set_flag,
412#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
413 .selftest = trace_selftest_startup_irqsoff, 648 .selftest = trace_selftest_startup_irqsoff,
414#endif 649#endif
650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close,
415}; 652};
416# define register_irqsoff(trace) register_tracer(&trace) 653# define register_irqsoff(trace) register_tracer(&trace)
417#else 654#else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
435 .start = irqsoff_tracer_start, 672 .start = irqsoff_tracer_start,
436 .stop = irqsoff_tracer_stop, 673 .stop = irqsoff_tracer_stop,
437 .print_max = 1, 674 .print_max = 1,
675 .print_header = irqsoff_print_header,
676 .print_line = irqsoff_print_line,
677 .flags = &tracer_flags,
678 .set_flag = irqsoff_set_flag,
438#ifdef CONFIG_FTRACE_SELFTEST 679#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_preemptoff, 680 .selftest = trace_selftest_startup_preemptoff,
440#endif 681#endif
682 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close,
441}; 684};
442# define register_preemptoff(trace) register_tracer(&trace) 685# define register_preemptoff(trace) register_tracer(&trace)
443#else 686#else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
463 .start = irqsoff_tracer_start, 706 .start = irqsoff_tracer_start,
464 .stop = irqsoff_tracer_stop, 707 .stop = irqsoff_tracer_stop,
465 .print_max = 1, 708 .print_max = 1,
709 .print_header = irqsoff_print_header,
710 .print_line = irqsoff_print_line,
711 .flags = &tracer_flags,
712 .set_flag = irqsoff_set_flag,
466#ifdef CONFIG_FTRACE_SELFTEST 713#ifdef CONFIG_FTRACE_SELFTEST
467 .selftest = trace_selftest_startup_preemptirqsoff, 714 .selftest = trace_selftest_startup_preemptirqsoff,
468#endif 715#endif
716 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close,
469}; 718};
470 719
471# define register_preemptirqsoff(trace) register_tracer(&trace) 720# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a7514326052b..9a082bba9537 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -324,8 +324,8 @@ struct trace_probe {
324 unsigned long nhit; 324 unsigned long nhit;
325 unsigned int flags; /* For TP_FLAG_* */ 325 unsigned int flags; /* For TP_FLAG_* */
326 const char *symbol; /* symbol name */ 326 const char *symbol; /* symbol name */
327 struct ftrace_event_class class;
327 struct ftrace_event_call call; 328 struct ftrace_event_call call;
328 struct trace_event event;
329 ssize_t size; /* trace entry size */ 329 ssize_t size; /* trace entry size */
330 unsigned int nr_args; 330 unsigned int nr_args;
331 struct probe_arg args[]; 331 struct probe_arg args[];
@@ -404,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
404 goto error; 404 goto error;
405 } 405 }
406 406
407 tp->call.class = &tp->class;
407 tp->call.name = kstrdup(event, GFP_KERNEL); 408 tp->call.name = kstrdup(event, GFP_KERNEL);
408 if (!tp->call.name) 409 if (!tp->call.name)
409 goto error; 410 goto error;
@@ -413,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
413 goto error; 414 goto error;
414 } 415 }
415 416
416 tp->call.system = kstrdup(group, GFP_KERNEL); 417 tp->class.system = kstrdup(group, GFP_KERNEL);
417 if (!tp->call.system) 418 if (!tp->class.system)
418 goto error; 419 goto error;
419 420
420 INIT_LIST_HEAD(&tp->list); 421 INIT_LIST_HEAD(&tp->list);
@@ -443,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
443 for (i = 0; i < tp->nr_args; i++) 444 for (i = 0; i < tp->nr_args; i++)
444 free_probe_arg(&tp->args[i]); 445 free_probe_arg(&tp->args[i]);
445 446
446 kfree(tp->call.system); 447 kfree(tp->call.class->system);
447 kfree(tp->call.name); 448 kfree(tp->call.name);
448 kfree(tp->symbol); 449 kfree(tp->symbol);
449 kfree(tp); 450 kfree(tp);
@@ -456,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
456 457
457 list_for_each_entry(tp, &probe_list, list) 458 list_for_each_entry(tp, &probe_list, list)
458 if (strcmp(tp->call.name, event) == 0 && 459 if (strcmp(tp->call.name, event) == 0 &&
459 strcmp(tp->call.system, group) == 0) 460 strcmp(tp->call.class->system, group) == 0)
460 return tp; 461 return tp;
461 return NULL; 462 return NULL;
462} 463}
@@ -481,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
481 mutex_lock(&probe_lock); 482 mutex_lock(&probe_lock);
482 483
483 /* register as an event */ 484 /* register as an event */
484 old_tp = find_probe_event(tp->call.name, tp->call.system); 485 old_tp = find_probe_event(tp->call.name, tp->call.class->system);
485 if (old_tp) { 486 if (old_tp) {
486 /* delete old event */ 487 /* delete old event */
487 unregister_trace_probe(old_tp); 488 unregister_trace_probe(old_tp);
@@ -904,7 +905,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
904 int i; 905 int i;
905 906
906 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 907 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
907 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 908 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
908 909
909 if (!tp->symbol) 910 if (!tp->symbol)
910 seq_printf(m, " 0x%p", tp->rp.kp.addr); 911 seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -1061,8 +1062,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1061 1062
1062 size = sizeof(*entry) + tp->size; 1063 size = sizeof(*entry) + tp->size;
1063 1064
1064 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1065 irq_flags, pc); 1066 size, irq_flags, pc);
1066 if (!event) 1067 if (!event)
1067 return; 1068 return;
1068 1069
@@ -1094,8 +1095,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1094 1095
1095 size = sizeof(*entry) + tp->size; 1096 size = sizeof(*entry) + tp->size;
1096 1097
1097 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1098 irq_flags, pc); 1099 size, irq_flags, pc);
1099 if (!event) 1100 if (!event)
1100 return; 1101 return;
1101 1102
@@ -1112,18 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1112 1113
1113/* Event entry printers */ 1114/* Event entry printers */
1114enum print_line_t 1115enum print_line_t
1115print_kprobe_event(struct trace_iterator *iter, int flags) 1116print_kprobe_event(struct trace_iterator *iter, int flags,
1117 struct trace_event *event)
1116{ 1118{
1117 struct kprobe_trace_entry_head *field; 1119 struct kprobe_trace_entry_head *field;
1118 struct trace_seq *s = &iter->seq; 1120 struct trace_seq *s = &iter->seq;
1119 struct trace_event *event;
1120 struct trace_probe *tp; 1121 struct trace_probe *tp;
1121 u8 *data; 1122 u8 *data;
1122 int i; 1123 int i;
1123 1124
1124 field = (struct kprobe_trace_entry_head *)iter->ent; 1125 field = (struct kprobe_trace_entry_head *)iter->ent;
1125 event = ftrace_find_event(field->ent.type); 1126 tp = container_of(event, struct trace_probe, call.event);
1126 tp = container_of(event, struct trace_probe, event);
1127 1127
1128 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1128 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1129 goto partial; 1129 goto partial;
@@ -1149,18 +1149,17 @@ partial:
1149} 1149}
1150 1150
1151enum print_line_t 1151enum print_line_t
1152print_kretprobe_event(struct trace_iterator *iter, int flags) 1152print_kretprobe_event(struct trace_iterator *iter, int flags,
1153 struct trace_event *event)
1153{ 1154{
1154 struct kretprobe_trace_entry_head *field; 1155 struct kretprobe_trace_entry_head *field;
1155 struct trace_seq *s = &iter->seq; 1156 struct trace_seq *s = &iter->seq;
1156 struct trace_event *event;
1157 struct trace_probe *tp; 1157 struct trace_probe *tp;
1158 u8 *data; 1158 u8 *data;
1159 int i; 1159 int i;
1160 1160
1161 field = (struct kretprobe_trace_entry_head *)iter->ent; 1161 field = (struct kretprobe_trace_entry_head *)iter->ent;
1162 event = ftrace_find_event(field->ent.type); 1162 tp = container_of(event, struct trace_probe, call.event);
1163 tp = container_of(event, struct trace_probe, event);
1164 1163
1165 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1164 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1166 goto partial; 1165 goto partial;
@@ -1217,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1217 1216
1218static int probe_event_raw_init(struct ftrace_event_call *event_call) 1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1219{ 1218{
1220 INIT_LIST_HEAD(&event_call->fields);
1221
1222 return 0; 1219 return 0;
1223} 1220}
1224 1221
@@ -1353,7 +1350,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1353 "profile buffer not large enough")) 1350 "profile buffer not large enough"))
1354 return; 1351 return;
1355 1352
1356 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1353 entry = perf_trace_buf_prepare(size, call->event.type,
1354 &rctx, &irq_flags);
1357 if (!entry) 1355 if (!entry)
1358 return; 1356 return;
1359 1357
@@ -1384,7 +1382,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1384 "profile buffer not large enough")) 1382 "profile buffer not large enough"))
1385 return; 1383 return;
1386 1384
1387 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1385 entry = perf_trace_buf_prepare(size, call->event.type,
1386 &rctx, &irq_flags);
1388 if (!entry) 1387 if (!entry)
1389 return; 1388 return;
1390 1389
@@ -1425,6 +1424,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
1425} 1424}
1426#endif /* CONFIG_PERF_EVENTS */ 1425#endif /* CONFIG_PERF_EVENTS */
1427 1426
1427static __kprobes
1428int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1429{
1430 switch (type) {
1431 case TRACE_REG_REGISTER:
1432 return probe_event_enable(event);
1433 case TRACE_REG_UNREGISTER:
1434 probe_event_disable(event);
1435 return 0;
1436
1437#ifdef CONFIG_PERF_EVENTS
1438 case TRACE_REG_PERF_REGISTER:
1439 return probe_perf_enable(event);
1440 case TRACE_REG_PERF_UNREGISTER:
1441 probe_perf_disable(event);
1442 return 0;
1443#endif
1444 }
1445 return 0;
1446}
1428 1447
1429static __kprobes 1448static __kprobes
1430int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1449int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1454,6 +1473,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1454 return 0; /* We don't tweek kernel, so just return 0 */ 1473 return 0; /* We don't tweek kernel, so just return 0 */
1455} 1474}
1456 1475
1476static struct trace_event_functions kretprobe_funcs = {
1477 .trace = print_kretprobe_event
1478};
1479
1480static struct trace_event_functions kprobe_funcs = {
1481 .trace = print_kprobe_event
1482};
1483
1457static int register_probe_event(struct trace_probe *tp) 1484static int register_probe_event(struct trace_probe *tp)
1458{ 1485{
1459 struct ftrace_event_call *call = &tp->call; 1486 struct ftrace_event_call *call = &tp->call;
@@ -1461,36 +1488,31 @@ static int register_probe_event(struct trace_probe *tp)
1461 1488
1462 /* Initialize ftrace_event_call */ 1489 /* Initialize ftrace_event_call */
1463 if (probe_is_return(tp)) { 1490 if (probe_is_return(tp)) {
1464 tp->event.trace = print_kretprobe_event; 1491 INIT_LIST_HEAD(&call->class->fields);
1465 call->raw_init = probe_event_raw_init; 1492 call->event.funcs = &kretprobe_funcs;
1466 call->define_fields = kretprobe_event_define_fields; 1493 call->class->raw_init = probe_event_raw_init;
1494 call->class->define_fields = kretprobe_event_define_fields;
1467 } else { 1495 } else {
1468 tp->event.trace = print_kprobe_event; 1496 INIT_LIST_HEAD(&call->class->fields);
1469 call->raw_init = probe_event_raw_init; 1497 call->event.funcs = &kprobe_funcs;
1470 call->define_fields = kprobe_event_define_fields; 1498 call->class->raw_init = probe_event_raw_init;
1499 call->class->define_fields = kprobe_event_define_fields;
1471 } 1500 }
1472 if (set_print_fmt(tp) < 0) 1501 if (set_print_fmt(tp) < 0)
1473 return -ENOMEM; 1502 return -ENOMEM;
1474 call->event = &tp->event; 1503 ret = register_ftrace_event(&call->event);
1475 call->id = register_ftrace_event(&tp->event); 1504 if (!ret) {
1476 if (!call->id) {
1477 kfree(call->print_fmt); 1505 kfree(call->print_fmt);
1478 return -ENODEV; 1506 return -ENODEV;
1479 } 1507 }
1480 call->enabled = 0; 1508 call->flags = 0;
1481 call->regfunc = probe_event_enable; 1509 call->class->reg = kprobe_register;
1482 call->unregfunc = probe_event_disable;
1483
1484#ifdef CONFIG_PERF_EVENTS
1485 call->perf_event_enable = probe_perf_enable;
1486 call->perf_event_disable = probe_perf_disable;
1487#endif
1488 call->data = tp; 1510 call->data = tp;
1489 ret = trace_add_event_call(call); 1511 ret = trace_add_event_call(call);
1490 if (ret) { 1512 if (ret) {
1491 pr_info("Failed to register kprobe event: %s\n", call->name); 1513 pr_info("Failed to register kprobe event: %s\n", call->name);
1492 kfree(call->print_fmt); 1514 kfree(call->print_fmt);
1493 unregister_ftrace_event(&tp->event); 1515 unregister_ftrace_event(&call->event);
1494 } 1516 }
1495 return ret; 1517 return ret;
1496} 1518}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..fc9d4dbb089e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -253,7 +253,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
253 void *ret; 253 void *ret;
254 254
255 if (s->full) 255 if (s->full)
256 return 0; 256 return NULL;
257 257
258 if (len > ((PAGE_SIZE - 1) - s->len)) { 258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1; 259 s->full = 1;
@@ -726,6 +726,9 @@ int register_ftrace_event(struct trace_event *event)
726 if (WARN_ON(!event)) 726 if (WARN_ON(!event))
727 goto out; 727 goto out;
728 728
729 if (WARN_ON(!event->funcs))
730 goto out;
731
729 INIT_LIST_HEAD(&event->list); 732 INIT_LIST_HEAD(&event->list);
730 733
731 if (!event->type) { 734 if (!event->type) {
@@ -758,14 +761,14 @@ int register_ftrace_event(struct trace_event *event)
758 goto out; 761 goto out;
759 } 762 }
760 763
761 if (event->trace == NULL) 764 if (event->funcs->trace == NULL)
762 event->trace = trace_nop_print; 765 event->funcs->trace = trace_nop_print;
763 if (event->raw == NULL) 766 if (event->funcs->raw == NULL)
764 event->raw = trace_nop_print; 767 event->funcs->raw = trace_nop_print;
765 if (event->hex == NULL) 768 if (event->funcs->hex == NULL)
766 event->hex = trace_nop_print; 769 event->funcs->hex = trace_nop_print;
767 if (event->binary == NULL) 770 if (event->funcs->binary == NULL)
768 event->binary = trace_nop_print; 771 event->funcs->binary = trace_nop_print;
769 772
770 key = event->type & (EVENT_HASHSIZE - 1); 773 key = event->type & (EVENT_HASHSIZE - 1);
771 774
@@ -807,13 +810,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
807 * Standard events 810 * Standard events
808 */ 811 */
809 812
810enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) 813enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
814 struct trace_event *event)
811{ 815{
812 return TRACE_TYPE_HANDLED; 816 return TRACE_TYPE_HANDLED;
813} 817}
814 818
815/* TRACE_FN */ 819/* TRACE_FN */
816static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) 820static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
821 struct trace_event *event)
817{ 822{
818 struct ftrace_entry *field; 823 struct ftrace_entry *field;
819 struct trace_seq *s = &iter->seq; 824 struct trace_seq *s = &iter->seq;
@@ -840,7 +845,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
840 return TRACE_TYPE_PARTIAL_LINE; 845 return TRACE_TYPE_PARTIAL_LINE;
841} 846}
842 847
843static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) 848static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
849 struct trace_event *event)
844{ 850{
845 struct ftrace_entry *field; 851 struct ftrace_entry *field;
846 852
@@ -854,7 +860,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
854 return TRACE_TYPE_HANDLED; 860 return TRACE_TYPE_HANDLED;
855} 861}
856 862
857static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) 863static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
864 struct trace_event *event)
858{ 865{
859 struct ftrace_entry *field; 866 struct ftrace_entry *field;
860 struct trace_seq *s = &iter->seq; 867 struct trace_seq *s = &iter->seq;
@@ -867,7 +874,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
867 return TRACE_TYPE_HANDLED; 874 return TRACE_TYPE_HANDLED;
868} 875}
869 876
870static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) 877static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
878 struct trace_event *event)
871{ 879{
872 struct ftrace_entry *field; 880 struct ftrace_entry *field;
873 struct trace_seq *s = &iter->seq; 881 struct trace_seq *s = &iter->seq;
@@ -880,14 +888,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
880 return TRACE_TYPE_HANDLED; 888 return TRACE_TYPE_HANDLED;
881} 889}
882 890
883static struct trace_event trace_fn_event = { 891static struct trace_event_functions trace_fn_funcs = {
884 .type = TRACE_FN,
885 .trace = trace_fn_trace, 892 .trace = trace_fn_trace,
886 .raw = trace_fn_raw, 893 .raw = trace_fn_raw,
887 .hex = trace_fn_hex, 894 .hex = trace_fn_hex,
888 .binary = trace_fn_bin, 895 .binary = trace_fn_bin,
889}; 896};
890 897
898static struct trace_event trace_fn_event = {
899 .type = TRACE_FN,
900 .funcs = &trace_fn_funcs,
901};
902
891/* TRACE_CTX an TRACE_WAKE */ 903/* TRACE_CTX an TRACE_WAKE */
892static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, 904static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
893 char *delim) 905 char *delim)
@@ -916,13 +928,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
916 return TRACE_TYPE_HANDLED; 928 return TRACE_TYPE_HANDLED;
917} 929}
918 930
919static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) 931static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
932 struct trace_event *event)
920{ 933{
921 return trace_ctxwake_print(iter, "==>"); 934 return trace_ctxwake_print(iter, "==>");
922} 935}
923 936
924static enum print_line_t trace_wake_print(struct trace_iterator *iter, 937static enum print_line_t trace_wake_print(struct trace_iterator *iter,
925 int flags) 938 int flags, struct trace_event *event)
926{ 939{
927 return trace_ctxwake_print(iter, " +"); 940 return trace_ctxwake_print(iter, " +");
928} 941}
@@ -950,12 +963,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
950 return TRACE_TYPE_HANDLED; 963 return TRACE_TYPE_HANDLED;
951} 964}
952 965
953static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) 966static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
967 struct trace_event *event)
954{ 968{
955 return trace_ctxwake_raw(iter, 0); 969 return trace_ctxwake_raw(iter, 0);
956} 970}
957 971
958static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) 972static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
973 struct trace_event *event)
959{ 974{
960 return trace_ctxwake_raw(iter, '+'); 975 return trace_ctxwake_raw(iter, '+');
961} 976}
@@ -984,18 +999,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
984 return TRACE_TYPE_HANDLED; 999 return TRACE_TYPE_HANDLED;
985} 1000}
986 1001
987static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) 1002static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
1003 struct trace_event *event)
988{ 1004{
989 return trace_ctxwake_hex(iter, 0); 1005 return trace_ctxwake_hex(iter, 0);
990} 1006}
991 1007
992static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) 1008static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
1009 struct trace_event *event)
993{ 1010{
994 return trace_ctxwake_hex(iter, '+'); 1011 return trace_ctxwake_hex(iter, '+');
995} 1012}
996 1013
997static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, 1014static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
998 int flags) 1015 int flags, struct trace_event *event)
999{ 1016{
1000 struct ctx_switch_entry *field; 1017 struct ctx_switch_entry *field;
1001 struct trace_seq *s = &iter->seq; 1018 struct trace_seq *s = &iter->seq;
@@ -1012,25 +1029,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
1012 return TRACE_TYPE_HANDLED; 1029 return TRACE_TYPE_HANDLED;
1013} 1030}
1014 1031
1015static struct trace_event trace_ctx_event = { 1032static struct trace_event_functions trace_ctx_funcs = {
1016 .type = TRACE_CTX,
1017 .trace = trace_ctx_print, 1033 .trace = trace_ctx_print,
1018 .raw = trace_ctx_raw, 1034 .raw = trace_ctx_raw,
1019 .hex = trace_ctx_hex, 1035 .hex = trace_ctx_hex,
1020 .binary = trace_ctxwake_bin, 1036 .binary = trace_ctxwake_bin,
1021}; 1037};
1022 1038
1023static struct trace_event trace_wake_event = { 1039static struct trace_event trace_ctx_event = {
1024 .type = TRACE_WAKE, 1040 .type = TRACE_CTX,
1041 .funcs = &trace_ctx_funcs,
1042};
1043
1044static struct trace_event_functions trace_wake_funcs = {
1025 .trace = trace_wake_print, 1045 .trace = trace_wake_print,
1026 .raw = trace_wake_raw, 1046 .raw = trace_wake_raw,
1027 .hex = trace_wake_hex, 1047 .hex = trace_wake_hex,
1028 .binary = trace_ctxwake_bin, 1048 .binary = trace_ctxwake_bin,
1029}; 1049};
1030 1050
1051static struct trace_event trace_wake_event = {
1052 .type = TRACE_WAKE,
1053 .funcs = &trace_wake_funcs,
1054};
1055
1031/* TRACE_SPECIAL */ 1056/* TRACE_SPECIAL */
1032static enum print_line_t trace_special_print(struct trace_iterator *iter, 1057static enum print_line_t trace_special_print(struct trace_iterator *iter,
1033 int flags) 1058 int flags, struct trace_event *event)
1034{ 1059{
1035 struct special_entry *field; 1060 struct special_entry *field;
1036 1061
@@ -1046,7 +1071,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
1046} 1071}
1047 1072
1048static enum print_line_t trace_special_hex(struct trace_iterator *iter, 1073static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1049 int flags) 1074 int flags, struct trace_event *event)
1050{ 1075{
1051 struct special_entry *field; 1076 struct special_entry *field;
1052 struct trace_seq *s = &iter->seq; 1077 struct trace_seq *s = &iter->seq;
@@ -1061,7 +1086,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1061} 1086}
1062 1087
1063static enum print_line_t trace_special_bin(struct trace_iterator *iter, 1088static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1064 int flags) 1089 int flags, struct trace_event *event)
1065{ 1090{
1066 struct special_entry *field; 1091 struct special_entry *field;
1067 struct trace_seq *s = &iter->seq; 1092 struct trace_seq *s = &iter->seq;
@@ -1075,18 +1100,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1075 return TRACE_TYPE_HANDLED; 1100 return TRACE_TYPE_HANDLED;
1076} 1101}
1077 1102
1078static struct trace_event trace_special_event = { 1103static struct trace_event_functions trace_special_funcs = {
1079 .type = TRACE_SPECIAL,
1080 .trace = trace_special_print, 1104 .trace = trace_special_print,
1081 .raw = trace_special_print, 1105 .raw = trace_special_print,
1082 .hex = trace_special_hex, 1106 .hex = trace_special_hex,
1083 .binary = trace_special_bin, 1107 .binary = trace_special_bin,
1084}; 1108};
1085 1109
1110static struct trace_event trace_special_event = {
1111 .type = TRACE_SPECIAL,
1112 .funcs = &trace_special_funcs,
1113};
1114
1086/* TRACE_STACK */ 1115/* TRACE_STACK */
1087 1116
1088static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1117static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1089 int flags) 1118 int flags, struct trace_event *event)
1090{ 1119{
1091 struct stack_entry *field; 1120 struct stack_entry *field;
1092 struct trace_seq *s = &iter->seq; 1121 struct trace_seq *s = &iter->seq;
@@ -1114,17 +1143,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1114 return TRACE_TYPE_PARTIAL_LINE; 1143 return TRACE_TYPE_PARTIAL_LINE;
1115} 1144}
1116 1145
1117static struct trace_event trace_stack_event = { 1146static struct trace_event_functions trace_stack_funcs = {
1118 .type = TRACE_STACK,
1119 .trace = trace_stack_print, 1147 .trace = trace_stack_print,
1120 .raw = trace_special_print, 1148 .raw = trace_special_print,
1121 .hex = trace_special_hex, 1149 .hex = trace_special_hex,
1122 .binary = trace_special_bin, 1150 .binary = trace_special_bin,
1123}; 1151};
1124 1152
1153static struct trace_event trace_stack_event = {
1154 .type = TRACE_STACK,
1155 .funcs = &trace_stack_funcs,
1156};
1157
1125/* TRACE_USER_STACK */ 1158/* TRACE_USER_STACK */
1126static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, 1159static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1127 int flags) 1160 int flags, struct trace_event *event)
1128{ 1161{
1129 struct userstack_entry *field; 1162 struct userstack_entry *field;
1130 struct trace_seq *s = &iter->seq; 1163 struct trace_seq *s = &iter->seq;
@@ -1143,17 +1176,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1143 return TRACE_TYPE_PARTIAL_LINE; 1176 return TRACE_TYPE_PARTIAL_LINE;
1144} 1177}
1145 1178
1146static struct trace_event trace_user_stack_event = { 1179static struct trace_event_functions trace_user_stack_funcs = {
1147 .type = TRACE_USER_STACK,
1148 .trace = trace_user_stack_print, 1180 .trace = trace_user_stack_print,
1149 .raw = trace_special_print, 1181 .raw = trace_special_print,
1150 .hex = trace_special_hex, 1182 .hex = trace_special_hex,
1151 .binary = trace_special_bin, 1183 .binary = trace_special_bin,
1152}; 1184};
1153 1185
1186static struct trace_event trace_user_stack_event = {
1187 .type = TRACE_USER_STACK,
1188 .funcs = &trace_user_stack_funcs,
1189};
1190
1154/* TRACE_BPRINT */ 1191/* TRACE_BPRINT */
1155static enum print_line_t 1192static enum print_line_t
1156trace_bprint_print(struct trace_iterator *iter, int flags) 1193trace_bprint_print(struct trace_iterator *iter, int flags,
1194 struct trace_event *event)
1157{ 1195{
1158 struct trace_entry *entry = iter->ent; 1196 struct trace_entry *entry = iter->ent;
1159 struct trace_seq *s = &iter->seq; 1197 struct trace_seq *s = &iter->seq;
@@ -1178,7 +1216,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
1178 1216
1179 1217
1180static enum print_line_t 1218static enum print_line_t
1181trace_bprint_raw(struct trace_iterator *iter, int flags) 1219trace_bprint_raw(struct trace_iterator *iter, int flags,
1220 struct trace_event *event)
1182{ 1221{
1183 struct bprint_entry *field; 1222 struct bprint_entry *field;
1184 struct trace_seq *s = &iter->seq; 1223 struct trace_seq *s = &iter->seq;
@@ -1197,16 +1236,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
1197 return TRACE_TYPE_PARTIAL_LINE; 1236 return TRACE_TYPE_PARTIAL_LINE;
1198} 1237}
1199 1238
1239static struct trace_event_functions trace_bprint_funcs = {
1240 .trace = trace_bprint_print,
1241 .raw = trace_bprint_raw,
1242};
1200 1243
1201static struct trace_event trace_bprint_event = { 1244static struct trace_event trace_bprint_event = {
1202 .type = TRACE_BPRINT, 1245 .type = TRACE_BPRINT,
1203 .trace = trace_bprint_print, 1246 .funcs = &trace_bprint_funcs,
1204 .raw = trace_bprint_raw,
1205}; 1247};
1206 1248
1207/* TRACE_PRINT */ 1249/* TRACE_PRINT */
1208static enum print_line_t trace_print_print(struct trace_iterator *iter, 1250static enum print_line_t trace_print_print(struct trace_iterator *iter,
1209 int flags) 1251 int flags, struct trace_event *event)
1210{ 1252{
1211 struct print_entry *field; 1253 struct print_entry *field;
1212 struct trace_seq *s = &iter->seq; 1254 struct trace_seq *s = &iter->seq;
@@ -1225,7 +1267,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1225 return TRACE_TYPE_PARTIAL_LINE; 1267 return TRACE_TYPE_PARTIAL_LINE;
1226} 1268}
1227 1269
1228static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) 1270static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1271 struct trace_event *event)
1229{ 1272{
1230 struct print_entry *field; 1273 struct print_entry *field;
1231 1274
@@ -1240,12 +1283,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
1240 return TRACE_TYPE_PARTIAL_LINE; 1283 return TRACE_TYPE_PARTIAL_LINE;
1241} 1284}
1242 1285
1243static struct trace_event trace_print_event = { 1286static struct trace_event_functions trace_print_funcs = {
1244 .type = TRACE_PRINT,
1245 .trace = trace_print_print, 1287 .trace = trace_print_print,
1246 .raw = trace_print_raw, 1288 .raw = trace_print_raw,
1247}; 1289};
1248 1290
1291static struct trace_event trace_print_event = {
1292 .type = TRACE_PRINT,
1293 .funcs = &trace_print_funcs,
1294};
1295
1249 1296
1250static struct trace_event *events[] __initdata = { 1297static struct trace_event *events[] __initdata = {
1251 &trace_fn_event, 1298 &trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
25extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags, struct trace_event *event);
29extern int 29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); 30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 31
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54 struct task_struct *next)
55{ 54{
56 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
57 unsigned long flags; 56 unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
109} 108}
110 109
111static void 110static void
112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 111probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
113{ 112{
114 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
115 unsigned long flags; 114 unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
139{ 138{
140 int ret; 139 int ret;
141 140
142 ret = register_trace_sched_wakeup(probe_sched_wakeup); 141 ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
143 if (ret) { 142 if (ret) {
144 pr_info("wakeup trace: Couldn't activate tracepoint" 143 pr_info("wakeup trace: Couldn't activate tracepoint"
145 " probe to kernel_sched_wakeup\n"); 144 " probe to kernel_sched_wakeup\n");
146 return ret; 145 return ret;
147 } 146 }
148 147
149 ret = register_trace_sched_wakeup_new(probe_sched_wakeup); 148 ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
150 if (ret) { 149 if (ret) {
151 pr_info("wakeup trace: Couldn't activate tracepoint" 150 pr_info("wakeup trace: Couldn't activate tracepoint"
152 " probe to kernel_sched_wakeup_new\n"); 151 " probe to kernel_sched_wakeup_new\n");
153 goto fail_deprobe; 152 goto fail_deprobe;
154 } 153 }
155 154
156 ret = register_trace_sched_switch(probe_sched_switch); 155 ret = register_trace_sched_switch(probe_sched_switch, NULL);
157 if (ret) { 156 if (ret) {
158 pr_info("sched trace: Couldn't activate tracepoint" 157 pr_info("sched trace: Couldn't activate tracepoint"
159 " probe to kernel_sched_switch\n"); 158 " probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
162 161
163 return ret; 162 return ret;
164fail_deprobe_wake_new: 163fail_deprobe_wake_new:
165 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 164 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
166fail_deprobe: 165fail_deprobe:
167 unregister_trace_sched_wakeup(probe_sched_wakeup); 166 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
168 return ret; 167 return ret;
169} 168}
170 169
171static void tracing_sched_unregister(void) 170static void tracing_sched_unregister(void)
172{ 171{
173 unregister_trace_sched_switch(probe_sched_switch); 172 unregister_trace_sched_switch(probe_sched_switch, NULL);
174 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 173 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
175 unregister_trace_sched_wakeup(probe_sched_wakeup); 174 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
176} 175}
177 176
178static void tracing_start_sched_switch(void) 177static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..0e73bc2ef8c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
98 return 1; 98 return 1;
99} 99}
100 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) 101static void
102probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
102{ 103{
103 if (task != wakeup_task) 104 if (task != wakeup_task)
104 return; 105 return;
@@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 108}
108 109
109static void notrace 110static void notrace
110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 111probe_wakeup_sched_switch(void *ignore,
111 struct task_struct *next) 112 struct task_struct *prev, struct task_struct *next)
112{ 113{
113 struct trace_array_cpu *data; 114 struct trace_array_cpu *data;
114 cycle_t T0, T1, delta; 115 cycle_t T0, T1, delta;
@@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
200} 201}
201 202
202static void 203static void
203probe_wakeup(struct rq *rq, struct task_struct *p, int success) 204probe_wakeup(void *ignore, struct task_struct *p, int success)
204{ 205{
205 struct trace_array_cpu *data; 206 struct trace_array_cpu *data;
206 int cpu = smp_processor_id(); 207 int cpu = smp_processor_id();
@@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
264{ 265{
265 int ret; 266 int ret;
266 267
267 ret = register_trace_sched_wakeup(probe_wakeup); 268 ret = register_trace_sched_wakeup(probe_wakeup, NULL);
268 if (ret) { 269 if (ret) {
269 pr_info("wakeup trace: Couldn't activate tracepoint" 270 pr_info("wakeup trace: Couldn't activate tracepoint"
270 " probe to kernel_sched_wakeup\n"); 271 " probe to kernel_sched_wakeup\n");
271 return; 272 return;
272 } 273 }
273 274
274 ret = register_trace_sched_wakeup_new(probe_wakeup); 275 ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
275 if (ret) { 276 if (ret) {
276 pr_info("wakeup trace: Couldn't activate tracepoint" 277 pr_info("wakeup trace: Couldn't activate tracepoint"
277 " probe to kernel_sched_wakeup_new\n"); 278 " probe to kernel_sched_wakeup_new\n");
278 goto fail_deprobe; 279 goto fail_deprobe;
279 } 280 }
280 281
281 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 282 ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
282 if (ret) { 283 if (ret) {
283 pr_info("sched trace: Couldn't activate tracepoint" 284 pr_info("sched trace: Couldn't activate tracepoint"
284 " probe to kernel_sched_switch\n"); 285 " probe to kernel_sched_switch\n");
285 goto fail_deprobe_wake_new; 286 goto fail_deprobe_wake_new;
286 } 287 }
287 288
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); 289 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
289 if (ret) { 290 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint" 291 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n"); 292 " probe to kernel_sched_migrate_task\n");
@@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
312 313
313 return; 314 return;
314fail_deprobe_wake_new: 315fail_deprobe_wake_new:
315 unregister_trace_sched_wakeup_new(probe_wakeup); 316 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
316fail_deprobe: 317fail_deprobe:
317 unregister_trace_sched_wakeup(probe_wakeup); 318 unregister_trace_sched_wakeup(probe_wakeup, NULL);
318} 319}
319 320
320static void stop_wakeup_tracer(struct trace_array *tr) 321static void stop_wakeup_tracer(struct trace_array *tr)
321{ 322{
322 tracer_enabled = 0; 323 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 324 unregister_ftrace_function(&trace_ops);
324 unregister_trace_sched_switch(probe_wakeup_sched_switch); 325 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup); 326 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup); 327 unregister_trace_sched_wakeup(probe_wakeup, NULL);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); 328 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
328} 329}
329 330
330static int __wakeup_tracer_init(struct trace_array *tr) 331static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 1cc9858258b3..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
29 struct trace_entry *entry; 29 struct trace_entry *entry;
30 unsigned int loops = 0; 30 unsigned int loops = 0;
31 31
32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
33 entry = ring_buffer_event_data(event); 33 entry = ring_buffer_event_data(event);
34 34
35 /* 35 /*
@@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
255/* Maximum number of functions to trace before diagnosing a hang */ 255/* Maximum number of functions to trace before diagnosing a hang */
256#define GRAPH_MAX_FUNC_TEST 100000000 256#define GRAPH_MAX_FUNC_TEST 100000000
257 257
258static void __ftrace_dump(bool disable_tracing); 258static void
259__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
259static unsigned int graph_hang_thresh; 260static unsigned int graph_hang_thresh;
260 261
261/* Wrap the real function entry probe to avoid possible hanging */ 262/* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
266 ftrace_graph_stop(); 267 ftrace_graph_stop();
267 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
268 if (ftrace_dump_on_oops) 269 if (ftrace_dump_on_oops)
269 __ftrace_dump(false); 270 __ftrace_dump(false, DUMP_ALL);
270 return 0; 271 return 0;
271 } 272 }
272 273
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f2..9d358301ae3e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -15,6 +15,54 @@ static int sys_refcount_exit;
15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17 17
18static int syscall_enter_register(struct ftrace_event_call *event,
19 enum trace_reg type);
20static int syscall_exit_register(struct ftrace_event_call *event,
21 enum trace_reg type);
22
23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25
26static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call)
28{
29 struct syscall_metadata *entry = call->data;
30
31 return &entry->enter_fields;
32}
33
34static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call)
36{
37 struct syscall_metadata *entry = call->data;
38
39 return &entry->exit_fields;
40}
41
42struct trace_event_functions enter_syscall_print_funcs = {
43 .trace = print_syscall_enter,
44};
45
46struct trace_event_functions exit_syscall_print_funcs = {
47 .trace = print_syscall_exit,
48};
49
50struct ftrace_event_class event_class_syscall_enter = {
51 .system = "syscalls",
52 .reg = syscall_enter_register,
53 .define_fields = syscall_enter_define_fields,
54 .get_fields = syscall_get_enter_fields,
55 .raw_init = init_syscall_trace,
56};
57
58struct ftrace_event_class event_class_syscall_exit = {
59 .system = "syscalls",
60 .reg = syscall_exit_register,
61 .define_fields = syscall_exit_define_fields,
62 .get_fields = syscall_get_exit_fields,
63 .raw_init = init_syscall_trace,
64};
65
18extern unsigned long __start_syscalls_metadata[]; 66extern unsigned long __start_syscalls_metadata[];
19extern unsigned long __stop_syscalls_metadata[]; 67extern unsigned long __stop_syscalls_metadata[];
20 68
@@ -53,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
53} 101}
54 102
55enum print_line_t 103enum print_line_t
56print_syscall_enter(struct trace_iterator *iter, int flags) 104print_syscall_enter(struct trace_iterator *iter, int flags,
105 struct trace_event *event)
57{ 106{
58 struct trace_seq *s = &iter->seq; 107 struct trace_seq *s = &iter->seq;
59 struct trace_entry *ent = iter->ent; 108 struct trace_entry *ent = iter->ent;
@@ -68,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
68 if (!entry) 117 if (!entry)
69 goto end; 118 goto end;
70 119
71 if (entry->enter_event->id != ent->type) { 120 if (entry->enter_event->event.type != ent->type) {
72 WARN_ON_ONCE(1); 121 WARN_ON_ONCE(1);
73 goto end; 122 goto end;
74 } 123 }
@@ -105,7 +154,8 @@ end:
105} 154}
106 155
107enum print_line_t 156enum print_line_t
108print_syscall_exit(struct trace_iterator *iter, int flags) 157print_syscall_exit(struct trace_iterator *iter, int flags,
158 struct trace_event *event)
109{ 159{
110 struct trace_seq *s = &iter->seq; 160 struct trace_seq *s = &iter->seq;
111 struct trace_entry *ent = iter->ent; 161 struct trace_entry *ent = iter->ent;
@@ -123,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
123 return TRACE_TYPE_HANDLED; 173 return TRACE_TYPE_HANDLED;
124 } 174 }
125 175
126 if (entry->exit_event->id != ent->type) { 176 if (entry->exit_event->event.type != ent->type) {
127 WARN_ON_ONCE(1); 177 WARN_ON_ONCE(1);
128 return TRACE_TYPE_UNHANDLED; 178 return TRACE_TYPE_UNHANDLED;
129 } 179 }
@@ -205,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
205 kfree(call->print_fmt); 255 kfree(call->print_fmt);
206} 256}
207 257
208int syscall_enter_define_fields(struct ftrace_event_call *call) 258static int syscall_enter_define_fields(struct ftrace_event_call *call)
209{ 259{
210 struct syscall_trace_enter trace; 260 struct syscall_trace_enter trace;
211 struct syscall_metadata *meta = call->data; 261 struct syscall_metadata *meta = call->data;
@@ -228,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
228 return ret; 278 return ret;
229} 279}
230 280
231int syscall_exit_define_fields(struct ftrace_event_call *call) 281static int syscall_exit_define_fields(struct ftrace_event_call *call)
232{ 282{
233 struct syscall_trace_exit trace; 283 struct syscall_trace_exit trace;
234 int ret; 284 int ret;
@@ -243,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
243 return ret; 293 return ret;
244} 294}
245 295
246void ftrace_syscall_enter(struct pt_regs *regs, long id) 296void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
247{ 297{
248 struct syscall_trace_enter *entry; 298 struct syscall_trace_enter *entry;
249 struct syscall_metadata *sys_data; 299 struct syscall_metadata *sys_data;
@@ -265,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
265 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 315 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
266 316
267 event = trace_current_buffer_lock_reserve(&buffer, 317 event = trace_current_buffer_lock_reserve(&buffer,
268 sys_data->enter_event->id, size, 0, 0); 318 sys_data->enter_event->event.type, size, 0, 0);
269 if (!event) 319 if (!event)
270 return; 320 return;
271 321
@@ -278,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
278 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 328 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
279} 329}
280 330
281void ftrace_syscall_exit(struct pt_regs *regs, long ret) 331void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
282{ 332{
283 struct syscall_trace_exit *entry; 333 struct syscall_trace_exit *entry;
284 struct syscall_metadata *sys_data; 334 struct syscall_metadata *sys_data;
@@ -297,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
297 return; 347 return;
298 348
299 event = trace_current_buffer_lock_reserve(&buffer, 349 event = trace_current_buffer_lock_reserve(&buffer,
300 sys_data->exit_event->id, sizeof(*entry), 0, 0); 350 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
301 if (!event) 351 if (!event)
302 return; 352 return;
303 353
@@ -320,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
320 return -ENOSYS; 370 return -ENOSYS;
321 mutex_lock(&syscall_trace_lock); 371 mutex_lock(&syscall_trace_lock);
322 if (!sys_refcount_enter) 372 if (!sys_refcount_enter)
323 ret = register_trace_sys_enter(ftrace_syscall_enter); 373 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
324 if (!ret) { 374 if (!ret) {
325 set_bit(num, enabled_enter_syscalls); 375 set_bit(num, enabled_enter_syscalls);
326 sys_refcount_enter++; 376 sys_refcount_enter++;
@@ -340,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
340 sys_refcount_enter--; 390 sys_refcount_enter--;
341 clear_bit(num, enabled_enter_syscalls); 391 clear_bit(num, enabled_enter_syscalls);
342 if (!sys_refcount_enter) 392 if (!sys_refcount_enter)
343 unregister_trace_sys_enter(ftrace_syscall_enter); 393 unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
344 mutex_unlock(&syscall_trace_lock); 394 mutex_unlock(&syscall_trace_lock);
345} 395}
346 396
@@ -354,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
354 return -ENOSYS; 404 return -ENOSYS;
355 mutex_lock(&syscall_trace_lock); 405 mutex_lock(&syscall_trace_lock);
356 if (!sys_refcount_exit) 406 if (!sys_refcount_exit)
357 ret = register_trace_sys_exit(ftrace_syscall_exit); 407 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
358 if (!ret) { 408 if (!ret) {
359 set_bit(num, enabled_exit_syscalls); 409 set_bit(num, enabled_exit_syscalls);
360 sys_refcount_exit++; 410 sys_refcount_exit++;
@@ -374,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
374 sys_refcount_exit--; 424 sys_refcount_exit--;
375 clear_bit(num, enabled_exit_syscalls); 425 clear_bit(num, enabled_exit_syscalls);
376 if (!sys_refcount_exit) 426 if (!sys_refcount_exit)
377 unregister_trace_sys_exit(ftrace_syscall_exit); 427 unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
378 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
379} 429}
380 430
@@ -434,7 +484,7 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
434static int sys_perf_refcount_enter; 484static int sys_perf_refcount_enter;
435static int sys_perf_refcount_exit; 485static int sys_perf_refcount_exit;
436 486
437static void perf_syscall_enter(struct pt_regs *regs, long id) 487static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
438{ 488{
439 struct syscall_metadata *sys_data; 489 struct syscall_metadata *sys_data;
440 struct syscall_trace_enter *rec; 490 struct syscall_trace_enter *rec;
@@ -461,7 +511,8 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
461 return; 511 return;
462 512
463 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 513 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
464 sys_data->enter_event->id, &rctx, &flags); 514 sys_data->enter_event->event.type,
515 &rctx, &flags);
465 if (!rec) 516 if (!rec)
466 return; 517 return;
467 518
@@ -480,7 +531,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
480 531
481 mutex_lock(&syscall_trace_lock); 532 mutex_lock(&syscall_trace_lock);
482 if (!sys_perf_refcount_enter) 533 if (!sys_perf_refcount_enter)
483 ret = register_trace_sys_enter(perf_syscall_enter); 534 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
484 if (ret) { 535 if (ret) {
485 pr_info("event trace: Could not activate" 536 pr_info("event trace: Could not activate"
486 "syscall entry trace point"); 537 "syscall entry trace point");
@@ -502,11 +553,11 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
502 sys_perf_refcount_enter--; 553 sys_perf_refcount_enter--;
503 clear_bit(num, enabled_perf_enter_syscalls); 554 clear_bit(num, enabled_perf_enter_syscalls);
504 if (!sys_perf_refcount_enter) 555 if (!sys_perf_refcount_enter)
505 unregister_trace_sys_enter(perf_syscall_enter); 556 unregister_trace_sys_enter(perf_syscall_enter, NULL);
506 mutex_unlock(&syscall_trace_lock); 557 mutex_unlock(&syscall_trace_lock);
507} 558}
508 559
509static void perf_syscall_exit(struct pt_regs *regs, long ret) 560static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
510{ 561{
511 struct syscall_metadata *sys_data; 562 struct syscall_metadata *sys_data;
512 struct syscall_trace_exit *rec; 563 struct syscall_trace_exit *rec;
@@ -536,7 +587,8 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
536 return; 587 return;
537 588
538 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 589 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
539 sys_data->exit_event->id, &rctx, &flags); 590 sys_data->exit_event->event.type,
591 &rctx, &flags);
540 if (!rec) 592 if (!rec)
541 return; 593 return;
542 594
@@ -555,7 +607,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
555 607
556 mutex_lock(&syscall_trace_lock); 608 mutex_lock(&syscall_trace_lock);
557 if (!sys_perf_refcount_exit) 609 if (!sys_perf_refcount_exit)
558 ret = register_trace_sys_exit(perf_syscall_exit); 610 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
559 if (ret) { 611 if (ret) {
560 pr_info("event trace: Could not activate" 612 pr_info("event trace: Could not activate"
561 "syscall exit trace point"); 613 "syscall exit trace point");
@@ -577,9 +629,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
577 sys_perf_refcount_exit--; 629 sys_perf_refcount_exit--;
578 clear_bit(num, enabled_perf_exit_syscalls); 630 clear_bit(num, enabled_perf_exit_syscalls);
579 if (!sys_perf_refcount_exit) 631 if (!sys_perf_refcount_exit)
580 unregister_trace_sys_exit(perf_syscall_exit); 632 unregister_trace_sys_exit(perf_syscall_exit, NULL);
581 mutex_unlock(&syscall_trace_lock); 633 mutex_unlock(&syscall_trace_lock);
582} 634}
583 635
584#endif /* CONFIG_PERF_EVENTS */ 636#endif /* CONFIG_PERF_EVENTS */
585 637
638static int syscall_enter_register(struct ftrace_event_call *event,
639 enum trace_reg type)
640{
641 switch (type) {
642 case TRACE_REG_REGISTER:
643 return reg_event_syscall_enter(event);
644 case TRACE_REG_UNREGISTER:
645 unreg_event_syscall_enter(event);
646 return 0;
647
648#ifdef CONFIG_PERF_EVENTS
649 case TRACE_REG_PERF_REGISTER:
650 return perf_sysenter_enable(event);
651 case TRACE_REG_PERF_UNREGISTER:
652 perf_sysenter_disable(event);
653 return 0;
654#endif
655 }
656 return 0;
657}
658
659static int syscall_exit_register(struct ftrace_event_call *event,
660 enum trace_reg type)
661{
662 switch (type) {
663 case TRACE_REG_REGISTER:
664 return reg_event_syscall_exit(event);
665 case TRACE_REG_UNREGISTER:
666 unreg_event_syscall_exit(event);
667 return 0;
668
669#ifdef CONFIG_PERF_EVENTS
670 case TRACE_REG_PERF_REGISTER:
671 return perf_sysexit_enable(event);
672 case TRACE_REG_PERF_UNREGISTER:
673 perf_sysexit_disable(event);
674 return 0;
675#endif
676 }
677 return 0;
678}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cc2d2faa7d9e..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
49 49
50/* Insertion of a work */ 50/* Insertion of a work */
51static void 51static void
52probe_workqueue_insertion(struct task_struct *wq_thread, 52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
53 struct work_struct *work) 54 struct work_struct *work)
54{ 55{
55 int cpu = cpumask_first(&wq_thread->cpus_allowed); 56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
70 71
71/* Execution of a work */ 72/* Execution of a work */
72static void 73static void
73probe_workqueue_execution(struct task_struct *wq_thread, 74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
74 struct work_struct *work) 76 struct work_struct *work)
75{ 77{
76 int cpu = cpumask_first(&wq_thread->cpus_allowed); 78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
90} 92}
91 93
92/* Creation of a cpu workqueue thread */ 94/* Creation of a cpu workqueue thread */
93static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) 95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
94{ 97{
95 struct cpu_workqueue_stats *cws; 98 struct cpu_workqueue_stats *cws;
96 unsigned long flags; 99 unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
114} 117}
115 118
116/* Destruction of a cpu workqueue thread */ 119/* Destruction of a cpu workqueue thread */
117static void probe_workqueue_destruction(struct task_struct *wq_thread) 120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
118{ 122{
119 /* Workqueue only execute on one cpu */ 123 /* Workqueue only execute on one cpu */
120 int cpu = cpumask_first(&wq_thread->cpus_allowed); 124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
259{ 263{
260 int ret, cpu; 264 int ret, cpu;
261 265
262 ret = register_trace_workqueue_insertion(probe_workqueue_insertion); 266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
263 if (ret) 267 if (ret)
264 goto out; 268 goto out;
265 269
266 ret = register_trace_workqueue_execution(probe_workqueue_execution); 270 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
267 if (ret) 271 if (ret)
268 goto no_insertion; 272 goto no_insertion;
269 273
270 ret = register_trace_workqueue_creation(probe_workqueue_creation); 274 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
271 if (ret) 275 if (ret)
272 goto no_execution; 276 goto no_execution;
273 277
274 ret = register_trace_workqueue_destruction(probe_workqueue_destruction); 278 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
275 if (ret) 279 if (ret)
276 goto no_creation; 280 goto no_creation;
277 281
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
283 return 0; 287 return 0;
284 288
285no_creation: 289no_creation:
286 unregister_trace_workqueue_creation(probe_workqueue_creation); 290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
287no_execution: 291no_execution:
288 unregister_trace_workqueue_execution(probe_workqueue_execution); 292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
289no_insertion: 293no_insertion:
290 unregister_trace_workqueue_insertion(probe_workqueue_insertion); 294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
291out: 295out:
292 pr_warning("trace_workqueue: unable to trace workqueues\n"); 296 pr_warning("trace_workqueue: unable to trace workqueues\n");
293 297
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
54 */ 54 */
55struct tracepoint_entry { 55struct tracepoint_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 void **funcs; 57 struct tracepoint_func *funcs;
58 int refcount; /* Number of times armed. 0 if disarmed. */ 58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; 59 char name[0];
60}; 60};
@@ -64,12 +64,12 @@ struct tp_probes {
64 struct rcu_head rcu; 64 struct rcu_head rcu;
65 struct list_head list; 65 struct list_head list;
66 } u; 66 } u;
67 void *probes[0]; 67 struct tracepoint_func probes[0];
68}; 68};
69 69
70static inline void *allocate_probes(int count) 70static inline void *allocate_probes(int count)
71{ 71{
72 struct tp_probes *p = kmalloc(count * sizeof(void *) 72 struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
73 + sizeof(struct tp_probes), GFP_KERNEL); 73 + sizeof(struct tp_probes), GFP_KERNEL);
74 return p == NULL ? NULL : p->probes; 74 return p == NULL ? NULL : p->probes;
75} 75}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
79 kfree(container_of(head, struct tp_probes, u.rcu)); 79 kfree(container_of(head, struct tp_probes, u.rcu));
80} 80}
81 81
82static inline void release_probes(void *old) 82static inline void release_probes(struct tracepoint_func *old)
83{ 83{
84 if (old) { 84 if (old) {
85 struct tp_probes *tp_probes = container_of(old, 85 struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
95 if (!tracepoint_debug || !entry->funcs) 95 if (!tracepoint_debug || !entry->funcs)
96 return; 96 return;
97 97
98 for (i = 0; entry->funcs[i]; i++) 98 for (i = 0; entry->funcs[i].func; i++)
99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); 99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
100} 100}
101 101
102static void * 102static struct tracepoint_func *
103tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) 103tracepoint_entry_add_probe(struct tracepoint_entry *entry,
104 void *probe, void *data)
104{ 105{
105 int nr_probes = 0; 106 int nr_probes = 0;
106 void **old, **new; 107 struct tracepoint_func *old, *new;
107 108
108 WARN_ON(!probe); 109 WARN_ON(!probe);
109 110
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 old = entry->funcs; 112 old = entry->funcs;
112 if (old) { 113 if (old) {
113 /* (N -> N+1), (N != 0, 1) probes */ 114 /* (N -> N+1), (N != 0, 1) probes */
114 for (nr_probes = 0; old[nr_probes]; nr_probes++) 115 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
115 if (old[nr_probes] == probe) 116 if (old[nr_probes].func == probe &&
117 old[nr_probes].data == data)
116 return ERR_PTR(-EEXIST); 118 return ERR_PTR(-EEXIST);
117 } 119 }
118 /* + 2 : one for new probe, one for NULL func */ 120 /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
120 if (new == NULL) 122 if (new == NULL)
121 return ERR_PTR(-ENOMEM); 123 return ERR_PTR(-ENOMEM);
122 if (old) 124 if (old)
123 memcpy(new, old, nr_probes * sizeof(void *)); 125 memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
124 new[nr_probes] = probe; 126 new[nr_probes].func = probe;
125 new[nr_probes + 1] = NULL; 127 new[nr_probes].data = data;
128 new[nr_probes + 1].func = NULL;
126 entry->refcount = nr_probes + 1; 129 entry->refcount = nr_probes + 1;
127 entry->funcs = new; 130 entry->funcs = new;
128 debug_print_probes(entry); 131 debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
130} 133}
131 134
132static void * 135static void *
133tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) 136tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
137 void *probe, void *data)
134{ 138{
135 int nr_probes = 0, nr_del = 0, i; 139 int nr_probes = 0, nr_del = 0, i;
136 void **old, **new; 140 struct tracepoint_func *old, *new;
137 141
138 old = entry->funcs; 142 old = entry->funcs;
139 143
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
142 146
143 debug_print_probes(entry); 147 debug_print_probes(entry);
144 /* (N -> M), (N > 1, M >= 0) probes */ 148 /* (N -> M), (N > 1, M >= 0) probes */
145 for (nr_probes = 0; old[nr_probes]; nr_probes++) { 149 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
146 if ((!probe || old[nr_probes] == probe)) 150 if (!probe ||
151 (old[nr_probes].func == probe &&
152 old[nr_probes].data == data))
147 nr_del++; 153 nr_del++;
148 } 154 }
149 155
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
160 new = allocate_probes(nr_probes - nr_del + 1); 166 new = allocate_probes(nr_probes - nr_del + 1);
161 if (new == NULL) 167 if (new == NULL)
162 return ERR_PTR(-ENOMEM); 168 return ERR_PTR(-ENOMEM);
163 for (i = 0; old[i]; i++) 169 for (i = 0; old[i].func; i++)
164 if ((probe && old[i] != probe)) 170 if (probe &&
171 (old[i].func != probe || old[i].data != data))
165 new[j++] = old[i]; 172 new[j++] = old[i];
166 new[nr_probes - nr_del] = NULL; 173 new[nr_probes - nr_del].func = NULL;
167 entry->refcount = nr_probes - nr_del; 174 entry->refcount = nr_probes - nr_del;
168 entry->funcs = new; 175 entry->funcs = new;
169 } 176 }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
315 module_update_tracepoints(); 322 module_update_tracepoints();
316} 323}
317 324
318static void *tracepoint_add_probe(const char *name, void *probe) 325static struct tracepoint_func *
326tracepoint_add_probe(const char *name, void *probe, void *data)
319{ 327{
320 struct tracepoint_entry *entry; 328 struct tracepoint_entry *entry;
321 void *old; 329 struct tracepoint_func *old;
322 330
323 entry = get_tracepoint(name); 331 entry = get_tracepoint(name);
324 if (!entry) { 332 if (!entry) {
325 entry = add_tracepoint(name); 333 entry = add_tracepoint(name);
326 if (IS_ERR(entry)) 334 if (IS_ERR(entry))
327 return entry; 335 return (struct tracepoint_func *)entry;
328 } 336 }
329 old = tracepoint_entry_add_probe(entry, probe); 337 old = tracepoint_entry_add_probe(entry, probe, data);
330 if (IS_ERR(old) && !entry->refcount) 338 if (IS_ERR(old) && !entry->refcount)
331 remove_tracepoint(entry); 339 remove_tracepoint(entry);
332 return old; 340 return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
340 * Returns 0 if ok, error value on error. 348 * Returns 0 if ok, error value on error.
341 * The probe address must at least be aligned on the architecture pointer size. 349 * The probe address must at least be aligned on the architecture pointer size.
342 */ 350 */
343int tracepoint_probe_register(const char *name, void *probe) 351int tracepoint_probe_register(const char *name, void *probe, void *data)
344{ 352{
345 void *old; 353 struct tracepoint_func *old;
346 354
347 mutex_lock(&tracepoints_mutex); 355 mutex_lock(&tracepoints_mutex);
348 old = tracepoint_add_probe(name, probe); 356 old = tracepoint_add_probe(name, probe, data);
349 mutex_unlock(&tracepoints_mutex); 357 mutex_unlock(&tracepoints_mutex);
350 if (IS_ERR(old)) 358 if (IS_ERR(old))
351 return PTR_ERR(old); 359 return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
356} 364}
357EXPORT_SYMBOL_GPL(tracepoint_probe_register); 365EXPORT_SYMBOL_GPL(tracepoint_probe_register);
358 366
359static void *tracepoint_remove_probe(const char *name, void *probe) 367static struct tracepoint_func *
368tracepoint_remove_probe(const char *name, void *probe, void *data)
360{ 369{
361 struct tracepoint_entry *entry; 370 struct tracepoint_entry *entry;
362 void *old; 371 struct tracepoint_func *old;
363 372
364 entry = get_tracepoint(name); 373 entry = get_tracepoint(name);
365 if (!entry) 374 if (!entry)
366 return ERR_PTR(-ENOENT); 375 return ERR_PTR(-ENOENT);
367 old = tracepoint_entry_remove_probe(entry, probe); 376 old = tracepoint_entry_remove_probe(entry, probe, data);
368 if (IS_ERR(old)) 377 if (IS_ERR(old))
369 return old; 378 return old;
370 if (!entry->refcount) 379 if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
382 * itself uses stop_machine(), which insures that every preempt disabled section 391 * itself uses stop_machine(), which insures that every preempt disabled section
383 * have finished. 392 * have finished.
384 */ 393 */
385int tracepoint_probe_unregister(const char *name, void *probe) 394int tracepoint_probe_unregister(const char *name, void *probe, void *data)
386{ 395{
387 void *old; 396 struct tracepoint_func *old;
388 397
389 mutex_lock(&tracepoints_mutex); 398 mutex_lock(&tracepoints_mutex);
390 old = tracepoint_remove_probe(name, probe); 399 old = tracepoint_remove_probe(name, probe, data);
391 mutex_unlock(&tracepoints_mutex); 400 mutex_unlock(&tracepoints_mutex);
392 if (IS_ERR(old)) 401 if (IS_ERR(old))
393 return PTR_ERR(old); 402 return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
418 * 427 *
419 * caller must call tracepoint_probe_update_all() 428 * caller must call tracepoint_probe_update_all()
420 */ 429 */
421int tracepoint_probe_register_noupdate(const char *name, void *probe) 430int tracepoint_probe_register_noupdate(const char *name, void *probe,
431 void *data)
422{ 432{
423 void *old; 433 struct tracepoint_func *old;
424 434
425 mutex_lock(&tracepoints_mutex); 435 mutex_lock(&tracepoints_mutex);
426 old = tracepoint_add_probe(name, probe); 436 old = tracepoint_add_probe(name, probe, data);
427 if (IS_ERR(old)) { 437 if (IS_ERR(old)) {
428 mutex_unlock(&tracepoints_mutex); 438 mutex_unlock(&tracepoints_mutex);
429 return PTR_ERR(old); 439 return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
441 * 451 *
442 * caller must call tracepoint_probe_update_all() 452 * caller must call tracepoint_probe_update_all()
443 */ 453 */
444int tracepoint_probe_unregister_noupdate(const char *name, void *probe) 454int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
455 void *data)
445{ 456{
446 void *old; 457 struct tracepoint_func *old;
447 458
448 mutex_lock(&tracepoints_mutex); 459 mutex_lock(&tracepoints_mutex);
449 old = tracepoint_remove_probe(name, probe); 460 old = tracepoint_remove_probe(name, probe, data);
450 if (IS_ERR(old)) { 461 if (IS_ERR(old)) {
451 mutex_unlock(&tracepoints_mutex); 462 mutex_unlock(&tracepoints_mutex);
452 return PTR_ERR(old); 463 return PTR_ERR(old);
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
20 19
21struct user_namespace init_user_ns = { 20struct user_namespace init_user_ns = {
22 .kref = { 21 .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
137 struct hlist_head *hashent = uidhashentry(ns, uid); 136 struct hlist_head *hashent = uidhashentry(ns, uid);
138 struct user_struct *up, *new; 137 struct user_struct *up, *new;
139 138
140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
141 * atomic.
142 */
143 spin_lock_irq(&uidhash_lock); 139 spin_lock_irq(&uidhash_lock);
144 up = uid_hash_find(uid, hashent); 140 up = uid_hash_find(uid, hashent);
145 spin_unlock_irq(&uidhash_lock); 141 spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
161 spin_lock_irq(&uidhash_lock); 157 spin_lock_irq(&uidhash_lock);
162 up = uid_hash_find(uid, hashent); 158 up = uid_hash_find(uid, hashent);
163 if (up) { 159 if (up) {
164 /* This case is not possible when CONFIG_USER_SCHED
165 * is defined, since we serialize alloc_uid() using
166 * uids_mutex. Hence no need to call
167 * sched_destroy_user() or remove_user_sysfs_dir().
168 */
169 key_put(new->uid_keyring); 160 key_put(new->uid_keyring);
170 key_put(new->session_keyring); 161 key_put(new->session_keyring);
171 kmem_cache_free(uid_cachep, new); 162 kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
178 169
179 return up; 170 return up;
180 171
181 put_user_ns(new->user_ns);
182 kmem_cache_free(uid_cachep, new);
183out_unlock: 172out_unlock:
184 return NULL; 173 return NULL;
185} 174}
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index cf208d8042b1..ad41529fb60f 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -172,12 +172,12 @@ out:
172 return; 172 return;
173} 173}
174 174
175static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) 175static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
176{ 176{
177 trace_drop_common(skb, location); 177 trace_drop_common(skb, location);
178} 178}
179 179
180static void trace_napi_poll_hit(struct napi_struct *napi) 180static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
181{ 181{
182 struct dm_hw_stat_delta *new_stat; 182 struct dm_hw_stat_delta *new_stat;
183 183
@@ -225,12 +225,12 @@ static int set_all_monitor_traces(int state)
225 225
226 switch (state) { 226 switch (state) {
227 case TRACE_ON: 227 case TRACE_ON:
228 rc |= register_trace_kfree_skb(trace_kfree_skb_hit); 228 rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
229 rc |= register_trace_napi_poll(trace_napi_poll_hit); 229 rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
230 break; 230 break;
231 case TRACE_OFF: 231 case TRACE_OFF:
232 rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); 232 rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
233 rc |= unregister_trace_napi_poll(trace_napi_poll_hit); 233 rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
234 234
235 tracepoint_synchronize_unregister(); 235 tracepoint_synchronize_unregister();
236 236
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index dffdc49878af..4d46be965961 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -7,7 +7,5 @@
7DECLARE_TRACE(subsys_event, 7DECLARE_TRACE(subsys_event,
8 TP_PROTO(struct inode *inode, struct file *file), 8 TP_PROTO(struct inode *inode, struct file *file),
9 TP_ARGS(inode, file)); 9 TP_ARGS(inode, file));
10DECLARE_TRACE(subsys_eventb, 10DECLARE_TRACE_NOARGS(subsys_eventb);
11 TP_PROTO(void),
12 TP_ARGS());
13#endif 11#endif
diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c
index 9e60eb6ca2d8..744c0b9652a7 100644
--- a/samples/tracepoints/tracepoint-probe-sample.c
+++ b/samples/tracepoints/tracepoint-probe-sample.c
@@ -13,7 +13,8 @@
13 * Here the caller only guarantees locking for struct file and struct inode. 13 * Here the caller only guarantees locking for struct file and struct inode.
14 * Locking must therefore be done in the probe to use the dentry. 14 * Locking must therefore be done in the probe to use the dentry.
15 */ 15 */
16static void probe_subsys_event(struct inode *inode, struct file *file) 16static void probe_subsys_event(void *ignore,
17 struct inode *inode, struct file *file)
17{ 18{
18 path_get(&file->f_path); 19 path_get(&file->f_path);
19 dget(file->f_path.dentry); 20 dget(file->f_path.dentry);
@@ -23,7 +24,7 @@ static void probe_subsys_event(struct inode *inode, struct file *file)
23 path_put(&file->f_path); 24 path_put(&file->f_path);
24} 25}
25 26
26static void probe_subsys_eventb(void) 27static void probe_subsys_eventb(void *ignore)
27{ 28{
28 printk(KERN_INFO "Event B is encountered\n"); 29 printk(KERN_INFO "Event B is encountered\n");
29} 30}
@@ -32,9 +33,9 @@ static int __init tp_sample_trace_init(void)
32{ 33{
33 int ret; 34 int ret;
34 35
35 ret = register_trace_subsys_event(probe_subsys_event); 36 ret = register_trace_subsys_event(probe_subsys_event, NULL);
36 WARN_ON(ret); 37 WARN_ON(ret);
37 ret = register_trace_subsys_eventb(probe_subsys_eventb); 38 ret = register_trace_subsys_eventb(probe_subsys_eventb, NULL);
38 WARN_ON(ret); 39 WARN_ON(ret);
39 40
40 return 0; 41 return 0;
@@ -44,8 +45,8 @@ module_init(tp_sample_trace_init);
44 45
45static void __exit tp_sample_trace_exit(void) 46static void __exit tp_sample_trace_exit(void)
46{ 47{
47 unregister_trace_subsys_eventb(probe_subsys_eventb); 48 unregister_trace_subsys_eventb(probe_subsys_eventb, NULL);
48 unregister_trace_subsys_event(probe_subsys_event); 49 unregister_trace_subsys_event(probe_subsys_event, NULL);
49 tracepoint_synchronize_unregister(); 50 tracepoint_synchronize_unregister();
50} 51}
51 52
diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c
index be2a960573f1..9fcf990e5d4b 100644
--- a/samples/tracepoints/tracepoint-probe-sample2.c
+++ b/samples/tracepoints/tracepoint-probe-sample2.c
@@ -12,7 +12,8 @@
12 * Here the caller only guarantees locking for struct file and struct inode. 12 * Here the caller only guarantees locking for struct file and struct inode.
13 * Locking must therefore be done in the probe to use the dentry. 13 * Locking must therefore be done in the probe to use the dentry.
14 */ 14 */
15static void probe_subsys_event(struct inode *inode, struct file *file) 15static void probe_subsys_event(void *ignore,
16 struct inode *inode, struct file *file)
16{ 17{
17 printk(KERN_INFO "Event is encountered with inode number %lu\n", 18 printk(KERN_INFO "Event is encountered with inode number %lu\n",
18 inode->i_ino); 19 inode->i_ino);
@@ -22,7 +23,7 @@ static int __init tp_sample_trace_init(void)
22{ 23{
23 int ret; 24 int ret;
24 25
25 ret = register_trace_subsys_event(probe_subsys_event); 26 ret = register_trace_subsys_event(probe_subsys_event, NULL);
26 WARN_ON(ret); 27 WARN_ON(ret);
27 28
28 return 0; 29 return 0;
@@ -32,7 +33,7 @@ module_init(tp_sample_trace_init);
32 33
33static void __exit tp_sample_trace_exit(void) 34static void __exit tp_sample_trace_exit(void)
34{ 35{
35 unregister_trace_subsys_event(probe_subsys_event); 36 unregister_trace_subsys_event(probe_subsys_event, NULL);
36 tracepoint_synchronize_unregister(); 37 tracepoint_synchronize_unregister();
37} 38}
38 39