aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c106
-rw-r--r--kernel/audit.c9
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c16
-rw-r--r--kernel/auditsc.c34
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c468
-rw-r--r--kernel/cgroup_debug.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c269
-rw-r--r--kernel/dma-coherent.c47
-rw-r--r--kernel/exec_domain.c26
-rw-r--r--kernel/exit.c275
-rw-r--r--kernel/extable.c48
-rw-r--r--kernel/fork.c122
-rw-r--r--kernel/futex.c269
-rw-r--r--kernel/hrtimer.c104
-rw-r--r--kernel/hung_task.c217
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c14
-rw-r--r--kernel/irq/devres.c16
-rw-r--r--kernel/irq/handle.c165
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/manage.c432
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c37
-rw-r--r--kernel/irq/pm.c79
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/irq/spurious.c14
-rw-r--r--kernel/itimer.c11
-rw-r--r--kernel/kallsyms.c35
-rw-r--r--kernel/kexec.c28
-rw-r--r--kernel/kmod.c12
-rw-r--r--kernel/kprobes.c300
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c83
-rw-r--r--kernel/lockdep.c560
-rw-r--r--kernel/lockdep_internals.h45
-rw-r--r--kernel/lockdep_proc.c22
-rw-r--r--kernel/lockdep_states.h9
-rw-r--r--kernel/module.c403
-rw-r--r--kernel/mutex-debug.c9
-rw-r--r--kernel/mutex-debug.h18
-rw-r--r--kernel/mutex.c121
-rw-r--r--kernel/mutex.h22
-rw-r--r--kernel/ns_cgroup.c14
-rw-r--r--kernel/panic.c123
-rw-r--r--kernel/params.c26
-rw-r--r--kernel/pid.c33
-rw-r--r--kernel/pid_namespace.c15
-rw-r--r--kernel/posix-cpu-timers.c196
-rw-r--r--kernel/posix-timers.c43
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/console.c6
-rw-r--r--kernel/power/disk.c165
-rw-r--r--kernel/power/main.c89
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swap.c5
-rw-r--r--kernel/power/swsusp.c18
-rw-r--r--kernel/power/user.c8
-rw-r--r--kernel/printk.c48
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/ptrace.c105
-rw-r--r--kernel/rcuclassic.c29
-rw-r--r--kernel/rcupdate.c56
-rw-r--r--kernel/rcupreempt.c51
-rw-r--r--kernel/rcutorture.c25
-rw-r--r--kernel/rcutree.c26
-rw-r--r--kernel/rcutree.h10
-rw-r--r--kernel/rcutree_trace.c2
-rw-r--r--kernel/relay.c18
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/sched.c1249
-rw-r--r--kernel/sched_clock.c38
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c8
-rw-r--r--kernel/sched_fair.c117
-rw-r--r--kernel/sched_features.h4
-rw-r--r--kernel/sched_rt.c573
-rw-r--r--kernel/sched_stats.h55
-rw-r--r--kernel/seccomp.c7
-rw-r--r--kernel/signal.c140
-rw-r--r--kernel/slow-work.c640
-rw-r--r--kernel/smp.c408
-rw-r--r--kernel/softirq.c42
-rw-r--r--kernel/softlockup.c109
-rw-r--r--kernel/spinlock.c18
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c122
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c86
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/time.c14
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c20
-rw-r--r--kernel/time/clocksource.c76
-rw-r--r--kernel/time/ntp.c444
-rw-r--r--kernel/time/tick-common.c26
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timecompare.c191
-rw-r--r--kernel/timer.c196
-rw-r--r--kernel/trace/Kconfig155
-rw-r--r--kernel/trace/Makefile13
-rw-r--r--kernel/trace/blktrace.c1550
-rw-r--r--kernel/trace/events.c14
-rw-r--r--kernel/trace/ftrace.c1171
-rw-r--r--kernel/trace/kmemtrace.c464
-rw-r--r--kernel/trace/ring_buffer.c706
-rw-r--r--kernel/trace/trace.c3058
-rw-r--r--kernel/trace/trace.h321
-rw-r--r--kernel/trace/trace_boot.c36
-rw-r--r--kernel/trace/trace_branch.c278
-rw-r--r--kernel/trace/trace_clock.c109
-rw-r--r--kernel/trace/trace_event_profile.c31
-rw-r--r--kernel/trace/trace_event_types.h173
-rw-r--r--kernel/trace/trace_events.c824
-rw-r--r--kernel/trace/trace_events_filter.c427
-rw-r--r--kernel/trace/trace_events_stage_1.h39
-rw-r--r--kernel/trace/trace_events_stage_2.h176
-rw-r--r--kernel/trace/trace_events_stage_3.h281
-rw-r--r--kernel/trace/trace_export.c102
-rw-r--r--kernel/trace/trace_functions.c369
-rw-r--r--kernel/trace/trace_functions_graph.c635
-rw-r--r--kernel/trace/trace_hw_branches.c185
-rw-r--r--kernel/trace/trace_irqsoff.c55
-rw-r--r--kernel/trace/trace_mmiotrace.c59
-rw-r--r--kernel/trace/trace_nop.c6
-rw-r--r--kernel/trace/trace_output.c1017
-rw-r--r--kernel/trace/trace_output.h71
-rw-r--r--kernel/trace/trace_power.c194
-rw-r--r--kernel/trace/trace_printk.c270
-rw-r--r--kernel/trace/trace_sched_switch.c27
-rw-r--r--kernel/trace/trace_sched_wakeup.c103
-rw-r--r--kernel/trace/trace_selftest.c188
-rw-r--r--kernel/trace/trace_stack.c19
-rw-r--r--kernel/trace/trace_stat.c326
-rw-r--r--kernel/trace/trace_stat.h31
-rw-r--r--kernel/trace/trace_syscalls.c250
-rw-r--r--kernel/trace/trace_sysprof.c23
-rw-r--r--kernel/trace/trace_workqueue.c288
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/tsacct.c6
-rw-r--r--kernel/uid16.c39
-rw-r--r--kernel/up.c1
-rw-r--r--kernel/user.c37
-rw-r--r--kernel/user_namespace.c21
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/wait.c59
-rw-r--r--kernel/workqueue.c83
151 files changed, 18776 insertions, 5747 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 2aebc4cd7878..42423665660a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -40,9 +40,8 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
40obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 40obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
41obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 41obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
42obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 42obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
43ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y) 43obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
44obj-y += smp.o 44ifneq ($(CONFIG_SMP),y)
45else
46obj-y += up.o 45obj-y += up.o
47endif 46endif
48obj-$(CONFIG_SMP) += spinlock.o 47obj-$(CONFIG_SMP) += spinlock.o
@@ -52,6 +51,7 @@ obj-$(CONFIG_UID16) += uid16.o
52obj-$(CONFIG_MODULES) += module.o 51obj-$(CONFIG_MODULES) += module.o
53obj-$(CONFIG_KALLSYMS) += kallsyms.o 52obj-$(CONFIG_KALLSYMS) += kallsyms.o
54obj-$(CONFIG_PM) += power/ 53obj-$(CONFIG_PM) += power/
54obj-$(CONFIG_FREEZER) += power/
55obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 55obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
56obj-$(CONFIG_KEXEC) += kexec.o 56obj-$(CONFIG_KEXEC) += kexec.o
57obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 57obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
74obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_KGDB) += kgdb.o 75obj-$(CONFIG_KGDB) += kgdb.o
76obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 76obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
77obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
77obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 78obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
78obj-$(CONFIG_SECCOMP) += seccomp.o 79obj-$(CONFIG_SECCOMP) += seccomp.o
79obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 80obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
@@ -93,6 +94,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 94obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 95obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_SMP) += sched_cpupri.o 96obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o
96 98
97ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
98# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index d57b7cbb98b6..7afa31564162 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -277,7 +277,7 @@ static int acct_on(char *name)
277 * should be written. If the filename is NULL, accounting will be 277 * should be written. If the filename is NULL, accounting will be
278 * shutdown. 278 * shutdown.
279 */ 279 */
280asmlinkage long sys_acct(const char __user *name) 280SYSCALL_DEFINE1(acct, const char __user *, name)
281{ 281{
282 int error; 282 int error;
283 283
diff --git a/kernel/async.c b/kernel/async.c
index 608b32b42812..968ef9457d4e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,11 +49,13 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/bug.h>
52#include <linux/module.h> 53#include <linux/module.h>
53#include <linux/wait.h> 54#include <linux/wait.h>
54#include <linux/sched.h> 55#include <linux/sched.h>
55#include <linux/init.h> 56#include <linux/init.h>
56#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/delay.h>
57#include <asm/atomic.h> 59#include <asm/atomic.h>
58 60
59static async_cookie_t next_cookie = 1; 61static async_cookie_t next_cookie = 1;
@@ -132,21 +134,23 @@ static void run_one_entry(void)
132 entry = list_first_entry(&async_pending, struct async_entry, list); 134 entry = list_first_entry(&async_pending, struct async_entry, list);
133 135
134 /* 2) move it to the running queue */ 136 /* 2) move it to the running queue */
135 list_del(&entry->list); 137 list_move_tail(&entry->list, entry->running);
136 list_add_tail(&entry->list, &async_running);
137 spin_unlock_irqrestore(&async_lock, flags); 138 spin_unlock_irqrestore(&async_lock, flags);
138 139
139 /* 3) run it (and print duration)*/ 140 /* 3) run it (and print duration)*/
140 if (initcall_debug && system_state == SYSTEM_BOOTING) { 141 if (initcall_debug && system_state == SYSTEM_BOOTING) {
141 printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current)); 142 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
143 entry->func, task_pid_nr(current));
142 calltime = ktime_get(); 144 calltime = ktime_get();
143 } 145 }
144 entry->func(entry->data, entry->cookie); 146 entry->func(entry->data, entry->cookie);
145 if (initcall_debug && system_state == SYSTEM_BOOTING) { 147 if (initcall_debug && system_state == SYSTEM_BOOTING) {
146 rettime = ktime_get(); 148 rettime = ktime_get();
147 delta = ktime_sub(rettime, calltime); 149 delta = ktime_sub(rettime, calltime);
148 printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie, 150 printk("initcall %lli_%pF returned 0 after %lld usecs\n",
149 entry->func, ktime_to_ns(delta) >> 10); 151 (long long)entry->cookie,
152 entry->func,
153 (long long)ktime_to_ns(delta) >> 10);
150 } 154 }
151 155
152 /* 4) remove it from the running queue */ 156 /* 4) remove it from the running queue */
@@ -205,18 +209,44 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
205 return newcookie; 209 return newcookie;
206} 210}
207 211
212/**
213 * async_schedule - schedule a function for asynchronous execution
214 * @ptr: function to execute asynchronously
215 * @data: data pointer to pass to the function
216 *
217 * Returns an async_cookie_t that may be used for checkpointing later.
218 * Note: This function may be called from atomic or non-atomic contexts.
219 */
208async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 220async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
209{ 221{
210 return __async_schedule(ptr, data, &async_pending); 222 return __async_schedule(ptr, data, &async_running);
211} 223}
212EXPORT_SYMBOL_GPL(async_schedule); 224EXPORT_SYMBOL_GPL(async_schedule);
213 225
214async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running) 226/**
227 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
228 * @ptr: function to execute asynchronously
229 * @data: data pointer to pass to the function
230 * @running: running list for the domain
231 *
232 * Returns an async_cookie_t that may be used for checkpointing later.
233 * @running may be used in the async_synchronize_*_domain() functions
234 * to wait within a certain synchronization domain rather than globally.
235 * A synchronization domain is specified via the running queue @running to use.
236 * Note: This function may be called from atomic or non-atomic contexts.
237 */
238async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
239 struct list_head *running)
215{ 240{
216 return __async_schedule(ptr, data, running); 241 return __async_schedule(ptr, data, running);
217} 242}
218EXPORT_SYMBOL_GPL(async_schedule_special); 243EXPORT_SYMBOL_GPL(async_schedule_domain);
219 244
245/**
246 * async_synchronize_full - synchronize all asynchronous function calls
247 *
248 * This function waits until all asynchronous function calls have been done.
249 */
220void async_synchronize_full(void) 250void async_synchronize_full(void)
221{ 251{
222 do { 252 do {
@@ -225,13 +255,30 @@ void async_synchronize_full(void)
225} 255}
226EXPORT_SYMBOL_GPL(async_synchronize_full); 256EXPORT_SYMBOL_GPL(async_synchronize_full);
227 257
228void async_synchronize_full_special(struct list_head *list) 258/**
259 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
260 * @list: running list to synchronize on
261 *
262 * This function waits until all asynchronous function calls for the
263 * synchronization domain specified by the running list @list have been done.
264 */
265void async_synchronize_full_domain(struct list_head *list)
229{ 266{
230 async_synchronize_cookie_special(next_cookie, list); 267 async_synchronize_cookie_domain(next_cookie, list);
231} 268}
232EXPORT_SYMBOL_GPL(async_synchronize_full_special); 269EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
233 270
234void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running) 271/**
272 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
273 * @cookie: async_cookie_t to use as checkpoint
274 * @running: running list to synchronize on
275 *
276 * This function waits until all asynchronous function calls for the
277 * synchronization domain specified by the running list @list submitted
278 * prior to @cookie have been done.
279 */
280void async_synchronize_cookie_domain(async_cookie_t cookie,
281 struct list_head *running)
235{ 282{
236 ktime_t starttime, delta, endtime; 283 ktime_t starttime, delta, endtime;
237 284
@@ -247,14 +294,22 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
247 delta = ktime_sub(endtime, starttime); 294 delta = ktime_sub(endtime, starttime);
248 295
249 printk("async_continuing @ %i after %lli usec\n", 296 printk("async_continuing @ %i after %lli usec\n",
250 task_pid_nr(current), ktime_to_ns(delta) >> 10); 297 task_pid_nr(current),
298 (long long)ktime_to_ns(delta) >> 10);
251 } 299 }
252} 300}
253EXPORT_SYMBOL_GPL(async_synchronize_cookie_special); 301EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
254 302
303/**
304 * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
305 * @cookie: async_cookie_t to use as checkpoint
306 *
307 * This function waits until all asynchronous function calls prior to @cookie
308 * have been done.
309 */
255void async_synchronize_cookie(async_cookie_t cookie) 310void async_synchronize_cookie(async_cookie_t cookie)
256{ 311{
257 async_synchronize_cookie_special(cookie, &async_running); 312 async_synchronize_cookie_domain(cookie, &async_running);
258} 313}
259EXPORT_SYMBOL_GPL(async_synchronize_cookie); 314EXPORT_SYMBOL_GPL(async_synchronize_cookie);
260 315
@@ -315,7 +370,11 @@ static int async_manager_thread(void *unused)
315 ec = atomic_read(&entry_count); 370 ec = atomic_read(&entry_count);
316 371
317 while (tc < ec && tc < MAX_THREADS) { 372 while (tc < ec && tc < MAX_THREADS) {
318 kthread_run(async_thread, NULL, "async/%i", tc); 373 if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
374 tc))) {
375 msleep(100);
376 continue;
377 }
319 atomic_inc(&thread_count); 378 atomic_inc(&thread_count);
320 tc++; 379 tc++;
321 } 380 }
@@ -329,18 +388,11 @@ static int async_manager_thread(void *unused)
329 388
330static int __init async_init(void) 389static int __init async_init(void)
331{ 390{
332 if (async_enabled) 391 async_enabled =
333 kthread_run(async_manager_thread, NULL, "async/mgr"); 392 !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
334 return 0;
335}
336 393
337static int __init setup_async(char *str) 394 WARN_ON(!async_enabled);
338{ 395 return 0;
339 async_enabled = 1;
340 return 1;
341} 396}
342 397
343__setup("fastboot", setup_async);
344
345
346core_initcall(async_init); 398core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index ce6d8ea3131e..9442c3533ba9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -766,6 +766,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
766 766
767 audit_log_format(ab, " msg="); 767 audit_log_format(ab, " msg=");
768 size = nlmsg_len(nlh); 768 size = nlmsg_len(nlh);
769 if (size > 0 &&
770 ((unsigned char *)data)[size - 1] == '\0')
771 size--;
769 audit_log_n_untrustedstring(ab, data, size); 772 audit_log_n_untrustedstring(ab, data, size);
770 } 773 }
771 audit_set_pid(ab, pid); 774 audit_set_pid(ab, pid);
@@ -1382,7 +1385,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
1382int audit_string_contains_control(const char *string, size_t len) 1385int audit_string_contains_control(const char *string, size_t len)
1383{ 1386{
1384 const unsigned char *p; 1387 const unsigned char *p;
1385 for (p = string; p < (const unsigned char *)string + len && *p; p++) { 1388 for (p = string; p < (const unsigned char *)string + len; p++) {
1386 if (*p == '"' || *p < 0x21 || *p > 0x7e) 1389 if (*p == '"' || *p < 0x21 || *p > 0x7e)
1387 return 1; 1390 return 1;
1388 } 1391 }
@@ -1437,13 +1440,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1437 /* We will allow 11 spaces for ' (deleted)' to be appended */ 1440 /* We will allow 11 spaces for ' (deleted)' to be appended */
1438 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); 1441 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
1439 if (!pathname) { 1442 if (!pathname) {
1440 audit_log_format(ab, "<no memory>"); 1443 audit_log_string(ab, "<no_memory>");
1441 return; 1444 return;
1442 } 1445 }
1443 p = d_path(path, pathname, PATH_MAX+11); 1446 p = d_path(path, pathname, PATH_MAX+11);
1444 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ 1447 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
1445 /* FIXME: can we save some information here? */ 1448 /* FIXME: can we save some information here? */
1446 audit_log_format(ab, "<too long>"); 1449 audit_log_string(ab, "<too_long>");
1447 } else 1450 } else
1448 audit_log_untrustedstring(ab, p); 1451 audit_log_untrustedstring(ab, p);
1449 kfree(pathname); 1452 kfree(pathname);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ad9545b8db9..917ab9525568 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -385,6 +385,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
385 mutex_lock(&inode->inotify_mutex); 385 mutex_lock(&inode->inotify_mutex);
386 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 386 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
387 mutex_unlock(&inode->inotify_mutex); 387 mutex_unlock(&inode->inotify_mutex);
388 put_inotify_watch(&old->watch);
388 free_chunk(chunk); 389 free_chunk(chunk);
389 return -ENOSPC; 390 return -ENOSPC;
390 } 391 }
@@ -394,6 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
394 chunk->dead = 1; 395 chunk->dead = 1;
395 inotify_evict_watch(&chunk->watch); 396 inotify_evict_watch(&chunk->watch);
396 mutex_unlock(&inode->inotify_mutex); 397 mutex_unlock(&inode->inotify_mutex);
398 put_inotify_watch(&old->watch);
397 put_inotify_watch(&chunk->watch); 399 put_inotify_watch(&chunk->watch);
398 return 0; 400 return 0;
399 } 401 }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index fbf24d121d97..a6fe71fd5d1b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -135,18 +135,18 @@ static void audit_remove_watch(struct audit_watch *watch)
135static inline void audit_free_rule(struct audit_entry *e) 135static inline void audit_free_rule(struct audit_entry *e)
136{ 136{
137 int i; 137 int i;
138 138 struct audit_krule *erule = &e->rule;
139 /* some rules don't have associated watches */ 139 /* some rules don't have associated watches */
140 if (e->rule.watch) 140 if (erule->watch)
141 audit_put_watch(e->rule.watch); 141 audit_put_watch(erule->watch);
142 if (e->rule.fields) 142 if (erule->fields)
143 for (i = 0; i < e->rule.field_count; i++) { 143 for (i = 0; i < erule->field_count; i++) {
144 struct audit_field *f = &e->rule.fields[i]; 144 struct audit_field *f = &erule->fields[i];
145 kfree(f->lsm_str); 145 kfree(f->lsm_str);
146 security_audit_rule_free(f->lsm_rule); 146 security_audit_rule_free(f->lsm_rule);
147 } 147 }
148 kfree(e->rule.fields); 148 kfree(erule->fields);
149 kfree(e->rule.filterkey); 149 kfree(erule->filterkey);
150 kfree(e); 150 kfree(e);
151} 151}
152 152
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c283..7d6ac7c1f414 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
66#include <linux/syscalls.h> 66#include <linux/syscalls.h>
67#include <linux/inotify.h> 67#include <linux/inotify.h>
68#include <linux/capability.h> 68#include <linux/capability.h>
69#include <linux/fs_struct.h>
69 70
70#include "audit.h" 71#include "audit.h"
71 72
@@ -328,6 +329,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
328 */ 329 */
329 330
330#ifdef CONFIG_AUDIT_TREE 331#ifdef CONFIG_AUDIT_TREE
332static void audit_set_auditable(struct audit_context *ctx)
333{
334 if (!ctx->prio) {
335 ctx->prio = 1;
336 ctx->current_state = AUDIT_RECORD_CONTEXT;
337 }
338}
339
331static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) 340static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
332{ 341{
333 struct audit_tree_refs *p = ctx->trees; 342 struct audit_tree_refs *p = ctx->trees;
@@ -741,17 +750,9 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
741 rcu_read_unlock(); 750 rcu_read_unlock();
742} 751}
743 752
744static void audit_set_auditable(struct audit_context *ctx)
745{
746 if (!ctx->prio) {
747 ctx->prio = 1;
748 ctx->current_state = AUDIT_RECORD_CONTEXT;
749 }
750}
751
752static inline struct audit_context *audit_get_context(struct task_struct *tsk, 753static inline struct audit_context *audit_get_context(struct task_struct *tsk,
753 int return_valid, 754 int return_valid,
754 int return_code) 755 long return_code)
755{ 756{
756 struct audit_context *context = tsk->audit_context; 757 struct audit_context *context = tsk->audit_context;
757 758
@@ -1023,7 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1023{ 1024{
1024 char arg_num_len_buf[12]; 1025 char arg_num_len_buf[12];
1025 const char __user *tmp_p = p; 1026 const char __user *tmp_p = p;
1026 /* how many digits are in arg_num? 3 is the length of a=\n */ 1027 /* how many digits are in arg_num? 3 is the length of " a=" */
1027 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
1028 size_t len, len_left, to_send; 1029 size_t len, len_left, to_send;
1029 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
@@ -1109,7 +1110,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1109 * so we can be sure nothing was lost. 1110 * so we can be sure nothing was lost.
1110 */ 1111 */
1111 if ((i == 0) && (too_long)) 1112 if ((i == 0) && (too_long))
1112 audit_log_format(*ab, "a%d_len=%zu ", arg_num, 1113 audit_log_format(*ab, " a%d_len=%zu", arg_num,
1113 has_cntl ? 2*len : len); 1114 has_cntl ? 2*len : len);
1114 1115
1115 /* 1116 /*
@@ -1129,7 +1130,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1129 buf[to_send] = '\0'; 1130 buf[to_send] = '\0';
1130 1131
1131 /* actually log it */ 1132 /* actually log it */
1132 audit_log_format(*ab, "a%d", arg_num); 1133 audit_log_format(*ab, " a%d", arg_num);
1133 if (too_long) 1134 if (too_long)
1134 audit_log_format(*ab, "[%d]", i); 1135 audit_log_format(*ab, "[%d]", i);
1135 audit_log_format(*ab, "="); 1136 audit_log_format(*ab, "=");
@@ -1137,7 +1138,6 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1137 audit_log_n_hex(*ab, buf, to_send); 1138 audit_log_n_hex(*ab, buf, to_send);
1138 else 1139 else
1139 audit_log_format(*ab, "\"%s\"", buf); 1140 audit_log_format(*ab, "\"%s\"", buf);
1140 audit_log_format(*ab, "\n");
1141 1141
1142 p += to_send; 1142 p += to_send;
1143 len_left -= to_send; 1143 len_left -= to_send;
@@ -1165,7 +1165,7 @@ static void audit_log_execve_info(struct audit_context *context,
1165 1165
1166 p = (const char __user *)axi->mm->arg_start; 1166 p = (const char __user *)axi->mm->arg_start;
1167 1167
1168 audit_log_format(*ab, "argc=%d ", axi->argc); 1168 audit_log_format(*ab, "argc=%d", axi->argc);
1169 1169
1170 /* 1170 /*
1171 * we need some kernel buffer to hold the userspace args. Just 1171 * we need some kernel buffer to hold the userspace args. Just
@@ -1478,7 +1478,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1478 case 0: 1478 case 0:
1479 /* name was specified as a relative path and the 1479 /* name was specified as a relative path and the
1480 * directory component is the cwd */ 1480 * directory component is the cwd */
1481 audit_log_d_path(ab, " name=", &context->pwd); 1481 audit_log_d_path(ab, "name=", &context->pwd);
1482 break; 1482 break;
1483 default: 1483 default:
1484 /* log the name's directory component */ 1484 /* log the name's directory component */
@@ -2149,7 +2149,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2149 * __audit_mq_open - record audit data for a POSIX MQ open 2149 * __audit_mq_open - record audit data for a POSIX MQ open
2150 * @oflag: open flag 2150 * @oflag: open flag
2151 * @mode: mode bits 2151 * @mode: mode bits
2152 * @u_attr: queue attributes 2152 * @attr: queue attributes
2153 * 2153 *
2154 */ 2154 */
2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) 2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@@ -2196,7 +2196,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
2196/** 2196/**
2197 * __audit_mq_notify - record audit data for a POSIX MQ notify 2197 * __audit_mq_notify - record audit data for a POSIX MQ notify
2198 * @mqdes: MQ descriptor 2198 * @mqdes: MQ descriptor
2199 * @u_notification: Notification event 2199 * @notification: Notification event
2200 * 2200 *
2201 */ 2201 */
2202 2202
diff --git a/kernel/capability.c b/kernel/capability.c
index 688926e496be..4e17041963f5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -161,7 +161,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
161 * 161 *
162 * Returns 0 on success and < 0 on error. 162 * Returns 0 on success and < 0 on error.
163 */ 163 */
164asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) 164SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
165{ 165{
166 int ret = 0; 166 int ret = 0;
167 pid_t pid; 167 pid_t pid;
@@ -235,7 +235,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
235 * 235 *
236 * Returns 0 on success and < 0 on error. 236 * Returns 0 on success and < 0 on error.
237 */ 237 */
238asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) 238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 239{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 241 unsigned i, tocopy;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c29831076e7a..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
94 char release_agent_path[PATH_MAX]; 94 char release_agent_path[PATH_MAX];
95}; 95};
96 96
97
98/* 97/*
99 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 98 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
100 * subsystems that are otherwise unattached - it never has more than a 99 * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
102 */ 101 */
103static struct cgroupfs_root rootnode; 102static struct cgroupfs_root rootnode;
104 103
104/*
105 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
106 * cgroup_subsys->use_id != 0.
107 */
108#define CSS_ID_MAX (65535)
109struct css_id {
110 /*
111 * The css to which this ID points. This pointer is set to valid value
112 * after cgroup is populated. If cgroup is removed, this will be NULL.
113 * This pointer is expected to be RCU-safe because destroy()
114 * is called after synchronize_rcu(). But for safe use, css_is_removed()
115 * css_tryget() should be used for avoiding race.
116 */
117 struct cgroup_subsys_state *css;
118 /*
119 * ID of this css.
120 */
121 unsigned short id;
122 /*
123 * Depth in hierarchy which this ID belongs to.
124 */
125 unsigned short depth;
126 /*
127 * ID is freed by RCU. (and lookup routine is RCU safe.)
128 */
129 struct rcu_head rcu_head;
130 /*
131 * Hierarchy of CSS ID belongs to.
132 */
133 unsigned short stack[0]; /* Array of Length (depth+1) */
134};
135
136
105/* The list of hierarchy roots */ 137/* The list of hierarchy roots */
106 138
107static LIST_HEAD(roots); 139static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
185static struct css_set init_css_set; 217static struct css_set init_css_set;
186static struct cg_cgroup_link init_css_set_link; 218static struct cg_cgroup_link init_css_set_link;
187 219
220static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
221
188/* css_set_lock protects the list of css_set objects, and the 222/* css_set_lock protects the list of css_set objects, and the
189 * chain of tasks off each css_set. Nests outside task->alloc_lock 223 * chain of tasks off each css_set. Nests outside task->alloc_lock
190 * due to cgroup_iter_start() */ 224 * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
567 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 601 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
568}; 602};
569 603
604static int alloc_css_id(struct cgroup_subsys *ss,
605 struct cgroup *parent, struct cgroup *child);
606
570static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 607static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571{ 608{
572 struct inode *inode = new_inode(sb); 609 struct inode *inode = new_inode(sb);
@@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
585 * Call subsys's pre_destroy handler. 622 * Call subsys's pre_destroy handler.
586 * This is called before css refcnt check. 623 * This is called before css refcnt check.
587 */ 624 */
588static void cgroup_call_pre_destroy(struct cgroup *cgrp) 625static int cgroup_call_pre_destroy(struct cgroup *cgrp)
589{ 626{
590 struct cgroup_subsys *ss; 627 struct cgroup_subsys *ss;
628 int ret = 0;
629
591 for_each_subsys(cgrp->root, ss) 630 for_each_subsys(cgrp->root, ss)
592 if (ss->pre_destroy) 631 if (ss->pre_destroy) {
593 ss->pre_destroy(ss, cgrp); 632 ret = ss->pre_destroy(ss, cgrp);
594 return; 633 if (ret)
634 break;
635 }
636 return ret;
595} 637}
596 638
597static void free_cgroup_rcu(struct rcu_head *obj) 639static void free_cgroup_rcu(struct rcu_head *obj)
@@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
685 remove_dir(dentry); 727 remove_dir(dentry);
686} 728}
687 729
730/*
731 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
732 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
733 * reference to css->refcnt. In general, this refcnt is expected to goes down
734 * to zero, soon.
735 *
736 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
737 */
738DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
739
740static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
741{
742 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
743 wake_up_all(&cgroup_rmdir_waitq);
744}
745
688static int rebind_subsystems(struct cgroupfs_root *root, 746static int rebind_subsystems(struct cgroupfs_root *root,
689 unsigned long final_bits) 747 unsigned long final_bits)
690{ 748{
@@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
857 } 915 }
858 916
859 ret = rebind_subsystems(root, opts.subsys_bits); 917 ret = rebind_subsystems(root, opts.subsys_bits);
918 if (ret)
919 goto out_unlock;
860 920
861 /* (re)populate subsystem files */ 921 /* (re)populate subsystem files */
862 if (!ret) 922 cgroup_populate_dir(cgrp);
863 cgroup_populate_dir(cgrp);
864 923
865 if (opts.release_agent) 924 if (opts.release_agent)
866 strcpy(root->release_agent_path, opts.release_agent); 925 strcpy(root->release_agent_path, opts.release_agent);
867 out_unlock: 926 out_unlock:
868 if (opts.release_agent) 927 kfree(opts.release_agent);
869 kfree(opts.release_agent);
870 mutex_unlock(&cgroup_mutex); 928 mutex_unlock(&cgroup_mutex);
871 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
872 return ret; 930 return ret;
@@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
969 /* First find the desired set of subsystems */ 1027 /* First find the desired set of subsystems */
970 ret = parse_cgroupfs_options(data, &opts); 1028 ret = parse_cgroupfs_options(data, &opts);
971 if (ret) { 1029 if (ret) {
972 if (opts.release_agent) 1030 kfree(opts.release_agent);
973 kfree(opts.release_agent);
974 return ret; 1031 return ret;
975 } 1032 }
976 1033
977 root = kzalloc(sizeof(*root), GFP_KERNEL); 1034 root = kzalloc(sizeof(*root), GFP_KERNEL);
978 if (!root) { 1035 if (!root) {
979 if (opts.release_agent) 1036 kfree(opts.release_agent);
980 kfree(opts.release_agent);
981 return -ENOMEM; 1037 return -ENOMEM;
982 } 1038 }
983 1039
@@ -1071,7 +1127,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1071 mutex_unlock(&cgroup_mutex); 1127 mutex_unlock(&cgroup_mutex);
1072 } 1128 }
1073 1129
1074 return simple_set_mnt(mnt, sb); 1130 simple_set_mnt(mnt, sb);
1131 return 0;
1075 1132
1076 free_cg_links: 1133 free_cg_links:
1077 free_cg_links(&tmp_cg_links); 1134 free_cg_links(&tmp_cg_links);
@@ -1115,13 +1172,15 @@ static void cgroup_kill_sb(struct super_block *sb) {
1115 } 1172 }
1116 write_unlock(&css_set_lock); 1173 write_unlock(&css_set_lock);
1117 1174
1118 list_del(&root->root_list); 1175 if (!list_empty(&root->root_list)) {
1119 root_count--; 1176 list_del(&root->root_list);
1177 root_count--;
1178 }
1120 1179
1121 mutex_unlock(&cgroup_mutex); 1180 mutex_unlock(&cgroup_mutex);
1122 1181
1123 kfree(root);
1124 kill_litter_super(sb); 1182 kill_litter_super(sb);
1183 kfree(root);
1125} 1184}
1126 1185
1127static struct file_system_type cgroup_fs_type = { 1186static struct file_system_type cgroup_fs_type = {
@@ -1277,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1277 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1336 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1278 synchronize_rcu(); 1337 synchronize_rcu();
1279 put_css_set(cg); 1338 put_css_set(cg);
1339
1340 /*
1341 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1342 * is no longer empty.
1343 */
1344 cgroup_wakeup_rmdir_waiters(cgrp);
1280 return 0; 1345 return 0;
1281} 1346}
1282 1347
@@ -1622,10 +1687,10 @@ static struct inode_operations cgroup_dir_inode_operations = {
1622 .rename = cgroup_rename, 1687 .rename = cgroup_rename,
1623}; 1688};
1624 1689
1625static int cgroup_create_file(struct dentry *dentry, int mode, 1690static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1626 struct super_block *sb) 1691 struct super_block *sb)
1627{ 1692{
1628 static struct dentry_operations cgroup_dops = { 1693 static const struct dentry_operations cgroup_dops = {
1629 .d_iput = cgroup_diput, 1694 .d_iput = cgroup_diput,
1630 }; 1695 };
1631 1696
@@ -1668,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
1668 * @mode: mode to set on new directory. 1733 * @mode: mode to set on new directory.
1669 */ 1734 */
1670static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 1735static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1671 int mode) 1736 mode_t mode)
1672{ 1737{
1673 struct dentry *parent; 1738 struct dentry *parent;
1674 int error = 0; 1739 int error = 0;
@@ -1686,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1686 return error; 1751 return error;
1687} 1752}
1688 1753
1754/**
1755 * cgroup_file_mode - deduce file mode of a control file
1756 * @cft: the control file in question
1757 *
1758 * returns cft->mode if ->mode is not 0
1759 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1760 * returns S_IRUGO if it has only a read handler
1761 * returns S_IWUSR if it has only a write hander
1762 */
1763static mode_t cgroup_file_mode(const struct cftype *cft)
1764{
1765 mode_t mode = 0;
1766
1767 if (cft->mode)
1768 return cft->mode;
1769
1770 if (cft->read || cft->read_u64 || cft->read_s64 ||
1771 cft->read_map || cft->read_seq_string)
1772 mode |= S_IRUGO;
1773
1774 if (cft->write || cft->write_u64 || cft->write_s64 ||
1775 cft->write_string || cft->trigger)
1776 mode |= S_IWUSR;
1777
1778 return mode;
1779}
1780
1689int cgroup_add_file(struct cgroup *cgrp, 1781int cgroup_add_file(struct cgroup *cgrp,
1690 struct cgroup_subsys *subsys, 1782 struct cgroup_subsys *subsys,
1691 const struct cftype *cft) 1783 const struct cftype *cft)
@@ -1693,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,
1693 struct dentry *dir = cgrp->dentry; 1785 struct dentry *dir = cgrp->dentry;
1694 struct dentry *dentry; 1786 struct dentry *dentry;
1695 int error; 1787 int error;
1788 mode_t mode;
1696 1789
1697 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 1790 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
1698 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 1791 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1703,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,
1703 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 1796 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
1704 dentry = lookup_one_len(name, dir, strlen(name)); 1797 dentry = lookup_one_len(name, dir, strlen(name));
1705 if (!IS_ERR(dentry)) { 1798 if (!IS_ERR(dentry)) {
1706 error = cgroup_create_file(dentry, 0644 | S_IFREG, 1799 mode = cgroup_file_mode(cft);
1800 error = cgroup_create_file(dentry, mode | S_IFREG,
1707 cgrp->root->sb); 1801 cgrp->root->sb);
1708 if (!error) 1802 if (!error)
1709 dentry->d_fsdata = (void *)cft; 1803 dentry->d_fsdata = (void *)cft;
@@ -2285,6 +2379,7 @@ static struct cftype files[] = {
2285 .write_u64 = cgroup_tasks_write, 2379 .write_u64 = cgroup_tasks_write,
2286 .release = cgroup_tasks_release, 2380 .release = cgroup_tasks_release,
2287 .private = FILE_TASKLIST, 2381 .private = FILE_TASKLIST,
2382 .mode = S_IRUGO | S_IWUSR,
2288 }, 2383 },
2289 2384
2290 { 2385 {
@@ -2324,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
2324 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 2419 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
2325 return err; 2420 return err;
2326 } 2421 }
2422 /* This cgroup is ready now */
2423 for_each_subsys(cgrp->root, ss) {
2424 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2425 /*
2426 * Update id->css pointer and make this css visible from
2427 * CSS ID functions. This pointer will be dereferened
2428 * from RCU-read-side without locks.
2429 */
2430 if (css->id)
2431 rcu_assign_pointer(css->id->css, css);
2432 }
2327 2433
2328 return 0; 2434 return 0;
2329} 2435}
@@ -2335,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2335 css->cgroup = cgrp; 2441 css->cgroup = cgrp;
2336 atomic_set(&css->refcnt, 1); 2442 atomic_set(&css->refcnt, 1);
2337 css->flags = 0; 2443 css->flags = 0;
2444 css->id = NULL;
2338 if (cgrp == dummytop) 2445 if (cgrp == dummytop)
2339 set_bit(CSS_ROOT, &css->flags); 2446 set_bit(CSS_ROOT, &css->flags);
2340 BUG_ON(cgrp->subsys[ss->subsys_id]); 2447 BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2349,7 +2456,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2349 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2456 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2350 struct cgroup_subsys *ss = subsys[i]; 2457 struct cgroup_subsys *ss = subsys[i];
2351 if (ss->root == root) 2458 if (ss->root == root)
2352 mutex_lock_nested(&ss->hierarchy_mutex, i); 2459 mutex_lock(&ss->hierarchy_mutex);
2353 } 2460 }
2354} 2461}
2355 2462
@@ -2373,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2373 * Must be called with the mutex on the parent inode held 2480 * Must be called with the mutex on the parent inode held
2374 */ 2481 */
2375static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 2482static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2376 int mode) 2483 mode_t mode)
2377{ 2484{
2378 struct cgroup *cgrp; 2485 struct cgroup *cgrp;
2379 struct cgroupfs_root *root = parent->root; 2486 struct cgroupfs_root *root = parent->root;
@@ -2410,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2410 goto err_destroy; 2517 goto err_destroy;
2411 } 2518 }
2412 init_cgroup_css(css, ss, cgrp); 2519 init_cgroup_css(css, ss, cgrp);
2520 if (ss->use_id)
2521 if (alloc_css_id(ss, parent, cgrp))
2522 goto err_destroy;
2523 /* At error, ->destroy() callback has to free assigned ID. */
2413 } 2524 }
2414 2525
2415 cgroup_lock_hierarchy(root); 2526 cgroup_lock_hierarchy(root);
@@ -2434,7 +2545,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2434 2545
2435 err_remove: 2546 err_remove:
2436 2547
2548 cgroup_lock_hierarchy(root);
2437 list_del(&cgrp->sibling); 2549 list_del(&cgrp->sibling);
2550 cgroup_unlock_hierarchy(root);
2438 root->number_of_cgroups--; 2551 root->number_of_cgroups--;
2439 2552
2440 err_destroy: 2553 err_destroy:
@@ -2507,7 +2620,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
2507 for_each_subsys(cgrp->root, ss) { 2620 for_each_subsys(cgrp->root, ss) {
2508 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 2621 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2509 int refcnt; 2622 int refcnt;
2510 do { 2623 while (1) {
2511 /* We can only remove a CSS with a refcnt==1 */ 2624 /* We can only remove a CSS with a refcnt==1 */
2512 refcnt = atomic_read(&css->refcnt); 2625 refcnt = atomic_read(&css->refcnt);
2513 if (refcnt > 1) { 2626 if (refcnt > 1) {
@@ -2521,7 +2634,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
2521 * css_tryget() to spin until we set the 2634 * css_tryget() to spin until we set the
2522 * CSS_REMOVED bits or abort 2635 * CSS_REMOVED bits or abort
2523 */ 2636 */
2524 } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt); 2637 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
2638 break;
2639 cpu_relax();
2640 }
2525 } 2641 }
2526 done: 2642 done:
2527 for_each_subsys(cgrp->root, ss) { 2643 for_each_subsys(cgrp->root, ss) {
@@ -2547,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2547 struct cgroup *cgrp = dentry->d_fsdata; 2663 struct cgroup *cgrp = dentry->d_fsdata;
2548 struct dentry *d; 2664 struct dentry *d;
2549 struct cgroup *parent; 2665 struct cgroup *parent;
2666 DEFINE_WAIT(wait);
2667 int ret;
2550 2668
2551 /* the vfs holds both inode->i_mutex already */ 2669 /* the vfs holds both inode->i_mutex already */
2552 2670again:
2553 mutex_lock(&cgroup_mutex); 2671 mutex_lock(&cgroup_mutex);
2554 if (atomic_read(&cgrp->count) != 0) { 2672 if (atomic_read(&cgrp->count) != 0) {
2555 mutex_unlock(&cgroup_mutex); 2673 mutex_unlock(&cgroup_mutex);
@@ -2565,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2565 * Call pre_destroy handlers of subsys. Notify subsystems 2683 * Call pre_destroy handlers of subsys. Notify subsystems
2566 * that rmdir() request comes. 2684 * that rmdir() request comes.
2567 */ 2685 */
2568 cgroup_call_pre_destroy(cgrp); 2686 ret = cgroup_call_pre_destroy(cgrp);
2687 if (ret)
2688 return ret;
2569 2689
2570 mutex_lock(&cgroup_mutex); 2690 mutex_lock(&cgroup_mutex);
2571 parent = cgrp->parent; 2691 parent = cgrp->parent;
2572 2692 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2573 if (atomic_read(&cgrp->count)
2574 || !list_empty(&cgrp->children)
2575 || !cgroup_clear_css_refs(cgrp)) {
2576 mutex_unlock(&cgroup_mutex); 2693 mutex_unlock(&cgroup_mutex);
2577 return -EBUSY; 2694 return -EBUSY;
2578 } 2695 }
2696 /*
2697 * css_put/get is provided for subsys to grab refcnt to css. In typical
2698 * case, subsystem has no reference after pre_destroy(). But, under
2699 * hierarchy management, some *temporal* refcnt can be hold.
2700 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2701 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2702 * is called when css_put() is called and refcnt goes down to 0.
2703 */
2704 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2705 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2706
2707 if (!cgroup_clear_css_refs(cgrp)) {
2708 mutex_unlock(&cgroup_mutex);
2709 schedule();
2710 finish_wait(&cgroup_rmdir_waitq, &wait);
2711 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2712 if (signal_pending(current))
2713 return -EINTR;
2714 goto again;
2715 }
2716 /* NO css_tryget() can success after here. */
2717 finish_wait(&cgroup_rmdir_waitq, &wait);
2718 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2579 2719
2580 spin_lock(&release_list_lock); 2720 spin_lock(&release_list_lock);
2581 set_bit(CGRP_REMOVED, &cgrp->flags); 2721 set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -2630,6 +2770,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2630 BUG_ON(!list_empty(&init_task.tasks)); 2770 BUG_ON(!list_empty(&init_task.tasks));
2631 2771
2632 mutex_init(&ss->hierarchy_mutex); 2772 mutex_init(&ss->hierarchy_mutex);
2773 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
2633 ss->active = 1; 2774 ss->active = 1;
2634} 2775}
2635 2776
@@ -2699,6 +2840,8 @@ int __init cgroup_init(void)
2699 struct cgroup_subsys *ss = subsys[i]; 2840 struct cgroup_subsys *ss = subsys[i];
2700 if (!ss->early_init) 2841 if (!ss->early_init)
2701 cgroup_init_subsys(ss); 2842 cgroup_init_subsys(ss);
2843 if (ss->use_id)
2844 cgroup_subsys_init_idr(ss);
2702 } 2845 }
2703 2846
2704 /* Add init_css_set to the hash table */ 2847 /* Add init_css_set to the hash table */
@@ -2991,20 +3134,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2991 mutex_unlock(&cgroup_mutex); 3134 mutex_unlock(&cgroup_mutex);
2992 return 0; 3135 return 0;
2993 } 3136 }
2994 task_lock(tsk);
2995 cg = tsk->cgroups;
2996 parent = task_cgroup(tsk, subsys->subsys_id);
2997 3137
2998 /* Pin the hierarchy */ 3138 /* Pin the hierarchy */
2999 if (!atomic_inc_not_zero(&parent->root->sb->s_active)) { 3139 if (!atomic_inc_not_zero(&root->sb->s_active)) {
3000 /* We race with the final deactivate_super() */ 3140 /* We race with the final deactivate_super() */
3001 mutex_unlock(&cgroup_mutex); 3141 mutex_unlock(&cgroup_mutex);
3002 return 0; 3142 return 0;
3003 } 3143 }
3004 3144
3005 /* Keep the cgroup alive */ 3145 /* Keep the cgroup alive */
3146 task_lock(tsk);
3147 parent = task_cgroup(tsk, subsys->subsys_id);
3148 cg = tsk->cgroups;
3006 get_css_set(cg); 3149 get_css_set(cg);
3007 task_unlock(tsk); 3150 task_unlock(tsk);
3151
3008 mutex_unlock(&cgroup_mutex); 3152 mutex_unlock(&cgroup_mutex);
3009 3153
3010 /* Now do the VFS work to create a cgroup */ 3154 /* Now do the VFS work to create a cgroup */
@@ -3043,7 +3187,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3043 mutex_unlock(&inode->i_mutex); 3187 mutex_unlock(&inode->i_mutex);
3044 put_css_set(cg); 3188 put_css_set(cg);
3045 3189
3046 deactivate_super(parent->root->sb); 3190 deactivate_super(root->sb);
3047 /* The cgroup is still accessible in the VFS, but 3191 /* The cgroup is still accessible in the VFS, but
3048 * we're not going to try to rmdir() it at this 3192 * we're not going to try to rmdir() it at this
3049 * point. */ 3193 * point. */
@@ -3069,23 +3213,24 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3069 mutex_lock(&cgroup_mutex); 3213 mutex_lock(&cgroup_mutex);
3070 put_css_set(cg); 3214 put_css_set(cg);
3071 mutex_unlock(&cgroup_mutex); 3215 mutex_unlock(&cgroup_mutex);
3072 deactivate_super(parent->root->sb); 3216 deactivate_super(root->sb);
3073 return ret; 3217 return ret;
3074} 3218}
3075 3219
3076/** 3220/**
3077 * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp 3221 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
3078 * @cgrp: the cgroup in question 3222 * @cgrp: the cgroup in question
3223 * @task: the task in question
3079 * 3224 *
3080 * See if @cgrp is a descendant of the current task's cgroup in 3225 * See if @cgrp is a descendant of @task's cgroup in the appropriate
3081 * the appropriate hierarchy. 3226 * hierarchy.
3082 * 3227 *
3083 * If we are sending in dummytop, then presumably we are creating 3228 * If we are sending in dummytop, then presumably we are creating
3084 * the top cgroup in the subsystem. 3229 * the top cgroup in the subsystem.
3085 * 3230 *
3086 * Called only by the ns (nsproxy) cgroup. 3231 * Called only by the ns (nsproxy) cgroup.
3087 */ 3232 */
3088int cgroup_is_descendant(const struct cgroup *cgrp) 3233int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3089{ 3234{
3090 int ret; 3235 int ret;
3091 struct cgroup *target; 3236 struct cgroup *target;
@@ -3095,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
3095 return 1; 3240 return 1;
3096 3241
3097 get_first_subsys(cgrp, NULL, &subsys_id); 3242 get_first_subsys(cgrp, NULL, &subsys_id);
3098 target = task_cgroup(current, subsys_id); 3243 target = task_cgroup(task, subsys_id);
3099 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3244 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3100 cgrp = cgrp->parent; 3245 cgrp = cgrp->parent;
3101 ret = (cgrp == target); 3246 ret = (cgrp == target);
@@ -3128,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)
3128{ 3273{
3129 struct cgroup *cgrp = css->cgroup; 3274 struct cgroup *cgrp = css->cgroup;
3130 rcu_read_lock(); 3275 rcu_read_lock();
3131 if ((atomic_dec_return(&css->refcnt) == 1) && 3276 if (atomic_dec_return(&css->refcnt) == 1) {
3132 notify_on_release(cgrp)) { 3277 if (notify_on_release(cgrp)) {
3133 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3278 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3134 check_for_release(cgrp); 3279 check_for_release(cgrp);
3280 }
3281 cgroup_wakeup_rmdir_waiters(cgrp);
3135 } 3282 }
3136 rcu_read_unlock(); 3283 rcu_read_unlock();
3137} 3284}
@@ -3231,3 +3378,232 @@ static int __init cgroup_disable(char *str)
3231 return 1; 3378 return 1;
3232} 3379}
3233__setup("cgroup_disable=", cgroup_disable); 3380__setup("cgroup_disable=", cgroup_disable);
3381
3382/*
3383 * Functons for CSS ID.
3384 */
3385
3386/*
3387 *To get ID other than 0, this should be called when !cgroup_is_removed().
3388 */
3389unsigned short css_id(struct cgroup_subsys_state *css)
3390{
3391 struct css_id *cssid = rcu_dereference(css->id);
3392
3393 if (cssid)
3394 return cssid->id;
3395 return 0;
3396}
3397
3398unsigned short css_depth(struct cgroup_subsys_state *css)
3399{
3400 struct css_id *cssid = rcu_dereference(css->id);
3401
3402 if (cssid)
3403 return cssid->depth;
3404 return 0;
3405}
3406
3407bool css_is_ancestor(struct cgroup_subsys_state *child,
3408 const struct cgroup_subsys_state *root)
3409{
3410 struct css_id *child_id = rcu_dereference(child->id);
3411 struct css_id *root_id = rcu_dereference(root->id);
3412
3413 if (!child_id || !root_id || (child_id->depth < root_id->depth))
3414 return false;
3415 return child_id->stack[root_id->depth] == root_id->id;
3416}
3417
3418static void __free_css_id_cb(struct rcu_head *head)
3419{
3420 struct css_id *id;
3421
3422 id = container_of(head, struct css_id, rcu_head);
3423 kfree(id);
3424}
3425
3426void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3427{
3428 struct css_id *id = css->id;
3429 /* When this is called before css_id initialization, id can be NULL */
3430 if (!id)
3431 return;
3432
3433 BUG_ON(!ss->use_id);
3434
3435 rcu_assign_pointer(id->css, NULL);
3436 rcu_assign_pointer(css->id, NULL);
3437 spin_lock(&ss->id_lock);
3438 idr_remove(&ss->idr, id->id);
3439 spin_unlock(&ss->id_lock);
3440 call_rcu(&id->rcu_head, __free_css_id_cb);
3441}
3442
3443/*
3444 * This is called by init or create(). Then, calls to this function are
3445 * always serialized (By cgroup_mutex() at create()).
3446 */
3447
3448static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
3449{
3450 struct css_id *newid;
3451 int myid, error, size;
3452
3453 BUG_ON(!ss->use_id);
3454
3455 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
3456 newid = kzalloc(size, GFP_KERNEL);
3457 if (!newid)
3458 return ERR_PTR(-ENOMEM);
3459 /* get id */
3460 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
3461 error = -ENOMEM;
3462 goto err_out;
3463 }
3464 spin_lock(&ss->id_lock);
3465 /* Don't use 0. allocates an ID of 1-65535 */
3466 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
3467 spin_unlock(&ss->id_lock);
3468
3469 /* Returns error when there are no free spaces for new ID.*/
3470 if (error) {
3471 error = -ENOSPC;
3472 goto err_out;
3473 }
3474 if (myid > CSS_ID_MAX)
3475 goto remove_idr;
3476
3477 newid->id = myid;
3478 newid->depth = depth;
3479 return newid;
3480remove_idr:
3481 error = -ENOSPC;
3482 spin_lock(&ss->id_lock);
3483 idr_remove(&ss->idr, myid);
3484 spin_unlock(&ss->id_lock);
3485err_out:
3486 kfree(newid);
3487 return ERR_PTR(error);
3488
3489}
3490
3491static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
3492{
3493 struct css_id *newid;
3494 struct cgroup_subsys_state *rootcss;
3495
3496 spin_lock_init(&ss->id_lock);
3497 idr_init(&ss->idr);
3498
3499 rootcss = init_css_set.subsys[ss->subsys_id];
3500 newid = get_new_cssid(ss, 0);
3501 if (IS_ERR(newid))
3502 return PTR_ERR(newid);
3503
3504 newid->stack[0] = newid->id;
3505 newid->css = rootcss;
3506 rootcss->id = newid;
3507 return 0;
3508}
3509
3510static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3511 struct cgroup *child)
3512{
3513 int subsys_id, i, depth = 0;
3514 struct cgroup_subsys_state *parent_css, *child_css;
3515 struct css_id *child_id, *parent_id = NULL;
3516
3517 subsys_id = ss->subsys_id;
3518 parent_css = parent->subsys[subsys_id];
3519 child_css = child->subsys[subsys_id];
3520 depth = css_depth(parent_css) + 1;
3521 parent_id = parent_css->id;
3522
3523 child_id = get_new_cssid(ss, depth);
3524 if (IS_ERR(child_id))
3525 return PTR_ERR(child_id);
3526
3527 for (i = 0; i < depth; i++)
3528 child_id->stack[i] = parent_id->stack[i];
3529 child_id->stack[depth] = child_id->id;
3530 /*
3531 * child_id->css pointer will be set after this cgroup is available
3532 * see cgroup_populate_dir()
3533 */
3534 rcu_assign_pointer(child_css->id, child_id);
3535
3536 return 0;
3537}
3538
3539/**
3540 * css_lookup - lookup css by id
3541 * @ss: cgroup subsys to be looked into.
3542 * @id: the id
3543 *
3544 * Returns pointer to cgroup_subsys_state if there is valid one with id.
3545 * NULL if not. Should be called under rcu_read_lock()
3546 */
3547struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3548{
3549 struct css_id *cssid = NULL;
3550
3551 BUG_ON(!ss->use_id);
3552 cssid = idr_find(&ss->idr, id);
3553
3554 if (unlikely(!cssid))
3555 return NULL;
3556
3557 return rcu_dereference(cssid->css);
3558}
3559
3560/**
3561 * css_get_next - lookup next cgroup under specified hierarchy.
3562 * @ss: pointer to subsystem
3563 * @id: current position of iteration.
3564 * @root: pointer to css. search tree under this.
3565 * @foundid: position of found object.
3566 *
3567 * Search next css under the specified hierarchy of rootid. Calling under
3568 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
3569 */
3570struct cgroup_subsys_state *
3571css_get_next(struct cgroup_subsys *ss, int id,
3572 struct cgroup_subsys_state *root, int *foundid)
3573{
3574 struct cgroup_subsys_state *ret = NULL;
3575 struct css_id *tmp;
3576 int tmpid;
3577 int rootid = css_id(root);
3578 int depth = css_depth(root);
3579
3580 if (!rootid)
3581 return NULL;
3582
3583 BUG_ON(!ss->use_id);
3584 /* fill start point for scan */
3585 tmpid = id;
3586 while (1) {
3587 /*
3588 * scan next entry from bitmap(tree), tmpid is updated after
3589 * idr_get_next().
3590 */
3591 spin_lock(&ss->id_lock);
3592 tmp = idr_get_next(&ss->idr, &tmpid);
3593 spin_unlock(&ss->id_lock);
3594
3595 if (!tmp)
3596 break;
3597 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
3598 ret = rcu_dereference(tmp->css);
3599 if (ret) {
3600 *foundid = tmpid;
3601 break;
3602 }
3603 }
3604 /* continue to scan from next id */
3605 tmpid = tmpid + 1;
3606 }
3607 return ret;
3608}
3609
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index daca6209202d..0c92d797baa6 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{ 40{
41 u64 count; 41 u64 count;
42 42
43 cgroup_lock();
44 count = cgroup_task_count(cont); 43 count = cgroup_task_count(cont);
45 cgroup_unlock();
46 return count; 44 return count;
47} 45}
48 46
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f00dcb8..395b6974dc8d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu)
281 goto out; 281 goto out;
282 } 282 }
283 283
284 cpu_clear(cpu, cpu_active_map); 284 set_cpu_active(cpu, false);
285 285
286 /* 286 /*
287 * Make sure the all cpus did the reschedule and are not 287 * Make sure the all cpus did the reschedule and are not
@@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu)
296 err = _cpu_down(cpu, 0); 296 err = _cpu_down(cpu, 0);
297 297
298 if (cpu_online(cpu)) 298 if (cpu_online(cpu))
299 cpu_set(cpu, cpu_active_map); 299 set_cpu_active(cpu, true);
300 300
301out: 301out:
302 cpu_maps_update_done(); 302 cpu_maps_update_done();
@@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
333 goto out_notify; 333 goto out_notify;
334 BUG_ON(!cpu_online(cpu)); 334 BUG_ON(!cpu_online(cpu));
335 335
336 cpu_set(cpu, cpu_active_map); 336 set_cpu_active(cpu, true);
337 337
338 /* Now call notifier in preparation. */ 338 /* Now call notifier in preparation. */
339 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 339 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 647c77a88fcb..026faccca869 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,14 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62 62
63/* 63/*
64 * Workqueue for cpuset related tasks.
65 *
66 * Using kevent workqueue may cause deadlock when memory_migrate
67 * is set. So we create a separate workqueue thread for cpuset.
68 */
69static struct workqueue_struct *cpuset_wq;
70
71/*
64 * Tracks how many cpusets are currently defined in system. 72 * Tracks how many cpusets are currently defined in system.
65 * When there is only one cpuset (the root cpuset) we can 73 * When there is only one cpuset (the root cpuset) we can
66 * short circuit some hooks. 74 * short circuit some hooks.
@@ -120,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
120 return container_of(task_subsys_state(task, cpuset_subsys_id), 128 return container_of(task_subsys_state(task, cpuset_subsys_id),
121 struct cpuset, css); 129 struct cpuset, css);
122} 130}
123struct cpuset_hotplug_scanner {
124 struct cgroup_scanner scan;
125 struct cgroup *to;
126};
127 131
128/* bits in struct cpuset flags field */ 132/* bits in struct cpuset flags field */
129typedef enum { 133typedef enum {
@@ -513,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
513 return 0; 517 return 0;
514} 518}
515 519
520#ifdef CONFIG_SMP
516/* 521/*
517 * Helper routine for generate_sched_domains(). 522 * Helper routine for generate_sched_domains().
518 * Do cpusets a, b have overlapping cpus_allowed masks? 523 * Do cpusets a, b have overlapping cpus_allowed masks?
@@ -568,7 +573,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
568 * load balancing domains (sched domains) as specified by that partial 573 * load balancing domains (sched domains) as specified by that partial
569 * partition. 574 * partition.
570 * 575 *
571 * See "What is sched_load_balance" in Documentation/cpusets.txt 576 * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
572 * for a background explanation of this. 577 * for a background explanation of this.
573 * 578 *
574 * Does not return errors, on the theory that the callers of this 579 * Does not return errors, on the theory that the callers of this
@@ -807,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
807 812
808 put_online_cpus(); 813 put_online_cpus();
809} 814}
815#else /* !CONFIG_SMP */
816static void do_rebuild_sched_domains(struct work_struct *unused)
817{
818}
819
820static int generate_sched_domains(struct cpumask **domains,
821 struct sched_domain_attr **attributes)
822{
823 *domains = NULL;
824 return 1;
825}
826#endif /* CONFIG_SMP */
810 827
811static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); 828static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
812 829
@@ -831,7 +848,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
831 */ 848 */
832static void async_rebuild_sched_domains(void) 849static void async_rebuild_sched_domains(void)
833{ 850{
834 schedule_work(&rebuild_sched_domains_work); 851 queue_work(cpuset_wq, &rebuild_sched_domains_work);
835} 852}
836 853
837/* 854/*
@@ -1018,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1018 mutex_unlock(&callback_mutex); 1035 mutex_unlock(&callback_mutex);
1019} 1036}
1020 1037
1038/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
1041 */
1042static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan)
1044{
1045 struct mm_struct *mm;
1046 struct cpuset *cs;
1047 int migrate;
1048 const nodemask_t *oldmem = scan->data;
1049
1050 mm = get_task_mm(p);
1051 if (!mm)
1052 return;
1053
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs);
1056
1057 mpol_rebind_mm(mm, &cs->mems_allowed);
1058 if (migrate)
1059 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1060 mmput(mm);
1061}
1062
1021static void *cpuset_being_rebound; 1063static void *cpuset_being_rebound;
1022 1064
1023/** 1065/**
1024 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1066 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1025 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1067 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1026 * @oldmem: old mems_allowed of cpuset cs 1068 * @oldmem: old mems_allowed of cpuset cs
1069 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1027 * 1070 *
1028 * Called with cgroup_mutex held 1071 * Called with cgroup_mutex held
1029 * Return 0 if successful, -errno if not. 1072 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1073 * if @heap != NULL.
1030 */ 1074 */
1031static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) 1075static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1076 struct ptr_heap *heap)
1032{ 1077{
1033 struct task_struct *p; 1078 struct cgroup_scanner scan;
1034 struct mm_struct **mmarray;
1035 int i, n, ntasks;
1036 int migrate;
1037 int fudge;
1038 struct cgroup_iter it;
1039 int retval;
1040 1079
1041 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1080 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1042 1081
1043 fudge = 10; /* spare mmarray[] slots */ 1082 scan.cg = cs->css.cgroup;
1044 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ 1083 scan.test_task = NULL;
1045 retval = -ENOMEM; 1084 scan.process_task = cpuset_change_nodemask;
1046 1085 scan.heap = heap;
1047 /* 1086 scan.data = (nodemask_t *)oldmem;
1048 * Allocate mmarray[] to hold mm reference for each task
1049 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
1050 * tasklist_lock. We could use GFP_ATOMIC, but with a
1051 * few more lines of code, we can retry until we get a big
1052 * enough mmarray[] w/o using GFP_ATOMIC.
1053 */
1054 while (1) {
1055 ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
1056 ntasks += fudge;
1057 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
1058 if (!mmarray)
1059 goto done;
1060 read_lock(&tasklist_lock); /* block fork */
1061 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
1062 break; /* got enough */
1063 read_unlock(&tasklist_lock); /* try again */
1064 kfree(mmarray);
1065 }
1066
1067 n = 0;
1068
1069 /* Load up mmarray[] with mm reference for each task in cpuset. */
1070 cgroup_iter_start(cs->css.cgroup, &it);
1071 while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
1072 struct mm_struct *mm;
1073
1074 if (n >= ntasks) {
1075 printk(KERN_WARNING
1076 "Cpuset mempolicy rebind incomplete.\n");
1077 break;
1078 }
1079 mm = get_task_mm(p);
1080 if (!mm)
1081 continue;
1082 mmarray[n++] = mm;
1083 }
1084 cgroup_iter_end(cs->css.cgroup, &it);
1085 read_unlock(&tasklist_lock);
1086 1087
1087 /* 1088 /*
1088 * Now that we've dropped the tasklist spinlock, we can 1089 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1089 * rebind the vma mempolicies of each mm in mmarray[] to their 1090 * take while holding tasklist_lock. Forks can happen - the
1090 * new cpuset, and release that mm. The mpol_rebind_mm() 1091 * mpol_dup() cpuset_being_rebound check will catch such forks,
1091 * call takes mmap_sem, which we couldn't take while holding 1092 * and rebind their vma mempolicies too. Because we still hold
1092 * tasklist_lock. Forks can happen again now - the mpol_dup() 1093 * the global cgroup_mutex, we know that no other rebind effort
1093 * cpuset_being_rebound check will catch such forks, and rebind 1094 * will be contending for the global variable cpuset_being_rebound.
1094 * their vma mempolicies too. Because we still hold the global
1095 * cgroup_mutex, we know that no other rebind effort will
1096 * be contending for the global variable cpuset_being_rebound.
1097 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1095 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1098 * is idempotent. Also migrate pages in each mm to new nodes. 1096 * is idempotent. Also migrate pages in each mm to new nodes.
1099 */ 1097 */
1100 migrate = is_memory_migrate(cs); 1098 cgroup_scan_tasks(&scan);
1101 for (i = 0; i < n; i++) {
1102 struct mm_struct *mm = mmarray[i];
1103
1104 mpol_rebind_mm(mm, &cs->mems_allowed);
1105 if (migrate)
1106 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1107 mmput(mm);
1108 }
1109 1099
1110 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1100 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1111 kfree(mmarray);
1112 cpuset_being_rebound = NULL; 1101 cpuset_being_rebound = NULL;
1113 retval = 0;
1114done:
1115 return retval;
1116} 1102}
1117 1103
1118/* 1104/*
@@ -1133,6 +1119,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1133{ 1119{
1134 nodemask_t oldmem; 1120 nodemask_t oldmem;
1135 int retval; 1121 int retval;
1122 struct ptr_heap heap;
1136 1123
1137 /* 1124 /*
1138 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1125 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1167,12 +1154,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1167 if (retval < 0) 1154 if (retval < 0)
1168 goto done; 1155 goto done;
1169 1156
1157 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1158 if (retval < 0)
1159 goto done;
1160
1170 mutex_lock(&callback_mutex); 1161 mutex_lock(&callback_mutex);
1171 cs->mems_allowed = trialcs->mems_allowed; 1162 cs->mems_allowed = trialcs->mems_allowed;
1172 cs->mems_generation = cpuset_mems_generation++; 1163 cs->mems_generation = cpuset_mems_generation++;
1173 mutex_unlock(&callback_mutex); 1164 mutex_unlock(&callback_mutex);
1174 1165
1175 retval = update_tasks_nodemask(cs, &oldmem); 1166 update_tasks_nodemask(cs, &oldmem, &heap);
1167
1168 heap_free(&heap);
1176done: 1169done:
1177 return retval; 1170 return retval;
1178} 1171}
@@ -1184,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
1184 1177
1185static int update_relax_domain_level(struct cpuset *cs, s64 val) 1178static int update_relax_domain_level(struct cpuset *cs, s64 val)
1186{ 1179{
1180#ifdef CONFIG_SMP
1187 if (val < -1 || val >= SD_LV_MAX) 1181 if (val < -1 || val >= SD_LV_MAX)
1188 return -EINVAL; 1182 return -EINVAL;
1183#endif
1189 1184
1190 if (val != cs->relax_domain_level) { 1185 if (val != cs->relax_domain_level) {
1191 cs->relax_domain_level = val; 1186 cs->relax_domain_level = val;
@@ -1347,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1347 struct cgroup *cont, struct task_struct *tsk) 1342 struct cgroup *cont, struct task_struct *tsk)
1348{ 1343{
1349 struct cpuset *cs = cgroup_cs(cont); 1344 struct cpuset *cs = cgroup_cs(cont);
1350 int ret = 0;
1351 1345
1352 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1346 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1353 return -ENOSPC; 1347 return -ENOSPC;
1354 1348
1355 if (tsk->flags & PF_THREAD_BOUND) { 1349 /*
1356 mutex_lock(&callback_mutex); 1350 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1357 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed)) 1351 * cannot change their cpu affinity and isolating such threads by their
1358 ret = -EINVAL; 1352 * set of allowed nodes is unnecessary. Thus, cpusets are not
1359 mutex_unlock(&callback_mutex); 1353 * applicable for such threads. This prevents checking for success of
1360 } 1354 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1355 * be changed.
1356 */
1357 if (tsk->flags & PF_THREAD_BOUND)
1358 return -EINVAL;
1361 1359
1362 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL); 1360 return security_task_setscheduler(tsk, 0, NULL);
1363} 1361}
1364 1362
1365static void cpuset_attach(struct cgroup_subsys *ss, 1363static void cpuset_attach(struct cgroup_subsys *ss,
@@ -1698,6 +1696,7 @@ static struct cftype files[] = {
1698 .read_u64 = cpuset_read_u64, 1696 .read_u64 = cpuset_read_u64,
1699 .write_u64 = cpuset_write_u64, 1697 .write_u64 = cpuset_write_u64,
1700 .private = FILE_MEMORY_PRESSURE, 1698 .private = FILE_MEMORY_PRESSURE,
1699 .mode = S_IRUGO,
1701 }, 1700 },
1702 1701
1703 { 1702 {
@@ -1905,10 +1904,9 @@ int __init cpuset_init(void)
1905static void cpuset_do_move_task(struct task_struct *tsk, 1904static void cpuset_do_move_task(struct task_struct *tsk,
1906 struct cgroup_scanner *scan) 1905 struct cgroup_scanner *scan)
1907{ 1906{
1908 struct cpuset_hotplug_scanner *chsp; 1907 struct cgroup *new_cgroup = scan->data;
1909 1908
1910 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); 1909 cgroup_attach_task(new_cgroup, tsk);
1911 cgroup_attach_task(chsp->to, tsk);
1912} 1910}
1913 1911
1914/** 1912/**
@@ -1924,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1924 */ 1922 */
1925static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) 1923static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1926{ 1924{
1927 struct cpuset_hotplug_scanner scan; 1925 struct cgroup_scanner scan;
1928 1926
1929 scan.scan.cg = from->css.cgroup; 1927 scan.cg = from->css.cgroup;
1930 scan.scan.test_task = NULL; /* select all tasks in cgroup */ 1928 scan.test_task = NULL; /* select all tasks in cgroup */
1931 scan.scan.process_task = cpuset_do_move_task; 1929 scan.process_task = cpuset_do_move_task;
1932 scan.scan.heap = NULL; 1930 scan.heap = NULL;
1933 scan.to = to->css.cgroup; 1931 scan.data = to->css.cgroup;
1934 1932
1935 if (cgroup_scan_tasks(&scan.scan)) 1933 if (cgroup_scan_tasks(&scan))
1936 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1934 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1937 "cgroup_scan_tasks failed\n"); 1935 "cgroup_scan_tasks failed\n");
1938} 1936}
@@ -2025,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2025 remove_tasks_in_empty_cpuset(cp); 2023 remove_tasks_in_empty_cpuset(cp);
2026 else { 2024 else {
2027 update_tasks_cpumask(cp, NULL); 2025 update_tasks_cpumask(cp, NULL);
2028 update_tasks_nodemask(cp, &oldmems); 2026 update_tasks_nodemask(cp, &oldmems, NULL);
2029 } 2027 }
2030 } 2028 }
2031} 2029}
@@ -2061,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2061 } 2059 }
2062 2060
2063 cgroup_lock(); 2061 cgroup_lock();
2062 mutex_lock(&callback_mutex);
2064 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2063 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2064 mutex_unlock(&callback_mutex);
2065 scan_for_empty_cpusets(&top_cpuset); 2065 scan_for_empty_cpusets(&top_cpuset);
2066 ndoms = generate_sched_domains(&doms, &attr); 2066 ndoms = generate_sched_domains(&doms, &attr);
2067 cgroup_unlock(); 2067 cgroup_unlock();
@@ -2084,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2084 cgroup_lock(); 2084 cgroup_lock();
2085 switch (action) { 2085 switch (action) {
2086 case MEM_ONLINE: 2086 case MEM_ONLINE:
2087 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2088 break;
2089 case MEM_OFFLINE: 2087 case MEM_OFFLINE:
2088 mutex_lock(&callback_mutex);
2090 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2089 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2091 scan_for_empty_cpusets(&top_cpuset); 2090 mutex_unlock(&callback_mutex);
2091 if (action == MEM_OFFLINE)
2092 scan_for_empty_cpusets(&top_cpuset);
2092 break; 2093 break;
2093 default: 2094 default:
2094 break; 2095 break;
@@ -2111,6 +2112,9 @@ void __init cpuset_init_smp(void)
2111 2112
2112 hotcpu_notifier(cpuset_track_online_cpus, 0); 2113 hotcpu_notifier(cpuset_track_online_cpus, 0);
2113 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2114 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2115
2116 cpuset_wq = create_singlethread_workqueue("cpuset");
2117 BUG_ON(!cpuset_wq);
2114} 2118}
2115 2119
2116/** 2120/**
@@ -2195,26 +2199,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2195} 2199}
2196 2200
2197/** 2201/**
2198 * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? 2202 * cpuset_node_allowed_softwall - Can we allocate on a memory node?
2199 * @z: is this zone on an allowed node? 2203 * @node: is this an allowed node?
2200 * @gfp_mask: memory allocation flags 2204 * @gfp_mask: memory allocation flags
2201 * 2205 *
2202 * If we're in interrupt, yes, we can always allocate. If 2206 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2203 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2207 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2204 * z's node is in our tasks mems_allowed, yes. If it's not a 2208 * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
2205 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2209 * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
2206 * hardwalled cpuset ancestor to this tasks cpuset, yes. 2210 * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
2207 * If the task has been OOM killed and has access to memory reserves 2211 * flag, yes.
2208 * as specified by the TIF_MEMDIE flag, yes.
2209 * Otherwise, no. 2212 * Otherwise, no.
2210 * 2213 *
2211 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() 2214 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2212 * reduces to cpuset_zone_allowed_hardwall(). Otherwise, 2215 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2213 * cpuset_zone_allowed_softwall() might sleep, and might allow a zone 2216 * might sleep, and might allow a node from an enclosing cpuset.
2214 * from an enclosing cpuset.
2215 * 2217 *
2216 * cpuset_zone_allowed_hardwall() only handles the simpler case of 2218 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2217 * hardwall cpusets, and never sleeps. 2219 * cpusets, and never sleeps.
2218 * 2220 *
2219 * The __GFP_THISNODE placement logic is really handled elsewhere, 2221 * The __GFP_THISNODE placement logic is really handled elsewhere,
2220 * by forcibly using a zonelist starting at a specified node, and by 2222 * by forcibly using a zonelist starting at a specified node, and by
@@ -2253,20 +2255,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2253 * GFP_USER - only nodes in current tasks mems allowed ok. 2255 * GFP_USER - only nodes in current tasks mems allowed ok.
2254 * 2256 *
2255 * Rule: 2257 * Rule:
2256 * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you 2258 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2257 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables 2259 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2258 * the code that might scan up ancestor cpusets and sleep. 2260 * the code that might scan up ancestor cpusets and sleep.
2259 */ 2261 */
2260 2262int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2261int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2262{ 2263{
2263 int node; /* node that zone z is on */
2264 const struct cpuset *cs; /* current cpuset ancestors */ 2264 const struct cpuset *cs; /* current cpuset ancestors */
2265 int allowed; /* is allocation in zone z allowed? */ 2265 int allowed; /* is allocation in zone z allowed? */
2266 2266
2267 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2267 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2268 return 1; 2268 return 1;
2269 node = zone_to_nid(z);
2270 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2269 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2271 if (node_isset(node, current->mems_allowed)) 2270 if (node_isset(node, current->mems_allowed))
2272 return 1; 2271 return 1;
@@ -2295,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2295} 2294}
2296 2295
2297/* 2296/*
2298 * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? 2297 * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2299 * @z: is this zone on an allowed node? 2298 * @node: is this an allowed node?
2300 * @gfp_mask: memory allocation flags 2299 * @gfp_mask: memory allocation flags
2301 * 2300 *
2302 * If we're in interrupt, yes, we can always allocate. 2301 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2303 * If __GFP_THISNODE is set, yes, we can always allocate. If zone 2302 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2304 * z's node is in our tasks mems_allowed, yes. If the task has been 2303 * yes. If the task has been OOM killed and has access to memory reserves as
2305 * OOM killed and has access to memory reserves as specified by the 2304 * specified by the TIF_MEMDIE flag, yes.
2306 * TIF_MEMDIE flag, yes. Otherwise, no. 2305 * Otherwise, no.
2307 * 2306 *
2308 * The __GFP_THISNODE placement logic is really handled elsewhere, 2307 * The __GFP_THISNODE placement logic is really handled elsewhere,
2309 * by forcibly using a zonelist starting at a specified node, and by 2308 * by forcibly using a zonelist starting at a specified node, and by
@@ -2311,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2311 * any node on the zonelist except the first. By the time any such 2310 * any node on the zonelist except the first. By the time any such
2312 * calls get to this routine, we should just shut up and say 'yes'. 2311 * calls get to this routine, we should just shut up and say 'yes'.
2313 * 2312 *
2314 * Unlike the cpuset_zone_allowed_softwall() variant, above, 2313 * Unlike the cpuset_node_allowed_softwall() variant, above,
2315 * this variant requires that the zone be in the current tasks 2314 * this variant requires that the node be in the current task's
2316 * mems_allowed or that we're in interrupt. It does not scan up the 2315 * mems_allowed or that we're in interrupt. It does not scan up the
2317 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. 2316 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2318 * It never sleeps. 2317 * It never sleeps.
2319 */ 2318 */
2320 2319int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2321int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2322{ 2320{
2323 int node; /* node that zone z is on */
2324
2325 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2321 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2326 return 1; 2322 return 1;
2327 node = zone_to_nid(z);
2328 if (node_isset(node, current->mems_allowed)) 2323 if (node_isset(node, current->mems_allowed))
2329 return 1; 2324 return 1;
2330 /* 2325 /*
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 038707404b76..962a3b574f21 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
98 * @size: size of requested memory area 98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle 99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address 100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area. 101 * to allocated area.
102 * 102 *
103 * This function should be only called from per-arch dma_alloc_coherent() 103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools. 104 * to support allocation from per-device coherent memory pools.
@@ -118,31 +118,32 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
118 mem = dev->dma_mem; 118 mem = dev->dma_mem;
119 if (!mem) 119 if (!mem)
120 return 0; 120 return 0;
121 if (unlikely(size > mem->size)) 121
122 return 0; 122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
123 126
124 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); 127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
125 if (pageno >= 0) { 128 if (unlikely(pageno < 0))
126 /* 129 goto err;
127 * Memory was found in the per-device arena. 130
128 */ 131 /*
129 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); 132 * Memory was found in the per-device area.
130 *ret = mem->virt_base + (pageno << PAGE_SHIFT); 133 */
131 memset(*ret, 0, size); 134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
132 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { 135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
133 /* 136 memset(*ret, 0, size);
134 * The per-device arena is exhausted and we are not 137
135 * permitted to fall back to generic memory.
136 */
137 *ret = NULL;
138 } else {
139 /*
140 * The per-device arena is exhausted and we are
141 * permitted to fall back to generic memory.
142 */
143 return 0;
144 }
145 return 1; 138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
146} 147}
147EXPORT_SYMBOL(dma_alloc_from_coherent); 148EXPORT_SYMBOL(dma_alloc_from_coherent);
148 149
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0511716e9424..c35452cadded 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/sysctl.h> 19#include <linux/sysctl.h>
20#include <linux/types.h> 20#include <linux/types.h>
21#include <linux/fs_struct.h>
21 22
22 23
23static void default_handler(int, struct pt_regs *); 24static void default_handler(int, struct pt_regs *);
@@ -145,28 +146,6 @@ __set_personality(u_long personality)
145 return 0; 146 return 0;
146 } 147 }
147 148
148 if (atomic_read(&current->fs->count) != 1) {
149 struct fs_struct *fsp, *ofsp;
150
151 fsp = copy_fs_struct(current->fs);
152 if (fsp == NULL) {
153 module_put(ep->module);
154 return -ENOMEM;
155 }
156
157 task_lock(current);
158 ofsp = current->fs;
159 current->fs = fsp;
160 task_unlock(current);
161
162 put_fs_struct(ofsp);
163 }
164
165 /*
166 * At that point we are guaranteed to be the sole owner of
167 * current->fs.
168 */
169
170 current->personality = personality; 149 current->personality = personality;
171 oep = current_thread_info()->exec_domain; 150 oep = current_thread_info()->exec_domain;
172 current_thread_info()->exec_domain = ep; 151 current_thread_info()->exec_domain = ep;
@@ -209,8 +188,7 @@ static int __init proc_execdomains_init(void)
209module_init(proc_execdomains_init); 188module_init(proc_execdomains_init);
210#endif 189#endif
211 190
212asmlinkage long 191SYSCALL_DEFINE1(personality, u_long, personality)
213sys_personality(u_long personality)
214{ 192{
215 u_long old = current->personality; 193 u_long old = current->personality;
216 194
diff --git a/kernel/exit.c b/kernel/exit.c
index c7740fa3252c..abf9cf3b95c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
46#include <linux/blkdev.h> 46#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/fs_struct.h>
49#include <linux/init_task.h> 50#include <linux/init_task.h>
50#include <trace/sched.h> 51#include <trace/sched.h>
51 52
@@ -61,11 +62,6 @@ DEFINE_TRACE(sched_process_wait);
61 62
62static void exit_mm(struct task_struct * tsk); 63static void exit_mm(struct task_struct * tsk);
63 64
64static inline int task_detached(struct task_struct *p)
65{
66 return p->exit_signal == -1;
67}
68
69static void __unhash_process(struct task_struct *p) 65static void __unhash_process(struct task_struct *p)
70{ 66{
71 nr_threads--; 67 nr_threads--;
@@ -118,6 +114,8 @@ static void __exit_signal(struct task_struct *tsk)
118 * We won't ever get here for the group leader, since it 114 * We won't ever get here for the group leader, since it
119 * will have been the last reference on the signal_struct. 115 * will have been the last reference on the signal_struct.
120 */ 116 */
117 sig->utime = cputime_add(sig->utime, task_utime(tsk));
118 sig->stime = cputime_add(sig->stime, task_stime(tsk));
121 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 119 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
122 sig->min_flt += tsk->min_flt; 120 sig->min_flt += tsk->min_flt;
123 sig->maj_flt += tsk->maj_flt; 121 sig->maj_flt += tsk->maj_flt;
@@ -126,6 +124,7 @@ static void __exit_signal(struct task_struct *tsk)
126 sig->inblock += task_io_get_inblock(tsk); 124 sig->inblock += task_io_get_inblock(tsk);
127 sig->oublock += task_io_get_oublock(tsk); 125 sig->oublock += task_io_get_oublock(tsk);
128 task_io_accounting_add(&sig->ioac, &tsk->ioac); 126 task_io_accounting_add(&sig->ioac, &tsk->ioac);
127 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
129 sig = NULL; /* Marker for below. */ 128 sig = NULL; /* Marker for below. */
130 } 129 }
131 130
@@ -359,16 +358,12 @@ static void reparent_to_kthreadd(void)
359void __set_special_pids(struct pid *pid) 358void __set_special_pids(struct pid *pid)
360{ 359{
361 struct task_struct *curr = current->group_leader; 360 struct task_struct *curr = current->group_leader;
362 pid_t nr = pid_nr(pid);
363 361
364 if (task_session(curr) != pid) { 362 if (task_session(curr) != pid)
365 change_pid(curr, PIDTYPE_SID, pid); 363 change_pid(curr, PIDTYPE_SID, pid);
366 set_task_session(curr, nr); 364
367 } 365 if (task_pgrp(curr) != pid)
368 if (task_pgrp(curr) != pid) {
369 change_pid(curr, PIDTYPE_PGID, pid); 366 change_pid(curr, PIDTYPE_PGID, pid);
370 set_task_pgrp(curr, nr);
371 }
372} 367}
373 368
374static void set_special_pids(struct pid *pid) 369static void set_special_pids(struct pid *pid)
@@ -426,7 +421,6 @@ EXPORT_SYMBOL(disallow_signal);
426void daemonize(const char *name, ...) 421void daemonize(const char *name, ...)
427{ 422{
428 va_list args; 423 va_list args;
429 struct fs_struct *fs;
430 sigset_t blocked; 424 sigset_t blocked;
431 425
432 va_start(args, name); 426 va_start(args, name);
@@ -459,11 +453,7 @@ void daemonize(const char *name, ...)
459 453
460 /* Become as one with the init task */ 454 /* Become as one with the init task */
461 455
462 exit_fs(current); /* current->fs->count--; */ 456 daemonize_fs_struct();
463 fs = init_task.fs;
464 current->fs = fs;
465 atomic_inc(&fs->count);
466
467 exit_files(current); 457 exit_files(current);
468 current->files = init_task.files; 458 current->files = init_task.files;
469 atomic_inc(&current->files->count); 459 atomic_inc(&current->files->count);
@@ -562,30 +552,6 @@ void exit_files(struct task_struct *tsk)
562 } 552 }
563} 553}
564 554
565void put_fs_struct(struct fs_struct *fs)
566{
567 /* No need to hold fs->lock if we are killing it */
568 if (atomic_dec_and_test(&fs->count)) {
569 path_put(&fs->root);
570 path_put(&fs->pwd);
571 kmem_cache_free(fs_cachep, fs);
572 }
573}
574
575void exit_fs(struct task_struct *tsk)
576{
577 struct fs_struct * fs = tsk->fs;
578
579 if (fs) {
580 task_lock(tsk);
581 tsk->fs = NULL;
582 task_unlock(tsk);
583 put_fs_struct(fs);
584 }
585}
586
587EXPORT_SYMBOL_GPL(exit_fs);
588
589#ifdef CONFIG_MM_OWNER 555#ifdef CONFIG_MM_OWNER
590/* 556/*
591 * Task p is exiting and it owned mm, lets find a new owner for it 557 * Task p is exiting and it owned mm, lets find a new owner for it
@@ -729,119 +695,6 @@ static void exit_mm(struct task_struct * tsk)
729} 695}
730 696
731/* 697/*
732 * Return nonzero if @parent's children should reap themselves.
733 *
734 * Called with write_lock_irq(&tasklist_lock) held.
735 */
736static int ignoring_children(struct task_struct *parent)
737{
738 int ret;
739 struct sighand_struct *psig = parent->sighand;
740 unsigned long flags;
741 spin_lock_irqsave(&psig->siglock, flags);
742 ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
743 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
744 spin_unlock_irqrestore(&psig->siglock, flags);
745 return ret;
746}
747
748/*
749 * Detach all tasks we were using ptrace on.
750 * Any that need to be release_task'd are put on the @dead list.
751 *
752 * Called with write_lock(&tasklist_lock) held.
753 */
754static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
755{
756 struct task_struct *p, *n;
757 int ign = -1;
758
759 list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
760 __ptrace_unlink(p);
761
762 if (p->exit_state != EXIT_ZOMBIE)
763 continue;
764
765 /*
766 * If it's a zombie, our attachedness prevented normal
767 * parent notification or self-reaping. Do notification
768 * now if it would have happened earlier. If it should
769 * reap itself, add it to the @dead list. We can't call
770 * release_task() here because we already hold tasklist_lock.
771 *
772 * If it's our own child, there is no notification to do.
773 * But if our normal children self-reap, then this child
774 * was prevented by ptrace and we must reap it now.
775 */
776 if (!task_detached(p) && thread_group_empty(p)) {
777 if (!same_thread_group(p->real_parent, parent))
778 do_notify_parent(p, p->exit_signal);
779 else {
780 if (ign < 0)
781 ign = ignoring_children(parent);
782 if (ign)
783 p->exit_signal = -1;
784 }
785 }
786
787 if (task_detached(p)) {
788 /*
789 * Mark it as in the process of being reaped.
790 */
791 p->exit_state = EXIT_DEAD;
792 list_add(&p->ptrace_entry, dead);
793 }
794 }
795}
796
797/*
798 * Finish up exit-time ptrace cleanup.
799 *
800 * Called without locks.
801 */
802static void ptrace_exit_finish(struct task_struct *parent,
803 struct list_head *dead)
804{
805 struct task_struct *p, *n;
806
807 BUG_ON(!list_empty(&parent->ptraced));
808
809 list_for_each_entry_safe(p, n, dead, ptrace_entry) {
810 list_del_init(&p->ptrace_entry);
811 release_task(p);
812 }
813}
814
815static void reparent_thread(struct task_struct *p, struct task_struct *father)
816{
817 if (p->pdeath_signal)
818 /* We already hold the tasklist_lock here. */
819 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
820
821 list_move_tail(&p->sibling, &p->real_parent->children);
822
823 /* If this is a threaded reparent there is no need to
824 * notify anyone anything has happened.
825 */
826 if (same_thread_group(p->real_parent, father))
827 return;
828
829 /* We don't want people slaying init. */
830 if (!task_detached(p))
831 p->exit_signal = SIGCHLD;
832
833 /* If we'd notified the old parent about this child's death,
834 * also notify the new parent.
835 */
836 if (!ptrace_reparented(p) &&
837 p->exit_state == EXIT_ZOMBIE &&
838 !task_detached(p) && thread_group_empty(p))
839 do_notify_parent(p, p->exit_signal);
840
841 kill_orphaned_pgrp(p, father);
842}
843
844/*
845 * When we die, we re-parent all our children. 698 * When we die, we re-parent all our children.
846 * Try to give them to another thread in our thread 699 * Try to give them to another thread in our thread
847 * group, and if no such member exists, give it to 700 * group, and if no such member exists, give it to
@@ -880,17 +733,51 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
880 return pid_ns->child_reaper; 733 return pid_ns->child_reaper;
881} 734}
882 735
736/*
737* Any that need to be release_task'd are put on the @dead list.
738 */
739static void reparent_thread(struct task_struct *father, struct task_struct *p,
740 struct list_head *dead)
741{
742 if (p->pdeath_signal)
743 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744
745 list_move_tail(&p->sibling, &p->real_parent->children);
746
747 if (task_detached(p))
748 return;
749 /*
750 * If this is a threaded reparent there is no need to
751 * notify anyone anything has happened.
752 */
753 if (same_thread_group(p->real_parent, father))
754 return;
755
756 /* We don't want people slaying init. */
757 p->exit_signal = SIGCHLD;
758
759 /* If it has exited notify the new parent about this child's death. */
760 if (!p->ptrace &&
761 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
762 do_notify_parent(p, p->exit_signal);
763 if (task_detached(p)) {
764 p->exit_state = EXIT_DEAD;
765 list_move_tail(&p->sibling, dead);
766 }
767 }
768
769 kill_orphaned_pgrp(p, father);
770}
771
883static void forget_original_parent(struct task_struct *father) 772static void forget_original_parent(struct task_struct *father)
884{ 773{
885 struct task_struct *p, *n, *reaper; 774 struct task_struct *p, *n, *reaper;
886 LIST_HEAD(ptrace_dead); 775 LIST_HEAD(dead_children);
776
777 exit_ptrace(father);
887 778
888 write_lock_irq(&tasklist_lock); 779 write_lock_irq(&tasklist_lock);
889 reaper = find_new_reaper(father); 780 reaper = find_new_reaper(father);
890 /*
891 * First clean up ptrace if we were using it.
892 */
893 ptrace_exit(father, &ptrace_dead);
894 781
895 list_for_each_entry_safe(p, n, &father->children, sibling) { 782 list_for_each_entry_safe(p, n, &father->children, sibling) {
896 p->real_parent = reaper; 783 p->real_parent = reaper;
@@ -898,13 +785,16 @@ static void forget_original_parent(struct task_struct *father)
898 BUG_ON(p->ptrace); 785 BUG_ON(p->ptrace);
899 p->parent = p->real_parent; 786 p->parent = p->real_parent;
900 } 787 }
901 reparent_thread(p, father); 788 reparent_thread(father, p, &dead_children);
902 } 789 }
903
904 write_unlock_irq(&tasklist_lock); 790 write_unlock_irq(&tasklist_lock);
791
905 BUG_ON(!list_empty(&father->children)); 792 BUG_ON(!list_empty(&father->children));
906 793
907 ptrace_exit_finish(father, &ptrace_dead); 794 list_for_each_entry_safe(p, n, &dead_children, sibling) {
795 list_del_init(&p->sibling);
796 release_task(p);
797 }
908} 798}
909 799
910/* 800/*
@@ -947,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
947 */ 837 */
948 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 838 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
949 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
950 tsk->self_exec_id != tsk->parent_exec_id) && 840 tsk->self_exec_id != tsk->parent_exec_id))
951 !capable(CAP_KILL))
952 tsk->exit_signal = SIGCHLD; 841 tsk->exit_signal = SIGCHLD;
953 842
954 signal = tracehook_notify_death(tsk, &cookie, group_dead); 843 signal = tracehook_notify_death(tsk, &cookie, group_dead);
@@ -977,12 +866,9 @@ static void check_stack_usage(void)
977{ 866{
978 static DEFINE_SPINLOCK(low_water_lock); 867 static DEFINE_SPINLOCK(low_water_lock);
979 static int lowest_to_date = THREAD_SIZE; 868 static int lowest_to_date = THREAD_SIZE;
980 unsigned long *n = end_of_stack(current);
981 unsigned long free; 869 unsigned long free;
982 870
983 while (*n == 0) 871 free = stack_not_used(current);
984 n++;
985 free = (unsigned long)n - (unsigned long)end_of_stack(current);
986 872
987 if (free >= lowest_to_date) 873 if (free >= lowest_to_date)
988 return; 874 return;
@@ -1037,6 +923,8 @@ NORET_TYPE void do_exit(long code)
1037 schedule(); 923 schedule();
1038 } 924 }
1039 925
926 exit_irq_thread();
927
1040 exit_signals(tsk); /* sets PF_EXITING */ 928 exit_signals(tsk); /* sets PF_EXITING */
1041 /* 929 /*
1042 * tsk->flags are checked in the futex code to protect against 930 * tsk->flags are checked in the futex code to protect against
@@ -1141,7 +1029,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
1141 1029
1142EXPORT_SYMBOL(complete_and_exit); 1030EXPORT_SYMBOL(complete_and_exit);
1143 1031
1144asmlinkage long sys_exit(int error_code) 1032SYSCALL_DEFINE1(exit, int, error_code)
1145{ 1033{
1146 do_exit((error_code&0xff)<<8); 1034 do_exit((error_code&0xff)<<8);
1147} 1035}
@@ -1182,9 +1070,11 @@ do_group_exit(int exit_code)
1182 * wait4()-ing process will get the correct exit code - even if this 1070 * wait4()-ing process will get the correct exit code - even if this
1183 * thread is not the thread group leader. 1071 * thread is not the thread group leader.
1184 */ 1072 */
1185asmlinkage void sys_exit_group(int error_code) 1073SYSCALL_DEFINE1(exit_group, int, error_code)
1186{ 1074{
1187 do_group_exit((error_code & 0xff) << 8); 1075 do_group_exit((error_code & 0xff) << 8);
1076 /* NOTREACHED */
1077 return 0;
1188} 1078}
1189 1079
1190static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1080static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
@@ -1415,6 +1305,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
1415 return retval; 1305 return retval;
1416} 1306}
1417 1307
1308static int *task_stopped_code(struct task_struct *p, bool ptrace)
1309{
1310 if (ptrace) {
1311 if (task_is_stopped_or_traced(p))
1312 return &p->exit_code;
1313 } else {
1314 if (p->signal->flags & SIGNAL_STOP_STOPPED)
1315 return &p->signal->group_exit_code;
1316 }
1317 return NULL;
1318}
1319
1418/* 1320/*
1419 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold 1321 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
1420 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold 1322 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1425,7 +1327,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1425 int options, struct siginfo __user *infop, 1327 int options, struct siginfo __user *infop,
1426 int __user *stat_addr, struct rusage __user *ru) 1328 int __user *stat_addr, struct rusage __user *ru)
1427{ 1329{
1428 int retval, exit_code, why; 1330 int retval, exit_code, *p_code, why;
1429 uid_t uid = 0; /* unneeded, required by compiler */ 1331 uid_t uid = 0; /* unneeded, required by compiler */
1430 pid_t pid; 1332 pid_t pid;
1431 1333
@@ -1435,22 +1337,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1435 exit_code = 0; 1337 exit_code = 0;
1436 spin_lock_irq(&p->sighand->siglock); 1338 spin_lock_irq(&p->sighand->siglock);
1437 1339
1438 if (unlikely(!task_is_stopped_or_traced(p))) 1340 p_code = task_stopped_code(p, ptrace);
1439 goto unlock_sig; 1341 if (unlikely(!p_code))
1440
1441 if (!ptrace && p->signal->group_stop_count > 0)
1442 /*
1443 * A group stop is in progress and this is the group leader.
1444 * We won't report until all threads have stopped.
1445 */
1446 goto unlock_sig; 1342 goto unlock_sig;
1447 1343
1448 exit_code = p->exit_code; 1344 exit_code = *p_code;
1449 if (!exit_code) 1345 if (!exit_code)
1450 goto unlock_sig; 1346 goto unlock_sig;
1451 1347
1452 if (!unlikely(options & WNOWAIT)) 1348 if (!unlikely(options & WNOWAIT))
1453 p->exit_code = 0; 1349 *p_code = 0;
1454 1350
1455 /* don't need the RCU readlock here as we're holding a spinlock */ 1351 /* don't need the RCU readlock here as we're holding a spinlock */
1456 uid = __task_cred(p)->uid; 1352 uid = __task_cred(p)->uid;
@@ -1606,7 +1502,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1606 */ 1502 */
1607 *notask_error = 0; 1503 *notask_error = 0;
1608 1504
1609 if (task_is_stopped_or_traced(p)) 1505 if (task_stopped_code(p, ptrace))
1610 return wait_task_stopped(ptrace, p, options, 1506 return wait_task_stopped(ptrace, p, options,
1611 infop, stat_addr, ru); 1507 infop, stat_addr, ru);
1612 1508
@@ -1752,9 +1648,8 @@ end:
1752 return retval; 1648 return retval;
1753} 1649}
1754 1650
1755asmlinkage long sys_waitid(int which, pid_t upid, 1651SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1756 struct siginfo __user *infop, int options, 1652 infop, int, options, struct rusage __user *, ru)
1757 struct rusage __user *ru)
1758{ 1653{
1759 struct pid *pid = NULL; 1654 struct pid *pid = NULL;
1760 enum pid_type type; 1655 enum pid_type type;
@@ -1793,8 +1688,8 @@ asmlinkage long sys_waitid(int which, pid_t upid,
1793 return ret; 1688 return ret;
1794} 1689}
1795 1690
1796asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, 1691SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1797 int options, struct rusage __user *ru) 1692 int, options, struct rusage __user *, ru)
1798{ 1693{
1799 struct pid *pid = NULL; 1694 struct pid *pid = NULL;
1800 enum pid_type type; 1695 enum pid_type type;
@@ -1811,7 +1706,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
1811 pid = find_get_pid(-upid); 1706 pid = find_get_pid(-upid);
1812 } else if (upid == 0) { 1707 } else if (upid == 0) {
1813 type = PIDTYPE_PGID; 1708 type = PIDTYPE_PGID;
1814 pid = get_pid(task_pgrp(current)); 1709 pid = get_task_pid(current, PIDTYPE_PGID);
1815 } else /* upid > 0 */ { 1710 } else /* upid > 0 */ {
1816 type = PIDTYPE_PID; 1711 type = PIDTYPE_PID;
1817 pid = find_get_pid(upid); 1712 pid = find_get_pid(upid);
@@ -1831,7 +1726,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
1831 * sys_waitpid() remains for compatibility. waitpid() should be 1726 * sys_waitpid() remains for compatibility. waitpid() should be
1832 * implemented by calling sys_wait4() from libc.a. 1727 * implemented by calling sys_wait4() from libc.a.
1833 */ 1728 */
1834asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) 1729SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1835{ 1730{
1836 return sys_wait4(pid, stat_addr, options, NULL); 1731 return sys_wait4(pid, stat_addr, options, NULL);
1837} 1732}
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..7f8f263f8524 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -15,11 +15,22 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/ftrace.h>
19#include <linux/memory.h>
18#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/mutex.h>
19#include <linux/init.h> 22#include <linux/init.h>
20#include <linux/ftrace.h> 23
21#include <asm/uaccess.h>
22#include <asm/sections.h> 24#include <asm/sections.h>
25#include <asm/uaccess.h>
26
27/*
28 * mutex protecting text section modification (dynamic code patching).
29 * some users need to sleep (allocating memory...) while they hold this lock.
30 *
31 * NOT exported to modules - patching kernel text is a really delicate matter.
32 */
33DEFINE_MUTEX(text_mutex);
23 34
24extern struct exception_table_entry __start___ex_table[]; 35extern struct exception_table_entry __start___ex_table[];
25extern struct exception_table_entry __stop___ex_table[]; 36extern struct exception_table_entry __stop___ex_table[];
@@ -41,31 +52,50 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
41 return e; 52 return e;
42} 53}
43 54
44__notrace_funcgraph int core_kernel_text(unsigned long addr) 55static inline int init_kernel_text(unsigned long addr)
56{
57 if (addr >= (unsigned long)_sinittext &&
58 addr <= (unsigned long)_einittext)
59 return 1;
60 return 0;
61}
62
63int core_kernel_text(unsigned long addr)
45{ 64{
46 if (addr >= (unsigned long)_stext && 65 if (addr >= (unsigned long)_stext &&
47 addr <= (unsigned long)_etext) 66 addr <= (unsigned long)_etext)
48 return 1; 67 return 1;
49 68
50 if (system_state == SYSTEM_BOOTING && 69 if (system_state == SYSTEM_BOOTING &&
51 addr >= (unsigned long)_sinittext && 70 init_kernel_text(addr))
52 addr <= (unsigned long)_einittext)
53 return 1; 71 return 1;
54 return 0; 72 return 0;
55} 73}
56 74
57__notrace_funcgraph int __kernel_text_address(unsigned long addr) 75int __kernel_text_address(unsigned long addr)
58{ 76{
59 if (core_kernel_text(addr)) 77 if (core_kernel_text(addr))
60 return 1; 78 return 1;
61 return __module_text_address(addr) != NULL; 79 if (is_module_text_address(addr))
80 return 1;
81 /*
82 * There might be init symbols in saved stacktraces.
83 * Give those symbols a chance to be printed in
84 * backtraces (such as lockdep traces).
85 *
86 * Since we are after the module-symbols check, there's
87 * no danger of address overlap:
88 */
89 if (init_kernel_text(addr))
90 return 1;
91 return 0;
62} 92}
63 93
64int kernel_text_address(unsigned long addr) 94int kernel_text_address(unsigned long addr)
65{ 95{
66 if (core_kernel_text(addr)) 96 if (core_kernel_text(addr))
67 return 1; 97 return 1;
68 return module_text_address(addr) != NULL; 98 return is_module_text_address(addr);
69} 99}
70 100
71/* 101/*
@@ -81,5 +111,5 @@ int func_ptr_is_kernel_text(void *ptr)
81 addr = (unsigned long) dereference_function_descriptor(ptr); 111 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr)) 112 if (core_kernel_text(addr))
83 return 1; 113 return 1;
84 return module_text_address(addr) != NULL; 114 return is_module_text_address(addr);
85} 115}
diff --git a/kernel/fork.c b/kernel/fork.c
index 1d68f1255dd8..989c7c202b3d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,7 +60,9 @@
60#include <linux/tty.h> 60#include <linux/tty.h>
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h>
63#include <trace/sched.h> 64#include <trace/sched.h>
65#include <linux/magic.h>
64 66
65#include <asm/pgtable.h> 67#include <asm/pgtable.h>
66#include <asm/pgalloc.h> 68#include <asm/pgalloc.h>
@@ -212,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
212{ 214{
213 struct task_struct *tsk; 215 struct task_struct *tsk;
214 struct thread_info *ti; 216 struct thread_info *ti;
217 unsigned long *stackend;
218
215 int err; 219 int err;
216 220
217 prepare_to_copy(orig); 221 prepare_to_copy(orig);
@@ -237,6 +241,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
237 goto out; 241 goto out;
238 242
239 setup_thread_stack(tsk, orig); 243 setup_thread_stack(tsk, orig);
244 stackend = end_of_stack(tsk);
245 *stackend = STACK_END_MAGIC; /* for overflow detection */
240 246
241#ifdef CONFIG_CC_STACKPROTECTOR 247#ifdef CONFIG_CC_STACKPROTECTOR
242 tsk->stack_canary = get_random_int(); 248 tsk->stack_canary = get_random_int();
@@ -279,7 +285,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
279 mm->free_area_cache = oldmm->mmap_base; 285 mm->free_area_cache = oldmm->mmap_base;
280 mm->cached_hole_size = ~0UL; 286 mm->cached_hole_size = ~0UL;
281 mm->map_count = 0; 287 mm->map_count = 0;
282 cpus_clear(mm->cpu_vm_mask); 288 cpumask_clear(mm_cpumask(mm));
283 mm->mm_rb = RB_ROOT; 289 mm->mm_rb = RB_ROOT;
284 rb_link = &mm->mm_rb.rb_node; 290 rb_link = &mm->mm_rb.rb_node;
285 rb_parent = NULL; 291 rb_parent = NULL;
@@ -639,6 +645,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
639 645
640 tsk->min_flt = tsk->maj_flt = 0; 646 tsk->min_flt = tsk->maj_flt = 0;
641 tsk->nvcsw = tsk->nivcsw = 0; 647 tsk->nvcsw = tsk->nivcsw = 0;
648#ifdef CONFIG_DETECT_HUNG_TASK
649 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
650#endif
642 651
643 tsk->mm = NULL; 652 tsk->mm = NULL;
644 tsk->active_mm = NULL; 653 tsk->active_mm = NULL;
@@ -676,38 +685,21 @@ fail_nomem:
676 return retval; 685 return retval;
677} 686}
678 687
679static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
680{
681 struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
682 /* We don't need to lock fs - think why ;-) */
683 if (fs) {
684 atomic_set(&fs->count, 1);
685 rwlock_init(&fs->lock);
686 fs->umask = old->umask;
687 read_lock(&old->lock);
688 fs->root = old->root;
689 path_get(&old->root);
690 fs->pwd = old->pwd;
691 path_get(&old->pwd);
692 read_unlock(&old->lock);
693 }
694 return fs;
695}
696
697struct fs_struct *copy_fs_struct(struct fs_struct *old)
698{
699 return __copy_fs_struct(old);
700}
701
702EXPORT_SYMBOL_GPL(copy_fs_struct);
703
704static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) 688static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
705{ 689{
690 struct fs_struct *fs = current->fs;
706 if (clone_flags & CLONE_FS) { 691 if (clone_flags & CLONE_FS) {
707 atomic_inc(&current->fs->count); 692 /* tsk->fs is already what we want */
693 write_lock(&fs->lock);
694 if (fs->in_exec) {
695 write_unlock(&fs->lock);
696 return -EAGAIN;
697 }
698 fs->users++;
699 write_unlock(&fs->lock);
708 return 0; 700 return 0;
709 } 701 }
710 tsk->fs = __copy_fs_struct(current->fs); 702 tsk->fs = copy_fs_struct(fs);
711 if (!tsk->fs) 703 if (!tsk->fs)
712 return -ENOMEM; 704 return -ENOMEM;
713 return 0; 705 return 0;
@@ -817,17 +809,17 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
817static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) 809static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
818{ 810{
819 struct signal_struct *sig; 811 struct signal_struct *sig;
820 int ret;
821 812
822 if (clone_flags & CLONE_THREAD) { 813 if (clone_flags & CLONE_THREAD) {
823 ret = thread_group_cputime_clone_thread(current); 814 atomic_inc(&current->signal->count);
824 if (likely(!ret)) { 815 atomic_inc(&current->signal->live);
825 atomic_inc(&current->signal->count); 816 return 0;
826 atomic_inc(&current->signal->live);
827 }
828 return ret;
829 } 817 }
830 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 818 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
819
820 if (sig)
821 posix_cpu_timers_init_group(sig);
822
831 tsk->signal = sig; 823 tsk->signal = sig;
832 if (!sig) 824 if (!sig)
833 return -ENOMEM; 825 return -ENOMEM;
@@ -836,6 +828,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
836 atomic_set(&sig->live, 1); 828 atomic_set(&sig->live, 1);
837 init_waitqueue_head(&sig->wait_chldexit); 829 init_waitqueue_head(&sig->wait_chldexit);
838 sig->flags = 0; 830 sig->flags = 0;
831 if (clone_flags & CLONE_NEWPID)
832 sig->flags |= SIGNAL_UNKILLABLE;
839 sig->group_exit_code = 0; 833 sig->group_exit_code = 0;
840 sig->group_exit_task = NULL; 834 sig->group_exit_task = NULL;
841 sig->group_stop_count = 0; 835 sig->group_stop_count = 0;
@@ -851,21 +845,20 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
851 sig->tty_old_pgrp = NULL; 845 sig->tty_old_pgrp = NULL;
852 sig->tty = NULL; 846 sig->tty = NULL;
853 847
854 sig->cutime = sig->cstime = cputime_zero; 848 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
855 sig->gtime = cputime_zero; 849 sig->gtime = cputime_zero;
856 sig->cgtime = cputime_zero; 850 sig->cgtime = cputime_zero;
857 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 851 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
858 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 852 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
859 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 853 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
860 task_io_accounting_init(&sig->ioac); 854 task_io_accounting_init(&sig->ioac);
855 sig->sum_sched_runtime = 0;
861 taskstats_tgid_init(sig); 856 taskstats_tgid_init(sig);
862 857
863 task_lock(current->group_leader); 858 task_lock(current->group_leader);
864 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 859 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
865 task_unlock(current->group_leader); 860 task_unlock(current->group_leader);
866 861
867 posix_cpu_timers_init_group(sig);
868
869 acct_init_pacct(&sig->pacct); 862 acct_init_pacct(&sig->pacct);
870 863
871 tty_audit_fork(sig); 864 tty_audit_fork(sig);
@@ -901,7 +894,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
901 clear_freeze_flag(p); 894 clear_freeze_flag(p);
902} 895}
903 896
904asmlinkage long sys_set_tid_address(int __user *tidptr) 897SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
905{ 898{
906 current->clear_child_tid = tidptr; 899 current->clear_child_tid = tidptr;
907 900
@@ -1007,6 +1000,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1007 * triggers too late. This doesn't hurt, the check is only there 1000 * triggers too late. This doesn't hurt, the check is only there
1008 * to stop root fork bombs. 1001 * to stop root fork bombs.
1009 */ 1002 */
1003 retval = -EAGAIN;
1010 if (nr_threads >= max_threads) 1004 if (nr_threads >= max_threads)
1011 goto bad_fork_cleanup_count; 1005 goto bad_fork_cleanup_count;
1012 1006
@@ -1041,11 +1035,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1041 1035
1042 p->default_timer_slack_ns = current->timer_slack_ns; 1036 p->default_timer_slack_ns = current->timer_slack_ns;
1043 1037
1044#ifdef CONFIG_DETECT_SOFTLOCKUP
1045 p->last_switch_count = 0;
1046 p->last_switch_timestamp = 0;
1047#endif
1048
1049 task_io_accounting_init(&p->ioac); 1038 task_io_accounting_init(&p->ioac);
1050 acct_clear_integrals(p); 1039 acct_clear_integrals(p);
1051 1040
@@ -1095,7 +1084,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1095#ifdef CONFIG_DEBUG_MUTEXES 1084#ifdef CONFIG_DEBUG_MUTEXES
1096 p->blocked_on = NULL; /* not blocked yet */ 1085 p->blocked_on = NULL; /* not blocked yet */
1097#endif 1086#endif
1098 if (unlikely(ptrace_reparented(current))) 1087 if (unlikely(current->ptrace))
1099 ptrace_fork(p, clone_flags); 1088 ptrace_fork(p, clone_flags);
1100 1089
1101 /* Perform scheduler related setup. Assign this task to a CPU. */ 1090 /* Perform scheduler related setup. Assign this task to a CPU. */
@@ -1120,7 +1109,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1120 goto bad_fork_cleanup_mm; 1109 goto bad_fork_cleanup_mm;
1121 if ((retval = copy_io(clone_flags, p))) 1110 if ((retval = copy_io(clone_flags, p)))
1122 goto bad_fork_cleanup_namespaces; 1111 goto bad_fork_cleanup_namespaces;
1123 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1112 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1124 if (retval) 1113 if (retval)
1125 goto bad_fork_cleanup_io; 1114 goto bad_fork_cleanup_io;
1126 1115
@@ -1179,10 +1168,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1179#endif 1168#endif
1180 clear_all_latency_tracing(p); 1169 clear_all_latency_tracing(p);
1181 1170
1182 /* Our parent execution domain becomes current domain
1183 These must match for thread signalling to apply */
1184 p->parent_exec_id = p->self_exec_id;
1185
1186 /* ok, now we should be set up.. */ 1171 /* ok, now we should be set up.. */
1187 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); 1172 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
1188 p->pdeath_signal = 0; 1173 p->pdeath_signal = 0;
@@ -1220,10 +1205,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1220 set_task_cpu(p, smp_processor_id()); 1205 set_task_cpu(p, smp_processor_id());
1221 1206
1222 /* CLONE_PARENT re-uses the old parent */ 1207 /* CLONE_PARENT re-uses the old parent */
1223 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) 1208 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1224 p->real_parent = current->real_parent; 1209 p->real_parent = current->real_parent;
1225 else 1210 p->parent_exec_id = current->parent_exec_id;
1211 } else {
1226 p->real_parent = current; 1212 p->real_parent = current;
1213 p->parent_exec_id = current->self_exec_id;
1214 }
1227 1215
1228 spin_lock(&current->sighand->siglock); 1216 spin_lock(&current->sighand->siglock);
1229 1217
@@ -1259,8 +1247,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1259 p->signal->leader_pid = pid; 1247 p->signal->leader_pid = pid;
1260 tty_kref_put(p->signal->tty); 1248 tty_kref_put(p->signal->tty);
1261 p->signal->tty = tty_kref_get(current->signal->tty); 1249 p->signal->tty = tty_kref_get(current->signal->tty);
1262 set_task_pgrp(p, task_pgrp_nr(current));
1263 set_task_session(p, task_session_nr(current));
1264 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1250 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1265 attach_pid(p, PIDTYPE_SID, task_session(current)); 1251 attach_pid(p, PIDTYPE_SID, task_session(current));
1266 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1252 list_add_tail_rcu(&p->tasks, &init_task.tasks);
@@ -1484,6 +1470,7 @@ void __init proc_caches_init(void)
1484 mm_cachep = kmem_cache_create("mm_struct", 1470 mm_cachep = kmem_cache_create("mm_struct",
1485 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1471 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1486 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1472 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1473 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1487 mmap_init(); 1474 mmap_init();
1488} 1475}
1489 1476
@@ -1539,12 +1526,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1539{ 1526{
1540 struct fs_struct *fs = current->fs; 1527 struct fs_struct *fs = current->fs;
1541 1528
1542 if ((unshare_flags & CLONE_FS) && 1529 if (!(unshare_flags & CLONE_FS) || !fs)
1543 (fs && atomic_read(&fs->count) > 1)) { 1530 return 0;
1544 *new_fsp = __copy_fs_struct(current->fs); 1531
1545 if (!*new_fsp) 1532 /* don't need lock here; in the worst case we'll do useless copy */
1546 return -ENOMEM; 1533 if (fs->users == 1)
1547 } 1534 return 0;
1535
1536 *new_fsp = copy_fs_struct(fs);
1537 if (!*new_fsp)
1538 return -ENOMEM;
1548 1539
1549 return 0; 1540 return 0;
1550} 1541}
@@ -1603,7 +1594,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1603 * constructed. Here we are modifying the current, active, 1594 * constructed. Here we are modifying the current, active,
1604 * task_struct. 1595 * task_struct.
1605 */ 1596 */
1606asmlinkage long sys_unshare(unsigned long unshare_flags) 1597SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1607{ 1598{
1608 int err = 0; 1599 int err = 0;
1609 struct fs_struct *fs, *new_fs = NULL; 1600 struct fs_struct *fs, *new_fs = NULL;
@@ -1660,8 +1651,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1660 1651
1661 if (new_fs) { 1652 if (new_fs) {
1662 fs = current->fs; 1653 fs = current->fs;
1654 write_lock(&fs->lock);
1663 current->fs = new_fs; 1655 current->fs = new_fs;
1664 new_fs = fs; 1656 if (--fs->users)
1657 new_fs = NULL;
1658 else
1659 new_fs = fs;
1660 write_unlock(&fs->lock);
1665 } 1661 }
1666 1662
1667 if (new_mm) { 1663 if (new_mm) {
@@ -1700,7 +1696,7 @@ bad_unshare_cleanup_sigh:
1700 1696
1701bad_unshare_cleanup_fs: 1697bad_unshare_cleanup_fs:
1702 if (new_fs) 1698 if (new_fs)
1703 put_fs_struct(new_fs); 1699 free_fs_struct(new_fs);
1704 1700
1705bad_unshare_cleanup_thread: 1701bad_unshare_cleanup_thread:
1706bad_unshare_out: 1702bad_unshare_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index 002aa189eb09..6b50a024bca2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -114,7 +114,9 @@ struct futex_q {
114}; 114};
115 115
116/* 116/*
117 * Split the global futex_lock into every hash list lock. 117 * Hash buckets are shared by all the futex_keys that hash to the same
118 * location. Each key may have multiple futex_q structures, one for each task
119 * waiting on a futex.
118 */ 120 */
119struct futex_hash_bucket { 121struct futex_hash_bucket {
120 spinlock_t lock; 122 spinlock_t lock;
@@ -189,8 +191,7 @@ static void drop_futex_key_refs(union futex_key *key)
189/** 191/**
190 * get_futex_key - Get parameters which are the keys for a futex. 192 * get_futex_key - Get parameters which are the keys for a futex.
191 * @uaddr: virtual address of the futex 193 * @uaddr: virtual address of the futex
192 * @shared: NULL for a PROCESS_PRIVATE futex, 194 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
193 * &current->mm->mmap_sem for a PROCESS_SHARED futex
194 * @key: address where result is stored. 195 * @key: address where result is stored.
195 * 196 *
196 * Returns a negative error code or 0 197 * Returns a negative error code or 0
@@ -200,9 +201,7 @@ static void drop_futex_key_refs(union futex_key *key)
200 * offset_within_page). For private mappings, it's (uaddr, current->mm). 201 * offset_within_page). For private mappings, it's (uaddr, current->mm).
201 * We can usually work out the index without swapping in the page. 202 * We can usually work out the index without swapping in the page.
202 * 203 *
203 * fshared is NULL for PROCESS_PRIVATE futexes 204 * lock_page() might sleep, the caller should not hold a spinlock.
204 * For other futexes, it points to &current->mm->mmap_sem and
205 * caller must have taken the reader lock. but NOT any spinlocks.
206 */ 205 */
207static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) 206static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
208{ 207{
@@ -299,41 +298,6 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
299 return ret ? -EFAULT : 0; 298 return ret ? -EFAULT : 0;
300} 299}
301 300
302/*
303 * Fault handling.
304 */
305static int futex_handle_fault(unsigned long address, int attempt)
306{
307 struct vm_area_struct * vma;
308 struct mm_struct *mm = current->mm;
309 int ret = -EFAULT;
310
311 if (attempt > 2)
312 return ret;
313
314 down_read(&mm->mmap_sem);
315 vma = find_vma(mm, address);
316 if (vma && address >= vma->vm_start &&
317 (vma->vm_flags & VM_WRITE)) {
318 int fault;
319 fault = handle_mm_fault(mm, vma, address, 1);
320 if (unlikely((fault & VM_FAULT_ERROR))) {
321#if 0
322 /* XXX: let's do this when we verify it is OK */
323 if (ret & VM_FAULT_OOM)
324 ret = -ENOMEM;
325#endif
326 } else {
327 ret = 0;
328 if (fault & VM_FAULT_MAJOR)
329 current->maj_flt++;
330 else
331 current->min_flt++;
332 }
333 }
334 up_read(&mm->mmap_sem);
335 return ret;
336}
337 301
338/* 302/*
339 * PI code: 303 * PI code:
@@ -589,10 +553,9 @@ static void wake_futex(struct futex_q *q)
589 * The waiting task can free the futex_q as soon as this is written, 553 * The waiting task can free the futex_q as soon as this is written,
590 * without taking any locks. This must come last. 554 * without taking any locks. This must come last.
591 * 555 *
592 * A memory barrier is required here to prevent the following store 556 * A memory barrier is required here to prevent the following store to
593 * to lock_ptr from getting ahead of the wakeup. Clearing the lock 557 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
594 * at the end of wake_up_all() does not prevent this store from 558 * end of wake_up() does not prevent this store from moving.
595 * moving.
596 */ 559 */
597 smp_wmb(); 560 smp_wmb();
598 q->lock_ptr = NULL; 561 q->lock_ptr = NULL;
@@ -692,9 +655,16 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
692 } 655 }
693} 656}
694 657
658static inline void
659double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
660{
661 spin_unlock(&hb1->lock);
662 if (hb1 != hb2)
663 spin_unlock(&hb2->lock);
664}
665
695/* 666/*
696 * Wake up all waiters hashed on the physical page that is mapped 667 * Wake up waiters matching bitset queued on this futex (uaddr).
697 * to this virtual address:
698 */ 668 */
699static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) 669static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
700{ 670{
@@ -750,9 +720,9 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
750 struct futex_hash_bucket *hb1, *hb2; 720 struct futex_hash_bucket *hb1, *hb2;
751 struct plist_head *head; 721 struct plist_head *head;
752 struct futex_q *this, *next; 722 struct futex_q *this, *next;
753 int ret, op_ret, attempt = 0; 723 int ret, op_ret;
754 724
755retryfull: 725retry:
756 ret = get_futex_key(uaddr1, fshared, &key1); 726 ret = get_futex_key(uaddr1, fshared, &key1);
757 if (unlikely(ret != 0)) 727 if (unlikely(ret != 0))
758 goto out; 728 goto out;
@@ -763,16 +733,13 @@ retryfull:
763 hb1 = hash_futex(&key1); 733 hb1 = hash_futex(&key1);
764 hb2 = hash_futex(&key2); 734 hb2 = hash_futex(&key2);
765 735
766retry:
767 double_lock_hb(hb1, hb2); 736 double_lock_hb(hb1, hb2);
768 737retry_private:
769 op_ret = futex_atomic_op_inuser(op, uaddr2); 738 op_ret = futex_atomic_op_inuser(op, uaddr2);
770 if (unlikely(op_ret < 0)) { 739 if (unlikely(op_ret < 0)) {
771 u32 dummy; 740 u32 dummy;
772 741
773 spin_unlock(&hb1->lock); 742 double_unlock_hb(hb1, hb2);
774 if (hb1 != hb2)
775 spin_unlock(&hb2->lock);
776 743
777#ifndef CONFIG_MMU 744#ifndef CONFIG_MMU
778 /* 745 /*
@@ -788,26 +755,16 @@ retry:
788 goto out_put_keys; 755 goto out_put_keys;
789 } 756 }
790 757
791 /*
792 * futex_atomic_op_inuser needs to both read and write
793 * *(int __user *)uaddr2, but we can't modify it
794 * non-atomically. Therefore, if get_user below is not
795 * enough, we need to handle the fault ourselves, while
796 * still holding the mmap_sem.
797 */
798 if (attempt++) {
799 ret = futex_handle_fault((unsigned long)uaddr2,
800 attempt);
801 if (ret)
802 goto out_put_keys;
803 goto retry;
804 }
805
806 ret = get_user(dummy, uaddr2); 758 ret = get_user(dummy, uaddr2);
807 if (ret) 759 if (ret)
808 return ret; 760 goto out_put_keys;
761
762 if (!fshared)
763 goto retry_private;
809 764
810 goto retryfull; 765 put_futex_key(fshared, &key2);
766 put_futex_key(fshared, &key1);
767 goto retry;
811 } 768 }
812 769
813 head = &hb1->chain; 770 head = &hb1->chain;
@@ -834,9 +791,7 @@ retry:
834 ret += op_ret; 791 ret += op_ret;
835 } 792 }
836 793
837 spin_unlock(&hb1->lock); 794 double_unlock_hb(hb1, hb2);
838 if (hb1 != hb2)
839 spin_unlock(&hb2->lock);
840out_put_keys: 795out_put_keys:
841 put_futex_key(fshared, &key2); 796 put_futex_key(fshared, &key2);
842out_put_key1: 797out_put_key1:
@@ -869,6 +824,7 @@ retry:
869 hb1 = hash_futex(&key1); 824 hb1 = hash_futex(&key1);
870 hb2 = hash_futex(&key2); 825 hb2 = hash_futex(&key2);
871 826
827retry_private:
872 double_lock_hb(hb1, hb2); 828 double_lock_hb(hb1, hb2);
873 829
874 if (likely(cmpval != NULL)) { 830 if (likely(cmpval != NULL)) {
@@ -877,16 +833,18 @@ retry:
877 ret = get_futex_value_locked(&curval, uaddr1); 833 ret = get_futex_value_locked(&curval, uaddr1);
878 834
879 if (unlikely(ret)) { 835 if (unlikely(ret)) {
880 spin_unlock(&hb1->lock); 836 double_unlock_hb(hb1, hb2);
881 if (hb1 != hb2)
882 spin_unlock(&hb2->lock);
883 837
884 ret = get_user(curval, uaddr1); 838 ret = get_user(curval, uaddr1);
839 if (ret)
840 goto out_put_keys;
885 841
886 if (!ret) 842 if (!fshared)
887 goto retry; 843 goto retry_private;
888 844
889 goto out_put_keys; 845 put_futex_key(fshared, &key2);
846 put_futex_key(fshared, &key1);
847 goto retry;
890 } 848 }
891 if (curval != *cmpval) { 849 if (curval != *cmpval) {
892 ret = -EAGAIN; 850 ret = -EAGAIN;
@@ -923,9 +881,7 @@ retry:
923 } 881 }
924 882
925out_unlock: 883out_unlock:
926 spin_unlock(&hb1->lock); 884 double_unlock_hb(hb1, hb2);
927 if (hb1 != hb2)
928 spin_unlock(&hb2->lock);
929 885
930 /* drop_futex_key_refs() must be called outside the spinlocks. */ 886 /* drop_futex_key_refs() must be called outside the spinlocks. */
931 while (--drop_count >= 0) 887 while (--drop_count >= 0)
@@ -1063,7 +1019,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1063 struct futex_pi_state *pi_state = q->pi_state; 1019 struct futex_pi_state *pi_state = q->pi_state;
1064 struct task_struct *oldowner = pi_state->owner; 1020 struct task_struct *oldowner = pi_state->owner;
1065 u32 uval, curval, newval; 1021 u32 uval, curval, newval;
1066 int ret, attempt = 0; 1022 int ret;
1067 1023
1068 /* Owner died? */ 1024 /* Owner died? */
1069 if (!pi_state->owner) 1025 if (!pi_state->owner)
@@ -1076,11 +1032,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1076 * in the user space variable. This must be atomic as we have 1032 * in the user space variable. This must be atomic as we have
1077 * to preserve the owner died bit here. 1033 * to preserve the owner died bit here.
1078 * 1034 *
1079 * Note: We write the user space value _before_ changing the 1035 * Note: We write the user space value _before_ changing the pi_state
1080 * pi_state because we can fault here. Imagine swapped out 1036 * because we can fault here. Imagine swapped out pages or a fork
1081 * pages or a fork, which was running right before we acquired 1037 * that marked all the anonymous memory readonly for cow.
1082 * mmap_sem, that marked all the anonymous memory readonly for
1083 * cow.
1084 * 1038 *
1085 * Modifying pi_state _before_ the user space value would 1039 * Modifying pi_state _before_ the user space value would
1086 * leave the pi_state in an inconsistent state when we fault 1040 * leave the pi_state in an inconsistent state when we fault
@@ -1136,7 +1090,7 @@ retry:
1136handle_fault: 1090handle_fault:
1137 spin_unlock(q->lock_ptr); 1091 spin_unlock(q->lock_ptr);
1138 1092
1139 ret = futex_handle_fault((unsigned long)uaddr, attempt++); 1093 ret = get_user(uval, uaddr);
1140 1094
1141 spin_lock(q->lock_ptr); 1095 spin_lock(q->lock_ptr);
1142 1096
@@ -1165,6 +1119,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1165 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1119 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1166{ 1120{
1167 struct task_struct *curr = current; 1121 struct task_struct *curr = current;
1122 struct restart_block *restart;
1168 DECLARE_WAITQUEUE(wait, curr); 1123 DECLARE_WAITQUEUE(wait, curr);
1169 struct futex_hash_bucket *hb; 1124 struct futex_hash_bucket *hb;
1170 struct futex_q q; 1125 struct futex_q q;
@@ -1184,10 +1139,11 @@ retry:
1184 if (unlikely(ret != 0)) 1139 if (unlikely(ret != 0))
1185 goto out; 1140 goto out;
1186 1141
1142retry_private:
1187 hb = queue_lock(&q); 1143 hb = queue_lock(&q);
1188 1144
1189 /* 1145 /*
1190 * Access the page AFTER the futex is queued. 1146 * Access the page AFTER the hash-bucket is locked.
1191 * Order is important: 1147 * Order is important:
1192 * 1148 *
1193 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); 1149 * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
@@ -1203,24 +1159,29 @@ retry:
1203 * a wakeup when *uaddr != val on entry to the syscall. This is 1159 * a wakeup when *uaddr != val on entry to the syscall. This is
1204 * rare, but normal. 1160 * rare, but normal.
1205 * 1161 *
1206 * for shared futexes, we hold the mmap semaphore, so the mapping 1162 * For shared futexes, we hold the mmap semaphore, so the mapping
1207 * cannot have changed since we looked it up in get_futex_key. 1163 * cannot have changed since we looked it up in get_futex_key.
1208 */ 1164 */
1209 ret = get_futex_value_locked(&uval, uaddr); 1165 ret = get_futex_value_locked(&uval, uaddr);
1210 1166
1211 if (unlikely(ret)) { 1167 if (unlikely(ret)) {
1212 queue_unlock(&q, hb); 1168 queue_unlock(&q, hb);
1213 put_futex_key(fshared, &q.key);
1214 1169
1215 ret = get_user(uval, uaddr); 1170 ret = get_user(uval, uaddr);
1171 if (ret)
1172 goto out_put_key;
1216 1173
1217 if (!ret) 1174 if (!fshared)
1218 goto retry; 1175 goto retry_private;
1219 return ret; 1176
1177 put_futex_key(fshared, &q.key);
1178 goto retry;
1220 } 1179 }
1221 ret = -EWOULDBLOCK; 1180 ret = -EWOULDBLOCK;
1222 if (uval != val) 1181 if (unlikely(uval != val)) {
1223 goto out_unlock_put_key; 1182 queue_unlock(&q, hb);
1183 goto out_put_key;
1184 }
1224 1185
1225 /* Only actually queue if *uaddr contained val. */ 1186 /* Only actually queue if *uaddr contained val. */
1226 queue_me(&q, hb); 1187 queue_me(&q, hb);
@@ -1245,16 +1206,13 @@ retry:
1245 if (!abs_time) 1206 if (!abs_time)
1246 schedule(); 1207 schedule();
1247 else { 1208 else {
1248 unsigned long slack;
1249 slack = current->timer_slack_ns;
1250 if (rt_task(current))
1251 slack = 0;
1252 hrtimer_init_on_stack(&t.timer, 1209 hrtimer_init_on_stack(&t.timer,
1253 clockrt ? CLOCK_REALTIME : 1210 clockrt ? CLOCK_REALTIME :
1254 CLOCK_MONOTONIC, 1211 CLOCK_MONOTONIC,
1255 HRTIMER_MODE_ABS); 1212 HRTIMER_MODE_ABS);
1256 hrtimer_init_sleeper(&t, current); 1213 hrtimer_init_sleeper(&t, current);
1257 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1214 hrtimer_set_expires_range_ns(&t.timer, *abs_time,
1215 current->timer_slack_ns);
1258 1216
1259 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); 1217 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1260 if (!hrtimer_active(&t.timer)) 1218 if (!hrtimer_active(&t.timer))
@@ -1284,38 +1242,38 @@ retry:
1284 */ 1242 */
1285 1243
1286 /* If we were woken (and unqueued), we succeeded, whatever. */ 1244 /* If we were woken (and unqueued), we succeeded, whatever. */
1245 ret = 0;
1287 if (!unqueue_me(&q)) 1246 if (!unqueue_me(&q))
1288 return 0; 1247 goto out_put_key;
1248 ret = -ETIMEDOUT;
1289 if (rem) 1249 if (rem)
1290 return -ETIMEDOUT; 1250 goto out_put_key;
1291 1251
1292 /* 1252 /*
1293 * We expect signal_pending(current), but another thread may 1253 * We expect signal_pending(current), but another thread may
1294 * have handled it for us already. 1254 * have handled it for us already.
1295 */ 1255 */
1256 ret = -ERESTARTSYS;
1296 if (!abs_time) 1257 if (!abs_time)
1297 return -ERESTARTSYS; 1258 goto out_put_key;
1298 else {
1299 struct restart_block *restart;
1300 restart = &current_thread_info()->restart_block;
1301 restart->fn = futex_wait_restart;
1302 restart->futex.uaddr = (u32 *)uaddr;
1303 restart->futex.val = val;
1304 restart->futex.time = abs_time->tv64;
1305 restart->futex.bitset = bitset;
1306 restart->futex.flags = 0;
1307
1308 if (fshared)
1309 restart->futex.flags |= FLAGS_SHARED;
1310 if (clockrt)
1311 restart->futex.flags |= FLAGS_CLOCKRT;
1312 return -ERESTART_RESTARTBLOCK;
1313 }
1314 1259
1315out_unlock_put_key: 1260 restart = &current_thread_info()->restart_block;
1316 queue_unlock(&q, hb); 1261 restart->fn = futex_wait_restart;
1317 put_futex_key(fshared, &q.key); 1262 restart->futex.uaddr = (u32 *)uaddr;
1263 restart->futex.val = val;
1264 restart->futex.time = abs_time->tv64;
1265 restart->futex.bitset = bitset;
1266 restart->futex.flags = 0;
1267
1268 if (fshared)
1269 restart->futex.flags |= FLAGS_SHARED;
1270 if (clockrt)
1271 restart->futex.flags |= FLAGS_CLOCKRT;
1272
1273 ret = -ERESTART_RESTARTBLOCK;
1318 1274
1275out_put_key:
1276 put_futex_key(fshared, &q.key);
1319out: 1277out:
1320 return ret; 1278 return ret;
1321} 1279}
@@ -1351,7 +1309,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1351 struct futex_hash_bucket *hb; 1309 struct futex_hash_bucket *hb;
1352 u32 uval, newval, curval; 1310 u32 uval, newval, curval;
1353 struct futex_q q; 1311 struct futex_q q;
1354 int ret, lock_taken, ownerdied = 0, attempt = 0; 1312 int ret, lock_taken, ownerdied = 0;
1355 1313
1356 if (refill_pi_state_cache()) 1314 if (refill_pi_state_cache())
1357 return -ENOMEM; 1315 return -ENOMEM;
@@ -1371,7 +1329,7 @@ retry:
1371 if (unlikely(ret != 0)) 1329 if (unlikely(ret != 0))
1372 goto out; 1330 goto out;
1373 1331
1374retry_unlocked: 1332retry_private:
1375 hb = queue_lock(&q); 1333 hb = queue_lock(&q);
1376 1334
1377retry_locked: 1335retry_locked:
@@ -1455,6 +1413,7 @@ retry_locked:
1455 * exit to complete. 1413 * exit to complete.
1456 */ 1414 */
1457 queue_unlock(&q, hb); 1415 queue_unlock(&q, hb);
1416 put_futex_key(fshared, &q.key);
1458 cond_resched(); 1417 cond_resched();
1459 goto retry; 1418 goto retry;
1460 1419
@@ -1561,6 +1520,13 @@ retry_locked:
1561 } 1520 }
1562 } 1521 }
1563 1522
1523 /*
1524 * If fixup_pi_state_owner() faulted and was unable to handle the
1525 * fault, unlock it and return the fault to userspace.
1526 */
1527 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
1528 rt_mutex_unlock(&q.pi_state->pi_mutex);
1529
1564 /* Unqueue and drop the lock */ 1530 /* Unqueue and drop the lock */
1565 unqueue_me_pi(&q); 1531 unqueue_me_pi(&q);
1566 1532
@@ -1588,22 +1554,18 @@ uaddr_faulted:
1588 */ 1554 */
1589 queue_unlock(&q, hb); 1555 queue_unlock(&q, hb);
1590 1556
1591 if (attempt++) {
1592 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1593 if (ret)
1594 goto out_put_key;
1595 goto retry_unlocked;
1596 }
1597
1598 ret = get_user(uval, uaddr); 1557 ret = get_user(uval, uaddr);
1599 if (!ret) 1558 if (ret)
1600 goto retry; 1559 goto out_put_key;
1601 1560
1602 if (to) 1561 if (!fshared)
1603 destroy_hrtimer_on_stack(&to->timer); 1562 goto retry_private;
1604 return ret; 1563
1564 put_futex_key(fshared, &q.key);
1565 goto retry;
1605} 1566}
1606 1567
1568
1607/* 1569/*
1608 * Userspace attempted a TID -> 0 atomic transition, and failed. 1570 * Userspace attempted a TID -> 0 atomic transition, and failed.
1609 * This is the in-kernel slowpath: we look up the PI state (if any), 1571 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1616,7 +1578,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1616 u32 uval; 1578 u32 uval;
1617 struct plist_head *head; 1579 struct plist_head *head;
1618 union futex_key key = FUTEX_KEY_INIT; 1580 union futex_key key = FUTEX_KEY_INIT;
1619 int ret, attempt = 0; 1581 int ret;
1620 1582
1621retry: 1583retry:
1622 if (get_user(uval, uaddr)) 1584 if (get_user(uval, uaddr))
@@ -1632,7 +1594,6 @@ retry:
1632 goto out; 1594 goto out;
1633 1595
1634 hb = hash_futex(&key); 1596 hb = hash_futex(&key);
1635retry_unlocked:
1636 spin_lock(&hb->lock); 1597 spin_lock(&hb->lock);
1637 1598
1638 /* 1599 /*
@@ -1697,14 +1658,7 @@ pi_faulted:
1697 * we have to drop the mmap_sem in order to call get_user(). 1658 * we have to drop the mmap_sem in order to call get_user().
1698 */ 1659 */
1699 spin_unlock(&hb->lock); 1660 spin_unlock(&hb->lock);
1700 1661 put_futex_key(fshared, &key);
1701 if (attempt++) {
1702 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1703 if (ret)
1704 goto out;
1705 uval = 0;
1706 goto retry_unlocked;
1707 }
1708 1662
1709 ret = get_user(uval, uaddr); 1663 ret = get_user(uval, uaddr);
1710 if (!ret) 1664 if (!ret)
@@ -1733,9 +1687,8 @@ pi_faulted:
1733 * @head: pointer to the list-head 1687 * @head: pointer to the list-head
1734 * @len: length of the list-head, as userspace expects 1688 * @len: length of the list-head, as userspace expects
1735 */ 1689 */
1736asmlinkage long 1690SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
1737sys_set_robust_list(struct robust_list_head __user *head, 1691 size_t, len)
1738 size_t len)
1739{ 1692{
1740 if (!futex_cmpxchg_enabled) 1693 if (!futex_cmpxchg_enabled)
1741 return -ENOSYS; 1694 return -ENOSYS;
@@ -1756,9 +1709,9 @@ sys_set_robust_list(struct robust_list_head __user *head,
1756 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 1709 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
1757 * @len_ptr: pointer to a length field, the kernel fills in the header size 1710 * @len_ptr: pointer to a length field, the kernel fills in the header size
1758 */ 1711 */
1759asmlinkage long 1712SYSCALL_DEFINE3(get_robust_list, int, pid,
1760sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, 1713 struct robust_list_head __user * __user *, head_ptr,
1761 size_t __user *len_ptr) 1714 size_t __user *, len_ptr)
1762{ 1715{
1763 struct robust_list_head __user *head; 1716 struct robust_list_head __user *head;
1764 unsigned long ret; 1717 unsigned long ret;
@@ -1978,9 +1931,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1978} 1931}
1979 1932
1980 1933
1981asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, 1934SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1982 struct timespec __user *utime, u32 __user *uaddr2, 1935 struct timespec __user *, utime, u32 __user *, uaddr2,
1983 u32 val3) 1936 u32, val3)
1984{ 1937{
1985 struct timespec ts; 1938 struct timespec ts;
1986 ktime_t t, *tp = NULL; 1939 ktime_t t, *tp = NULL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1455b7651b6b..cb8a15c19583 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
501 continue; 501 continue;
502 timer = rb_entry(base->first, struct hrtimer, node); 502 timer = rb_entry(base->first, struct hrtimer, node);
503 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 503 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
504 /*
505 * clock_was_set() has changed base->offset so the
506 * result might be negative. Fix it up to prevent a
507 * false positive in clockevents_program_event()
508 */
509 if (expires.tv64 < 0)
510 expires.tv64 = 0;
504 if (expires.tv64 < cpu_base->expires_next.tv64) 511 if (expires.tv64 < cpu_base->expires_next.tv64)
505 cpu_base->expires_next = expires; 512 cpu_base->expires_next = expires;
506 } 513 }
@@ -614,7 +621,9 @@ void clock_was_set(void)
614 */ 621 */
615void hres_timers_resume(void) 622void hres_timers_resume(void)
616{ 623{
617 /* Retrigger the CPU local events: */ 624 WARN_ONCE(!irqs_disabled(),
625 KERN_INFO "hres_timers_resume() called with IRQs enabled!");
626
618 retrigger_next_event(NULL); 627 retrigger_next_event(NULL);
619} 628}
620 629
@@ -642,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
642 * and expiry check is done in the hrtimer_interrupt or in the softirq. 651 * and expiry check is done in the hrtimer_interrupt or in the softirq.
643 */ 652 */
644static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 653static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
645 struct hrtimer_clock_base *base) 654 struct hrtimer_clock_base *base,
655 int wakeup)
646{ 656{
647 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 657 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
648 spin_unlock(&base->cpu_base->lock); 658 if (wakeup) {
649 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 659 spin_unlock(&base->cpu_base->lock);
650 spin_lock(&base->cpu_base->lock); 660 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
661 spin_lock(&base->cpu_base->lock);
662 } else
663 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
664
651 return 1; 665 return 1;
652 } 666 }
667
653 return 0; 668 return 0;
654} 669}
655 670
@@ -694,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
694static inline int hrtimer_switch_to_hres(void) { return 0; } 709static inline int hrtimer_switch_to_hres(void) { return 0; }
695static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 710static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
696static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 711static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
697 struct hrtimer_clock_base *base) 712 struct hrtimer_clock_base *base,
713 int wakeup)
698{ 714{
699 return 0; 715 return 0;
700} 716}
@@ -877,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
877 return 0; 893 return 0;
878} 894}
879 895
880/** 896int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
881 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU 897 unsigned long delta_ns, const enum hrtimer_mode mode,
882 * @timer: the timer to be added 898 int wakeup)
883 * @tim: expiry time
884 * @delta_ns: "slack" range for the timer
885 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
886 *
887 * Returns:
888 * 0 on success
889 * 1 when the timer was active
890 */
891int
892hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
893 const enum hrtimer_mode mode)
894{ 899{
895 struct hrtimer_clock_base *base, *new_base; 900 struct hrtimer_clock_base *base, *new_base;
896 unsigned long flags; 901 unsigned long flags;
@@ -931,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
931 * XXX send_remote_softirq() ? 936 * XXX send_remote_softirq() ?
932 */ 937 */
933 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) 938 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
934 hrtimer_enqueue_reprogram(timer, new_base); 939 hrtimer_enqueue_reprogram(timer, new_base, wakeup);
935 940
936 unlock_hrtimer_base(timer, &flags); 941 unlock_hrtimer_base(timer, &flags);
937 942
938 return ret; 943 return ret;
939} 944}
945
946/**
947 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
948 * @timer: the timer to be added
949 * @tim: expiry time
950 * @delta_ns: "slack" range for the timer
951 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
952 *
953 * Returns:
954 * 0 on success
955 * 1 when the timer was active
956 */
957int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
958 unsigned long delta_ns, const enum hrtimer_mode mode)
959{
960 return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
961}
940EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 962EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
941 963
942/** 964/**
@@ -952,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
952int 974int
953hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 975hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
954{ 976{
955 return hrtimer_start_range_ns(timer, tim, 0, mode); 977 return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
956} 978}
957EXPORT_SYMBOL_GPL(hrtimer_start); 979EXPORT_SYMBOL_GPL(hrtimer_start);
958 980
@@ -1156,6 +1178,29 @@ static void __run_hrtimer(struct hrtimer *timer)
1156 1178
1157#ifdef CONFIG_HIGH_RES_TIMERS 1179#ifdef CONFIG_HIGH_RES_TIMERS
1158 1180
1181static int force_clock_reprogram;
1182
1183/*
1184 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1185 * is hanging, which could happen with something that slows the interrupt
1186 * such as the tracing. Then we force the clock reprogramming for each future
1187 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1188 * threshold that we will overwrite.
1189 * The next tick event will be scheduled to 3 times we currently spend on
1190 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1191 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1192 * let it running without serious starvation.
1193 */
1194
1195static inline void
1196hrtimer_interrupt_hanging(struct clock_event_device *dev,
1197 ktime_t try_time)
1198{
1199 force_clock_reprogram = 1;
1200 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1201 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1202 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1203}
1159/* 1204/*
1160 * High resolution timer interrupt 1205 * High resolution timer interrupt
1161 * Called with interrupts disabled 1206 * Called with interrupts disabled
@@ -1165,6 +1210,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1165 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1210 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1166 struct hrtimer_clock_base *base; 1211 struct hrtimer_clock_base *base;
1167 ktime_t expires_next, now; 1212 ktime_t expires_next, now;
1213 int nr_retries = 0;
1168 int i; 1214 int i;
1169 1215
1170 BUG_ON(!cpu_base->hres_active); 1216 BUG_ON(!cpu_base->hres_active);
@@ -1172,6 +1218,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1172 dev->next_event.tv64 = KTIME_MAX; 1218 dev->next_event.tv64 = KTIME_MAX;
1173 1219
1174 retry: 1220 retry:
1221 /* 5 retries is enough to notice a hang */
1222 if (!(++nr_retries % 5))
1223 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1224
1175 now = ktime_get(); 1225 now = ktime_get();
1176 1226
1177 expires_next.tv64 = KTIME_MAX; 1227 expires_next.tv64 = KTIME_MAX;
@@ -1224,7 +1274,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1224 1274
1225 /* Reprogramming necessary ? */ 1275 /* Reprogramming necessary ? */
1226 if (expires_next.tv64 != KTIME_MAX) { 1276 if (expires_next.tv64 != KTIME_MAX) {
1227 if (tick_program_event(expires_next, 0)) 1277 if (tick_program_event(expires_next, force_clock_reprogram))
1228 goto retry; 1278 goto retry;
1229 } 1279 }
1230} 1280}
@@ -1467,8 +1517,8 @@ out:
1467 return ret; 1517 return ret;
1468} 1518}
1469 1519
1470asmlinkage long 1520SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1471sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) 1521 struct timespec __user *, rmtp)
1472{ 1522{
1473 struct timespec tu; 1523 struct timespec tu;
1474 1524
@@ -1578,6 +1628,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1578 break; 1628 break;
1579 1629
1580#ifdef CONFIG_HOTPLUG_CPU 1630#ifdef CONFIG_HOTPLUG_CPU
1631 case CPU_DYING:
1632 case CPU_DYING_FROZEN:
1633 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
1634 break;
1581 case CPU_DEAD: 1635 case CPU_DEAD:
1582 case CPU_DEAD_FROZEN: 1636 case CPU_DEAD_FROZEN:
1583 { 1637 {
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000000..022a4927b785
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,217 @@
1/*
2 * Detect Hung Task
3 *
4 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
5 *
6 */
7
8#include <linux/mm.h>
9#include <linux/cpu.h>
10#include <linux/nmi.h>
11#include <linux/init.h>
12#include <linux/delay.h>
13#include <linux/freezer.h>
14#include <linux/kthread.h>
15#include <linux/lockdep.h>
16#include <linux/module.h>
17#include <linux/sysctl.h>
18
19/*
20 * The number of tasks checked:
21 */
22unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
23
24/*
25 * Limit number of tasks checked in a batch.
26 *
27 * This value controls the preemptibility of khungtaskd since preemption
28 * is disabled during the critical section. It also controls the size of
29 * the RCU grace period. So it needs to be upper-bound.
30 */
31#define HUNG_TASK_BATCHING 1024
32
33/*
34 * Zero means infinite timeout - no checking done:
35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39
40static int __read_mostly did_panic;
41
42static struct task_struct *watchdog_task;
43
44/*
45 * Should we panic (and reboot, if panic_timeout= is set) when a
46 * hung task is detected:
47 */
48unsigned int __read_mostly sysctl_hung_task_panic =
49 CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
50
51static int __init hung_task_panic_setup(char *str)
52{
53 sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
54
55 return 1;
56}
57__setup("hung_task_panic=", hung_task_panic_setup);
58
59static int
60hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
61{
62 did_panic = 1;
63
64 return NOTIFY_DONE;
65}
66
67static struct notifier_block panic_block = {
68 .notifier_call = hung_task_panic,
69};
70
71static void check_hung_task(struct task_struct *t, unsigned long timeout)
72{
73 unsigned long switch_count = t->nvcsw + t->nivcsw;
74
75 /*
76 * Ensure the task is not frozen.
77 * Also, when a freshly created task is scheduled once, changes
78 * its state to TASK_UNINTERRUPTIBLE without having ever been
79 * switched out once, it musn't be checked.
80 */
81 if (unlikely(t->flags & PF_FROZEN || !switch_count))
82 return;
83
84 if (switch_count != t->last_switch_count) {
85 t->last_switch_count = switch_count;
86 return;
87 }
88 if (!sysctl_hung_task_warnings)
89 return;
90 sysctl_hung_task_warnings--;
91
92 /*
93 * Ok, the task did not get scheduled for more than 2 minutes,
94 * complain:
95 */
96 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
97 "%ld seconds.\n", t->comm, t->pid, timeout);
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n");
100 sched_show_task(t);
101 __debug_show_held_locks(t);
102
103 touch_nmi_watchdog();
104
105 if (sysctl_hung_task_panic)
106 panic("hung_task: blocked tasks");
107}
108
109/*
110 * To avoid extending the RCU grace period for an unbounded amount of time,
111 * periodically exit the critical section and enter a new one.
112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required.
115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{
118 get_task_struct(g);
119 get_task_struct(t);
120 rcu_read_unlock();
121 cond_resched();
122 rcu_read_lock();
123 put_task_struct(t);
124 put_task_struct(g);
125}
126
127/*
128 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
129 * a really long time (120 seconds). If that happens, print out
130 * a warning.
131 */
132static void check_hung_uninterruptible_tasks(unsigned long timeout)
133{
134 int max_count = sysctl_hung_task_check_count;
135 int batch_count = HUNG_TASK_BATCHING;
136 struct task_struct *g, *t;
137
138 /*
139 * If the system crashed already then all bets are off,
140 * do not report extra hung tasks:
141 */
142 if (test_taint(TAINT_DIE) || did_panic)
143 return;
144
145 rcu_read_lock();
146 do_each_thread(g, t) {
147 if (!--max_count)
148 goto unlock;
149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING;
151 rcu_lock_break(g, t);
152 /* Exit if t or g was unhashed during refresh. */
153 if (t->state == TASK_DEAD || g->state == TASK_DEAD)
154 goto unlock;
155 }
156 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
157 if (t->state == TASK_UNINTERRUPTIBLE)
158 check_hung_task(t, timeout);
159 } while_each_thread(g, t);
160 unlock:
161 rcu_read_unlock();
162}
163
164static unsigned long timeout_jiffies(unsigned long timeout)
165{
166 /* timeout of 0 will disable the watchdog */
167 return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
168}
169
170/*
171 * Process updating of timeout sysctl
172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer,
175 size_t *lenp, loff_t *ppos)
176{
177 int ret;
178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
180
181 if (ret || !write)
182 goto out;
183
184 wake_up_process(watchdog_task);
185
186 out:
187 return ret;
188}
189
190/*
191 * kthread which checks for tasks stuck in D state
192 */
193static int watchdog(void *dummy)
194{
195 set_user_nice(current, 0);
196
197 for ( ; ; ) {
198 unsigned long timeout = sysctl_hung_task_timeout_secs;
199
200 while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
201 timeout = sysctl_hung_task_timeout_secs;
202
203 check_hung_uninterruptible_tasks(timeout);
204 }
205
206 return 0;
207}
208
209static int __init hung_task_init(void)
210{
211 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
212 watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
213
214 return 0;
215}
216
217module_init(hung_task_init);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 4dd5b1edac98..3394f8f52964 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o 6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f63c706d25e1..c687ba4363f2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
46 desc->irq_count = 0; 46 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 47 desc->irqs_unhandled = 0;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 cpumask_setall(&desc->affinity); 49 cpumask_setall(desc->affinity);
50#ifdef CONFIG_GENERIC_PENDING_IRQ
51 cpumask_clear(desc->pending_mask);
52#endif
50#endif 53#endif
51 spin_unlock_irqrestore(&desc->lock, flags); 54 spin_unlock_irqrestore(&desc->lock, flags);
52} 55}
@@ -78,6 +81,7 @@ void dynamic_irq_cleanup(unsigned int irq)
78 desc->handle_irq = handle_bad_irq; 81 desc->handle_irq = handle_bad_irq;
79 desc->chip = &no_irq_chip; 82 desc->chip = &no_irq_chip;
80 desc->name = NULL; 83 desc->name = NULL;
84 clear_kstat_irqs(desc);
81 spin_unlock_irqrestore(&desc->lock, flags); 85 spin_unlock_irqrestore(&desc->lock, flags);
82} 86}
83 87
@@ -290,7 +294,8 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
290 desc->chip->mask_ack(irq); 294 desc->chip->mask_ack(irq);
291 else { 295 else {
292 desc->chip->mask(irq); 296 desc->chip->mask(irq);
293 desc->chip->ack(irq); 297 if (desc->chip->ack)
298 desc->chip->ack(irq);
294 } 299 }
295} 300}
296 301
@@ -383,6 +388,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
383out_unlock: 388out_unlock:
384 spin_unlock(&desc->lock); 389 spin_unlock(&desc->lock);
385} 390}
391EXPORT_SYMBOL_GPL(handle_level_irq);
386 392
387/** 393/**
388 * handle_fasteoi_irq - irq handler for transparent controllers 394 * handle_fasteoi_irq - irq handler for transparent controllers
@@ -475,7 +481,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
475 kstat_incr_irqs_this_cpu(irq, desc); 481 kstat_incr_irqs_this_cpu(irq, desc);
476 482
477 /* Start handling the irq */ 483 /* Start handling the irq */
478 desc->chip->ack(irq); 484 if (desc->chip->ack)
485 desc->chip->ack(irq);
479 desc = irq_remap_to_desc(irq, desc); 486 desc = irq_remap_to_desc(irq, desc);
480 487
481 /* Mark the IRQ currently in progress.*/ 488 /* Mark the IRQ currently in progress.*/
@@ -593,6 +600,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
593 } 600 }
594 spin_unlock_irqrestore(&desc->lock, flags); 601 spin_unlock_irqrestore(&desc->lock, flags);
595} 602}
603EXPORT_SYMBOL_GPL(__set_irq_handler);
596 604
597void 605void
598set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, 606set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bff..d06df9c41cba 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
26} 26}
27 27
28/** 28/**
29 * devm_request_irq - allocate an interrupt line for a managed device 29 * devm_request_threaded_irq - allocate an interrupt line for a managed device
30 * @dev: device to request interrupt for 30 * @dev: device to request interrupt for
31 * @irq: Interrupt line to allocate 31 * @irq: Interrupt line to allocate
32 * @handler: Function to be called when the IRQ occurs 32 * @handler: Function to be called when the IRQ occurs
33 * @thread_fn: function to be called in a threaded interrupt context. NULL
34 * for devices which handle everything in @handler
33 * @irqflags: Interrupt type flags 35 * @irqflags: Interrupt type flags
34 * @devname: An ascii name for the claiming device 36 * @devname: An ascii name for the claiming device
35 * @dev_id: A cookie passed back to the handler function 37 * @dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
43 * separately, dev_free_irq() must be used. 45 * separately, dev_free_irq() must be used.
44 */ 46 */
45int devm_request_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
46 irq_handler_t handler, unsigned long irqflags, 48 irq_handler_t handler, irq_handler_t thread_fn,
47 const char *devname, void *dev_id) 49 unsigned long irqflags, const char *devname,
50 void *dev_id)
48{ 51{
49 struct irq_devres *dr; 52 struct irq_devres *dr;
50 int rc; 53 int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
54 if (!dr) 57 if (!dr)
55 return -ENOMEM; 58 return -ENOMEM;
56 59
57 rc = request_irq(irq, handler, irqflags, devname, dev_id); 60 rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
61 dev_id);
58 if (rc) { 62 if (rc) {
59 devres_free(dr); 63 devres_free(dr);
60 return rc; 64 return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
66 70
67 return 0; 71 return 0;
68} 72}
69EXPORT_SYMBOL(devm_request_irq); 73EXPORT_SYMBOL(devm_request_threaded_irq);
70 74
71/** 75/**
72 * devm_free_irq - free an interrupt 76 * devm_free_irq - free an interrupt
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..d82142be8dd2 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,8 @@
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h> 18#include <linux/rculist.h>
19#include <linux/hash.h> 19#include <linux/hash.h>
20#include <trace/irq.h>
21#include <linux/bootmem.h>
20 22
21#include "internals.h" 23#include "internals.h"
22 24
@@ -39,6 +41,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
39 ack_bad_irq(irq); 41 ack_bad_irq(irq);
40} 42}
41 43
44#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
45static void __init init_irq_default_affinity(void)
46{
47 alloc_bootmem_cpumask_var(&irq_default_affinity);
48 cpumask_setall(irq_default_affinity);
49}
50#else
51static void __init init_irq_default_affinity(void)
52{
53}
54#endif
55
42/* 56/*
43 * Linux has a controller-independent interrupt architecture. 57 * Linux has a controller-independent interrupt architecture.
44 * Every controller has a 'controller-template', that is used 58 * Every controller has a 'controller-template', that is used
@@ -57,6 +71,7 @@ int nr_irqs = NR_IRQS;
57EXPORT_SYMBOL_GPL(nr_irqs); 71EXPORT_SYMBOL_GPL(nr_irqs);
58 72
59#ifdef CONFIG_SPARSE_IRQ 73#ifdef CONFIG_SPARSE_IRQ
74
60static struct irq_desc irq_desc_init = { 75static struct irq_desc irq_desc_init = {
61 .irq = -1, 76 .irq = -1,
62 .status = IRQ_DISABLED, 77 .status = IRQ_DISABLED,
@@ -64,26 +79,25 @@ static struct irq_desc irq_desc_init = {
64 .handle_irq = handle_bad_irq, 79 .handle_irq = handle_bad_irq,
65 .depth = 1, 80 .depth = 1,
66 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 81 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
67#ifdef CONFIG_SMP
68 .affinity = CPU_MASK_ALL
69#endif
70}; 82};
71 83
72void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
73{ 85{
74 unsigned long bytes;
75 char *ptr;
76 int node; 86 int node;
77 87 void *ptr;
78 /* Compute how many bytes we need per irq and allocate them */
79 bytes = nr * sizeof(unsigned int);
80 88
81 node = cpu_to_node(cpu); 89 node = cpu_to_node(cpu);
82 ptr = kzalloc_node(bytes, GFP_ATOMIC, node); 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
83 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
84 91
85 if (ptr) 92 /*
86 desc->kstat_irqs = (unsigned int *)ptr; 93 * don't overwite if can not get new one
94 * init_copy_kstat_irqs() could still use old one
95 */
96 if (ptr) {
97 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n",
98 cpu, node);
99 desc->kstat_irqs = ptr;
100 }
87} 101}
88 102
89static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 103static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
@@ -101,6 +115,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
101 printk(KERN_ERR "can not alloc kstat_irqs\n"); 115 printk(KERN_ERR "can not alloc kstat_irqs\n");
102 BUG_ON(1); 116 BUG_ON(1);
103 } 117 }
118 if (!init_alloc_desc_masks(desc, cpu, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1);
121 }
104 arch_init_chip_data(desc, cpu); 122 arch_init_chip_data(desc, cpu);
105} 123}
106 124
@@ -109,7 +127,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
109 */ 127 */
110DEFINE_SPINLOCK(sparse_irq_lock); 128DEFINE_SPINLOCK(sparse_irq_lock);
111 129
112struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; 130struct irq_desc **irq_desc_ptrs __read_mostly;
113 131
114static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 132static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
115 [0 ... NR_IRQS_LEGACY-1] = { 133 [0 ... NR_IRQS_LEGACY-1] = {
@@ -119,14 +137,10 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
119 .handle_irq = handle_bad_irq, 137 .handle_irq = handle_bad_irq,
120 .depth = 1, 138 .depth = 1,
121 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 139 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
122#ifdef CONFIG_SMP
123 .affinity = CPU_MASK_ALL
124#endif
125 } 140 }
126}; 141};
127 142
128/* FIXME: use bootmem alloc ...*/ 143static unsigned int *kstat_irqs_legacy;
129static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
130 144
131int __init early_irq_init(void) 145int __init early_irq_init(void)
132{ 146{
@@ -134,18 +148,32 @@ int __init early_irq_init(void)
134 int legacy_count; 148 int legacy_count;
135 int i; 149 int i;
136 150
151 init_irq_default_affinity();
152
153 /* initialize nr_irqs based on nr_cpu_ids */
154 arch_probe_nr_irqs();
155 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
156
137 desc = irq_desc_legacy; 157 desc = irq_desc_legacy;
138 legacy_count = ARRAY_SIZE(irq_desc_legacy); 158 legacy_count = ARRAY_SIZE(irq_desc_legacy);
139 159
160 /* allocate irq_desc_ptrs array based on nr_irqs */
161 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
162
163 /* allocate based on nr_cpu_ids */
164 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
165 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
166 sizeof(int));
167
140 for (i = 0; i < legacy_count; i++) { 168 for (i = 0; i < legacy_count; i++) {
141 desc[i].irq = i; 169 desc[i].irq = i;
142 desc[i].kstat_irqs = kstat_irqs_legacy[i]; 170 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
143 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 171 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
144 172 init_alloc_desc_masks(&desc[i], 0, true);
145 irq_desc_ptrs[i] = desc + i; 173 irq_desc_ptrs[i] = desc + i;
146 } 174 }
147 175
148 for (i = legacy_count; i < NR_IRQS; i++) 176 for (i = legacy_count; i < nr_irqs; i++)
149 irq_desc_ptrs[i] = NULL; 177 irq_desc_ptrs[i] = NULL;
150 178
151 return arch_early_irq_init(); 179 return arch_early_irq_init();
@@ -153,7 +181,10 @@ int __init early_irq_init(void)
153 181
154struct irq_desc *irq_to_desc(unsigned int irq) 182struct irq_desc *irq_to_desc(unsigned int irq)
155{ 183{
156 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; 184 if (irq_desc_ptrs && irq < nr_irqs)
185 return irq_desc_ptrs[irq];
186
187 return NULL;
157} 188}
158 189
159struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 190struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@ -162,10 +193,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
162 unsigned long flags; 193 unsigned long flags;
163 int node; 194 int node;
164 195
165 if (irq >= NR_IRQS) { 196 if (irq >= nr_irqs) {
166 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", 197 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
167 irq, NR_IRQS); 198 irq, nr_irqs);
168 WARN_ON(1);
169 return NULL; 199 return NULL;
170 } 200 }
171 201
@@ -207,24 +237,28 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
207 .handle_irq = handle_bad_irq, 237 .handle_irq = handle_bad_irq,
208 .depth = 1, 238 .depth = 1,
209 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 239 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
210#ifdef CONFIG_SMP
211 .affinity = CPU_MASK_ALL
212#endif
213 } 240 }
214}; 241};
215 242
243static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
216int __init early_irq_init(void) 244int __init early_irq_init(void)
217{ 245{
218 struct irq_desc *desc; 246 struct irq_desc *desc;
219 int count; 247 int count;
220 int i; 248 int i;
221 249
250 init_irq_default_affinity();
251
252 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
253
222 desc = irq_desc; 254 desc = irq_desc;
223 count = ARRAY_SIZE(irq_desc); 255 count = ARRAY_SIZE(irq_desc);
224 256
225 for (i = 0; i < count; i++) 257 for (i = 0; i < count; i++) {
226 desc[i].irq = i; 258 desc[i].irq = i;
227 259 init_alloc_desc_masks(&desc[i], 0, true);
260 desc[i].kstat_irqs = kstat_irqs_all[i];
261 }
228 return arch_early_irq_init(); 262 return arch_early_irq_init();
229} 263}
230 264
@@ -239,6 +273,11 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
239} 273}
240#endif /* !CONFIG_SPARSE_IRQ */ 274#endif /* !CONFIG_SPARSE_IRQ */
241 275
276void clear_kstat_irqs(struct irq_desc *desc)
277{
278 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
279}
280
242/* 281/*
243 * What should we do if we get a hw irq event on an illegal vector? 282 * What should we do if we get a hw irq event on an illegal vector?
244 * Each architecture has to answer this themself. 283 * Each architecture has to answer this themself.
@@ -300,6 +339,18 @@ irqreturn_t no_action(int cpl, void *dev_id)
300 return IRQ_NONE; 339 return IRQ_NONE;
301} 340}
302 341
342static void warn_no_thread(unsigned int irq, struct irqaction *action)
343{
344 if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
345 return;
346
347 printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
348 "but no thread function available.", irq, action->name);
349}
350
351DEFINE_TRACE(irq_handler_entry);
352DEFINE_TRACE(irq_handler_exit);
353
303/** 354/**
304 * handle_IRQ_event - irq action chain handler 355 * handle_IRQ_event - irq action chain handler
305 * @irq: the interrupt number 356 * @irq: the interrupt number
@@ -312,13 +363,56 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
312 irqreturn_t ret, retval = IRQ_NONE; 363 irqreturn_t ret, retval = IRQ_NONE;
313 unsigned int status = 0; 364 unsigned int status = 0;
314 365
366 WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
367
315 if (!(action->flags & IRQF_DISABLED)) 368 if (!(action->flags & IRQF_DISABLED))
316 local_irq_enable_in_hardirq(); 369 local_irq_enable_in_hardirq();
317 370
318 do { 371 do {
372 trace_irq_handler_entry(irq, action);
319 ret = action->handler(irq, action->dev_id); 373 ret = action->handler(irq, action->dev_id);
320 if (ret == IRQ_HANDLED) 374 trace_irq_handler_exit(irq, action, ret);
375
376 switch (ret) {
377 case IRQ_WAKE_THREAD:
378 /*
379 * Set result to handled so the spurious check
380 * does not trigger.
381 */
382 ret = IRQ_HANDLED;
383
384 /*
385 * Catch drivers which return WAKE_THREAD but
386 * did not set up a thread function
387 */
388 if (unlikely(!action->thread_fn)) {
389 warn_no_thread(irq, action);
390 break;
391 }
392
393 /*
394 * Wake up the handler thread for this
395 * action. In case the thread crashed and was
396 * killed we just pretend that we handled the
397 * interrupt. The hardirq handler above has
398 * disabled the device interrupt, so no irq
399 * storm is lurking.
400 */
401 if (likely(!test_bit(IRQTF_DIED,
402 &action->thread_flags))) {
403 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
404 wake_up_process(action->thread);
405 }
406
407 /* Fall through to add to randomness */
408 case IRQ_HANDLED:
321 status |= action->flags; 409 status |= action->flags;
410 break;
411
412 default:
413 break;
414 }
415
322 retval |= ret; 416 retval |= ret;
323 action = action->next; 417 action = action->next;
324 } while (action); 418 } while (action);
@@ -331,6 +425,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
331} 425}
332 426
333#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ 427#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
428
429#ifdef CONFIG_ENABLE_WARN_DEPRECATED
430# warning __do_IRQ is deprecated. Please convert to proper flow handlers
431#endif
432
334/** 433/**
335 * __do_IRQ - original all in one highlevel IRQ handler 434 * __do_IRQ - original all in one highlevel IRQ handler
336 * @irq: the interrupt number 435 * @irq: the interrupt number
@@ -451,12 +550,10 @@ void early_init_irq_lock_class(void)
451 } 550 }
452} 551}
453 552
454#ifdef CONFIG_SPARSE_IRQ
455unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 553unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
456{ 554{
457 struct irq_desc *desc = irq_to_desc(irq); 555 struct irq_desc *desc = irq_to_desc(irq);
458 return desc ? desc->kstat_irqs[cpu] : 0; 556 return desc ? desc->kstat_irqs[cpu] : 0;
459} 557}
460#endif
461EXPORT_SYMBOL(kstat_irqs_cpu); 558EXPORT_SYMBOL(kstat_irqs_cpu);
462 559
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..01ce20eab38f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,11 +12,21 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12 12
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags); 14 unsigned long flags);
15extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
15 17
16extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc);
18extern spinlock_t sparse_irq_lock; 21extern spinlock_t sparse_irq_lock;
22
23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; 28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif
20 30
21#ifdef CONFIG_PROC_FS 31#ifdef CONFIG_PROC_FS
22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 32extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..7e2e7dd4cd2f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,24 +8,15 @@
8 */ 8 */
9 9
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/kthread.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/random.h> 13#include <linux/random.h>
13#include <linux/interrupt.h> 14#include <linux/interrupt.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/sched.h>
15 17
16#include "internals.h" 18#include "internals.h"
17 19
18#ifdef CONFIG_SMP
19cpumask_var_t irq_default_affinity;
20
21static int init_irq_default_affinity(void)
22{
23 alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
24 cpumask_setall(irq_default_affinity);
25 return 0;
26}
27core_initcall(init_irq_default_affinity);
28
29/** 20/**
30 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
31 * @irq: interrupt number to wait for 22 * @irq: interrupt number to wait for
@@ -61,9 +52,18 @@ void synchronize_irq(unsigned int irq)
61 52
62 /* Oops, that failed? */ 53 /* Oops, that failed? */
63 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
55
56 /*
57 * We made sure that no hardirq handler is running. Now verify
58 * that no threaded handlers are active.
59 */
60 wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
64} 61}
65EXPORT_SYMBOL(synchronize_irq); 62EXPORT_SYMBOL(synchronize_irq);
66 63
64#ifdef CONFIG_SMP
65cpumask_var_t irq_default_affinity;
66
67/** 67/**
68 * irq_can_set_affinity - Check if the affinity of a given irq can be set 68 * irq_can_set_affinity - Check if the affinity of a given irq can be set
69 * @irq: Interrupt to check 69 * @irq: Interrupt to check
@@ -80,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83static void
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
85{
86 struct irqaction *action = desc->action;
87
88 while (action) {
89 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask);
91 action = action->next;
92 }
93}
94
83/** 95/**
84 * irq_set_affinity - Set the irq affinity of a given irq 96 * irq_set_affinity - Set the irq affinity of a given irq
85 * @irq: Interrupt to set affinity 97 * @irq: Interrupt to set affinity
@@ -98,16 +110,17 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
98 110
99#ifdef CONFIG_GENERIC_PENDING_IRQ 111#ifdef CONFIG_GENERIC_PENDING_IRQ
100 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 112 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
101 cpumask_copy(&desc->affinity, cpumask); 113 cpumask_copy(desc->affinity, cpumask);
102 desc->chip->set_affinity(irq, cpumask); 114 desc->chip->set_affinity(irq, cpumask);
103 } else { 115 } else {
104 desc->status |= IRQ_MOVE_PENDING; 116 desc->status |= IRQ_MOVE_PENDING;
105 cpumask_copy(&desc->pending_mask, cpumask); 117 cpumask_copy(desc->pending_mask, cpumask);
106 } 118 }
107#else 119#else
108 cpumask_copy(&desc->affinity, cpumask); 120 cpumask_copy(desc->affinity, cpumask);
109 desc->chip->set_affinity(irq, cpumask); 121 desc->chip->set_affinity(irq, cpumask);
110#endif 122#endif
123 irq_set_thread_affinity(desc, cpumask);
111 desc->status |= IRQ_AFFINITY_SET; 124 desc->status |= IRQ_AFFINITY_SET;
112 spin_unlock_irqrestore(&desc->lock, flags); 125 spin_unlock_irqrestore(&desc->lock, flags);
113 return 0; 126 return 0;
@@ -117,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
117/* 130/*
118 * Generic version of the affinity autoselector. 131 * Generic version of the affinity autoselector.
119 */ 132 */
120int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) 133static int setup_affinity(unsigned int irq, struct irq_desc *desc)
121{ 134{
122 if (!irq_can_set_affinity(irq)) 135 if (!irq_can_set_affinity(irq))
123 return 0; 136 return 0;
@@ -127,21 +140,21 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
127 * one of the targets is online. 140 * one of the targets is online.
128 */ 141 */
129 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 142 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
130 if (cpumask_any_and(&desc->affinity, cpu_online_mask) 143 if (cpumask_any_and(desc->affinity, cpu_online_mask)
131 < nr_cpu_ids) 144 < nr_cpu_ids)
132 goto set_affinity; 145 goto set_affinity;
133 else 146 else
134 desc->status &= ~IRQ_AFFINITY_SET; 147 desc->status &= ~IRQ_AFFINITY_SET;
135 } 148 }
136 149
137 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); 150 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
138set_affinity: 151set_affinity:
139 desc->chip->set_affinity(irq, &desc->affinity); 152 desc->chip->set_affinity(irq, desc->affinity);
140 153
141 return 0; 154 return 0;
142} 155}
143#else 156#else
144static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d) 157static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
145{ 158{
146 return irq_select_affinity(irq); 159 return irq_select_affinity(irq);
147} 160}
@@ -157,19 +170,35 @@ int irq_select_affinity_usr(unsigned int irq)
157 int ret; 170 int ret;
158 171
159 spin_lock_irqsave(&desc->lock, flags); 172 spin_lock_irqsave(&desc->lock, flags);
160 ret = do_irq_select_affinity(irq, desc); 173 ret = setup_affinity(irq, desc);
174 if (!ret)
175 irq_set_thread_affinity(desc, desc->affinity);
161 spin_unlock_irqrestore(&desc->lock, flags); 176 spin_unlock_irqrestore(&desc->lock, flags);
162 177
163 return ret; 178 return ret;
164} 179}
165 180
166#else 181#else
167static inline int do_irq_select_affinity(int irq, struct irq_desc *desc) 182static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
168{ 183{
169 return 0; 184 return 0;
170} 185}
171#endif 186#endif
172 187
188void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
189{
190 if (suspend) {
191 if (!desc->action || (desc->action->flags & IRQF_TIMER))
192 return;
193 desc->status |= IRQ_SUSPENDED;
194 }
195
196 if (!desc->depth++) {
197 desc->status |= IRQ_DISABLED;
198 desc->chip->disable(irq);
199 }
200}
201
173/** 202/**
174 * disable_irq_nosync - disable an irq without waiting 203 * disable_irq_nosync - disable an irq without waiting
175 * @irq: Interrupt to disable 204 * @irq: Interrupt to disable
@@ -190,10 +219,7 @@ void disable_irq_nosync(unsigned int irq)
190 return; 219 return;
191 220
192 spin_lock_irqsave(&desc->lock, flags); 221 spin_lock_irqsave(&desc->lock, flags);
193 if (!desc->depth++) { 222 __disable_irq(desc, irq, false);
194 desc->status |= IRQ_DISABLED;
195 desc->chip->disable(irq);
196 }
197 spin_unlock_irqrestore(&desc->lock, flags); 223 spin_unlock_irqrestore(&desc->lock, flags);
198} 224}
199EXPORT_SYMBOL(disable_irq_nosync); 225EXPORT_SYMBOL(disable_irq_nosync);
@@ -223,15 +249,21 @@ void disable_irq(unsigned int irq)
223} 249}
224EXPORT_SYMBOL(disable_irq); 250EXPORT_SYMBOL(disable_irq);
225 251
226static void __enable_irq(struct irq_desc *desc, unsigned int irq) 252void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
227{ 253{
254 if (resume)
255 desc->status &= ~IRQ_SUSPENDED;
256
228 switch (desc->depth) { 257 switch (desc->depth) {
229 case 0: 258 case 0:
259 err_out:
230 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 260 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
231 break; 261 break;
232 case 1: { 262 case 1: {
233 unsigned int status = desc->status & ~IRQ_DISABLED; 263 unsigned int status = desc->status & ~IRQ_DISABLED;
234 264
265 if (desc->status & IRQ_SUSPENDED)
266 goto err_out;
235 /* Prevent probing on this irq: */ 267 /* Prevent probing on this irq: */
236 desc->status = status | IRQ_NOPROBE; 268 desc->status = status | IRQ_NOPROBE;
237 check_irq_resend(desc, irq); 269 check_irq_resend(desc, irq);
@@ -261,7 +293,7 @@ void enable_irq(unsigned int irq)
261 return; 293 return;
262 294
263 spin_lock_irqsave(&desc->lock, flags); 295 spin_lock_irqsave(&desc->lock, flags);
264 __enable_irq(desc, irq); 296 __enable_irq(desc, irq, false);
265 spin_unlock_irqrestore(&desc->lock, flags); 297 spin_unlock_irqrestore(&desc->lock, flags);
266} 298}
267EXPORT_SYMBOL(enable_irq); 299EXPORT_SYMBOL(enable_irq);
@@ -392,14 +424,98 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
392 return ret; 424 return ret;
393} 425}
394 426
427static int irq_wait_for_interrupt(struct irqaction *action)
428{
429 while (!kthread_should_stop()) {
430 set_current_state(TASK_INTERRUPTIBLE);
431
432 if (test_and_clear_bit(IRQTF_RUNTHREAD,
433 &action->thread_flags)) {
434 __set_current_state(TASK_RUNNING);
435 return 0;
436 }
437 schedule();
438 }
439 return -1;
440}
441
442/*
443 * Interrupt handler thread
444 */
445static int irq_thread(void *data)
446{
447 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
448 struct irqaction *action = data;
449 struct irq_desc *desc = irq_to_desc(action->irq);
450 int wake;
451
452 sched_setscheduler(current, SCHED_FIFO, &param);
453 current->irqaction = action;
454
455 while (!irq_wait_for_interrupt(action)) {
456
457 atomic_inc(&desc->threads_active);
458
459 spin_lock_irq(&desc->lock);
460 if (unlikely(desc->status & IRQ_DISABLED)) {
461 /*
462 * CHECKME: We might need a dedicated
463 * IRQ_THREAD_PENDING flag here, which
464 * retriggers the thread in check_irq_resend()
465 * but AFAICT IRQ_PENDING should be fine as it
466 * retriggers the interrupt itself --- tglx
467 */
468 desc->status |= IRQ_PENDING;
469 spin_unlock_irq(&desc->lock);
470 } else {
471 spin_unlock_irq(&desc->lock);
472
473 action->thread_fn(action->irq, action->dev_id);
474 }
475
476 wake = atomic_dec_and_test(&desc->threads_active);
477
478 if (wake && waitqueue_active(&desc->wait_for_threads))
479 wake_up(&desc->wait_for_threads);
480 }
481
482 /*
483 * Clear irqaction. Otherwise exit_irq_thread() would make
484 * fuzz about an active irq thread going into nirvana.
485 */
486 current->irqaction = NULL;
487 return 0;
488}
489
490/*
491 * Called from do_exit()
492 */
493void exit_irq_thread(void)
494{
495 struct task_struct *tsk = current;
496
497 if (!tsk->irqaction)
498 return;
499
500 printk(KERN_ERR
501 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
502 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
503
504 /*
505 * Set the THREAD DIED flag to prevent further wakeups of the
506 * soon to be gone threaded handler.
507 */
508 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
509}
510
395/* 511/*
396 * Internal function to register an irqaction - typically used to 512 * Internal function to register an irqaction - typically used to
397 * allocate special interrupts that are part of the architecture. 513 * allocate special interrupts that are part of the architecture.
398 */ 514 */
399static int 515static int
400__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) 516__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
401{ 517{
402 struct irqaction *old, **p; 518 struct irqaction *old, **old_ptr;
403 const char *old_name = NULL; 519 const char *old_name = NULL;
404 unsigned long flags; 520 unsigned long flags;
405 int shared = 0; 521 int shared = 0;
@@ -428,11 +544,31 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
428 } 544 }
429 545
430 /* 546 /*
547 * Threaded handler ?
548 */
549 if (new->thread_fn) {
550 struct task_struct *t;
551
552 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
553 new->name);
554 if (IS_ERR(t))
555 return PTR_ERR(t);
556 /*
557 * We keep the reference to the task struct even if
558 * the thread dies to avoid that the interrupt code
559 * references an already freed task_struct.
560 */
561 get_task_struct(t);
562 new->thread = t;
563 wake_up_process(t);
564 }
565
566 /*
431 * The following block of code has to be executed atomically 567 * The following block of code has to be executed atomically
432 */ 568 */
433 spin_lock_irqsave(&desc->lock, flags); 569 spin_lock_irqsave(&desc->lock, flags);
434 p = &desc->action; 570 old_ptr = &desc->action;
435 old = *p; 571 old = *old_ptr;
436 if (old) { 572 if (old) {
437 /* 573 /*
438 * Can't share interrupts unless both agree to and are 574 * Can't share interrupts unless both agree to and are
@@ -455,8 +591,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
455 591
456 /* add new interrupt at end of irq queue */ 592 /* add new interrupt at end of irq queue */
457 do { 593 do {
458 p = &old->next; 594 old_ptr = &old->next;
459 old = *p; 595 old = *old_ptr;
460 } while (old); 596 } while (old);
461 shared = 1; 597 shared = 1;
462 } 598 }
@@ -464,15 +600,15 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
464 if (!shared) { 600 if (!shared) {
465 irq_chip_set_defaults(desc->chip); 601 irq_chip_set_defaults(desc->chip);
466 602
603 init_waitqueue_head(&desc->wait_for_threads);
604
467 /* Setup the type (level, edge polarity) if configured: */ 605 /* Setup the type (level, edge polarity) if configured: */
468 if (new->flags & IRQF_TRIGGER_MASK) { 606 if (new->flags & IRQF_TRIGGER_MASK) {
469 ret = __irq_set_trigger(desc, irq, 607 ret = __irq_set_trigger(desc, irq,
470 new->flags & IRQF_TRIGGER_MASK); 608 new->flags & IRQF_TRIGGER_MASK);
471 609
472 if (ret) { 610 if (ret)
473 spin_unlock_irqrestore(&desc->lock, flags); 611 goto out_thread;
474 return ret;
475 }
476 } else 612 } else
477 compat_irq_chip_set_default_handler(desc); 613 compat_irq_chip_set_default_handler(desc);
478#if defined(CONFIG_IRQ_PER_CPU) 614#if defined(CONFIG_IRQ_PER_CPU)
@@ -496,7 +632,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
496 desc->status |= IRQ_NO_BALANCING; 632 desc->status |= IRQ_NO_BALANCING;
497 633
498 /* Set default affinity mask once everything is setup */ 634 /* Set default affinity mask once everything is setup */
499 do_irq_select_affinity(irq, desc); 635 setup_affinity(irq, desc);
500 636
501 } else if ((new->flags & IRQF_TRIGGER_MASK) 637 } else if ((new->flags & IRQF_TRIGGER_MASK)
502 && (new->flags & IRQF_TRIGGER_MASK) 638 && (new->flags & IRQF_TRIGGER_MASK)
@@ -507,7 +643,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
507 (int)(new->flags & IRQF_TRIGGER_MASK)); 643 (int)(new->flags & IRQF_TRIGGER_MASK));
508 } 644 }
509 645
510 *p = new; 646 *old_ptr = new;
511 647
512 /* Reset broken irq detection when installing new handler */ 648 /* Reset broken irq detection when installing new handler */
513 desc->irq_count = 0; 649 desc->irq_count = 0;
@@ -519,7 +655,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
519 */ 655 */
520 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 656 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
521 desc->status &= ~IRQ_SPURIOUS_DISABLED; 657 desc->status &= ~IRQ_SPURIOUS_DISABLED;
522 __enable_irq(desc, irq); 658 __enable_irq(desc, irq, false);
523 } 659 }
524 660
525 spin_unlock_irqrestore(&desc->lock, flags); 661 spin_unlock_irqrestore(&desc->lock, flags);
@@ -540,8 +676,19 @@ mismatch:
540 dump_stack(); 676 dump_stack();
541 } 677 }
542#endif 678#endif
679 ret = -EBUSY;
680
681out_thread:
543 spin_unlock_irqrestore(&desc->lock, flags); 682 spin_unlock_irqrestore(&desc->lock, flags);
544 return -EBUSY; 683 if (new->thread) {
684 struct task_struct *t = new->thread;
685
686 new->thread = NULL;
687 if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
688 kthread_stop(t);
689 put_task_struct(t);
690 }
691 return ret;
545} 692}
546 693
547/** 694/**
@@ -557,97 +704,138 @@ int setup_irq(unsigned int irq, struct irqaction *act)
557 704
558 return __setup_irq(irq, desc, act); 705 return __setup_irq(irq, desc, act);
559} 706}
707EXPORT_SYMBOL_GPL(setup_irq);
560 708
561/** 709 /*
562 * free_irq - free an interrupt 710 * Internal function to unregister an irqaction - used to free
563 * @irq: Interrupt line to free 711 * regular and special interrupts that are part of the architecture.
564 * @dev_id: Device identity to free
565 *
566 * Remove an interrupt handler. The handler is removed and if the
567 * interrupt line is no longer in use by any driver it is disabled.
568 * On a shared IRQ the caller must ensure the interrupt is disabled
569 * on the card it drives before calling this function. The function
570 * does not return until any executing interrupts for this IRQ
571 * have completed.
572 *
573 * This function must not be called from interrupt context.
574 */ 712 */
575void free_irq(unsigned int irq, void *dev_id) 713static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
576{ 714{
577 struct irq_desc *desc = irq_to_desc(irq); 715 struct irq_desc *desc = irq_to_desc(irq);
578 struct irqaction **p; 716 struct irqaction *action, **action_ptr;
717 struct task_struct *irqthread;
579 unsigned long flags; 718 unsigned long flags;
580 719
581 WARN_ON(in_interrupt()); 720 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
582 721
583 if (!desc) 722 if (!desc)
584 return; 723 return NULL;
585 724
586 spin_lock_irqsave(&desc->lock, flags); 725 spin_lock_irqsave(&desc->lock, flags);
587 p = &desc->action; 726
727 /*
728 * There can be multiple actions per IRQ descriptor, find the right
729 * one based on the dev_id:
730 */
731 action_ptr = &desc->action;
588 for (;;) { 732 for (;;) {
589 struct irqaction *action = *p; 733 action = *action_ptr;
590 734
591 if (action) { 735 if (!action) {
592 struct irqaction **pp = p; 736 WARN(1, "Trying to free already-free IRQ %d\n", irq);
737 spin_unlock_irqrestore(&desc->lock, flags);
593 738
594 p = &action->next; 739 return NULL;
595 if (action->dev_id != dev_id) 740 }
596 continue; 741
742 if (action->dev_id == dev_id)
743 break;
744 action_ptr = &action->next;
745 }
597 746
598 /* Found it - now remove it from the list of entries */ 747 /* Found it - now remove it from the list of entries: */
599 *pp = action->next; 748 *action_ptr = action->next;
600 749
601 /* Currently used only by UML, might disappear one day.*/ 750 /* Currently used only by UML, might disappear one day: */
602#ifdef CONFIG_IRQ_RELEASE_METHOD 751#ifdef CONFIG_IRQ_RELEASE_METHOD
603 if (desc->chip->release) 752 if (desc->chip->release)
604 desc->chip->release(irq, dev_id); 753 desc->chip->release(irq, dev_id);
605#endif 754#endif
606 755
607 if (!desc->action) { 756 /* If this was the last handler, shut down the IRQ line: */
608 desc->status |= IRQ_DISABLED; 757 if (!desc->action) {
609 if (desc->chip->shutdown) 758 desc->status |= IRQ_DISABLED;
610 desc->chip->shutdown(irq); 759 if (desc->chip->shutdown)
611 else 760 desc->chip->shutdown(irq);
612 desc->chip->disable(irq); 761 else
613 } 762 desc->chip->disable(irq);
614 spin_unlock_irqrestore(&desc->lock, flags); 763 }
615 unregister_handler_proc(irq, action); 764
765 irqthread = action->thread;
766 action->thread = NULL;
767
768 spin_unlock_irqrestore(&desc->lock, flags);
769
770 unregister_handler_proc(irq, action);
771
772 /* Make sure it's not being used on another CPU: */
773 synchronize_irq(irq);
774
775 if (irqthread) {
776 if (!test_bit(IRQTF_DIED, &action->thread_flags))
777 kthread_stop(irqthread);
778 put_task_struct(irqthread);
779 }
616 780
617 /* Make sure it's not being used on another CPU */
618 synchronize_irq(irq);
619#ifdef CONFIG_DEBUG_SHIRQ
620 /*
621 * It's a shared IRQ -- the driver ought to be
622 * prepared for it to happen even now it's
623 * being freed, so let's make sure.... We do
624 * this after actually deregistering it, to
625 * make sure that a 'real' IRQ doesn't run in
626 * parallel with our fake
627 */
628 if (action->flags & IRQF_SHARED) {
629 local_irq_save(flags);
630 action->handler(irq, dev_id);
631 local_irq_restore(flags);
632 }
633#endif
634 kfree(action);
635 return;
636 }
637 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
638#ifdef CONFIG_DEBUG_SHIRQ 781#ifdef CONFIG_DEBUG_SHIRQ
639 dump_stack(); 782 /*
640#endif 783 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
641 spin_unlock_irqrestore(&desc->lock, flags); 784 * event to happen even now it's being freed, so let's make sure that
642 return; 785 * is so by doing an extra call to the handler ....
786 *
787 * ( We do this after actually deregistering it, to make sure that a
788 * 'real' IRQ doesn't run in * parallel with our fake. )
789 */
790 if (action->flags & IRQF_SHARED) {
791 local_irq_save(flags);
792 action->handler(irq, dev_id);
793 local_irq_restore(flags);
643 } 794 }
795#endif
796 return action;
797}
798
799/**
800 * remove_irq - free an interrupt
801 * @irq: Interrupt line to free
802 * @act: irqaction for the interrupt
803 *
804 * Used to remove interrupts statically setup by the early boot process.
805 */
806void remove_irq(unsigned int irq, struct irqaction *act)
807{
808 __free_irq(irq, act->dev_id);
809}
810EXPORT_SYMBOL_GPL(remove_irq);
811
812/**
813 * free_irq - free an interrupt allocated with request_irq
814 * @irq: Interrupt line to free
815 * @dev_id: Device identity to free
816 *
817 * Remove an interrupt handler. The handler is removed and if the
818 * interrupt line is no longer in use by any driver it is disabled.
819 * On a shared IRQ the caller must ensure the interrupt is disabled
820 * on the card it drives before calling this function. The function
821 * does not return until any executing interrupts for this IRQ
822 * have completed.
823 *
824 * This function must not be called from interrupt context.
825 */
826void free_irq(unsigned int irq, void *dev_id)
827{
828 kfree(__free_irq(irq, dev_id));
644} 829}
645EXPORT_SYMBOL(free_irq); 830EXPORT_SYMBOL(free_irq);
646 831
647/** 832/**
648 * request_irq - allocate an interrupt line 833 * request_threaded_irq - allocate an interrupt line
649 * @irq: Interrupt line to allocate 834 * @irq: Interrupt line to allocate
650 * @handler: Function to be called when the IRQ occurs 835 * @handler: Function to be called when the IRQ occurs.
836 * Primary handler for threaded interrupts
837 * @thread_fn: Function called from the irq handler thread
838 * If NULL, no irq thread is created
651 * @irqflags: Interrupt type flags 839 * @irqflags: Interrupt type flags
652 * @devname: An ascii name for the claiming device 840 * @devname: An ascii name for the claiming device
653 * @dev_id: A cookie passed back to the handler function 841 * @dev_id: A cookie passed back to the handler function
@@ -659,6 +847,15 @@ EXPORT_SYMBOL(free_irq);
659 * raises, you must take care both to initialise your hardware 847 * raises, you must take care both to initialise your hardware
660 * and to set up the interrupt handler in the right order. 848 * and to set up the interrupt handler in the right order.
661 * 849 *
850 * If you want to set up a threaded irq handler for your device
851 * then you need to supply @handler and @thread_fn. @handler ist
852 * still called in hard interrupt context and has to check
853 * whether the interrupt originates from the device. If yes it
854 * needs to disable the interrupt on the device and return
855 * IRQ_THREAD_WAKE which will wake up the handler thread and run
856 * @thread_fn. This split handler design is necessary to support
857 * shared interrupts.
858 *
662 * Dev_id must be globally unique. Normally the address of the 859 * Dev_id must be globally unique. Normally the address of the
663 * device data structure is used as the cookie. Since the handler 860 * device data structure is used as the cookie. Since the handler
664 * receives this value it makes sense to use it. 861 * receives this value it makes sense to use it.
@@ -674,8 +871,9 @@ EXPORT_SYMBOL(free_irq);
674 * IRQF_TRIGGER_* Specify active edge(s) or level 871 * IRQF_TRIGGER_* Specify active edge(s) or level
675 * 872 *
676 */ 873 */
677int request_irq(unsigned int irq, irq_handler_t handler, 874int request_threaded_irq(unsigned int irq, irq_handler_t handler,
678 unsigned long irqflags, const char *devname, void *dev_id) 875 irq_handler_t thread_fn, unsigned long irqflags,
876 const char *devname, void *dev_id)
679{ 877{
680 struct irqaction *action; 878 struct irqaction *action;
681 struct irq_desc *desc; 879 struct irq_desc *desc;
@@ -687,11 +885,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,
687 * the behavior is classified as "will not fix" so we need to 885 * the behavior is classified as "will not fix" so we need to
688 * start nudging drivers away from using that idiom. 886 * start nudging drivers away from using that idiom.
689 */ 887 */
690 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) 888 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
691 == (IRQF_SHARED|IRQF_DISABLED)) 889 (IRQF_SHARED|IRQF_DISABLED)) {
692 pr_warning("IRQ %d/%s: IRQF_DISABLED is not " 890 pr_warning(
693 "guaranteed on shared IRQs\n", 891 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
694 irq, devname); 892 irq, devname);
893 }
695 894
696#ifdef CONFIG_LOCKDEP 895#ifdef CONFIG_LOCKDEP
697 /* 896 /*
@@ -717,15 +916,14 @@ int request_irq(unsigned int irq, irq_handler_t handler,
717 if (!handler) 916 if (!handler)
718 return -EINVAL; 917 return -EINVAL;
719 918
720 action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); 919 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
721 if (!action) 920 if (!action)
722 return -ENOMEM; 921 return -ENOMEM;
723 922
724 action->handler = handler; 923 action->handler = handler;
924 action->thread_fn = thread_fn;
725 action->flags = irqflags; 925 action->flags = irqflags;
726 cpus_clear(action->mask);
727 action->name = devname; 926 action->name = devname;
728 action->next = NULL;
729 action->dev_id = dev_id; 927 action->dev_id = dev_id;
730 928
731 retval = __setup_irq(irq, desc, action); 929 retval = __setup_irq(irq, desc, action);
@@ -753,4 +951,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
753#endif 951#endif
754 return retval; 952 return retval;
755} 953}
756EXPORT_SYMBOL(request_irq); 954EXPORT_SYMBOL(request_threaded_irq);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630c..e05ad9be43b7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
18 18
19 desc->status &= ~IRQ_MOVE_PENDING; 19 desc->status &= ~IRQ_MOVE_PENDING;
20 20
21 if (unlikely(cpumask_empty(&desc->pending_mask))) 21 if (unlikely(cpumask_empty(desc->pending_mask)))
22 return; 22 return;
23 23
24 if (!desc->chip->set_affinity) 24 if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
38 * For correct operation this depends on the caller 38 * For correct operation this depends on the caller
39 * masking the irqs. 39 * masking the irqs.
40 */ 40 */
41 if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) 41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 42 < nr_cpu_ids)) {
43 cpumask_and(&desc->affinity, 43 cpumask_and(desc->affinity,
44 &desc->pending_mask, cpu_online_mask); 44 desc->pending_mask, cpu_online_mask);
45 desc->chip->set_affinity(irq, &desc->affinity); 45 desc->chip->set_affinity(irq, desc->affinity);
46 } 46 }
47 cpumask_clear(&desc->pending_mask); 47 cpumask_clear(desc->pending_mask);
48} 48}
49 49
50void move_native_irq(int irq) 50void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ecf765c6a77a..243d6121e50e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -17,16 +17,11 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc, 17 struct irq_desc *desc,
18 int cpu, int nr) 18 int cpu, int nr)
19{ 19{
20 unsigned long bytes;
21
22 init_kstat_irqs(desc, cpu, nr); 20 init_kstat_irqs(desc, cpu, nr);
23 21
24 if (desc->kstat_irqs != old_desc->kstat_irqs) { 22 if (desc->kstat_irqs != old_desc->kstat_irqs)
25 /* Compute how many bytes we need per irq and allocate them */ 23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
26 bytes = nr * sizeof(unsigned int); 24 nr * sizeof(*desc->kstat_irqs));
27
28 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
29 }
30} 25}
31 26
32static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) 27static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -38,15 +33,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
38 old_desc->kstat_irqs = NULL; 33 old_desc->kstat_irqs = NULL;
39} 34}
40 35
41static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu) 37 struct irq_desc *desc, int cpu)
43{ 38{
44 memcpy(desc, old_desc, sizeof(struct irq_desc)); 39 memcpy(desc, old_desc, sizeof(struct irq_desc));
40 if (!init_alloc_desc_masks(desc, cpu, false)) {
41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
42 "for migration.\n", irq);
43 return false;
44 }
45 spin_lock_init(&desc->lock); 45 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 46 desc->cpu = cpu;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
49 init_copy_desc_masks(old_desc, desc);
49 arch_init_copy_chip_data(old_desc, desc, cpu); 50 arch_init_copy_chip_data(old_desc, desc, cpu);
51 return true;
50} 52}
51 53
52static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) 54static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -71,23 +73,34 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
71 desc = irq_desc_ptrs[irq]; 73 desc = irq_desc_ptrs[irq];
72 74
73 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
74 goto out_unlock; 76 goto out_unlock;
75 77
76 node = cpu_to_node(cpu); 78 node = cpu_to_node(cpu);
77 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 79 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
78 if (!desc) { 80 if (!desc) {
79 printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); 81 printk(KERN_ERR "irq %d: can not get new irq_desc "
82 "for migration.\n", irq);
83 /* still use old one */
84 desc = old_desc;
85 goto out_unlock;
86 }
87 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
80 /* still use old one */ 88 /* still use old one */
89 kfree(desc);
81 desc = old_desc; 90 desc = old_desc;
82 goto out_unlock; 91 goto out_unlock;
83 } 92 }
84 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
85 93
86 irq_desc_ptrs[irq] = desc; 94 irq_desc_ptrs[irq] = desc;
95 spin_unlock_irqrestore(&sparse_irq_lock, flags);
87 96
88 /* free the old one */ 97 /* free the old one */
89 free_one_irq_desc(old_desc, desc); 98 free_one_irq_desc(old_desc, desc);
99 spin_unlock(&old_desc->lock);
90 kfree(old_desc); 100 kfree(old_desc);
101 spin_lock(&desc->lock);
102
103 return desc;
91 104
92out_unlock: 105out_unlock:
93 spin_unlock_irqrestore(&sparse_irq_lock, flags); 106 spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
new file mode 100644
index 000000000000..638d8bedec14
--- /dev/null
+++ b/kernel/irq/pm.c
@@ -0,0 +1,79 @@
1/*
2 * linux/kernel/irq/pm.c
3 *
4 * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file contains power management functions related to interrupts.
7 */
8
9#include <linux/irq.h>
10#include <linux/module.h>
11#include <linux/interrupt.h>
12
13#include "internals.h"
14
15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 *
18 * During system-wide suspend or hibernation device interrupts need to be
19 * disabled at the chip level and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the
21 * IRQ_SUSPENDED flag for them.
22 */
23void suspend_device_irqs(void)
24{
25 struct irq_desc *desc;
26 int irq;
27
28 for_each_irq_desc(irq, desc) {
29 unsigned long flags;
30
31 spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags);
34 }
35
36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED)
38 synchronize_irq(irq);
39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs);
41
42/**
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set.
47 */
48void resume_device_irqs(void)
49{
50 struct irq_desc *desc;
51 int irq;
52
53 for_each_irq_desc(irq, desc) {
54 unsigned long flags;
55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags);
62 }
63}
64EXPORT_SYMBOL_GPL(resume_device_irqs);
65
66/**
67 * check_wakeup_irqs - check if any wake-up interrupts are pending
68 */
69int check_wakeup_irqs(void)
70{
71 struct irq_desc *desc;
72 int irq;
73
74 for_each_irq_desc(irq, desc)
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
76 return -EBUSY;
77
78 return 0;
79}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bcec..692363dd591f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 const struct cpumask *mask = &desc->affinity; 23 const struct cpumask *mask = desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
27 mask = &desc->pending_mask; 27 mask = desc->pending_mask;
28#endif 28#endif
29 seq_cpumask(m, mask); 29 seq_cpumask(m, mask);
30 seq_putc(m, '\n'); 30 seq_putc(m, '\n');
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..4d568294de3e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
104 return ok; 104 return ok;
105} 105}
106 106
107static void poll_spurious_irqs(unsigned long dummy) 107static void poll_all_shared_irqs(void)
108{ 108{
109 struct irq_desc *desc; 109 struct irq_desc *desc;
110 int i; 110 int i;
@@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)
123 123
124 try_one_irq(i, desc); 124 try_one_irq(i, desc);
125 } 125 }
126}
127
128static void poll_spurious_irqs(unsigned long dummy)
129{
130 poll_all_shared_irqs();
126 131
127 mod_timer(&poll_spurious_irq_timer, 132 mod_timer(&poll_spurious_irq_timer,
128 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 133 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
129} 134}
130 135
136#ifdef CONFIG_DEBUG_SHIRQ
137void debug_poll_all_shared_irqs(void)
138{
139 poll_all_shared_irqs();
140}
141#endif
142
131/* 143/*
132 * If 99,900 of the previous 100,000 interrupts have not been handled 144 * If 99,900 of the previous 100,000 interrupts have not been handled
133 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 145 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
diff --git a/kernel/itimer.c b/kernel/itimer.c
index db7c358b9a02..58762f7077ec 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value)
62 struct task_cputime cputime; 62 struct task_cputime cputime;
63 cputime_t utime; 63 cputime_t utime;
64 64
65 thread_group_cputime(tsk, &cputime); 65 thread_group_cputimer(tsk, &cputime);
66 utime = cputime.utime; 66 utime = cputime.utime;
67 if (cputime_le(cval, utime)) { /* about to fire */ 67 if (cputime_le(cval, utime)) { /* about to fire */
68 cval = jiffies_to_cputime(1); 68 cval = jiffies_to_cputime(1);
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
82 struct task_cputime times; 82 struct task_cputime times;
83 cputime_t ptime; 83 cputime_t ptime;
84 84
85 thread_group_cputime(tsk, &times); 85 thread_group_cputimer(tsk, &times);
86 ptime = cputime_add(times.utime, times.stime); 86 ptime = cputime_add(times.utime, times.stime);
87 if (cputime_le(cval, ptime)) { /* about to fire */ 87 if (cputime_le(cval, ptime)) { /* about to fire */
88 cval = jiffies_to_cputime(1); 88 cval = jiffies_to_cputime(1);
@@ -100,7 +100,7 @@ int do_getitimer(int which, struct itimerval *value)
100 return 0; 100 return 0;
101} 101}
102 102
103asmlinkage long sys_getitimer(int which, struct itimerval __user *value) 103SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
104{ 104{
105 int error = -EFAULT; 105 int error = -EFAULT;
106 struct itimerval get_buffer; 106 struct itimerval get_buffer;
@@ -260,9 +260,8 @@ unsigned int alarm_setitimer(unsigned int seconds)
260 return it_old.it_value.tv_sec; 260 return it_old.it_value.tv_sec;
261} 261}
262 262
263asmlinkage long sys_setitimer(int which, 263SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
264 struct itimerval __user *value, 264 struct itimerval __user *, ovalue)
265 struct itimerval __user *ovalue)
266{ 265{
267 struct itimerval set_buffer, get_buffer; 266 struct itimerval set_buffer, get_buffer;
268 int error; 267 int error;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index e694afa0eb8c..374faf9bfdc7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,19 +30,20 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33extern const unsigned long kallsyms_addresses[]; 33/* These will be re-linked against their real values during the second link stage */
34extern const u8 kallsyms_names[]; 34extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern const u8 kallsyms_names[] __attribute__((weak));
35 36
36/* tell the compiler that the count isn't in the small data section if the arch 37/* tell the compiler that the count isn't in the small data section if the arch
37 * has one (eg: FRV) 38 * has one (eg: FRV)
38 */ 39 */
39extern const unsigned long kallsyms_num_syms 40extern const unsigned long kallsyms_num_syms
40 __attribute__((__section__(".rodata"))); 41__attribute__((weak, section(".rodata")));
41 42
42extern const u8 kallsyms_token_table[]; 43extern const u8 kallsyms_token_table[] __attribute__((weak));
43extern const u16 kallsyms_token_index[]; 44extern const u16 kallsyms_token_index[] __attribute__((weak));
44 45
45extern const unsigned long kallsyms_markers[]; 46extern const unsigned long kallsyms_markers[] __attribute__((weak));
46 47
47static inline int is_kernel_inittext(unsigned long addr) 48static inline int is_kernel_inittext(unsigned long addr)
48{ 49{
@@ -160,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
160 return module_kallsyms_lookup_name(name); 161 return module_kallsyms_lookup_name(name);
161} 162}
162 163
164int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
165 unsigned long),
166 void *data)
167{
168 char namebuf[KSYM_NAME_LEN];
169 unsigned long i;
170 unsigned int off;
171 int ret;
172
173 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
174 off = kallsyms_expand_symbol(off, namebuf);
175 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
176 if (ret != 0)
177 return ret;
178 }
179 return module_kallsyms_on_each_symbol(fn, data);
180}
181EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
182
163static unsigned long get_symbol_pos(unsigned long addr, 183static unsigned long get_symbol_pos(unsigned long addr,
164 unsigned long *symbolsize, 184 unsigned long *symbolsize,
165 unsigned long *offset) 185 unsigned long *offset)
@@ -167,6 +187,9 @@ static unsigned long get_symbol_pos(unsigned long addr,
167 unsigned long symbol_start = 0, symbol_end = 0; 187 unsigned long symbol_start = 0, symbol_end = 0;
168 unsigned long i, low, high, mid; 188 unsigned long i, low, high, mid;
169 189
190 /* This kernel should never had been booted. */
191 BUG_ON(!kallsyms_addresses);
192
170 /* do a binary search on the sorted kallsyms_addresses array */ 193 /* do a binary search on the sorted kallsyms_addresses array */
171 low = 0; 194 low = 0;
172 high = kallsyms_num_syms; 195 high = kallsyms_num_syms;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 3fb855ad6aa0..5a758c6e4950 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -42,7 +42,7 @@
42note_buf_t* crash_notes; 42note_buf_t* crash_notes;
43 43
44/* vmcoreinfo stuff */ 44/* vmcoreinfo stuff */
45unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 45static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
46u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; 46u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
47size_t vmcoreinfo_size; 47size_t vmcoreinfo_size;
48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); 48size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
@@ -934,9 +934,8 @@ struct kimage *kexec_crash_image;
934 934
935static DEFINE_MUTEX(kexec_mutex); 935static DEFINE_MUTEX(kexec_mutex);
936 936
937asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 937SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
938 struct kexec_segment __user *segments, 938 struct kexec_segment __user *, segments, unsigned long, flags)
939 unsigned long flags)
940{ 939{
941 struct kimage **dest_image, *image; 940 struct kimage **dest_image, *image;
942 int result; 941 int result;
@@ -1131,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1131 return; 1130 return;
1132 memset(&prstatus, 0, sizeof(prstatus)); 1131 memset(&prstatus, 0, sizeof(prstatus));
1133 prstatus.pr_pid = current->pid; 1132 prstatus.pr_pid = current->pid;
1134 elf_core_copy_regs(&prstatus.pr_reg, regs); 1133 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1135 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, 1134 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1136 &prstatus, sizeof(prstatus)); 1135 &prstatus, sizeof(prstatus));
1137 final_note(buf); 1136 final_note(buf);
@@ -1410,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1410 VMCOREINFO_OFFSET(list_head, prev); 1409 VMCOREINFO_OFFSET(list_head, prev);
1411 VMCOREINFO_OFFSET(vm_struct, addr); 1410 VMCOREINFO_OFFSET(vm_struct, addr);
1412 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1411 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1412 log_buf_kexec_setup();
1413 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1413 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1414 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1414 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1415 VMCOREINFO_NUMBER(PG_lru); 1415 VMCOREINFO_NUMBER(PG_lru);
@@ -1451,11 +1451,7 @@ int kernel_kexec(void)
1451 error = device_suspend(PMSG_FREEZE); 1451 error = device_suspend(PMSG_FREEZE);
1452 if (error) 1452 if (error)
1453 goto Resume_console; 1453 goto Resume_console;
1454 error = disable_nonboot_cpus();
1455 if (error)
1456 goto Resume_devices;
1457 device_pm_lock(); 1454 device_pm_lock();
1458 local_irq_disable();
1459 /* At this point, device_suspend() has been called, 1455 /* At this point, device_suspend() has been called,
1460 * but *not* device_power_down(). We *must* 1456 * but *not* device_power_down(). We *must*
1461 * device_power_down() now. Otherwise, drivers for 1457 * device_power_down() now. Otherwise, drivers for
@@ -1465,6 +1461,14 @@ int kernel_kexec(void)
1465 */ 1461 */
1466 error = device_power_down(PMSG_FREEZE); 1462 error = device_power_down(PMSG_FREEZE);
1467 if (error) 1463 if (error)
1464 goto Resume_devices;
1465 error = disable_nonboot_cpus();
1466 if (error)
1467 goto Enable_cpus;
1468 local_irq_disable();
1469 /* Suspend system devices */
1470 error = sysdev_suspend(PMSG_FREEZE);
1471 if (error)
1468 goto Enable_irqs; 1472 goto Enable_irqs;
1469 } else 1473 } else
1470#endif 1474#endif
@@ -1478,12 +1482,14 @@ int kernel_kexec(void)
1478 1482
1479#ifdef CONFIG_KEXEC_JUMP 1483#ifdef CONFIG_KEXEC_JUMP
1480 if (kexec_image->preserve_context) { 1484 if (kexec_image->preserve_context) {
1481 device_power_up(PMSG_RESTORE); 1485 sysdev_resume();
1482 Enable_irqs: 1486 Enable_irqs:
1483 local_irq_enable(); 1487 local_irq_enable();
1484 device_pm_unlock(); 1488 Enable_cpus:
1485 enable_nonboot_cpus(); 1489 enable_nonboot_cpus();
1490 device_power_up(PMSG_RESTORE);
1486 Resume_devices: 1491 Resume_devices:
1492 device_pm_unlock();
1487 device_resume(PMSG_RESTORE); 1493 device_resume(PMSG_RESTORE);
1488 Resume_console: 1494 Resume_console:
1489 resume_console(); 1495 resume_console();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a27a5f64443d..b750675251e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * __request_module - try to load a kernel module
54 * @wait: wait (or not) for the operation to complete
54 * @fmt: printf style format string for the name of the module 55 * @fmt: printf style format string for the name of the module
55 * @...: arguments as specified in the format string 56 * @...: arguments as specified in the format string
56 * 57 *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
63 * If module auto-loading support is disabled then this function 64 * If module auto-loading support is disabled then this function
64 * becomes a no-operation. 65 * becomes a no-operation.
65 */ 66 */
66int request_module(const char *fmt, ...) 67int __request_module(bool wait, const char *fmt, ...)
67{ 68{
68 va_list args; 69 va_list args;
69 char module_name[MODULE_NAME_LEN]; 70 char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
108 return -ENOMEM; 109 return -ENOMEM;
109 } 110 }
110 111
111 ret = call_usermodehelper(modprobe_path, argv, envp, 1); 112 ret = call_usermodehelper(modprobe_path, argv, envp,
113 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
112 atomic_dec(&kmod_concurrent); 114 atomic_dec(&kmod_concurrent);
113 return ret; 115 return ret;
114} 116}
115EXPORT_SYMBOL(request_module); 117EXPORT_SYMBOL(__request_module);
116#endif /* CONFIG_MODULES */ 118#endif /* CONFIG_MODULES */
117 119
118struct subprocess_info { 120struct subprocess_info {
@@ -167,7 +169,7 @@ static int ____call_usermodehelper(void *data)
167 } 169 }
168 170
169 /* We can run anywhere, unlike our parent keventd(). */ 171 /* We can run anywhere, unlike our parent keventd(). */
170 set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); 172 set_cpus_allowed_ptr(current, cpu_all_mask);
171 173
172 /* 174 /*
173 * Our parent is keventd, which runs with elevated scheduling priority. 175 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1b9cbdc0127a..a5e74ddee0e2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,6 +43,7 @@
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/kdebug.h> 45#include <linux/kdebug.h>
46#include <linux/memory.h>
46 47
47#include <asm-generic/sections.h> 48#include <asm-generic/sections.h>
48#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
@@ -67,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
67static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 68static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
68 69
69/* NOTE: change this value only with kprobe_mutex held */ 70/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled; 71static bool kprobes_all_disarmed;
71 72
72static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 73static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 74static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -123,7 +124,7 @@ static int collect_garbage_slots(void);
123static int __kprobes check_safety(void) 124static int __kprobes check_safety(void)
124{ 125{
125 int ret = 0; 126 int ret = 0;
126#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) 127#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
127 ret = freeze_processes(); 128 ret = freeze_processes();
128 if (ret == 0) { 129 if (ret == 0) {
129 struct task_struct *p, *q; 130 struct task_struct *p, *q;
@@ -327,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
327 struct kprobe *kp; 328 struct kprobe *kp;
328 329
329 list_for_each_entry_rcu(kp, &p->list, list) { 330 list_for_each_entry_rcu(kp, &p->list, list) {
330 if (kp->pre_handler && !kprobe_gone(kp)) { 331 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
331 set_kprobe_instance(kp); 332 set_kprobe_instance(kp);
332 if (kp->pre_handler(kp, regs)) 333 if (kp->pre_handler(kp, regs))
333 return 1; 334 return 1;
@@ -343,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
343 struct kprobe *kp; 344 struct kprobe *kp;
344 345
345 list_for_each_entry_rcu(kp, &p->list, list) { 346 list_for_each_entry_rcu(kp, &p->list, list) {
346 if (kp->post_handler && !kprobe_gone(kp)) { 347 if (kp->post_handler && likely(!kprobe_disabled(kp))) {
347 set_kprobe_instance(kp); 348 set_kprobe_instance(kp);
348 kp->post_handler(kp, regs, flags); 349 kp->post_handler(kp, regs, flags);
349 reset_kprobe_instance(); 350 reset_kprobe_instance();
@@ -517,20 +518,28 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
517} 518}
518 519
519/* 520/*
520* Add the new probe to old_p->list. Fail if this is the 521* Add the new probe to ap->list. Fail if this is the
521* second jprobe at the address - two jprobes can't coexist 522* second jprobe at the address - two jprobes can't coexist
522*/ 523*/
523static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 524static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
524{ 525{
526 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
525 if (p->break_handler) { 527 if (p->break_handler) {
526 if (old_p->break_handler) 528 if (ap->break_handler)
527 return -EEXIST; 529 return -EEXIST;
528 list_add_tail_rcu(&p->list, &old_p->list); 530 list_add_tail_rcu(&p->list, &ap->list);
529 old_p->break_handler = aggr_break_handler; 531 ap->break_handler = aggr_break_handler;
530 } else 532 } else
531 list_add_rcu(&p->list, &old_p->list); 533 list_add_rcu(&p->list, &ap->list);
532 if (p->post_handler && !old_p->post_handler) 534 if (p->post_handler && !ap->post_handler)
533 old_p->post_handler = aggr_post_handler; 535 ap->post_handler = aggr_post_handler;
536
537 if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
538 ap->flags &= ~KPROBE_FLAG_DISABLED;
539 if (!kprobes_all_disarmed)
540 /* Arm the breakpoint again. */
541 arch_arm_kprobe(ap);
542 }
534 return 0; 543 return 0;
535} 544}
536 545
@@ -543,6 +552,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
543 copy_kprobe(p, ap); 552 copy_kprobe(p, ap);
544 flush_insn_slot(ap); 553 flush_insn_slot(ap);
545 ap->addr = p->addr; 554 ap->addr = p->addr;
555 ap->flags = p->flags;
546 ap->pre_handler = aggr_pre_handler; 556 ap->pre_handler = aggr_pre_handler;
547 ap->fault_handler = aggr_fault_handler; 557 ap->fault_handler = aggr_fault_handler;
548 /* We don't care the kprobe which has gone. */ 558 /* We don't care the kprobe which has gone. */
@@ -565,44 +575,59 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
565 struct kprobe *p) 575 struct kprobe *p)
566{ 576{
567 int ret = 0; 577 int ret = 0;
568 struct kprobe *ap; 578 struct kprobe *ap = old_p;
569 579
570 if (kprobe_gone(old_p)) { 580 if (old_p->pre_handler != aggr_pre_handler) {
581 /* If old_p is not an aggr_probe, create new aggr_kprobe. */
582 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
583 if (!ap)
584 return -ENOMEM;
585 add_aggr_kprobe(ap, old_p);
586 }
587
588 if (kprobe_gone(ap)) {
571 /* 589 /*
572 * Attempting to insert new probe at the same location that 590 * Attempting to insert new probe at the same location that
573 * had a probe in the module vaddr area which already 591 * had a probe in the module vaddr area which already
574 * freed. So, the instruction slot has already been 592 * freed. So, the instruction slot has already been
575 * released. We need a new slot for the new probe. 593 * released. We need a new slot for the new probe.
576 */ 594 */
577 ret = arch_prepare_kprobe(old_p); 595 ret = arch_prepare_kprobe(ap);
578 if (ret) 596 if (ret)
597 /*
598 * Even if fail to allocate new slot, don't need to
599 * free aggr_probe. It will be used next time, or
600 * freed by unregister_kprobe.
601 */
579 return ret; 602 return ret;
580 } 603
581 if (old_p->pre_handler == aggr_pre_handler) {
582 copy_kprobe(old_p, p);
583 ret = add_new_kprobe(old_p, p);
584 ap = old_p;
585 } else {
586 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
587 if (!ap) {
588 if (kprobe_gone(old_p))
589 arch_remove_kprobe(old_p);
590 return -ENOMEM;
591 }
592 add_aggr_kprobe(ap, old_p);
593 copy_kprobe(ap, p);
594 ret = add_new_kprobe(ap, p);
595 }
596 if (kprobe_gone(old_p)) {
597 /* 604 /*
598 * If the old_p has gone, its breakpoint has been disarmed. 605 * Clear gone flag to prevent allocating new slot again, and
599 * We have to arm it again after preparing real kprobes. 606 * set disabled flag because it is not armed yet.
600 */ 607 */
601 ap->flags &= ~KPROBE_FLAG_GONE; 608 ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
602 if (kprobe_enabled) 609 | KPROBE_FLAG_DISABLED;
603 arch_arm_kprobe(ap);
604 } 610 }
605 return ret; 611
612 copy_kprobe(ap, p);
613 return add_new_kprobe(ap, p);
614}
615
616/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
617static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
618{
619 struct kprobe *kp;
620
621 list_for_each_entry_rcu(kp, &p->list, list) {
622 if (!kprobe_disabled(kp))
623 /*
624 * There is an active probe on the list.
625 * We can't disable aggr_kprobe.
626 */
627 return 0;
628 }
629 p->flags |= KPROBE_FLAG_DISABLED;
630 return 1;
606} 631}
607 632
608static int __kprobes in_kprobes_functions(unsigned long addr) 633static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -663,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p)
663 return -EINVAL; 688 return -EINVAL;
664 } 689 }
665 690
666 p->flags = 0; 691 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
692 p->flags &= KPROBE_FLAG_DISABLED;
693
667 /* 694 /*
668 * Check if are we probing a module. 695 * Check if are we probing a module.
669 */ 696 */
@@ -699,17 +726,20 @@ int __kprobes register_kprobe(struct kprobe *p)
699 goto out; 726 goto out;
700 } 727 }
701 728
729 mutex_lock(&text_mutex);
702 ret = arch_prepare_kprobe(p); 730 ret = arch_prepare_kprobe(p);
703 if (ret) 731 if (ret)
704 goto out; 732 goto out_unlock_text;
705 733
706 INIT_HLIST_NODE(&p->hlist); 734 INIT_HLIST_NODE(&p->hlist);
707 hlist_add_head_rcu(&p->hlist, 735 hlist_add_head_rcu(&p->hlist,
708 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 736 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
709 737
710 if (kprobe_enabled) 738 if (!kprobes_all_disarmed && !kprobe_disabled(p))
711 arch_arm_kprobe(p); 739 arch_arm_kprobe(p);
712 740
741out_unlock_text:
742 mutex_unlock(&text_mutex);
713out: 743out:
714 mutex_unlock(&kprobe_mutex); 744 mutex_unlock(&kprobe_mutex);
715 745
@@ -718,26 +748,39 @@ out:
718 748
719 return ret; 749 return ret;
720} 750}
751EXPORT_SYMBOL_GPL(register_kprobe);
721 752
722/* 753/* Check passed kprobe is valid and return kprobe in kprobe_table. */
723 * Unregister a kprobe without a scheduler synchronization. 754static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
724 */
725static int __kprobes __unregister_kprobe_top(struct kprobe *p)
726{ 755{
727 struct kprobe *old_p, *list_p; 756 struct kprobe *old_p, *list_p;
728 757
729 old_p = get_kprobe(p->addr); 758 old_p = get_kprobe(p->addr);
730 if (unlikely(!old_p)) 759 if (unlikely(!old_p))
731 return -EINVAL; 760 return NULL;
732 761
733 if (p != old_p) { 762 if (p != old_p) {
734 list_for_each_entry_rcu(list_p, &old_p->list, list) 763 list_for_each_entry_rcu(list_p, &old_p->list, list)
735 if (list_p == p) 764 if (list_p == p)
736 /* kprobe p is a valid probe */ 765 /* kprobe p is a valid probe */
737 goto valid_p; 766 goto valid;
738 return -EINVAL; 767 return NULL;
739 } 768 }
740valid_p: 769valid:
770 return old_p;
771}
772
773/*
774 * Unregister a kprobe without a scheduler synchronization.
775 */
776static int __kprobes __unregister_kprobe_top(struct kprobe *p)
777{
778 struct kprobe *old_p, *list_p;
779
780 old_p = __get_valid_kprobe(p);
781 if (old_p == NULL)
782 return -EINVAL;
783
741 if (old_p == p || 784 if (old_p == p ||
742 (old_p->pre_handler == aggr_pre_handler && 785 (old_p->pre_handler == aggr_pre_handler &&
743 list_is_singular(&old_p->list))) { 786 list_is_singular(&old_p->list))) {
@@ -746,8 +789,11 @@ valid_p:
746 * enabled and not gone - otherwise, the breakpoint would 789 * enabled and not gone - otherwise, the breakpoint would
747 * already have been removed. We save on flushing icache. 790 * already have been removed. We save on flushing icache.
748 */ 791 */
749 if (kprobe_enabled && !kprobe_gone(old_p)) 792 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) {
793 mutex_lock(&text_mutex);
750 arch_disarm_kprobe(p); 794 arch_disarm_kprobe(p);
795 mutex_unlock(&text_mutex);
796 }
751 hlist_del_rcu(&old_p->hlist); 797 hlist_del_rcu(&old_p->hlist);
752 } else { 798 } else {
753 if (p->break_handler && !kprobe_gone(p)) 799 if (p->break_handler && !kprobe_gone(p))
@@ -761,6 +807,11 @@ valid_p:
761 } 807 }
762noclean: 808noclean:
763 list_del_rcu(&p->list); 809 list_del_rcu(&p->list);
810 if (!kprobe_disabled(old_p)) {
811 try_to_disable_aggr_kprobe(old_p);
812 if (!kprobes_all_disarmed && kprobe_disabled(old_p))
813 arch_disarm_kprobe(old_p);
814 }
764 } 815 }
765 return 0; 816 return 0;
766} 817}
@@ -796,11 +847,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
796 } 847 }
797 return ret; 848 return ret;
798} 849}
850EXPORT_SYMBOL_GPL(register_kprobes);
799 851
800void __kprobes unregister_kprobe(struct kprobe *p) 852void __kprobes unregister_kprobe(struct kprobe *p)
801{ 853{
802 unregister_kprobes(&p, 1); 854 unregister_kprobes(&p, 1);
803} 855}
856EXPORT_SYMBOL_GPL(unregister_kprobe);
804 857
805void __kprobes unregister_kprobes(struct kprobe **kps, int num) 858void __kprobes unregister_kprobes(struct kprobe **kps, int num)
806{ 859{
@@ -819,6 +872,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
819 if (kps[i]->addr) 872 if (kps[i]->addr)
820 __unregister_kprobe_bottom(kps[i]); 873 __unregister_kprobe_bottom(kps[i]);
821} 874}
875EXPORT_SYMBOL_GPL(unregister_kprobes);
822 876
823static struct notifier_block kprobe_exceptions_nb = { 877static struct notifier_block kprobe_exceptions_nb = {
824 .notifier_call = kprobe_exceptions_notify, 878 .notifier_call = kprobe_exceptions_notify,
@@ -858,16 +912,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
858 } 912 }
859 return ret; 913 return ret;
860} 914}
915EXPORT_SYMBOL_GPL(register_jprobes);
861 916
862int __kprobes register_jprobe(struct jprobe *jp) 917int __kprobes register_jprobe(struct jprobe *jp)
863{ 918{
864 return register_jprobes(&jp, 1); 919 return register_jprobes(&jp, 1);
865} 920}
921EXPORT_SYMBOL_GPL(register_jprobe);
866 922
867void __kprobes unregister_jprobe(struct jprobe *jp) 923void __kprobes unregister_jprobe(struct jprobe *jp)
868{ 924{
869 unregister_jprobes(&jp, 1); 925 unregister_jprobes(&jp, 1);
870} 926}
927EXPORT_SYMBOL_GPL(unregister_jprobe);
871 928
872void __kprobes unregister_jprobes(struct jprobe **jps, int num) 929void __kprobes unregister_jprobes(struct jprobe **jps, int num)
873{ 930{
@@ -887,6 +944,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
887 __unregister_kprobe_bottom(&jps[i]->kp); 944 __unregister_kprobe_bottom(&jps[i]->kp);
888 } 945 }
889} 946}
947EXPORT_SYMBOL_GPL(unregister_jprobes);
890 948
891#ifdef CONFIG_KRETPROBES 949#ifdef CONFIG_KRETPROBES
892/* 950/*
@@ -912,10 +970,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
912 ri->rp = rp; 970 ri->rp = rp;
913 ri->task = current; 971 ri->task = current;
914 972
915 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 973 if (rp->entry_handler && rp->entry_handler(ri, regs))
916 spin_unlock_irqrestore(&rp->lock, flags);
917 return 0; 974 return 0;
918 }
919 975
920 arch_prepare_kretprobe(ri, regs); 976 arch_prepare_kretprobe(ri, regs);
921 977
@@ -982,6 +1038,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
982 free_rp_inst(rp); 1038 free_rp_inst(rp);
983 return ret; 1039 return ret;
984} 1040}
1041EXPORT_SYMBOL_GPL(register_kretprobe);
985 1042
986int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1043int __kprobes register_kretprobes(struct kretprobe **rps, int num)
987{ 1044{
@@ -999,11 +1056,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
999 } 1056 }
1000 return ret; 1057 return ret;
1001} 1058}
1059EXPORT_SYMBOL_GPL(register_kretprobes);
1002 1060
1003void __kprobes unregister_kretprobe(struct kretprobe *rp) 1061void __kprobes unregister_kretprobe(struct kretprobe *rp)
1004{ 1062{
1005 unregister_kretprobes(&rp, 1); 1063 unregister_kretprobes(&rp, 1);
1006} 1064}
1065EXPORT_SYMBOL_GPL(unregister_kretprobe);
1007 1066
1008void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1067void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1009{ 1068{
@@ -1025,24 +1084,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1025 } 1084 }
1026 } 1085 }
1027} 1086}
1087EXPORT_SYMBOL_GPL(unregister_kretprobes);
1028 1088
1029#else /* CONFIG_KRETPROBES */ 1089#else /* CONFIG_KRETPROBES */
1030int __kprobes register_kretprobe(struct kretprobe *rp) 1090int __kprobes register_kretprobe(struct kretprobe *rp)
1031{ 1091{
1032 return -ENOSYS; 1092 return -ENOSYS;
1033} 1093}
1094EXPORT_SYMBOL_GPL(register_kretprobe);
1034 1095
1035int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1096int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1036{ 1097{
1037 return -ENOSYS; 1098 return -ENOSYS;
1038} 1099}
1100EXPORT_SYMBOL_GPL(register_kretprobes);
1101
1039void __kprobes unregister_kretprobe(struct kretprobe *rp) 1102void __kprobes unregister_kretprobe(struct kretprobe *rp)
1040{ 1103{
1041} 1104}
1105EXPORT_SYMBOL_GPL(unregister_kretprobe);
1042 1106
1043void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1107void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1044{ 1108{
1045} 1109}
1110EXPORT_SYMBOL_GPL(unregister_kretprobes);
1046 1111
1047static int __kprobes pre_handler_kretprobe(struct kprobe *p, 1112static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1048 struct pt_regs *regs) 1113 struct pt_regs *regs)
@@ -1056,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1056static void __kprobes kill_kprobe(struct kprobe *p) 1121static void __kprobes kill_kprobe(struct kprobe *p)
1057{ 1122{
1058 struct kprobe *kp; 1123 struct kprobe *kp;
1124
1059 p->flags |= KPROBE_FLAG_GONE; 1125 p->flags |= KPROBE_FLAG_GONE;
1060 if (p->pre_handler == aggr_pre_handler) { 1126 if (p->pre_handler == aggr_pre_handler) {
1061 /* 1127 /*
@@ -1168,8 +1234,8 @@ static int __init init_kprobes(void)
1168 } 1234 }
1169 } 1235 }
1170 1236
1171 /* By default, kprobes are enabled */ 1237 /* By default, kprobes are armed */
1172 kprobe_enabled = true; 1238 kprobes_all_disarmed = false;
1173 1239
1174 err = arch_init_kprobes(); 1240 err = arch_init_kprobes();
1175 if (!err) 1241 if (!err)
@@ -1197,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1197 else 1263 else
1198 kprobe_type = "k"; 1264 kprobe_type = "k";
1199 if (sym) 1265 if (sym)
1200 seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, 1266 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n",
1201 sym, offset, (modname ? modname : " "), 1267 p->addr, kprobe_type, sym, offset,
1202 (kprobe_gone(p) ? "[GONE]" : "")); 1268 (modname ? modname : " "),
1269 (kprobe_gone(p) ? "[GONE]" : ""),
1270 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1271 "[DISABLED]" : ""));
1203 else 1272 else
1204 seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, 1273 seq_printf(pi, "%p %s %p %s%s\n",
1205 (kprobe_gone(p) ? "[GONE]" : "")); 1274 p->addr, kprobe_type, p->addr,
1275 (kprobe_gone(p) ? "[GONE]" : ""),
1276 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1277 "[DISABLED]" : ""));
1206} 1278}
1207 1279
1208static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1280static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1267,7 +1339,72 @@ static struct file_operations debugfs_kprobes_operations = {
1267 .release = seq_release, 1339 .release = seq_release,
1268}; 1340};
1269 1341
1270static void __kprobes enable_all_kprobes(void) 1342/* Disable one kprobe */
1343int __kprobes disable_kprobe(struct kprobe *kp)
1344{
1345 int ret = 0;
1346 struct kprobe *p;
1347
1348 mutex_lock(&kprobe_mutex);
1349
1350 /* Check whether specified probe is valid. */
1351 p = __get_valid_kprobe(kp);
1352 if (unlikely(p == NULL)) {
1353 ret = -EINVAL;
1354 goto out;
1355 }
1356
1357 /* If the probe is already disabled (or gone), just return */
1358 if (kprobe_disabled(kp))
1359 goto out;
1360
1361 kp->flags |= KPROBE_FLAG_DISABLED;
1362 if (p != kp)
1363 /* When kp != p, p is always enabled. */
1364 try_to_disable_aggr_kprobe(p);
1365
1366 if (!kprobes_all_disarmed && kprobe_disabled(p))
1367 arch_disarm_kprobe(p);
1368out:
1369 mutex_unlock(&kprobe_mutex);
1370 return ret;
1371}
1372EXPORT_SYMBOL_GPL(disable_kprobe);
1373
1374/* Enable one kprobe */
1375int __kprobes enable_kprobe(struct kprobe *kp)
1376{
1377 int ret = 0;
1378 struct kprobe *p;
1379
1380 mutex_lock(&kprobe_mutex);
1381
1382 /* Check whether specified probe is valid. */
1383 p = __get_valid_kprobe(kp);
1384 if (unlikely(p == NULL)) {
1385 ret = -EINVAL;
1386 goto out;
1387 }
1388
1389 if (kprobe_gone(kp)) {
1390 /* This kprobe has gone, we couldn't enable it. */
1391 ret = -EINVAL;
1392 goto out;
1393 }
1394
1395 if (!kprobes_all_disarmed && kprobe_disabled(p))
1396 arch_arm_kprobe(p);
1397
1398 p->flags &= ~KPROBE_FLAG_DISABLED;
1399 if (p != kp)
1400 kp->flags &= ~KPROBE_FLAG_DISABLED;
1401out:
1402 mutex_unlock(&kprobe_mutex);
1403 return ret;
1404}
1405EXPORT_SYMBOL_GPL(enable_kprobe);
1406
1407static void __kprobes arm_all_kprobes(void)
1271{ 1408{
1272 struct hlist_head *head; 1409 struct hlist_head *head;
1273 struct hlist_node *node; 1410 struct hlist_node *node;
@@ -1276,18 +1413,20 @@ static void __kprobes enable_all_kprobes(void)
1276 1413
1277 mutex_lock(&kprobe_mutex); 1414 mutex_lock(&kprobe_mutex);
1278 1415
1279 /* If kprobes are already enabled, just return */ 1416 /* If kprobes are armed, just return */
1280 if (kprobe_enabled) 1417 if (!kprobes_all_disarmed)
1281 goto already_enabled; 1418 goto already_enabled;
1282 1419
1420 mutex_lock(&text_mutex);
1283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1421 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1284 head = &kprobe_table[i]; 1422 head = &kprobe_table[i];
1285 hlist_for_each_entry_rcu(p, node, head, hlist) 1423 hlist_for_each_entry_rcu(p, node, head, hlist)
1286 if (!kprobe_gone(p)) 1424 if (!kprobe_disabled(p))
1287 arch_arm_kprobe(p); 1425 arch_arm_kprobe(p);
1288 } 1426 }
1427 mutex_unlock(&text_mutex);
1289 1428
1290 kprobe_enabled = true; 1429 kprobes_all_disarmed = false;
1291 printk(KERN_INFO "Kprobes globally enabled\n"); 1430 printk(KERN_INFO "Kprobes globally enabled\n");
1292 1431
1293already_enabled: 1432already_enabled:
@@ -1295,7 +1434,7 @@ already_enabled:
1295 return; 1434 return;
1296} 1435}
1297 1436
1298static void __kprobes disable_all_kprobes(void) 1437static void __kprobes disarm_all_kprobes(void)
1299{ 1438{
1300 struct hlist_head *head; 1439 struct hlist_head *head;
1301 struct hlist_node *node; 1440 struct hlist_node *node;
@@ -1304,20 +1443,22 @@ static void __kprobes disable_all_kprobes(void)
1304 1443
1305 mutex_lock(&kprobe_mutex); 1444 mutex_lock(&kprobe_mutex);
1306 1445
1307 /* If kprobes are already disabled, just return */ 1446 /* If kprobes are already disarmed, just return */
1308 if (!kprobe_enabled) 1447 if (kprobes_all_disarmed)
1309 goto already_disabled; 1448 goto already_disabled;
1310 1449
1311 kprobe_enabled = false; 1450 kprobes_all_disarmed = true;
1312 printk(KERN_INFO "Kprobes globally disabled\n"); 1451 printk(KERN_INFO "Kprobes globally disabled\n");
1452 mutex_lock(&text_mutex);
1313 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1453 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1314 head = &kprobe_table[i]; 1454 head = &kprobe_table[i];
1315 hlist_for_each_entry_rcu(p, node, head, hlist) { 1455 hlist_for_each_entry_rcu(p, node, head, hlist) {
1316 if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) 1456 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1317 arch_disarm_kprobe(p); 1457 arch_disarm_kprobe(p);
1318 } 1458 }
1319 } 1459 }
1320 1460
1461 mutex_unlock(&text_mutex);
1321 mutex_unlock(&kprobe_mutex); 1462 mutex_unlock(&kprobe_mutex);
1322 /* Allow all currently running kprobes to complete */ 1463 /* Allow all currently running kprobes to complete */
1323 synchronize_sched(); 1464 synchronize_sched();
@@ -1338,7 +1479,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
1338{ 1479{
1339 char buf[3]; 1480 char buf[3];
1340 1481
1341 if (kprobe_enabled) 1482 if (!kprobes_all_disarmed)
1342 buf[0] = '1'; 1483 buf[0] = '1';
1343 else 1484 else
1344 buf[0] = '0'; 1485 buf[0] = '0';
@@ -1361,12 +1502,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
1361 case 'y': 1502 case 'y':
1362 case 'Y': 1503 case 'Y':
1363 case '1': 1504 case '1':
1364 enable_all_kprobes(); 1505 arm_all_kprobes();
1365 break; 1506 break;
1366 case 'n': 1507 case 'n':
1367 case 'N': 1508 case 'N':
1368 case '0': 1509 case '0':
1369 disable_all_kprobes(); 1510 disarm_all_kprobes();
1370 break; 1511 break;
1371 } 1512 }
1372 1513
@@ -1409,16 +1550,5 @@ late_initcall(debugfs_kprobe_init);
1409 1550
1410module_init(init_kprobes); 1551module_init(init_kprobes);
1411 1552
1412EXPORT_SYMBOL_GPL(register_kprobe); 1553/* defined in arch/.../kernel/kprobes.c */
1413EXPORT_SYMBOL_GPL(unregister_kprobe);
1414EXPORT_SYMBOL_GPL(register_kprobes);
1415EXPORT_SYMBOL_GPL(unregister_kprobes);
1416EXPORT_SYMBOL_GPL(register_jprobe);
1417EXPORT_SYMBOL_GPL(unregister_jprobe);
1418EXPORT_SYMBOL_GPL(register_jprobes);
1419EXPORT_SYMBOL_GPL(unregister_jprobes);
1420EXPORT_SYMBOL_GPL(jprobe_return); 1554EXPORT_SYMBOL_GPL(jprobe_return);
1421EXPORT_SYMBOL_GPL(register_kretprobe);
1422EXPORT_SYMBOL_GPL(unregister_kretprobe);
1423EXPORT_SYMBOL_GPL(register_kretprobes);
1424EXPORT_SYMBOL_GPL(unregister_kretprobes);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456f393d..84bbadd4d021 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)
110 */ 110 */
111 sched_setscheduler(create->result, SCHED_NORMAL, &param); 111 sched_setscheduler(create->result, SCHED_NORMAL, &param);
112 set_user_nice(create->result, KTHREAD_NICE_LEVEL); 112 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
113 set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR); 113 set_cpus_allowed_ptr(create->result, cpu_all_mask);
114 } 114 }
115 complete(&create->done); 115 complete(&create->done);
116} 116}
@@ -240,7 +240,7 @@ int kthreadd(void *unused)
240 set_task_comm(tsk, "kthreadd"); 240 set_task_comm(tsk, "kthreadd");
241 ignore_signals(tsk); 241 ignore_signals(tsk);
242 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 242 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
243 set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); 243 set_cpus_allowed_ptr(tsk, cpu_all_mask);
244 244
245 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 245 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
246 246
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 449db466bdbc..ca07c5c0c914 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -9,6 +9,44 @@
9 * as published by the Free Software Foundation; version 2 9 * as published by the Free Software Foundation; version 2
10 * of the License. 10 * of the License.
11 */ 11 */
12
13/*
14 * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
15 * used by the "latencytop" userspace tool. The latency that is tracked is not
16 * the 'traditional' interrupt latency (which is primarily caused by something
17 * else consuming CPU), but instead, it is the latency an application encounters
18 * because the kernel sleeps on its behalf for various reasons.
19 *
20 * This code tracks 2 levels of statistics:
21 * 1) System level latency
22 * 2) Per process latency
23 *
24 * The latency is stored in fixed sized data structures in an accumulated form;
25 * if the "same" latency cause is hit twice, this will be tracked as one entry
26 * in the data structure. Both the count, total accumulated latency and maximum
27 * latency are tracked in this data structure. When the fixed size structure is
28 * full, no new causes are tracked until the buffer is flushed by writing to
29 * the /proc file; the userspace tool does this on a regular basis.
30 *
31 * A latency cause is identified by a stringified backtrace at the point that
32 * the scheduler gets invoked. The userland tool will use this string to
33 * identify the cause of the latency in human readable form.
34 *
35 * The information is exported via /proc/latency_stats and /proc/<pid>/latency.
36 * These files look like this:
37 *
38 * Latency Top version : v0.1
39 * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
40 * | | | |
41 * | | | +----> the stringified backtrace
42 * | | +---------> The maximum latency for this entry in microseconds
43 * | +--------------> The accumulated latency for this entry (microseconds)
44 * +-------------------> The number of times this entry is hit
45 *
46 * (note: the average latency is the accumulated latency divided by the number
47 * of times)
48 */
49
12#include <linux/latencytop.h> 50#include <linux/latencytop.h>
13#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
14#include <linux/seq_file.h> 52#include <linux/seq_file.h>
@@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
72 firstnonnull = i; 110 firstnonnull = i;
73 continue; 111 continue;
74 } 112 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 113 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
76 unsigned long record = lat->backtrace[q]; 114 unsigned long record = lat->backtrace[q];
77 115
78 if (latency_record[i].backtrace[q] != record) { 116 if (latency_record[i].backtrace[q] != record) {
@@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
101 memcpy(&latency_record[i], lat, sizeof(struct latency_record)); 139 memcpy(&latency_record[i], lat, sizeof(struct latency_record));
102} 140}
103 141
104static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) 142/*
143 * Iterator to store a backtrace into a latency record entry
144 */
145static inline void store_stacktrace(struct task_struct *tsk,
146 struct latency_record *lat)
105{ 147{
106 struct stack_trace trace; 148 struct stack_trace trace;
107 149
108 memset(&trace, 0, sizeof(trace)); 150 memset(&trace, 0, sizeof(trace));
109 trace.max_entries = LT_BACKTRACEDEPTH; 151 trace.max_entries = LT_BACKTRACEDEPTH;
110 trace.entries = &lat->backtrace[0]; 152 trace.entries = &lat->backtrace[0];
111 trace.skip = 0;
112 save_stack_trace_tsk(tsk, &trace); 153 save_stack_trace_tsk(tsk, &trace);
113} 154}
114 155
156/**
157 * __account_scheduler_latency - record an occured latency
158 * @tsk - the task struct of the task hitting the latency
159 * @usecs - the duration of the latency in microseconds
160 * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
161 *
162 * This function is the main entry point for recording latency entries
163 * as called by the scheduler.
164 *
165 * This function has a few special cases to deal with normal 'non-latency'
166 * sleeps: specifically, interruptible sleep longer than 5 msec is skipped
167 * since this usually is caused by waiting for events via select() and co.
168 *
169 * Negative latencies (caused by time going backwards) are also explicitly
170 * skipped.
171 */
115void __sched 172void __sched
116account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) 173__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
117{ 174{
118 unsigned long flags; 175 unsigned long flags;
119 int i, q; 176 int i, q;
120 struct latency_record lat; 177 struct latency_record lat;
121 178
122 if (!latencytop_enabled)
123 return;
124
125 /* Long interruptible waits are generally user requested... */ 179 /* Long interruptible waits are generally user requested... */
126 if (inter && usecs > 5000) 180 if (inter && usecs > 5000)
127 return; 181 return;
128 182
183 /* Negative sleeps are time going backwards */
184 /* Zero-time sleeps are non-interesting */
185 if (usecs <= 0)
186 return;
187
129 memset(&lat, 0, sizeof(lat)); 188 memset(&lat, 0, sizeof(lat));
130 lat.count = 1; 189 lat.count = 1;
131 lat.time = usecs; 190 lat.time = usecs;
@@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
143 if (tsk->latency_record_count >= LT_SAVECOUNT) 202 if (tsk->latency_record_count >= LT_SAVECOUNT)
144 goto out_unlock; 203 goto out_unlock;
145 204
146 for (i = 0; i < LT_SAVECOUNT ; i++) { 205 for (i = 0; i < LT_SAVECOUNT; i++) {
147 struct latency_record *mylat; 206 struct latency_record *mylat;
148 int same = 1; 207 int same = 1;
149 208
150 mylat = &tsk->latency_record[i]; 209 mylat = &tsk->latency_record[i];
151 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { 210 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
152 unsigned long record = lat.backtrace[q]; 211 unsigned long record = lat.backtrace[q];
153 212
154 if (mylat->backtrace[q] != record) { 213 if (mylat->backtrace[q] != record) {
@@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v)
186 for (i = 0; i < MAXLR; i++) { 245 for (i = 0; i < MAXLR; i++) {
187 if (latency_record[i].backtrace[0]) { 246 if (latency_record[i].backtrace[0]) {
188 int q; 247 int q;
189 seq_printf(m, "%i %li %li ", 248 seq_printf(m, "%i %lu %lu ",
190 latency_record[i].count, 249 latency_record[i].count,
191 latency_record[i].time, 250 latency_record[i].time,
192 latency_record[i].max); 251 latency_record[i].max);
@@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp)
223 return single_open(filp, lstats_show, NULL); 282 return single_open(filp, lstats_show, NULL);
224} 283}
225 284
226static struct file_operations lstats_fops = { 285static const struct file_operations lstats_fops = {
227 .open = lstats_open, 286 .open = lstats_open,
228 .read = seq_read, 287 .read = seq_read,
229 .write = lstats_write, 288 .write = lstats_write,
@@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void)
236 proc_create("latency_stats", 0644, NULL, &lstats_fops); 295 proc_create("latency_stats", 0644, NULL, &lstats_fops);
237 return 0; 296 return 0;
238} 297}
239__initcall(init_lstats_procfs); 298device_initcall(init_lstats_procfs);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06b0c3568f0b..b0f011866969 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -41,6 +41,8 @@
41#include <linux/utsname.h> 41#include <linux/utsname.h>
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h>
45#include <trace/lockdep.h>
44 46
45#include <asm/sections.h> 47#include <asm/sections.h>
46 48
@@ -310,12 +312,14 @@ EXPORT_SYMBOL(lockdep_on);
310#if VERBOSE 312#if VERBOSE
311# define HARDIRQ_VERBOSE 1 313# define HARDIRQ_VERBOSE 1
312# define SOFTIRQ_VERBOSE 1 314# define SOFTIRQ_VERBOSE 1
315# define RECLAIM_VERBOSE 1
313#else 316#else
314# define HARDIRQ_VERBOSE 0 317# define HARDIRQ_VERBOSE 0
315# define SOFTIRQ_VERBOSE 0 318# define SOFTIRQ_VERBOSE 0
319# define RECLAIM_VERBOSE 0
316#endif 320#endif
317 321
318#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE 322#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
319/* 323/*
320 * Quick filtering for interesting events: 324 * Quick filtering for interesting events:
321 */ 325 */
@@ -430,30 +434,24 @@ atomic_t nr_find_usage_forwards_checks;
430atomic_t nr_find_usage_forwards_recursions; 434atomic_t nr_find_usage_forwards_recursions;
431atomic_t nr_find_usage_backwards_checks; 435atomic_t nr_find_usage_backwards_checks;
432atomic_t nr_find_usage_backwards_recursions; 436atomic_t nr_find_usage_backwards_recursions;
433# define debug_atomic_inc(ptr) atomic_inc(ptr)
434# define debug_atomic_dec(ptr) atomic_dec(ptr)
435# define debug_atomic_read(ptr) atomic_read(ptr)
436#else
437# define debug_atomic_inc(ptr) do { } while (0)
438# define debug_atomic_dec(ptr) do { } while (0)
439# define debug_atomic_read(ptr) 0
440#endif 437#endif
441 438
442/* 439/*
443 * Locking printouts: 440 * Locking printouts:
444 */ 441 */
445 442
443#define __USAGE(__STATE) \
444 [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \
445 [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \
446 [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
447 [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
448
446static const char *usage_str[] = 449static const char *usage_str[] =
447{ 450{
448 [LOCK_USED] = "initial-use ", 451#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
449 [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", 452#include "lockdep_states.h"
450 [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", 453#undef LOCKDEP_STATE
451 [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", 454 [LOCK_USED] = "INITIAL USE",
452 [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
453 [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
454 [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
455 [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
456 [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
457}; 455};
458 456
459const char * __get_key_name(struct lockdep_subclass_key *key, char *str) 457const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@@ -461,46 +459,45 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
461 return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); 459 return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
462} 460}
463 461
464void 462static inline unsigned long lock_flag(enum lock_usage_bit bit)
465get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
466{ 463{
467 *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; 464 return 1UL << bit;
468 465}
469 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
470 *c1 = '+';
471 else
472 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
473 *c1 = '-';
474 466
475 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) 467static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
476 *c2 = '+'; 468{
477 else 469 char c = '.';
478 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
479 *c2 = '-';
480 470
481 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 471 if (class->usage_mask & lock_flag(bit + 2))
482 *c3 = '-'; 472 c = '+';
483 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { 473 if (class->usage_mask & lock_flag(bit)) {
484 *c3 = '+'; 474 c = '-';
485 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 475 if (class->usage_mask & lock_flag(bit + 2))
486 *c3 = '?'; 476 c = '?';
487 } 477 }
488 478
489 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 479 return c;
490 *c4 = '-'; 480}
491 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { 481
492 *c4 = '+'; 482void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
493 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 483{
494 *c4 = '?'; 484 int i = 0;
495 } 485
486#define LOCKDEP_STATE(__STATE) \
487 usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \
488 usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
489#include "lockdep_states.h"
490#undef LOCKDEP_STATE
491
492 usage[i] = '\0';
496} 493}
497 494
498static void print_lock_name(struct lock_class *class) 495static void print_lock_name(struct lock_class *class)
499{ 496{
500 char str[KSYM_NAME_LEN], c1, c2, c3, c4; 497 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
501 const char *name; 498 const char *name;
502 499
503 get_usage_chars(class, &c1, &c2, &c3, &c4); 500 get_usage_chars(class, usage);
504 501
505 name = class->name; 502 name = class->name;
506 if (!name) { 503 if (!name) {
@@ -513,7 +510,7 @@ static void print_lock_name(struct lock_class *class)
513 if (class->subclass) 510 if (class->subclass)
514 printk("/%d", class->subclass); 511 printk("/%d", class->subclass);
515 } 512 }
516 printk("){%c%c%c%c}", c1, c2, c3, c4); 513 printk("){%s}", usage);
517} 514}
518 515
519static void print_lockdep_cache(struct lockdep_map *lock) 516static void print_lockdep_cache(struct lockdep_map *lock)
@@ -796,6 +793,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
796 793
797 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 794 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
798 printk("turning off the locking correctness validator.\n"); 795 printk("turning off the locking correctness validator.\n");
796 dump_stack();
799 return NULL; 797 return NULL;
800 } 798 }
801 class = lock_classes + nr_lock_classes++; 799 class = lock_classes + nr_lock_classes++;
@@ -859,6 +857,7 @@ static struct lock_list *alloc_list_entry(void)
859 857
860 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 858 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
861 printk("turning off the locking correctness validator.\n"); 859 printk("turning off the locking correctness validator.\n");
860 dump_stack();
862 return NULL; 861 return NULL;
863 } 862 }
864 return list_entries + nr_list_entries++; 863 return list_entries + nr_list_entries++;
@@ -1263,9 +1262,49 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1263 bit_backwards, bit_forwards, irqclass); 1262 bit_backwards, bit_forwards, irqclass);
1264} 1263}
1265 1264
1266static int 1265static const char *state_names[] = {
1267check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, 1266#define LOCKDEP_STATE(__STATE) \
1268 struct held_lock *next) 1267 __stringify(__STATE),
1268#include "lockdep_states.h"
1269#undef LOCKDEP_STATE
1270};
1271
1272static const char *state_rnames[] = {
1273#define LOCKDEP_STATE(__STATE) \
1274 __stringify(__STATE)"-READ",
1275#include "lockdep_states.h"
1276#undef LOCKDEP_STATE
1277};
1278
1279static inline const char *state_name(enum lock_usage_bit bit)
1280{
1281 return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
1282}
1283
1284static int exclusive_bit(int new_bit)
1285{
1286 /*
1287 * USED_IN
1288 * USED_IN_READ
1289 * ENABLED
1290 * ENABLED_READ
1291 *
1292 * bit 0 - write/read
1293 * bit 1 - used_in/enabled
1294 * bit 2+ state
1295 */
1296
1297 int state = new_bit & ~3;
1298 int dir = new_bit & 2;
1299
1300 /*
1301 * keep state, bit flip the direction and strip read.
1302 */
1303 return state | (dir ^ 2);
1304}
1305
1306static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
1307 struct held_lock *next, enum lock_usage_bit bit)
1269{ 1308{
1270 /* 1309 /*
1271 * Prove that the new dependency does not connect a hardirq-safe 1310 * Prove that the new dependency does not connect a hardirq-safe
@@ -1273,38 +1312,34 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1273 * the backwards-subgraph starting at <prev>, and the 1312 * the backwards-subgraph starting at <prev>, and the
1274 * forwards-subgraph starting at <next>: 1313 * forwards-subgraph starting at <next>:
1275 */ 1314 */
1276 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, 1315 if (!check_usage(curr, prev, next, bit,
1277 LOCK_ENABLED_HARDIRQS, "hard")) 1316 exclusive_bit(bit), state_name(bit)))
1278 return 0; 1317 return 0;
1279 1318
1319 bit++; /* _READ */
1320
1280 /* 1321 /*
1281 * Prove that the new dependency does not connect a hardirq-safe-read 1322 * Prove that the new dependency does not connect a hardirq-safe-read
1282 * lock with a hardirq-unsafe lock - to achieve this we search 1323 * lock with a hardirq-unsafe lock - to achieve this we search
1283 * the backwards-subgraph starting at <prev>, and the 1324 * the backwards-subgraph starting at <prev>, and the
1284 * forwards-subgraph starting at <next>: 1325 * forwards-subgraph starting at <next>:
1285 */ 1326 */
1286 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, 1327 if (!check_usage(curr, prev, next, bit,
1287 LOCK_ENABLED_HARDIRQS, "hard-read")) 1328 exclusive_bit(bit), state_name(bit)))
1288 return 0; 1329 return 0;
1289 1330
1290 /* 1331 return 1;
1291 * Prove that the new dependency does not connect a softirq-safe 1332}
1292 * lock with a softirq-unsafe lock - to achieve this we search 1333
1293 * the backwards-subgraph starting at <prev>, and the 1334static int
1294 * forwards-subgraph starting at <next>: 1335check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
1295 */ 1336 struct held_lock *next)
1296 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, 1337{
1297 LOCK_ENABLED_SOFTIRQS, "soft")) 1338#define LOCKDEP_STATE(__STATE) \
1298 return 0; 1339 if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
1299 /*
1300 * Prove that the new dependency does not connect a softirq-safe-read
1301 * lock with a softirq-unsafe lock - to achieve this we search
1302 * the backwards-subgraph starting at <prev>, and the
1303 * forwards-subgraph starting at <next>:
1304 */
1305 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
1306 LOCK_ENABLED_SOFTIRQS, "soft"))
1307 return 0; 1340 return 0;
1341#include "lockdep_states.h"
1342#undef LOCKDEP_STATE
1308 1343
1309 return 1; 1344 return 1;
1310} 1345}
@@ -1649,6 +1684,7 @@ cache_hit:
1649 1684
1650 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 1685 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
1651 printk("turning off the locking correctness validator.\n"); 1686 printk("turning off the locking correctness validator.\n");
1687 dump_stack();
1652 return 0; 1688 return 0;
1653 } 1689 }
1654 chain = lock_chains + nr_lock_chains++; 1690 chain = lock_chains + nr_lock_chains++;
@@ -1861,9 +1897,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1861 curr->comm, task_pid_nr(curr)); 1897 curr->comm, task_pid_nr(curr));
1862 print_lock(this); 1898 print_lock(this);
1863 if (forwards) 1899 if (forwards)
1864 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); 1900 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1865 else 1901 else
1866 printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); 1902 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1867 print_lock_name(other); 1903 print_lock_name(other);
1868 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 1904 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1869 1905
@@ -1933,7 +1969,7 @@ void print_irqtrace_events(struct task_struct *curr)
1933 print_ip_sym(curr->softirq_disable_ip); 1969 print_ip_sym(curr->softirq_disable_ip);
1934} 1970}
1935 1971
1936static int hardirq_verbose(struct lock_class *class) 1972static int HARDIRQ_verbose(struct lock_class *class)
1937{ 1973{
1938#if HARDIRQ_VERBOSE 1974#if HARDIRQ_VERBOSE
1939 return class_filter(class); 1975 return class_filter(class);
@@ -1941,7 +1977,7 @@ static int hardirq_verbose(struct lock_class *class)
1941 return 0; 1977 return 0;
1942} 1978}
1943 1979
1944static int softirq_verbose(struct lock_class *class) 1980static int SOFTIRQ_verbose(struct lock_class *class)
1945{ 1981{
1946#if SOFTIRQ_VERBOSE 1982#if SOFTIRQ_VERBOSE
1947 return class_filter(class); 1983 return class_filter(class);
@@ -1949,185 +1985,95 @@ static int softirq_verbose(struct lock_class *class)
1949 return 0; 1985 return 0;
1950} 1986}
1951 1987
1988static int RECLAIM_FS_verbose(struct lock_class *class)
1989{
1990#if RECLAIM_VERBOSE
1991 return class_filter(class);
1992#endif
1993 return 0;
1994}
1995
1952#define STRICT_READ_CHECKS 1 1996#define STRICT_READ_CHECKS 1
1953 1997
1954static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, 1998static int (*state_verbose_f[])(struct lock_class *class) = {
1999#define LOCKDEP_STATE(__STATE) \
2000 __STATE##_verbose,
2001#include "lockdep_states.h"
2002#undef LOCKDEP_STATE
2003};
2004
2005static inline int state_verbose(enum lock_usage_bit bit,
2006 struct lock_class *class)
2007{
2008 return state_verbose_f[bit >> 2](class);
2009}
2010
2011typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
2012 enum lock_usage_bit bit, const char *name);
2013
2014static int
2015mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1955 enum lock_usage_bit new_bit) 2016 enum lock_usage_bit new_bit)
1956{ 2017{
1957 int ret = 1; 2018 int excl_bit = exclusive_bit(new_bit);
2019 int read = new_bit & 1;
2020 int dir = new_bit & 2;
1958 2021
1959 switch(new_bit) { 2022 /*
1960 case LOCK_USED_IN_HARDIRQ: 2023 * mark USED_IN has to look forwards -- to ensure no dependency
1961 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) 2024 * has ENABLED state, which would allow recursion deadlocks.
1962 return 0; 2025 *
1963 if (!valid_state(curr, this, new_bit, 2026 * mark ENABLED has to look backwards -- to ensure no dependee
1964 LOCK_ENABLED_HARDIRQS_READ)) 2027 * has USED_IN state, which, again, would allow recursion deadlocks.
1965 return 0; 2028 */
1966 /* 2029 check_usage_f usage = dir ?
1967 * just marked it hardirq-safe, check that this lock 2030 check_usage_backwards : check_usage_forwards;
1968 * took no hardirq-unsafe lock in the past: 2031
1969 */ 2032 /*
1970 if (!check_usage_forwards(curr, this, 2033 * Validate that this particular lock does not have conflicting
1971 LOCK_ENABLED_HARDIRQS, "hard")) 2034 * usage states.
1972 return 0; 2035 */
1973#if STRICT_READ_CHECKS 2036 if (!valid_state(curr, this, new_bit, excl_bit))
1974 /* 2037 return 0;
1975 * just marked it hardirq-safe, check that this lock 2038
1976 * took no hardirq-unsafe-read lock in the past: 2039 /*
1977 */ 2040 * Validate that the lock dependencies don't have conflicting usage
1978 if (!check_usage_forwards(curr, this, 2041 * states.
1979 LOCK_ENABLED_HARDIRQS_READ, "hard-read")) 2042 */
1980 return 0; 2043 if ((!read || !dir || STRICT_READ_CHECKS) &&
1981#endif 2044 !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
1982 if (hardirq_verbose(hlock_class(this))) 2045 return 0;
1983 ret = 2; 2046
1984 break; 2047 /*
1985 case LOCK_USED_IN_SOFTIRQ: 2048 * Check for read in write conflicts
1986 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) 2049 */
1987 return 0; 2050 if (!read) {
1988 if (!valid_state(curr, this, new_bit, 2051 if (!valid_state(curr, this, new_bit, excl_bit + 1))
1989 LOCK_ENABLED_SOFTIRQS_READ))
1990 return 0;
1991 /*
1992 * just marked it softirq-safe, check that this lock
1993 * took no softirq-unsafe lock in the past:
1994 */
1995 if (!check_usage_forwards(curr, this,
1996 LOCK_ENABLED_SOFTIRQS, "soft"))
1997 return 0;
1998#if STRICT_READ_CHECKS
1999 /*
2000 * just marked it softirq-safe, check that this lock
2001 * took no softirq-unsafe-read lock in the past:
2002 */
2003 if (!check_usage_forwards(curr, this,
2004 LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
2005 return 0;
2006#endif
2007 if (softirq_verbose(hlock_class(this)))
2008 ret = 2;
2009 break;
2010 case LOCK_USED_IN_HARDIRQ_READ:
2011 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
2012 return 0;
2013 /*
2014 * just marked it hardirq-read-safe, check that this lock
2015 * took no hardirq-unsafe lock in the past:
2016 */
2017 if (!check_usage_forwards(curr, this,
2018 LOCK_ENABLED_HARDIRQS, "hard"))
2019 return 0;
2020 if (hardirq_verbose(hlock_class(this)))
2021 ret = 2;
2022 break;
2023 case LOCK_USED_IN_SOFTIRQ_READ:
2024 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
2025 return 0;
2026 /*
2027 * just marked it softirq-read-safe, check that this lock
2028 * took no softirq-unsafe lock in the past:
2029 */
2030 if (!check_usage_forwards(curr, this,
2031 LOCK_ENABLED_SOFTIRQS, "soft"))
2032 return 0;
2033 if (softirq_verbose(hlock_class(this)))
2034 ret = 2;
2035 break;
2036 case LOCK_ENABLED_HARDIRQS:
2037 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
2038 return 0;
2039 if (!valid_state(curr, this, new_bit,
2040 LOCK_USED_IN_HARDIRQ_READ))
2041 return 0;
2042 /*
2043 * just marked it hardirq-unsafe, check that no hardirq-safe
2044 * lock in the system ever took it in the past:
2045 */
2046 if (!check_usage_backwards(curr, this,
2047 LOCK_USED_IN_HARDIRQ, "hard"))
2048 return 0;
2049#if STRICT_READ_CHECKS
2050 /*
2051 * just marked it hardirq-unsafe, check that no
2052 * hardirq-safe-read lock in the system ever took
2053 * it in the past:
2054 */
2055 if (!check_usage_backwards(curr, this,
2056 LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
2057 return 0;
2058#endif
2059 if (hardirq_verbose(hlock_class(this)))
2060 ret = 2;
2061 break;
2062 case LOCK_ENABLED_SOFTIRQS:
2063 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
2064 return 0;
2065 if (!valid_state(curr, this, new_bit,
2066 LOCK_USED_IN_SOFTIRQ_READ))
2067 return 0;
2068 /*
2069 * just marked it softirq-unsafe, check that no softirq-safe
2070 * lock in the system ever took it in the past:
2071 */
2072 if (!check_usage_backwards(curr, this,
2073 LOCK_USED_IN_SOFTIRQ, "soft"))
2074 return 0;
2075#if STRICT_READ_CHECKS
2076 /*
2077 * just marked it softirq-unsafe, check that no
2078 * softirq-safe-read lock in the system ever took
2079 * it in the past:
2080 */
2081 if (!check_usage_backwards(curr, this,
2082 LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
2083 return 0;
2084#endif
2085 if (softirq_verbose(hlock_class(this)))
2086 ret = 2;
2087 break;
2088 case LOCK_ENABLED_HARDIRQS_READ:
2089 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
2090 return 0;
2091#if STRICT_READ_CHECKS
2092 /*
2093 * just marked it hardirq-read-unsafe, check that no
2094 * hardirq-safe lock in the system ever took it in the past:
2095 */
2096 if (!check_usage_backwards(curr, this,
2097 LOCK_USED_IN_HARDIRQ, "hard"))
2098 return 0;
2099#endif
2100 if (hardirq_verbose(hlock_class(this)))
2101 ret = 2;
2102 break;
2103 case LOCK_ENABLED_SOFTIRQS_READ:
2104 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
2105 return 0; 2052 return 0;
2106#if STRICT_READ_CHECKS 2053
2107 /* 2054 if (STRICT_READ_CHECKS &&
2108 * just marked it softirq-read-unsafe, check that no 2055 !usage(curr, this, excl_bit + 1,
2109 * softirq-safe lock in the system ever took it in the past: 2056 state_name(new_bit + 1)))
2110 */
2111 if (!check_usage_backwards(curr, this,
2112 LOCK_USED_IN_SOFTIRQ, "soft"))
2113 return 0; 2057 return 0;
2114#endif
2115 if (softirq_verbose(hlock_class(this)))
2116 ret = 2;
2117 break;
2118 default:
2119 WARN_ON(1);
2120 break;
2121 } 2058 }
2122 2059
2123 return ret; 2060 if (state_verbose(new_bit, hlock_class(this)))
2061 return 2;
2062
2063 return 1;
2124} 2064}
2125 2065
2066enum mark_type {
2067#define LOCKDEP_STATE(__STATE) __STATE,
2068#include "lockdep_states.h"
2069#undef LOCKDEP_STATE
2070};
2071
2126/* 2072/*
2127 * Mark all held locks with a usage bit: 2073 * Mark all held locks with a usage bit:
2128 */ 2074 */
2129static int 2075static int
2130mark_held_locks(struct task_struct *curr, int hardirq) 2076mark_held_locks(struct task_struct *curr, enum mark_type mark)
2131{ 2077{
2132 enum lock_usage_bit usage_bit; 2078 enum lock_usage_bit usage_bit;
2133 struct held_lock *hlock; 2079 struct held_lock *hlock;
@@ -2136,17 +2082,12 @@ mark_held_locks(struct task_struct *curr, int hardirq)
2136 for (i = 0; i < curr->lockdep_depth; i++) { 2082 for (i = 0; i < curr->lockdep_depth; i++) {
2137 hlock = curr->held_locks + i; 2083 hlock = curr->held_locks + i;
2138 2084
2139 if (hardirq) { 2085 usage_bit = 2 + (mark << 2); /* ENABLED */
2140 if (hlock->read) 2086 if (hlock->read)
2141 usage_bit = LOCK_ENABLED_HARDIRQS_READ; 2087 usage_bit += 1; /* READ */
2142 else 2088
2143 usage_bit = LOCK_ENABLED_HARDIRQS; 2089 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2144 } else { 2090
2145 if (hlock->read)
2146 usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
2147 else
2148 usage_bit = LOCK_ENABLED_SOFTIRQS;
2149 }
2150 if (!mark_lock(curr, hlock, usage_bit)) 2091 if (!mark_lock(curr, hlock, usage_bit))
2151 return 0; 2092 return 0;
2152 } 2093 }
@@ -2200,7 +2141,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2200 * We are going to turn hardirqs on, so set the 2141 * We are going to turn hardirqs on, so set the
2201 * usage bit for all held locks: 2142 * usage bit for all held locks:
2202 */ 2143 */
2203 if (!mark_held_locks(curr, 1)) 2144 if (!mark_held_locks(curr, HARDIRQ))
2204 return; 2145 return;
2205 /* 2146 /*
2206 * If we have softirqs enabled, then set the usage 2147 * If we have softirqs enabled, then set the usage
@@ -2208,7 +2149,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2208 * this bit from being set before) 2149 * this bit from being set before)
2209 */ 2150 */
2210 if (curr->softirqs_enabled) 2151 if (curr->softirqs_enabled)
2211 if (!mark_held_locks(curr, 0)) 2152 if (!mark_held_locks(curr, SOFTIRQ))
2212 return; 2153 return;
2213 2154
2214 curr->hardirq_enable_ip = ip; 2155 curr->hardirq_enable_ip = ip;
@@ -2288,7 +2229,7 @@ void trace_softirqs_on(unsigned long ip)
2288 * enabled too: 2229 * enabled too:
2289 */ 2230 */
2290 if (curr->hardirqs_enabled) 2231 if (curr->hardirqs_enabled)
2291 mark_held_locks(curr, 0); 2232 mark_held_locks(curr, SOFTIRQ);
2292} 2233}
2293 2234
2294/* 2235/*
@@ -2317,6 +2258,48 @@ void trace_softirqs_off(unsigned long ip)
2317 debug_atomic_inc(&redundant_softirqs_off); 2258 debug_atomic_inc(&redundant_softirqs_off);
2318} 2259}
2319 2260
2261static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2262{
2263 struct task_struct *curr = current;
2264
2265 if (unlikely(!debug_locks))
2266 return;
2267
2268 /* no reclaim without waiting on it */
2269 if (!(gfp_mask & __GFP_WAIT))
2270 return;
2271
2272 /* this guy won't enter reclaim */
2273 if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
2274 return;
2275
2276 /* We're only interested __GFP_FS allocations for now */
2277 if (!(gfp_mask & __GFP_FS))
2278 return;
2279
2280 if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
2281 return;
2282
2283 mark_held_locks(curr, RECLAIM_FS);
2284}
2285
2286static void check_flags(unsigned long flags);
2287
2288void lockdep_trace_alloc(gfp_t gfp_mask)
2289{
2290 unsigned long flags;
2291
2292 if (unlikely(current->lockdep_recursion))
2293 return;
2294
2295 raw_local_irq_save(flags);
2296 check_flags(flags);
2297 current->lockdep_recursion = 1;
2298 __lockdep_trace_alloc(gfp_mask, flags);
2299 current->lockdep_recursion = 0;
2300 raw_local_irq_restore(flags);
2301}
2302
2320static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) 2303static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2321{ 2304{
2322 /* 2305 /*
@@ -2345,19 +2328,35 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
2345 if (!hlock->hardirqs_off) { 2328 if (!hlock->hardirqs_off) {
2346 if (hlock->read) { 2329 if (hlock->read) {
2347 if (!mark_lock(curr, hlock, 2330 if (!mark_lock(curr, hlock,
2348 LOCK_ENABLED_HARDIRQS_READ)) 2331 LOCK_ENABLED_HARDIRQ_READ))
2349 return 0; 2332 return 0;
2350 if (curr->softirqs_enabled) 2333 if (curr->softirqs_enabled)
2351 if (!mark_lock(curr, hlock, 2334 if (!mark_lock(curr, hlock,
2352 LOCK_ENABLED_SOFTIRQS_READ)) 2335 LOCK_ENABLED_SOFTIRQ_READ))
2353 return 0; 2336 return 0;
2354 } else { 2337 } else {
2355 if (!mark_lock(curr, hlock, 2338 if (!mark_lock(curr, hlock,
2356 LOCK_ENABLED_HARDIRQS)) 2339 LOCK_ENABLED_HARDIRQ))
2357 return 0; 2340 return 0;
2358 if (curr->softirqs_enabled) 2341 if (curr->softirqs_enabled)
2359 if (!mark_lock(curr, hlock, 2342 if (!mark_lock(curr, hlock,
2360 LOCK_ENABLED_SOFTIRQS)) 2343 LOCK_ENABLED_SOFTIRQ))
2344 return 0;
2345 }
2346 }
2347
2348 /*
2349 * We reuse the irq context infrastructure more broadly as a general
2350 * context checking code. This tests GFP_FS recursion (a lock taken
2351 * during reclaim for a GFP_FS allocation is held over a GFP_FS
2352 * allocation).
2353 */
2354 if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
2355 if (hlock->read) {
2356 if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
2357 return 0;
2358 } else {
2359 if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
2361 return 0; 2360 return 0;
2362 } 2361 }
2363 } 2362 }
@@ -2412,6 +2411,10 @@ static inline int separate_irq_context(struct task_struct *curr,
2412 return 0; 2411 return 0;
2413} 2412}
2414 2413
2414void lockdep_trace_alloc(gfp_t gfp_mask)
2415{
2416}
2417
2415#endif 2418#endif
2416 2419
2417/* 2420/*
@@ -2445,14 +2448,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2445 return 0; 2448 return 0;
2446 2449
2447 switch (new_bit) { 2450 switch (new_bit) {
2448 case LOCK_USED_IN_HARDIRQ: 2451#define LOCKDEP_STATE(__STATE) \
2449 case LOCK_USED_IN_SOFTIRQ: 2452 case LOCK_USED_IN_##__STATE: \
2450 case LOCK_USED_IN_HARDIRQ_READ: 2453 case LOCK_USED_IN_##__STATE##_READ: \
2451 case LOCK_USED_IN_SOFTIRQ_READ: 2454 case LOCK_ENABLED_##__STATE: \
2452 case LOCK_ENABLED_HARDIRQS: 2455 case LOCK_ENABLED_##__STATE##_READ:
2453 case LOCK_ENABLED_SOFTIRQS: 2456#include "lockdep_states.h"
2454 case LOCK_ENABLED_HARDIRQS_READ: 2457#undef LOCKDEP_STATE
2455 case LOCK_ENABLED_SOFTIRQS_READ:
2456 ret = mark_lock_irq(curr, this, new_bit); 2458 ret = mark_lock_irq(curr, this, new_bit);
2457 if (!ret) 2459 if (!ret)
2458 return 0; 2460 return 0;
@@ -2542,6 +2544,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2542 debug_locks_off(); 2544 debug_locks_off();
2543 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); 2545 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2544 printk("turning off the locking correctness validator.\n"); 2546 printk("turning off the locking correctness validator.\n");
2547 dump_stack();
2545 return 0; 2548 return 0;
2546 } 2549 }
2547 2550
@@ -2638,6 +2641,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2638 debug_locks_off(); 2641 debug_locks_off();
2639 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 2642 printk("BUG: MAX_LOCK_DEPTH too low!\n");
2640 printk("turning off the locking correctness validator.\n"); 2643 printk("turning off the locking correctness validator.\n");
2644 dump_stack();
2641 return 0; 2645 return 0;
2642 } 2646 }
2643 2647
@@ -2925,6 +2929,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
2925} 2929}
2926EXPORT_SYMBOL_GPL(lock_set_class); 2930EXPORT_SYMBOL_GPL(lock_set_class);
2927 2931
2932DEFINE_TRACE(lock_acquire);
2933
2928/* 2934/*
2929 * We are not always called with irqs disabled - do that here, 2935 * We are not always called with irqs disabled - do that here,
2930 * and also avoid lockdep recursion: 2936 * and also avoid lockdep recursion:
@@ -2935,6 +2941,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2935{ 2941{
2936 unsigned long flags; 2942 unsigned long flags;
2937 2943
2944 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
2945
2938 if (unlikely(current->lockdep_recursion)) 2946 if (unlikely(current->lockdep_recursion))
2939 return; 2947 return;
2940 2948
@@ -2949,11 +2957,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2949} 2957}
2950EXPORT_SYMBOL_GPL(lock_acquire); 2958EXPORT_SYMBOL_GPL(lock_acquire);
2951 2959
2960DEFINE_TRACE(lock_release);
2961
2952void lock_release(struct lockdep_map *lock, int nested, 2962void lock_release(struct lockdep_map *lock, int nested,
2953 unsigned long ip) 2963 unsigned long ip)
2954{ 2964{
2955 unsigned long flags; 2965 unsigned long flags;
2956 2966
2967 trace_lock_release(lock, nested, ip);
2968
2957 if (unlikely(current->lockdep_recursion)) 2969 if (unlikely(current->lockdep_recursion))
2958 return; 2970 return;
2959 2971
@@ -2966,6 +2978,16 @@ void lock_release(struct lockdep_map *lock, int nested,
2966} 2978}
2967EXPORT_SYMBOL_GPL(lock_release); 2979EXPORT_SYMBOL_GPL(lock_release);
2968 2980
2981void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2982{
2983 current->lockdep_reclaim_gfp = gfp_mask;
2984}
2985
2986void lockdep_clear_current_reclaim_state(void)
2987{
2988 current->lockdep_reclaim_gfp = 0;
2989}
2990
2969#ifdef CONFIG_LOCK_STAT 2991#ifdef CONFIG_LOCK_STAT
2970static int 2992static int
2971print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, 2993print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@@ -3092,10 +3114,14 @@ found_it:
3092 lock->ip = ip; 3114 lock->ip = ip;
3093} 3115}
3094 3116
3117DEFINE_TRACE(lock_contended);
3118
3095void lock_contended(struct lockdep_map *lock, unsigned long ip) 3119void lock_contended(struct lockdep_map *lock, unsigned long ip)
3096{ 3120{
3097 unsigned long flags; 3121 unsigned long flags;
3098 3122
3123 trace_lock_contended(lock, ip);
3124
3099 if (unlikely(!lock_stat)) 3125 if (unlikely(!lock_stat))
3100 return; 3126 return;
3101 3127
@@ -3111,10 +3137,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3111} 3137}
3112EXPORT_SYMBOL_GPL(lock_contended); 3138EXPORT_SYMBOL_GPL(lock_contended);
3113 3139
3140DEFINE_TRACE(lock_acquired);
3141
3114void lock_acquired(struct lockdep_map *lock, unsigned long ip) 3142void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3115{ 3143{
3116 unsigned long flags; 3144 unsigned long flags;
3117 3145
3146 trace_lock_acquired(lock, ip);
3147
3118 if (unlikely(!lock_stat)) 3148 if (unlikely(!lock_stat))
3119 return; 3149 return;
3120 3150
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 56b196932c08..a2cc7e9a6e84 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -7,6 +7,45 @@
7 */ 7 */
8 8
9/* 9/*
10 * Lock-class usage-state bits:
11 */
12enum lock_usage_bit {
13#define LOCKDEP_STATE(__STATE) \
14 LOCK_USED_IN_##__STATE, \
15 LOCK_USED_IN_##__STATE##_READ, \
16 LOCK_ENABLED_##__STATE, \
17 LOCK_ENABLED_##__STATE##_READ,
18#include "lockdep_states.h"
19#undef LOCKDEP_STATE
20 LOCK_USED,
21 LOCK_USAGE_STATES
22};
23
24/*
25 * Usage-state bitmasks:
26 */
27#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE),
28
29enum {
30#define LOCKDEP_STATE(__STATE) \
31 __LOCKF(USED_IN_##__STATE) \
32 __LOCKF(USED_IN_##__STATE##_READ) \
33 __LOCKF(ENABLED_##__STATE) \
34 __LOCKF(ENABLED_##__STATE##_READ)
35#include "lockdep_states.h"
36#undef LOCKDEP_STATE
37 __LOCKF(USED)
38};
39
40#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
41#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
42
43#define LOCKF_ENABLED_IRQ_READ \
44 (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
45#define LOCKF_USED_IN_IRQ_READ \
46 (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
47
48/*
10 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies 49 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
11 * we track. 50 * we track.
12 * 51 *
@@ -31,8 +70,10 @@
31extern struct list_head all_lock_classes; 70extern struct list_head all_lock_classes;
32extern struct lock_chain lock_chains[]; 71extern struct lock_chain lock_chains[];
33 72
34extern void 73#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
35get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); 74
75extern void get_usage_chars(struct lock_class *class,
76 char usage[LOCK_USAGE_CHARS]);
36 77
37extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); 78extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
38 79
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 13716b813896..d7135aa2d2c4 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
84{ 84{
85 struct lock_class *class = v; 85 struct lock_class *class = v;
86 struct lock_list *entry; 86 struct lock_list *entry;
87 char c1, c2, c3, c4; 87 char usage[LOCK_USAGE_CHARS];
88 88
89 if (v == SEQ_START_TOKEN) { 89 if (v == SEQ_START_TOKEN) {
90 seq_printf(m, "all lock classes:\n"); 90 seq_printf(m, "all lock classes:\n");
@@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); 100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
101#endif 101#endif
102 102
103 get_usage_chars(class, &c1, &c2, &c3, &c4); 103 get_usage_chars(class, usage);
104 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); 104 seq_printf(m, " %s", usage);
105 105
106 seq_printf(m, ": "); 106 seq_printf(m, ": ");
107 print_name(m, class); 107 print_name(m, class);
@@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
300 nr_uncategorized++; 300 nr_uncategorized++;
301 if (class->usage_mask & LOCKF_USED_IN_IRQ) 301 if (class->usage_mask & LOCKF_USED_IN_IRQ)
302 nr_irq_safe++; 302 nr_irq_safe++;
303 if (class->usage_mask & LOCKF_ENABLED_IRQS) 303 if (class->usage_mask & LOCKF_ENABLED_IRQ)
304 nr_irq_unsafe++; 304 nr_irq_unsafe++;
305 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) 305 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
306 nr_softirq_safe++; 306 nr_softirq_safe++;
307 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) 307 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
308 nr_softirq_unsafe++; 308 nr_softirq_unsafe++;
309 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) 309 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
310 nr_hardirq_safe++; 310 nr_hardirq_safe++;
311 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) 311 if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
312 nr_hardirq_unsafe++; 312 nr_hardirq_unsafe++;
313 if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) 313 if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
314 nr_irq_read_safe++; 314 nr_irq_read_safe++;
315 if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) 315 if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
316 nr_irq_read_unsafe++; 316 nr_irq_read_unsafe++;
317 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) 317 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
318 nr_softirq_read_safe++; 318 nr_softirq_read_safe++;
319 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) 319 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
320 nr_softirq_read_unsafe++; 320 nr_softirq_read_unsafe++;
321 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) 321 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
322 nr_hardirq_read_safe++; 322 nr_hardirq_read_safe++;
323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
324 nr_hardirq_read_unsafe++; 324 nr_hardirq_read_unsafe++;
325 325
326#ifdef CONFIG_PROVE_LOCKING 326#ifdef CONFIG_PROVE_LOCKING
@@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
601static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
602{ 602{
603 seq_printf(m, "lock_stat version 0.3\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
604
605 if (unlikely(!debug_locks))
606 seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
607
604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 608 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 609 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
606 "%14s %14s\n", 610 "%14s %14s\n",
diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h
new file mode 100644
index 000000000000..995b0cc2b84c
--- /dev/null
+++ b/kernel/lockdep_states.h
@@ -0,0 +1,9 @@
1/*
2 * Lockdep states,
3 *
4 * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
5 * you add one, or come up with a nice dynamic solution.
6 */
7LOCKDEP_STATE(HARDIRQ)
8LOCKDEP_STATE(SOFTIRQ)
9LOCKDEP_STATE(RECLAIM_FS)
diff --git a/kernel/module.c b/kernel/module.c
index c9332c90d5a0..05f014efa32c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
51#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
52#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h> 53#include <linux/async.h>
54#include <linux/percpu.h>
54 55
55#if 0 56#if 0
56#define DEBUGP printk 57#define DEBUGP printk
@@ -67,7 +68,8 @@
67 68
68/* List of modules, protected by module_mutex or preempt_disable 69/* List of modules, protected by module_mutex or preempt_disable
69 * (delete uses stop_machine/add uses RCU list operations). */ 70 * (delete uses stop_machine/add uses RCU list operations). */
70static DEFINE_MUTEX(module_mutex); 71DEFINE_MUTEX(module_mutex);
72EXPORT_SYMBOL_GPL(module_mutex);
71static LIST_HEAD(modules); 73static LIST_HEAD(modules);
72 74
73/* Waiting for a module to finish initializing? */ 75/* Waiting for a module to finish initializing? */
@@ -75,7 +77,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
75 77
76static BLOCKING_NOTIFIER_HEAD(module_notify_list); 78static BLOCKING_NOTIFIER_HEAD(module_notify_list);
77 79
78/* Bounds of module allocation, for speeding __module_text_address */ 80/* Bounds of module allocation, for speeding __module_address */
79static unsigned long module_addr_min = -1UL, module_addr_max = 0; 81static unsigned long module_addr_min = -1UL, module_addr_max = 0;
80 82
81int register_module_notifier(struct notifier_block * nb) 83int register_module_notifier(struct notifier_block * nb)
@@ -185,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
185#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) 187#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
186#endif 188#endif
187 189
188struct symsearch {
189 const struct kernel_symbol *start, *stop;
190 const unsigned long *crcs;
191 enum {
192 NOT_GPL_ONLY,
193 GPL_ONLY,
194 WILL_BE_GPL_ONLY,
195 } licence;
196 bool unused;
197};
198
199static bool each_symbol_in_section(const struct symsearch *arr, 190static bool each_symbol_in_section(const struct symsearch *arr,
200 unsigned int arrsize, 191 unsigned int arrsize,
201 struct module *owner, 192 struct module *owner,
@@ -216,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
216} 207}
217 208
218/* Returns true as soon as fn returns true, otherwise false. */ 209/* Returns true as soon as fn returns true, otherwise false. */
219static bool each_symbol(bool (*fn)(const struct symsearch *arr, 210bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
220 struct module *owner, 211 unsigned int symnum, void *data), void *data)
221 unsigned int symnum, void *data),
222 void *data)
223{ 212{
224 struct module *mod; 213 struct module *mod;
225 const struct symsearch arr[] = { 214 const struct symsearch arr[] = {
@@ -272,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
272 } 261 }
273 return false; 262 return false;
274} 263}
264EXPORT_SYMBOL_GPL(each_symbol);
275 265
276struct find_symbol_arg { 266struct find_symbol_arg {
277 /* Input */ 267 /* Input */
@@ -282,7 +272,7 @@ struct find_symbol_arg {
282 /* Output */ 272 /* Output */
283 struct module *owner; 273 struct module *owner;
284 const unsigned long *crc; 274 const unsigned long *crc;
285 unsigned long value; 275 const struct kernel_symbol *sym;
286}; 276};
287 277
288static bool find_symbol_in_section(const struct symsearch *syms, 278static bool find_symbol_in_section(const struct symsearch *syms,
@@ -323,17 +313,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
323 313
324 fsa->owner = owner; 314 fsa->owner = owner;
325 fsa->crc = symversion(syms->crcs, symnum); 315 fsa->crc = symversion(syms->crcs, symnum);
326 fsa->value = syms->start[symnum].value; 316 fsa->sym = &syms->start[symnum];
327 return true; 317 return true;
328} 318}
329 319
330/* Find a symbol, return value, (optional) crc and (optional) module 320/* Find a symbol and return it, along with, (optional) crc and
331 * which owns it */ 321 * (optional) module which owns it */
332static unsigned long find_symbol(const char *name, 322const struct kernel_symbol *find_symbol(const char *name,
333 struct module **owner, 323 struct module **owner,
334 const unsigned long **crc, 324 const unsigned long **crc,
335 bool gplok, 325 bool gplok,
336 bool warn) 326 bool warn)
337{ 327{
338 struct find_symbol_arg fsa; 328 struct find_symbol_arg fsa;
339 329
@@ -346,15 +336,16 @@ static unsigned long find_symbol(const char *name,
346 *owner = fsa.owner; 336 *owner = fsa.owner;
347 if (crc) 337 if (crc)
348 *crc = fsa.crc; 338 *crc = fsa.crc;
349 return fsa.value; 339 return fsa.sym;
350 } 340 }
351 341
352 DEBUGP("Failed to find symbol %s\n", name); 342 DEBUGP("Failed to find symbol %s\n", name);
353 return -ENOENT; 343 return NULL;
354} 344}
345EXPORT_SYMBOL_GPL(find_symbol);
355 346
356/* Search for module by name: must hold module_mutex. */ 347/* Search for module by name: must hold module_mutex. */
357static struct module *find_module(const char *name) 348struct module *find_module(const char *name)
358{ 349{
359 struct module *mod; 350 struct module *mod;
360 351
@@ -364,8 +355,37 @@ static struct module *find_module(const char *name)
364 } 355 }
365 return NULL; 356 return NULL;
366} 357}
358EXPORT_SYMBOL_GPL(find_module);
367 359
368#ifdef CONFIG_SMP 360#ifdef CONFIG_SMP
361
362#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
363
364static void *percpu_modalloc(unsigned long size, unsigned long align,
365 const char *name)
366{
367 void *ptr;
368
369 if (align > PAGE_SIZE) {
370 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
371 name, align, PAGE_SIZE);
372 align = PAGE_SIZE;
373 }
374
375 ptr = __alloc_reserved_percpu(size, align);
376 if (!ptr)
377 printk(KERN_WARNING
378 "Could not allocate %lu bytes percpu data\n", size);
379 return ptr;
380}
381
382static void percpu_modfree(void *freeme)
383{
384 free_percpu(freeme);
385}
386
387#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
388
369/* Number of blocks used and allocated. */ 389/* Number of blocks used and allocated. */
370static unsigned int pcpu_num_used, pcpu_num_allocated; 390static unsigned int pcpu_num_used, pcpu_num_allocated;
371/* Size of each block. -ve means used. */ 391/* Size of each block. -ve means used. */
@@ -480,21 +500,6 @@ static void percpu_modfree(void *freeme)
480 } 500 }
481} 501}
482 502
483static unsigned int find_pcpusec(Elf_Ehdr *hdr,
484 Elf_Shdr *sechdrs,
485 const char *secstrings)
486{
487 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
488}
489
490static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
491{
492 int cpu;
493
494 for_each_possible_cpu(cpu)
495 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
496}
497
498static int percpu_modinit(void) 503static int percpu_modinit(void)
499{ 504{
500 pcpu_num_used = 2; 505 pcpu_num_used = 2;
@@ -513,7 +518,26 @@ static int percpu_modinit(void)
513 return 0; 518 return 0;
514} 519}
515__initcall(percpu_modinit); 520__initcall(percpu_modinit);
521
522#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
523
524static unsigned int find_pcpusec(Elf_Ehdr *hdr,
525 Elf_Shdr *sechdrs,
526 const char *secstrings)
527{
528 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
529}
530
531static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
532{
533 int cpu;
534
535 for_each_possible_cpu(cpu)
536 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
537}
538
516#else /* ... !CONFIG_SMP */ 539#else /* ... !CONFIG_SMP */
540
517static inline void *percpu_modalloc(unsigned long size, unsigned long align, 541static inline void *percpu_modalloc(unsigned long size, unsigned long align,
518 const char *name) 542 const char *name)
519{ 543{
@@ -535,6 +559,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
535 /* pcpusec should be 0, and size of that section should be 0. */ 559 /* pcpusec should be 0, and size of that section should be 0. */
536 BUG_ON(size != 0); 560 BUG_ON(size != 0);
537} 561}
562
538#endif /* CONFIG_SMP */ 563#endif /* CONFIG_SMP */
539 564
540#define MODINFO_ATTR(field) \ 565#define MODINFO_ATTR(field) \
@@ -573,13 +598,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
573/* Init the unload section of the module. */ 598/* Init the unload section of the module. */
574static void module_unload_init(struct module *mod) 599static void module_unload_init(struct module *mod)
575{ 600{
576 unsigned int i; 601 int cpu;
577 602
578 INIT_LIST_HEAD(&mod->modules_which_use_me); 603 INIT_LIST_HEAD(&mod->modules_which_use_me);
579 for (i = 0; i < NR_CPUS; i++) 604 for_each_possible_cpu(cpu)
580 local_set(&mod->ref[i].count, 0); 605 local_set(__module_ref_addr(mod, cpu), 0);
581 /* Hold reference count during initialization. */ 606 /* Hold reference count during initialization. */
582 local_set(&mod->ref[raw_smp_processor_id()].count, 1); 607 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
583 /* Backwards compatibility macros put refcount during init. */ 608 /* Backwards compatibility macros put refcount during init. */
584 mod->waiter = current; 609 mod->waiter = current;
585} 610}
@@ -607,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
607} 632}
608 633
609/* Module a uses b */ 634/* Module a uses b */
610static int use_module(struct module *a, struct module *b) 635int use_module(struct module *a, struct module *b)
611{ 636{
612 struct module_use *use; 637 struct module_use *use;
613 int no_warn, err; 638 int no_warn, err;
@@ -640,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
640 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); 665 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
641 return 1; 666 return 1;
642} 667}
668EXPORT_SYMBOL_GPL(use_module);
643 669
644/* Clear the unload stuff of the module. */ 670/* Clear the unload stuff of the module. */
645static void module_unload_free(struct module *mod) 671static void module_unload_free(struct module *mod)
@@ -717,10 +743,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
717 743
718unsigned int module_refcount(struct module *mod) 744unsigned int module_refcount(struct module *mod)
719{ 745{
720 unsigned int i, total = 0; 746 unsigned int total = 0;
747 int cpu;
721 748
722 for (i = 0; i < NR_CPUS; i++) 749 for_each_possible_cpu(cpu)
723 total += local_read(&mod->ref[i].count); 750 total += local_read(__module_ref_addr(mod, cpu));
724 return total; 751 return total;
725} 752}
726EXPORT_SYMBOL(module_refcount); 753EXPORT_SYMBOL(module_refcount);
@@ -743,8 +770,8 @@ static void wait_for_zero_refcount(struct module *mod)
743 mutex_lock(&module_mutex); 770 mutex_lock(&module_mutex);
744} 771}
745 772
746asmlinkage long 773SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
747sys_delete_module(const char __user *name_user, unsigned int flags) 774 unsigned int, flags)
748{ 775{
749 struct module *mod; 776 struct module *mod;
750 char name[MODULE_NAME_LEN]; 777 char name[MODULE_NAME_LEN];
@@ -821,7 +848,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
821 mutex_lock(&module_mutex); 848 mutex_lock(&module_mutex);
822 /* Store the name of the last unloaded module for diagnostic purposes */ 849 /* Store the name of the last unloaded module for diagnostic purposes */
823 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 850 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
824 unregister_dynamic_debug_module(mod->name); 851 ddebug_remove_module(mod->name);
825 free_module(mod); 852 free_module(mod);
826 853
827 out: 854 out:
@@ -859,7 +886,7 @@ void __symbol_put(const char *symbol)
859 struct module *owner; 886 struct module *owner;
860 887
861 preempt_disable(); 888 preempt_disable();
862 if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false))) 889 if (!find_symbol(symbol, &owner, NULL, true, false))
863 BUG(); 890 BUG();
864 module_put(owner); 891 module_put(owner);
865 preempt_enable(); 892 preempt_enable();
@@ -873,8 +900,10 @@ void symbol_put_addr(void *addr)
873 if (core_kernel_text((unsigned long)addr)) 900 if (core_kernel_text((unsigned long)addr))
874 return; 901 return;
875 902
876 if (!(modaddr = module_text_address((unsigned long)addr))) 903 /* module_text_address is safe here: we're supposed to have reference
877 BUG(); 904 * to module from symbol_get, so it can't go away. */
905 modaddr = __module_text_address((unsigned long)addr);
906 BUG_ON(!modaddr);
878 module_put(modaddr); 907 module_put(modaddr);
879} 908}
880EXPORT_SYMBOL_GPL(symbol_put_addr); 909EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -894,7 +923,7 @@ void module_put(struct module *module)
894{ 923{
895 if (module) { 924 if (module) {
896 unsigned int cpu = get_cpu(); 925 unsigned int cpu = get_cpu();
897 local_dec(&module->ref[cpu].count); 926 local_dec(__module_ref_addr(module, cpu));
898 /* Maybe they're waiting for us to drop reference? */ 927 /* Maybe they're waiting for us to drop reference? */
899 if (unlikely(!module_is_live(module))) 928 if (unlikely(!module_is_live(module)))
900 wake_up_process(module->waiter); 929 wake_up_process(module->waiter);
@@ -914,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
914{ 943{
915} 944}
916 945
917static inline int use_module(struct module *a, struct module *b) 946int use_module(struct module *a, struct module *b)
918{ 947{
919 return strong_try_module_get(b) == 0; 948 return strong_try_module_get(b) == 0;
920} 949}
950EXPORT_SYMBOL_GPL(use_module);
921 951
922static inline void module_unload_init(struct module *mod) 952static inline void module_unload_init(struct module *mod)
923{ 953{
@@ -960,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
960 990
961static const char vermagic[] = VERMAGIC_STRING; 991static const char vermagic[] = VERMAGIC_STRING;
962 992
963static int try_to_force_load(struct module *mod, const char *symname) 993static int try_to_force_load(struct module *mod, const char *reason)
964{ 994{
965#ifdef CONFIG_MODULE_FORCE_LOAD 995#ifdef CONFIG_MODULE_FORCE_LOAD
966 if (!test_taint(TAINT_FORCED_MODULE)) 996 if (!test_taint(TAINT_FORCED_MODULE))
967 printk("%s: no version for \"%s\" found: kernel tainted.\n", 997 printk(KERN_WARNING "%s: %s: kernel tainted.\n",
968 mod->name, symname); 998 mod->name, reason);
969 add_taint_module(mod, TAINT_FORCED_MODULE); 999 add_taint_module(mod, TAINT_FORCED_MODULE);
970 return 0; 1000 return 0;
971#else 1001#else
@@ -1022,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1022{ 1052{
1023 const unsigned long *crc; 1053 const unsigned long *crc;
1024 1054
1025 if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false))) 1055 if (!find_symbol("module_layout", NULL, &crc, true, false))
1026 BUG(); 1056 BUG();
1027 return check_version(sechdrs, versindex, "struct_module", mod, crc); 1057 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1028} 1058}
1029 1059
1030/* First part is kernel version, which we ignore if module has crcs. */ 1060/* First part is kernel version, which we ignore if module has crcs. */
@@ -1063,25 +1093,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1063 1093
1064/* Resolve a symbol for this module. I.e. if we find one, record usage. 1094/* Resolve a symbol for this module. I.e. if we find one, record usage.
1065 Must be holding module_mutex. */ 1095 Must be holding module_mutex. */
1066static unsigned long resolve_symbol(Elf_Shdr *sechdrs, 1096static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1067 unsigned int versindex, 1097 unsigned int versindex,
1068 const char *name, 1098 const char *name,
1069 struct module *mod) 1099 struct module *mod)
1070{ 1100{
1071 struct module *owner; 1101 struct module *owner;
1072 unsigned long ret; 1102 const struct kernel_symbol *sym;
1073 const unsigned long *crc; 1103 const unsigned long *crc;
1074 1104
1075 ret = find_symbol(name, &owner, &crc, 1105 sym = find_symbol(name, &owner, &crc,
1076 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1106 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1077 if (!IS_ERR_VALUE(ret)) { 1107 /* use_module can fail due to OOM,
1078 /* use_module can fail due to OOM, 1108 or module initialization or unloading */
1079 or module initialization or unloading */ 1109 if (sym) {
1080 if (!check_version(sechdrs, versindex, name, mod, crc) || 1110 if (!check_version(sechdrs, versindex, name, mod, crc) ||
1081 !use_module(mod, owner)) 1111 !use_module(mod, owner))
1082 ret = -EINVAL; 1112 sym = NULL;
1083 } 1113 }
1084 return ret; 1114 return sym;
1085} 1115}
1086 1116
1087/* 1117/*
@@ -1456,6 +1486,9 @@ static void free_module(struct module *mod)
1456 /* Module unload stuff */ 1486 /* Module unload stuff */
1457 module_unload_free(mod); 1487 module_unload_free(mod);
1458 1488
1489 /* Free any allocated parameters. */
1490 destroy_params(mod->kp, mod->num_kp);
1491
1459 /* release any pointers to mcount in this module */ 1492 /* release any pointers to mcount in this module */
1460 ftrace_release(mod->module_core, mod->core_size); 1493 ftrace_release(mod->module_core, mod->core_size);
1461 1494
@@ -1464,7 +1497,10 @@ static void free_module(struct module *mod)
1464 kfree(mod->args); 1497 kfree(mod->args);
1465 if (mod->percpu) 1498 if (mod->percpu)
1466 percpu_modfree(mod->percpu); 1499 percpu_modfree(mod->percpu);
1467 1500#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
1501 if (mod->refptr)
1502 percpu_modfree(mod->refptr);
1503#endif
1468 /* Free lock-classes: */ 1504 /* Free lock-classes: */
1469 lockdep_free_key_range(mod->module_core, mod->core_size); 1505 lockdep_free_key_range(mod->module_core, mod->core_size);
1470 1506
@@ -1475,17 +1511,15 @@ static void free_module(struct module *mod)
1475void *__symbol_get(const char *symbol) 1511void *__symbol_get(const char *symbol)
1476{ 1512{
1477 struct module *owner; 1513 struct module *owner;
1478 unsigned long value; 1514 const struct kernel_symbol *sym;
1479 1515
1480 preempt_disable(); 1516 preempt_disable();
1481 value = find_symbol(symbol, &owner, NULL, true, true); 1517 sym = find_symbol(symbol, &owner, NULL, true, true);
1482 if (IS_ERR_VALUE(value)) 1518 if (sym && strong_try_module_get(owner))
1483 value = 0; 1519 sym = NULL;
1484 else if (strong_try_module_get(owner))
1485 value = 0;
1486 preempt_enable(); 1520 preempt_enable();
1487 1521
1488 return (void *)value; 1522 return sym ? (void *)sym->value : NULL;
1489} 1523}
1490EXPORT_SYMBOL_GPL(__symbol_get); 1524EXPORT_SYMBOL_GPL(__symbol_get);
1491 1525
@@ -1513,8 +1547,7 @@ static int verify_export_symbols(struct module *mod)
1513 1547
1514 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1548 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1515 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { 1549 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1516 if (!IS_ERR_VALUE(find_symbol(s->name, &owner, 1550 if (find_symbol(s->name, &owner, NULL, true, false)) {
1517 NULL, true, false))) {
1518 printk(KERN_ERR 1551 printk(KERN_ERR
1519 "%s: exports duplicate symbol %s" 1552 "%s: exports duplicate symbol %s"
1520 " (owned by %s)\n", 1553 " (owned by %s)\n",
@@ -1538,6 +1571,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1538 unsigned long secbase; 1571 unsigned long secbase;
1539 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1572 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1540 int ret = 0; 1573 int ret = 0;
1574 const struct kernel_symbol *ksym;
1541 1575
1542 for (i = 1; i < n; i++) { 1576 for (i = 1; i < n; i++) {
1543 switch (sym[i].st_shndx) { 1577 switch (sym[i].st_shndx) {
@@ -1557,13 +1591,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1557 break; 1591 break;
1558 1592
1559 case SHN_UNDEF: 1593 case SHN_UNDEF:
1560 sym[i].st_value 1594 ksym = resolve_symbol(sechdrs, versindex,
1561 = resolve_symbol(sechdrs, versindex, 1595 strtab + sym[i].st_name, mod);
1562 strtab + sym[i].st_name, mod);
1563
1564 /* Ok if resolved. */ 1596 /* Ok if resolved. */
1565 if (!IS_ERR_VALUE(sym[i].st_value)) 1597 if (ksym) {
1598 sym[i].st_value = ksym->value;
1566 break; 1599 break;
1600 }
1601
1567 /* Ok if weak. */ 1602 /* Ok if weak. */
1568 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1603 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1569 break; 1604 break;
@@ -1638,8 +1673,7 @@ static void layout_sections(struct module *mod,
1638 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1673 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1639 || (s->sh_flags & masks[m][1]) 1674 || (s->sh_flags & masks[m][1])
1640 || s->sh_entsize != ~0UL 1675 || s->sh_entsize != ~0UL
1641 || strncmp(secstrings + s->sh_name, 1676 || strstarts(secstrings + s->sh_name, ".init"))
1642 ".init", 5) == 0)
1643 continue; 1677 continue;
1644 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1678 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1645 DEBUGP("\t%s\n", secstrings + s->sh_name); 1679 DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1656,8 +1690,7 @@ static void layout_sections(struct module *mod,
1656 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1690 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1657 || (s->sh_flags & masks[m][1]) 1691 || (s->sh_flags & masks[m][1])
1658 || s->sh_entsize != ~0UL 1692 || s->sh_entsize != ~0UL
1659 || strncmp(secstrings + s->sh_name, 1693 || !strstarts(secstrings + s->sh_name, ".init"))
1660 ".init", 5) != 0)
1661 continue; 1694 continue;
1662 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1695 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1663 | INIT_OFFSET_MASK); 1696 | INIT_OFFSET_MASK);
@@ -1790,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
1790 else 1823 else
1791 return 'b'; 1824 return 'b';
1792 } 1825 }
1793 if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name, 1826 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
1794 ".debug", strlen(".debug")) == 0)
1795 return 'n'; 1827 return 'n';
1796 return '?'; 1828 return '?';
1797} 1829}
@@ -1823,19 +1855,13 @@ static inline void add_kallsyms(struct module *mod,
1823} 1855}
1824#endif /* CONFIG_KALLSYMS */ 1856#endif /* CONFIG_KALLSYMS */
1825 1857
1826static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num) 1858static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
1827{ 1859{
1828#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG 1860#ifdef CONFIG_DYNAMIC_DEBUG
1829 unsigned int i; 1861 if (ddebug_add_module(debug, num, debug->modname))
1830 1862 printk(KERN_ERR "dynamic debug error adding module: %s\n",
1831 for (i = 0; i < num; i++) { 1863 debug->modname);
1832 register_dynamic_debug_module(debug[i].modname, 1864#endif
1833 debug[i].type,
1834 debug[i].logical_modname,
1835 debug[i].flag_names,
1836 debug[i].hash, debug[i].hash2);
1837 }
1838#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
1839} 1865}
1840 1866
1841static void *module_alloc_update_bounds(unsigned long size) 1867static void *module_alloc_update_bounds(unsigned long size)
@@ -1866,8 +1892,7 @@ static noinline struct module *load_module(void __user *umod,
1866 unsigned int symindex = 0; 1892 unsigned int symindex = 0;
1867 unsigned int strindex = 0; 1893 unsigned int strindex = 0;
1868 unsigned int modindex, versindex, infoindex, pcpuindex; 1894 unsigned int modindex, versindex, infoindex, pcpuindex;
1869 unsigned int num_kp, num_mcount; 1895 unsigned int num_mcount;
1870 struct kernel_param *kp;
1871 struct module *mod; 1896 struct module *mod;
1872 long err = 0; 1897 long err = 0;
1873 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1898 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1884,12 +1909,6 @@ static noinline struct module *load_module(void __user *umod,
1884 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1909 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1885 return ERR_PTR(-ENOMEM); 1910 return ERR_PTR(-ENOMEM);
1886 1911
1887 /* Create stop_machine threads since the error path relies on
1888 * a non-failing stop_machine call. */
1889 err = stop_machine_create();
1890 if (err)
1891 goto free_hdr;
1892
1893 if (copy_from_user(hdr, umod, len) != 0) { 1912 if (copy_from_user(hdr, umod, len) != 0) {
1894 err = -EFAULT; 1913 err = -EFAULT;
1895 goto free_hdr; 1914 goto free_hdr;
@@ -1930,7 +1949,7 @@ static noinline struct module *load_module(void __user *umod,
1930 } 1949 }
1931#ifndef CONFIG_MODULE_UNLOAD 1950#ifndef CONFIG_MODULE_UNLOAD
1932 /* Don't load .exit sections */ 1951 /* Don't load .exit sections */
1933 if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0) 1952 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
1934 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 1953 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1935#endif 1954#endif
1936 } 1955 }
@@ -1974,7 +1993,7 @@ static noinline struct module *load_module(void __user *umod,
1974 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1993 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1975 /* This is allowed: modprobe --force will invalidate it. */ 1994 /* This is allowed: modprobe --force will invalidate it. */
1976 if (!modmagic) { 1995 if (!modmagic) {
1977 err = try_to_force_load(mod, "magic"); 1996 err = try_to_force_load(mod, "bad vermagic");
1978 if (err) 1997 if (err)
1979 goto free_hdr; 1998 goto free_hdr;
1980 } else if (!same_magic(modmagic, vermagic, versindex)) { 1999 } else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2070,6 +2089,14 @@ static noinline struct module *load_module(void __user *umod,
2070 /* Module has been moved. */ 2089 /* Module has been moved. */
2071 mod = (void *)sechdrs[modindex].sh_addr; 2090 mod = (void *)sechdrs[modindex].sh_addr;
2072 2091
2092#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2093 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
2094 mod->name);
2095 if (!mod->refptr) {
2096 err = -ENOMEM;
2097 goto free_init;
2098 }
2099#endif
2073 /* Now we've moved module, initialize linked lists, etc. */ 2100 /* Now we've moved module, initialize linked lists, etc. */
2074 module_unload_init(mod); 2101 module_unload_init(mod);
2075 2102
@@ -2104,8 +2131,8 @@ static noinline struct module *load_module(void __user *umod,
2104 2131
2105 /* Now we've got everything in the final locations, we can 2132 /* Now we've got everything in the final locations, we can
2106 * find optional sections. */ 2133 * find optional sections. */
2107 kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp), 2134 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2108 &num_kp); 2135 sizeof(*mod->kp), &mod->num_kp);
2109 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", 2136 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2110 sizeof(*mod->syms), &mod->num_syms); 2137 sizeof(*mod->syms), &mod->num_syms);
2111 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); 2138 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2155,8 +2182,8 @@ static noinline struct module *load_module(void __user *umod,
2155 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2182 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2156#endif 2183#endif
2157 ) { 2184 ) {
2158 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); 2185 err = try_to_force_load(mod,
2159 err = try_to_force_load(mod, "nocrc"); 2186 "no versions for exported symbols");
2160 if (err) 2187 if (err)
2161 goto cleanup; 2188 goto cleanup;
2162 } 2189 }
@@ -2201,12 +2228,13 @@ static noinline struct module *load_module(void __user *umod,
2201 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2228 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
2202 2229
2203 if (!mod->taints) { 2230 if (!mod->taints) {
2204 struct mod_debug *debug; 2231 struct _ddebug *debug;
2205 unsigned int num_debug; 2232 unsigned int num_debug;
2206 2233
2207 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2234 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2208 sizeof(*debug), &num_debug); 2235 sizeof(*debug), &num_debug);
2209 dynamic_printk_setup(debug, num_debug); 2236 if (debug)
2237 dynamic_debug_setup(debug, num_debug);
2210 } 2238 }
2211 2239
2212 /* sechdrs[0].sh_size is always zero */ 2240 /* sechdrs[0].sh_size is always zero */
@@ -2250,11 +2278,11 @@ static noinline struct module *load_module(void __user *umod,
2250 */ 2278 */
2251 list_add_rcu(&mod->list, &modules); 2279 list_add_rcu(&mod->list, &modules);
2252 2280
2253 err = parse_args(mod->name, mod->args, kp, num_kp, NULL); 2281 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2254 if (err < 0) 2282 if (err < 0)
2255 goto unlink; 2283 goto unlink;
2256 2284
2257 err = mod_sysfs_setup(mod, kp, num_kp); 2285 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
2258 if (err < 0) 2286 if (err < 0)
2259 goto unlink; 2287 goto unlink;
2260 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2288 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2263,12 +2291,13 @@ static noinline struct module *load_module(void __user *umod,
2263 /* Get rid of temporary copy */ 2291 /* Get rid of temporary copy */
2264 vfree(hdr); 2292 vfree(hdr);
2265 2293
2266 stop_machine_destroy();
2267 /* Done! */ 2294 /* Done! */
2268 return mod; 2295 return mod;
2269 2296
2270 unlink: 2297 unlink:
2271 stop_machine(__unlink_module, mod, NULL); 2298 /* Unlink carefully: kallsyms could be walking list. */
2299 list_del_rcu(&mod->list);
2300 synchronize_sched();
2272 module_arch_cleanup(mod); 2301 module_arch_cleanup(mod);
2273 cleanup: 2302 cleanup:
2274 kobject_del(&mod->mkobj.kobj); 2303 kobject_del(&mod->mkobj.kobj);
@@ -2276,9 +2305,14 @@ static noinline struct module *load_module(void __user *umod,
2276 ftrace_release(mod->module_core, mod->core_size); 2305 ftrace_release(mod->module_core, mod->core_size);
2277 free_unload: 2306 free_unload:
2278 module_unload_free(mod); 2307 module_unload_free(mod);
2308#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2309 free_init:
2310 percpu_modfree(mod->refptr);
2311#endif
2279 module_free(mod, mod->module_init); 2312 module_free(mod, mod->module_init);
2280 free_core: 2313 free_core:
2281 module_free(mod, mod->module_core); 2314 module_free(mod, mod->module_core);
2315 /* mod will be freed with core. Don't access it beyond this line! */
2282 free_percpu: 2316 free_percpu:
2283 if (percpu) 2317 if (percpu)
2284 percpu_modfree(percpu); 2318 percpu_modfree(percpu);
@@ -2286,7 +2320,6 @@ static noinline struct module *load_module(void __user *umod,
2286 kfree(args); 2320 kfree(args);
2287 free_hdr: 2321 free_hdr:
2288 vfree(hdr); 2322 vfree(hdr);
2289 stop_machine_destroy();
2290 return ERR_PTR(err); 2323 return ERR_PTR(err);
2291 2324
2292 truncated: 2325 truncated:
@@ -2296,10 +2329,8 @@ static noinline struct module *load_module(void __user *umod,
2296} 2329}
2297 2330
2298/* This is where the real work happens */ 2331/* This is where the real work happens */
2299asmlinkage long 2332SYSCALL_DEFINE3(init_module, void __user *, umod,
2300sys_init_module(void __user *umod, 2333 unsigned long, len, const char __user *, uargs)
2301 unsigned long len,
2302 const char __user *uargs)
2303{ 2334{
2304 struct module *mod; 2335 struct module *mod;
2305 int ret = 0; 2336 int ret = 0;
@@ -2565,6 +2596,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2565 preempt_enable(); 2596 preempt_enable();
2566 return ret; 2597 return ret;
2567} 2598}
2599
2600int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
2601 struct module *, unsigned long),
2602 void *data)
2603{
2604 struct module *mod;
2605 unsigned int i;
2606 int ret;
2607
2608 list_for_each_entry(mod, &modules, list) {
2609 for (i = 0; i < mod->num_symtab; i++) {
2610 ret = fn(data, mod->strtab + mod->symtab[i].st_name,
2611 mod, mod->symtab[i].st_value);
2612 if (ret != 0)
2613 return ret;
2614 }
2615 }
2616 return 0;
2617}
2568#endif /* CONFIG_KALLSYMS */ 2618#endif /* CONFIG_KALLSYMS */
2569 2619
2570static char *module_flags(struct module *mod, char *buf) 2620static char *module_flags(struct module *mod, char *buf)
@@ -2700,29 +2750,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2700} 2750}
2701 2751
2702/* 2752/*
2703 * Is this a valid module address? 2753 * is_module_address - is this address inside a module?
2754 * @addr: the address to check.
2755 *
2756 * See is_module_text_address() if you simply want to see if the address
2757 * is code (not data).
2704 */ 2758 */
2705int is_module_address(unsigned long addr) 2759bool is_module_address(unsigned long addr)
2706{ 2760{
2707 struct module *mod; 2761 bool ret;
2708 2762
2709 preempt_disable(); 2763 preempt_disable();
2710 2764 ret = __module_address(addr) != NULL;
2711 list_for_each_entry_rcu(mod, &modules, list) {
2712 if (within_module_core(addr, mod)) {
2713 preempt_enable();
2714 return 1;
2715 }
2716 }
2717
2718 preempt_enable(); 2765 preempt_enable();
2719 2766
2720 return 0; 2767 return ret;
2721} 2768}
2722 2769
2723 2770/*
2724/* Is this a valid kernel address? */ 2771 * __module_address - get the module which contains an address.
2725__notrace_funcgraph struct module *__module_text_address(unsigned long addr) 2772 * @addr: the address.
2773 *
2774 * Must be called with preempt disabled or module mutex held so that
2775 * module doesn't get freed during this.
2776 */
2777struct module *__module_address(unsigned long addr)
2726{ 2778{
2727 struct module *mod; 2779 struct module *mod;
2728 2780
@@ -2730,22 +2782,51 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
2730 return NULL; 2782 return NULL;
2731 2783
2732 list_for_each_entry_rcu(mod, &modules, list) 2784 list_for_each_entry_rcu(mod, &modules, list)
2733 if (within(addr, mod->module_init, mod->init_text_size) 2785 if (within_module_core(addr, mod)
2734 || within(addr, mod->module_core, mod->core_text_size)) 2786 || within_module_init(addr, mod))
2735 return mod; 2787 return mod;
2736 return NULL; 2788 return NULL;
2737} 2789}
2790EXPORT_SYMBOL_GPL(__module_address);
2738 2791
2739struct module *module_text_address(unsigned long addr) 2792/*
2793 * is_module_text_address - is this address inside module code?
2794 * @addr: the address to check.
2795 *
2796 * See is_module_address() if you simply want to see if the address is
2797 * anywhere in a module. See kernel_text_address() for testing if an
2798 * address corresponds to kernel or module code.
2799 */
2800bool is_module_text_address(unsigned long addr)
2740{ 2801{
2741 struct module *mod; 2802 bool ret;
2742 2803
2743 preempt_disable(); 2804 preempt_disable();
2744 mod = __module_text_address(addr); 2805 ret = __module_text_address(addr) != NULL;
2745 preempt_enable(); 2806 preempt_enable();
2746 2807
2808 return ret;
2809}
2810
2811/*
2812 * __module_text_address - get the module whose code contains an address.
2813 * @addr: the address.
2814 *
2815 * Must be called with preempt disabled or module mutex held so that
2816 * module doesn't get freed during this.
2817 */
2818struct module *__module_text_address(unsigned long addr)
2819{
2820 struct module *mod = __module_address(addr);
2821 if (mod) {
2822 /* Make sure it's within the text section. */
2823 if (!within(addr, mod->module_init, mod->init_text_size)
2824 && !within(addr, mod->module_core, mod->core_text_size))
2825 mod = NULL;
2826 }
2747 return mod; 2827 return mod;
2748} 2828}
2829EXPORT_SYMBOL_GPL(__module_text_address);
2749 2830
2750/* Don't grab lock, we're oopsing. */ 2831/* Don't grab lock, we're oopsing. */
2751void print_modules(void) 2832void print_modules(void)
@@ -2765,9 +2846,17 @@ void print_modules(void)
2765} 2846}
2766 2847
2767#ifdef CONFIG_MODVERSIONS 2848#ifdef CONFIG_MODVERSIONS
2768/* Generate the signature for struct module here, too, for modversions. */ 2849/* Generate the signature for all relevant module structures here.
2769void struct_module(struct module *mod) { return; } 2850 * If these change, we don't want to try to parse the module. */
2770EXPORT_SYMBOL(struct_module); 2851void module_layout(struct module *mod,
2852 struct modversion_info *ver,
2853 struct kernel_param *kp,
2854 struct kernel_symbol *ks,
2855 struct marker *marker,
2856 struct tracepoint *tp)
2857{
2858}
2859EXPORT_SYMBOL(module_layout);
2771#endif 2860#endif
2772 2861
2773#ifdef CONFIG_MARKERS 2862#ifdef CONFIG_MARKERS
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 1d94160eb532..50d022e5a560 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -26,11 +26,6 @@
26/* 26/*
27 * Must be called with lock->wait_lock held. 27 * Must be called with lock->wait_lock held.
28 */ 28 */
29void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
30{
31 lock->owner = new_owner;
32}
33
34void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) 29void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
35{ 30{
36 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); 31 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
59 54
60 /* Mark the current thread as blocked on the lock: */ 55 /* Mark the current thread as blocked on the lock: */
61 ti->task->blocked_on = waiter; 56 ti->task->blocked_on = waiter;
62 waiter->lock = lock;
63} 57}
64 58
65void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 59void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
82 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 76 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 77 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 78 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
85 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 79 mutex_clear_owner(lock);
86} 80}
87 81
88void debug_mutex_init(struct mutex *lock, const char *name, 82void debug_mutex_init(struct mutex *lock, const char *name,
@@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
95 debug_check_no_locks_freed((void *)lock, sizeof(*lock)); 89 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
96 lockdep_init_map(&lock->dep_map, name, key, 0); 90 lockdep_init_map(&lock->dep_map, name, key, 0);
97#endif 91#endif
98 lock->owner = NULL;
99 lock->magic = lock; 92 lock->magic = lock;
100} 93}
101 94
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index babfbdfc534b..6b2d735846a5 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -13,14 +13,6 @@
13/* 13/*
14 * This must be called with lock->wait_lock held. 14 * This must be called with lock->wait_lock held.
15 */ 15 */
16extern void
17debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
18
19static inline void debug_mutex_clear_owner(struct mutex *lock)
20{
21 lock->owner = NULL;
22}
23
24extern void debug_mutex_lock_common(struct mutex *lock, 16extern void debug_mutex_lock_common(struct mutex *lock,
25 struct mutex_waiter *waiter); 17 struct mutex_waiter *waiter);
26extern void debug_mutex_wake_waiter(struct mutex *lock, 18extern void debug_mutex_wake_waiter(struct mutex *lock,
@@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
35extern void debug_mutex_init(struct mutex *lock, const char *name, 27extern void debug_mutex_init(struct mutex *lock, const char *name,
36 struct lock_class_key *key); 28 struct lock_class_key *key);
37 29
30static inline void mutex_set_owner(struct mutex *lock)
31{
32 lock->owner = current_thread_info();
33}
34
35static inline void mutex_clear_owner(struct mutex *lock)
36{
37 lock->owner = NULL;
38}
39
38#define spin_lock_mutex(lock, flags) \ 40#define spin_lock_mutex(lock, flags) \
39 do { \ 41 do { \
40 struct mutex *l = container_of(lock, struct mutex, wait_lock); \ 42 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4f45d4b658ef..5d79781394a3 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -10,6 +10,11 @@
10 * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and 10 * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
11 * David Howells for suggestions and improvements. 11 * David Howells for suggestions and improvements.
12 * 12 *
13 * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
14 * from the -rt tree, where it was originally implemented for rtmutexes
15 * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
16 * and Sven Dietrich.
17 *
13 * Also see Documentation/mutex-design.txt. 18 * Also see Documentation/mutex-design.txt.
14 */ 19 */
15#include <linux/mutex.h> 20#include <linux/mutex.h>
@@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
46 atomic_set(&lock->count, 1); 51 atomic_set(&lock->count, 1);
47 spin_lock_init(&lock->wait_lock); 52 spin_lock_init(&lock->wait_lock);
48 INIT_LIST_HEAD(&lock->wait_list); 53 INIT_LIST_HEAD(&lock->wait_list);
54 mutex_clear_owner(lock);
49 55
50 debug_mutex_init(lock, name, key); 56 debug_mutex_init(lock, name, key);
51} 57}
@@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
91 * 'unlocked' into 'locked' state. 97 * 'unlocked' into 'locked' state.
92 */ 98 */
93 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); 99 __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
100 mutex_set_owner(lock);
94} 101}
95 102
96EXPORT_SYMBOL(mutex_lock); 103EXPORT_SYMBOL(mutex_lock);
@@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
115 * The unlocking fastpath is the 0->1 transition from 'locked' 122 * The unlocking fastpath is the 0->1 transition from 'locked'
116 * into 'unlocked' state: 123 * into 'unlocked' state:
117 */ 124 */
125#ifndef CONFIG_DEBUG_MUTEXES
126 /*
127 * When debugging is enabled we must not clear the owner before time,
128 * the slow path will always be taken, and that clears the owner field
129 * after verifying that it was indeed current.
130 */
131 mutex_clear_owner(lock);
132#endif
118 __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); 133 __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
119} 134}
120 135
@@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
129{ 144{
130 struct task_struct *task = current; 145 struct task_struct *task = current;
131 struct mutex_waiter waiter; 146 struct mutex_waiter waiter;
132 unsigned int old_val;
133 unsigned long flags; 147 unsigned long flags;
134 148
149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
152 /*
153 * Optimistic spinning.
154 *
155 * We try to spin for acquisition when we find that there are no
156 * pending waiters and the lock owner is currently running on a
157 * (different) CPU.
158 *
159 * The rationale is that if the lock owner is running, it is likely to
160 * release the lock soon.
161 *
162 * Since this needs the lock owner, and this mutex implementation
163 * doesn't track the owner atomically in the lock field, we need to
164 * track it non-atomically.
165 *
166 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
167 * to serialize everything.
168 */
169
170 for (;;) {
171 struct thread_info *owner;
172
173 /*
174 * If there's an owner, wait for it to either
175 * release the lock or go to sleep.
176 */
177 owner = ACCESS_ONCE(lock->owner);
178 if (owner && !mutex_spin_on_owner(lock, owner))
179 break;
180
181 if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
182 lock_acquired(&lock->dep_map, ip);
183 mutex_set_owner(lock);
184 preempt_enable();
185 return 0;
186 }
187
188 /*
189 * When there's no owner, we might have preempted between the
190 * owner acquiring the lock and setting the owner field. If
191 * we're an RT task that will live-lock because we won't let
192 * the owner complete.
193 */
194 if (!owner && (need_resched() || rt_task(task)))
195 break;
196
197 /*
198 * The cpu_relax() call is a compiler barrier which forces
199 * everything in this loop to be re-loaded. We don't need
200 * memory barriers as we'll eventually observe the right
201 * values at the cost of a few extra spins.
202 */
203 cpu_relax();
204 }
205#endif
135 spin_lock_mutex(&lock->wait_lock, flags); 206 spin_lock_mutex(&lock->wait_lock, flags);
136 207
137 debug_mutex_lock_common(lock, &waiter); 208 debug_mutex_lock_common(lock, &waiter);
138 mutex_acquire(&lock->dep_map, subclass, 0, ip);
139 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 209 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
140 210
141 /* add waiting tasks to the end of the waitqueue (FIFO): */ 211 /* add waiting tasks to the end of the waitqueue (FIFO): */
142 list_add_tail(&waiter.list, &lock->wait_list); 212 list_add_tail(&waiter.list, &lock->wait_list);
143 waiter.task = task; 213 waiter.task = task;
144 214
145 old_val = atomic_xchg(&lock->count, -1); 215 if (atomic_xchg(&lock->count, -1) == 1)
146 if (old_val == 1)
147 goto done; 216 goto done;
148 217
149 lock_contended(&lock->dep_map, ip); 218 lock_contended(&lock->dep_map, ip);
@@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
158 * that when we release the lock, we properly wake up the 227 * that when we release the lock, we properly wake up the
159 * other waiters: 228 * other waiters:
160 */ 229 */
161 old_val = atomic_xchg(&lock->count, -1); 230 if (atomic_xchg(&lock->count, -1) == 1)
162 if (old_val == 1)
163 break; 231 break;
164 232
165 /* 233 /*
@@ -173,21 +241,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
173 spin_unlock_mutex(&lock->wait_lock, flags); 241 spin_unlock_mutex(&lock->wait_lock, flags);
174 242
175 debug_mutex_free_waiter(&waiter); 243 debug_mutex_free_waiter(&waiter);
244 preempt_enable();
176 return -EINTR; 245 return -EINTR;
177 } 246 }
178 __set_task_state(task, state); 247 __set_task_state(task, state);
179 248
180 /* didnt get the lock, go to sleep: */ 249 /* didnt get the lock, go to sleep: */
181 spin_unlock_mutex(&lock->wait_lock, flags); 250 spin_unlock_mutex(&lock->wait_lock, flags);
182 schedule(); 251 __schedule();
183 spin_lock_mutex(&lock->wait_lock, flags); 252 spin_lock_mutex(&lock->wait_lock, flags);
184 } 253 }
185 254
186done: 255done:
187 lock_acquired(&lock->dep_map, ip); 256 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 257 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 258 mutex_remove_waiter(lock, &waiter, current_thread_info());
190 debug_mutex_set_owner(lock, task_thread_info(task)); 259 mutex_set_owner(lock);
191 260
192 /* set it to 0 if there are no waiters left: */ 261 /* set it to 0 if there are no waiters left: */
193 if (likely(list_empty(&lock->wait_list))) 262 if (likely(list_empty(&lock->wait_list)))
@@ -196,6 +265,7 @@ done:
196 spin_unlock_mutex(&lock->wait_lock, flags); 265 spin_unlock_mutex(&lock->wait_lock, flags);
197 266
198 debug_mutex_free_waiter(&waiter); 267 debug_mutex_free_waiter(&waiter);
268 preempt_enable();
199 269
200 return 0; 270 return 0;
201} 271}
@@ -222,7 +292,8 @@ int __sched
222mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) 292mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
223{ 293{
224 might_sleep(); 294 might_sleep();
225 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_); 295 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
296 subclass, _RET_IP_);
226} 297}
227 298
228EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 299EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -260,8 +331,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
260 wake_up_process(waiter->task); 331 wake_up_process(waiter->task);
261 } 332 }
262 333
263 debug_mutex_clear_owner(lock);
264
265 spin_unlock_mutex(&lock->wait_lock, flags); 334 spin_unlock_mutex(&lock->wait_lock, flags);
266} 335}
267 336
@@ -298,18 +367,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
298 */ 367 */
299int __sched mutex_lock_interruptible(struct mutex *lock) 368int __sched mutex_lock_interruptible(struct mutex *lock)
300{ 369{
370 int ret;
371
301 might_sleep(); 372 might_sleep();
302 return __mutex_fastpath_lock_retval 373 ret = __mutex_fastpath_lock_retval
303 (&lock->count, __mutex_lock_interruptible_slowpath); 374 (&lock->count, __mutex_lock_interruptible_slowpath);
375 if (!ret)
376 mutex_set_owner(lock);
377
378 return ret;
304} 379}
305 380
306EXPORT_SYMBOL(mutex_lock_interruptible); 381EXPORT_SYMBOL(mutex_lock_interruptible);
307 382
308int __sched mutex_lock_killable(struct mutex *lock) 383int __sched mutex_lock_killable(struct mutex *lock)
309{ 384{
385 int ret;
386
310 might_sleep(); 387 might_sleep();
311 return __mutex_fastpath_lock_retval 388 ret = __mutex_fastpath_lock_retval
312 (&lock->count, __mutex_lock_killable_slowpath); 389 (&lock->count, __mutex_lock_killable_slowpath);
390 if (!ret)
391 mutex_set_owner(lock);
392
393 return ret;
313} 394}
314EXPORT_SYMBOL(mutex_lock_killable); 395EXPORT_SYMBOL(mutex_lock_killable);
315 396
@@ -352,9 +433,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
352 433
353 prev = atomic_xchg(&lock->count, -1); 434 prev = atomic_xchg(&lock->count, -1);
354 if (likely(prev == 1)) { 435 if (likely(prev == 1)) {
355 debug_mutex_set_owner(lock, current_thread_info()); 436 mutex_set_owner(lock);
356 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); 437 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
357 } 438 }
439
358 /* Set it back to 0 if there are no waiters: */ 440 /* Set it back to 0 if there are no waiters: */
359 if (likely(list_empty(&lock->wait_list))) 441 if (likely(list_empty(&lock->wait_list)))
360 atomic_set(&lock->count, 0); 442 atomic_set(&lock->count, 0);
@@ -380,8 +462,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
380 */ 462 */
381int __sched mutex_trylock(struct mutex *lock) 463int __sched mutex_trylock(struct mutex *lock)
382{ 464{
383 return __mutex_fastpath_trylock(&lock->count, 465 int ret;
384 __mutex_trylock_slowpath); 466
467 ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
468 if (ret)
469 mutex_set_owner(lock);
470
471 return ret;
385} 472}
386 473
387EXPORT_SYMBOL(mutex_trylock); 474EXPORT_SYMBOL(mutex_trylock);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index a075dafbb290..67578ca48f94 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,8 +16,26 @@
16#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
17 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
18 18
19#define debug_mutex_set_owner(lock, new_owner) do { } while (0) 19#ifdef CONFIG_SMP
20#define debug_mutex_clear_owner(lock) do { } while (0) 20static inline void mutex_set_owner(struct mutex *lock)
21{
22 lock->owner = current_thread_info();
23}
24
25static inline void mutex_clear_owner(struct mutex *lock)
26{
27 lock->owner = NULL;
28}
29#else
30static inline void mutex_set_owner(struct mutex *lock)
31{
32}
33
34static inline void mutex_clear_owner(struct mutex *lock)
35{
36}
37#endif
38
21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) 39#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
22#define debug_mutex_free_waiter(waiter) do { } while (0) 40#define debug_mutex_free_waiter(waiter) do { } while (0)
23#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) 41#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d2..5aa854f9e5ae 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
34 34
35/* 35/*
36 * Rules: 36 * Rules:
37 * 1. you can only enter a cgroup which is a child of your current 37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup 38 * cgroup
39 * 2. you can only place another process into a cgroup if 39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN 40 * a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct cgroup *new_cgroup, struct task_struct *task)
47{ 47{
48 struct cgroup *orig;
49
50 if (current != task) { 48 if (current != task) {
51 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
52 return -EPERM; 50 return -EPERM;
53 51
54 if (!cgroup_is_descendant(new_cgroup)) 52 if (!cgroup_is_descendant(new_cgroup, current))
55 return -EPERM; 53 return -EPERM;
56 } 54 }
57 55
58 if (atomic_read(&new_cgroup->count) != 0) 56 if (!cgroup_is_descendant(new_cgroup, task))
59 return -EPERM;
60
61 orig = task_cgroup(task, ns_subsys_id);
62 if (orig && orig != new_cgroup->parent)
63 return -EPERM; 57 return -EPERM;
64 58
65 return 0; 59 return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
77 71
78 if (!capable(CAP_SYS_ADMIN)) 72 if (!capable(CAP_SYS_ADMIN))
79 return ERR_PTR(-EPERM); 73 return ERR_PTR(-EPERM);
80 if (!cgroup_is_descendant(cgroup)) 74 if (!cgroup_is_descendant(cgroup, current))
81 return ERR_PTR(-EPERM); 75 return ERR_PTR(-EPERM);
82 76
83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 77 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
diff --git a/kernel/panic.c b/kernel/panic.c
index 2a2ff36ff44d..3fd8c5bf8b39 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,19 +8,19 @@
8 * This function is used through-out the kernel (including mm and fs) 8 * This function is used through-out the kernel (including mm and fs)
9 * to indicate a major problem. 9 * to indicate a major problem.
10 */ 10 */
11#include <linux/debug_locks.h>
12#include <linux/interrupt.h>
13#include <linux/kallsyms.h>
14#include <linux/notifier.h>
11#include <linux/module.h> 15#include <linux/module.h>
12#include <linux/sched.h> 16#include <linux/random.h>
13#include <linux/delay.h>
14#include <linux/reboot.h> 17#include <linux/reboot.h>
15#include <linux/notifier.h> 18#include <linux/delay.h>
16#include <linux/init.h> 19#include <linux/kexec.h>
20#include <linux/sched.h>
17#include <linux/sysrq.h> 21#include <linux/sysrq.h>
18#include <linux/interrupt.h> 22#include <linux/init.h>
19#include <linux/nmi.h> 23#include <linux/nmi.h>
20#include <linux/kexec.h>
21#include <linux/debug_locks.h>
22#include <linux/random.h>
23#include <linux/kallsyms.h>
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25 25
26int panic_on_oops; 26int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
52 * 52 *
53 * This function never returns. 53 * This function never returns.
54 */ 54 */
55
56NORET_TYPE void panic(const char * fmt, ...) 55NORET_TYPE void panic(const char * fmt, ...)
57{ 56{
58 long i;
59 static char buf[1024]; 57 static char buf[1024];
60 va_list args; 58 va_list args;
61#if defined(CONFIG_S390) 59 long i;
62 unsigned long caller = (unsigned long) __builtin_return_address(0);
63#endif
64 60
65 /* 61 /*
66 * It's possible to come here directly from a panic-assertion and not 62 * It's possible to come here directly from a panic-assertion and
67 * have preempt disabled. Some functions called from here want 63 * not have preempt disabled. Some functions called from here want
68 * preempt to be disabled. No point enabling it later though... 64 * preempt to be disabled. No point enabling it later though...
69 */ 65 */
70 preempt_disable(); 66 preempt_disable();
@@ -74,7 +70,9 @@ NORET_TYPE void panic(const char * fmt, ...)
74 vsnprintf(buf, sizeof(buf), fmt, args); 70 vsnprintf(buf, sizeof(buf), fmt, args);
75 va_end(args); 71 va_end(args);
76 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 72 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
77 bust_spinlocks(0); 73#ifdef CONFIG_DEBUG_BUGVERBOSE
74 dump_stack();
75#endif
78 76
79 /* 77 /*
80 * If we have crashed and we have a crash kernel loaded let it handle 78 * If we have crashed and we have a crash kernel loaded let it handle
@@ -83,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...)
83 */ 81 */
84 crash_kexec(NULL); 82 crash_kexec(NULL);
85 83
86#ifdef CONFIG_SMP
87 /* 84 /*
88 * Note smp_send_stop is the usual smp shutdown function, which 85 * Note smp_send_stop is the usual smp shutdown function, which
89 * unfortunately means it may not be hardened to work in a panic 86 * unfortunately means it may not be hardened to work in a panic
90 * situation. 87 * situation.
91 */ 88 */
92 smp_send_stop(); 89 smp_send_stop();
93#endif
94 90
95 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
96 92
@@ -99,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
99 95
100 if (panic_timeout > 0) { 96 if (panic_timeout > 0) {
101 /* 97 /*
102 * Delay timeout seconds before rebooting the machine. 98 * Delay timeout seconds before rebooting the machine.
103 * We can't use the "normal" timers since we just panicked.. 99 * We can't use the "normal" timers since we just panicked.
104 */ 100 */
105 printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); 101 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
102
106 for (i = 0; i < panic_timeout*1000; ) { 103 for (i = 0; i < panic_timeout*1000; ) {
107 touch_nmi_watchdog(); 104 touch_nmi_watchdog();
108 i += panic_blink(i); 105 i += panic_blink(i);
109 mdelay(1); 106 mdelay(1);
110 i++; 107 i++;
111 } 108 }
112 /* This will not be a clean reboot, with everything 109 /*
113 * shutting down. But if there is a chance of 110 * This will not be a clean reboot, with everything
114 * rebooting the system it will be rebooted. 111 * shutting down. But if there is a chance of
112 * rebooting the system it will be rebooted.
115 */ 113 */
116 emergency_restart(); 114 emergency_restart();
117 } 115 }
@@ -124,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...)
124 } 122 }
125#endif 123#endif
126#if defined(CONFIG_S390) 124#if defined(CONFIG_S390)
127 disabled_wait(caller); 125 {
126 unsigned long caller;
127
128 caller = (unsigned long)__builtin_return_address(0);
129 disabled_wait(caller);
130 }
128#endif 131#endif
129 local_irq_enable(); 132 local_irq_enable();
130 for (i = 0;;) { 133 for (i = 0; ; ) {
131 touch_softlockup_watchdog(); 134 touch_softlockup_watchdog();
132 i += panic_blink(i); 135 i += panic_blink(i);
133 mdelay(1); 136 mdelay(1);
134 i++; 137 i++;
135 } 138 }
139 bust_spinlocks(0);
136} 140}
137 141
138EXPORT_SYMBOL(panic); 142EXPORT_SYMBOL(panic);
139 143
140 144
141struct tnt { 145struct tnt {
142 u8 bit; 146 u8 bit;
143 char true; 147 char true;
144 char false; 148 char false;
145}; 149};
146 150
147static const struct tnt tnts[] = { 151static const struct tnt tnts[] = {
148 { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, 152 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
149 { TAINT_FORCED_MODULE, 'F', ' ' }, 153 { TAINT_FORCED_MODULE, 'F', ' ' },
150 { TAINT_UNSAFE_SMP, 'S', ' ' }, 154 { TAINT_UNSAFE_SMP, 'S', ' ' },
151 { TAINT_FORCED_RMMOD, 'R', ' ' }, 155 { TAINT_FORCED_RMMOD, 'R', ' ' },
152 { TAINT_MACHINE_CHECK, 'M', ' ' }, 156 { TAINT_MACHINE_CHECK, 'M', ' ' },
153 { TAINT_BAD_PAGE, 'B', ' ' }, 157 { TAINT_BAD_PAGE, 'B', ' ' },
154 { TAINT_USER, 'U', ' ' }, 158 { TAINT_USER, 'U', ' ' },
155 { TAINT_DIE, 'D', ' ' }, 159 { TAINT_DIE, 'D', ' ' },
156 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, 160 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
157 { TAINT_WARN, 'W', ' ' }, 161 { TAINT_WARN, 'W', ' ' },
158 { TAINT_CRAP, 'C', ' ' }, 162 { TAINT_CRAP, 'C', ' ' },
159}; 163};
160 164
161/** 165/**
@@ -192,7 +196,8 @@ const char *print_tainted(void)
192 *s = 0; 196 *s = 0;
193 } else 197 } else
194 snprintf(buf, sizeof(buf), "Not tainted"); 198 snprintf(buf, sizeof(buf), "Not tainted");
195 return(buf); 199
200 return buf;
196} 201}
197 202
198int test_taint(unsigned flag) 203int test_taint(unsigned flag)
@@ -208,7 +213,8 @@ unsigned long get_taint(void)
208 213
209void add_taint(unsigned flag) 214void add_taint(unsigned flag)
210{ 215{
211 debug_locks = 0; /* can't trust the integrity of the kernel anymore */ 216 /* can't trust the integrity of the kernel anymore: */
217 debug_locks = 0;
212 set_bit(flag, &tainted_mask); 218 set_bit(flag, &tainted_mask);
213} 219}
214EXPORT_SYMBOL(add_taint); 220EXPORT_SYMBOL(add_taint);
@@ -263,8 +269,8 @@ static void do_oops_enter_exit(void)
263} 269}
264 270
265/* 271/*
266 * Return true if the calling CPU is allowed to print oops-related info. This 272 * Return true if the calling CPU is allowed to print oops-related info.
267 * is a bit racy.. 273 * This is a bit racy..
268 */ 274 */
269int oops_may_print(void) 275int oops_may_print(void)
270{ 276{
@@ -273,20 +279,22 @@ int oops_may_print(void)
273 279
274/* 280/*
275 * Called when the architecture enters its oops handler, before it prints 281 * Called when the architecture enters its oops handler, before it prints
276 * anything. If this is the first CPU to oops, and it's oopsing the first time 282 * anything. If this is the first CPU to oops, and it's oopsing the first
277 * then let it proceed. 283 * time then let it proceed.
278 * 284 *
279 * This is all enabled by the pause_on_oops kernel boot option. We do all this 285 * This is all enabled by the pause_on_oops kernel boot option. We do all
280 * to ensure that oopses don't scroll off the screen. It has the side-effect 286 * this to ensure that oopses don't scroll off the screen. It has the
281 * of preventing later-oopsing CPUs from mucking up the display, too. 287 * side-effect of preventing later-oopsing CPUs from mucking up the display,
288 * too.
282 * 289 *
283 * It turns out that the CPU which is allowed to print ends up pausing for the 290 * It turns out that the CPU which is allowed to print ends up pausing for
284 * right duration, whereas all the other CPUs pause for twice as long: once in 291 * the right duration, whereas all the other CPUs pause for twice as long:
285 * oops_enter(), once in oops_exit(). 292 * once in oops_enter(), once in oops_exit().
286 */ 293 */
287void oops_enter(void) 294void oops_enter(void)
288{ 295{
289 debug_locks_off(); /* can't trust the integrity of the kernel anymore */ 296 /* can't trust the integrity of the kernel anymore: */
297 debug_locks_off();
290 do_oops_enter_exit(); 298 do_oops_enter_exit();
291} 299}
292 300
@@ -355,15 +363,18 @@ EXPORT_SYMBOL(warn_slowpath);
355#endif 363#endif
356 364
357#ifdef CONFIG_CC_STACKPROTECTOR 365#ifdef CONFIG_CC_STACKPROTECTOR
366
358/* 367/*
359 * Called when gcc's -fstack-protector feature is used, and 368 * Called when gcc's -fstack-protector feature is used, and
360 * gcc detects corruption of the on-stack canary value 369 * gcc detects corruption of the on-stack canary value
361 */ 370 */
362void __stack_chk_fail(void) 371void __stack_chk_fail(void)
363{ 372{
364 panic("stack-protector: Kernel stack is corrupted"); 373 panic("stack-protector: Kernel stack is corrupted in: %p\n",
374 __builtin_return_address(0));
365} 375}
366EXPORT_SYMBOL(__stack_chk_fail); 376EXPORT_SYMBOL(__stack_chk_fail);
377
367#endif 378#endif
368 379
369core_param(panic, panic_timeout, int, 0644); 380core_param(panic, panic_timeout, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a9..de273ec85bd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26 26
27/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
28#define KPARAM_KMALLOCED 0x80000000
29
27#if 0 30#if 0
28#define DEBUGP printk 31#define DEBUGP printk
29#else 32#else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
217 return -ENOSPC; 220 return -ENOSPC;
218 } 221 }
219 222
220 *(char **)kp->arg = (char *)val; 223 if (kp->perm & KPARAM_KMALLOCED)
224 kfree(*(char **)kp->arg);
225
226 /* This is a hack. We can't need to strdup in early boot, and we
227 * don't need to; this mangled commandline is preserved. */
228 if (slab_is_available()) {
229 kp->perm |= KPARAM_KMALLOCED;
230 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
231 if (!kp->arg)
232 return -ENOMEM;
233 } else
234 *(const char **)kp->arg = val;
235
221 return 0; 236 return 0;
222} 237}
223 238
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
571} 586}
572#endif 587#endif
573 588
589void destroy_params(const struct kernel_param *params, unsigned num)
590{
591 unsigned int i;
592
593 for (i = 0; i < num; i++)
594 if (params[i].perm & KPARAM_KMALLOCED)
595 kfree(*(char **)params[i].arg);
596}
597
574static void __init kernel_add_sysfs_param(const char *name, 598static void __init kernel_add_sysfs_param(const char *name,
575 struct kernel_param *kparam, 599 struct kernel_param *kparam,
576 unsigned int name_skip) 600 unsigned int name_skip)
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586fe753a..b2e5f78fd281 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 403{
404 struct pid *pid; 404 struct pid *pid;
405 rcu_read_lock(); 405 rcu_read_lock();
406 if (type != PIDTYPE_PID)
407 task = task->group_leader;
406 pid = get_pid(task->pids[type].pid); 408 pid = get_pid(task->pids[type].pid);
407 rcu_read_unlock(); 409 rcu_read_unlock();
408 return pid; 410 return pid;
@@ -450,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
450} 452}
451EXPORT_SYMBOL_GPL(pid_vnr); 453EXPORT_SYMBOL_GPL(pid_vnr);
452 454
453pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 455pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
456 struct pid_namespace *ns)
454{ 457{
455 return pid_nr_ns(task_pid(tsk), ns); 458 pid_t nr = 0;
459
460 rcu_read_lock();
461 if (!ns)
462 ns = current->nsproxy->pid_ns;
463 if (likely(pid_alive(task))) {
464 if (type != PIDTYPE_PID)
465 task = task->group_leader;
466 nr = pid_nr_ns(task->pids[type].pid, ns);
467 }
468 rcu_read_unlock();
469
470 return nr;
456} 471}
457EXPORT_SYMBOL(task_pid_nr_ns); 472EXPORT_SYMBOL(__task_pid_nr_ns);
458 473
459pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) 474pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
460{ 475{
@@ -462,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
462} 477}
463EXPORT_SYMBOL(task_tgid_nr_ns); 478EXPORT_SYMBOL(task_tgid_nr_ns);
464 479
465pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
466{
467 return pid_nr_ns(task_pgrp(tsk), ns);
468}
469EXPORT_SYMBOL(task_pgrp_nr_ns);
470
471pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
472{
473 return pid_nr_ns(task_session(tsk), ns);
474}
475EXPORT_SYMBOL(task_session_nr_ns);
476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) 480struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{ 481{
479 return ns_of_pid(task_pid(tsk)); 482 return ns_of_pid(task_pid(tsk));
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea86fac3..2d1001b4858d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
152{ 152{
153 int nr; 153 int nr;
154 int rc; 154 int rc;
155 struct task_struct *task;
155 156
156 /* 157 /*
157 * The last thread in the cgroup-init thread group is terminating. 158 * The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
169 read_lock(&tasklist_lock); 170 read_lock(&tasklist_lock);
170 nr = next_pidmap(pid_ns, 1); 171 nr = next_pidmap(pid_ns, 1);
171 while (nr > 0) { 172 while (nr > 0) {
172 kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr); 173 rcu_read_lock();
174
175 /*
176 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
177 * any nested-container's init processes don't ignore the
178 * signal
179 */
180 task = pid_task(find_vpid(nr), PIDTYPE_PID);
181 if (task)
182 force_sig(SIGKILL, task);
183
184 rcu_read_unlock();
185
173 nr = next_pidmap(pid_ns, nr); 186 nr = next_pidmap(pid_ns, nr);
174 } 187 }
175 read_unlock(&tasklist_lock); 188 read_unlock(&tasklist_lock);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 157de3a47832..8e5d9a68b022 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,76 +10,6 @@
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11 11
12/* 12/*
13 * Allocate the thread_group_cputime structure appropriately and fill in the
14 * current values of the fields. Called from copy_signal() via
15 * thread_group_cputime_clone_thread() when adding a second or subsequent
16 * thread to a thread group. Assumes interrupts are enabled when called.
17 */
18int thread_group_cputime_alloc(struct task_struct *tsk)
19{
20 struct signal_struct *sig = tsk->signal;
21 struct task_cputime *cputime;
22
23 /*
24 * If we have multiple threads and we don't already have a
25 * per-CPU task_cputime struct (checked in the caller), allocate
26 * one and fill it in with the times accumulated so far. We may
27 * race with another thread so recheck after we pick up the sighand
28 * lock.
29 */
30 cputime = alloc_percpu(struct task_cputime);
31 if (cputime == NULL)
32 return -ENOMEM;
33 spin_lock_irq(&tsk->sighand->siglock);
34 if (sig->cputime.totals) {
35 spin_unlock_irq(&tsk->sighand->siglock);
36 free_percpu(cputime);
37 return 0;
38 }
39 sig->cputime.totals = cputime;
40 cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
41 cputime->utime = tsk->utime;
42 cputime->stime = tsk->stime;
43 cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
44 spin_unlock_irq(&tsk->sighand->siglock);
45 return 0;
46}
47
48/**
49 * thread_group_cputime - Sum the thread group time fields across all CPUs.
50 *
51 * @tsk: The task we use to identify the thread group.
52 * @times: task_cputime structure in which we return the summed fields.
53 *
54 * Walk the list of CPUs to sum the per-CPU time fields in the thread group
55 * time structure.
56 */
57void thread_group_cputime(
58 struct task_struct *tsk,
59 struct task_cputime *times)
60{
61 struct task_cputime *totals, *tot;
62 int i;
63
64 totals = tsk->signal->cputime.totals;
65 if (!totals) {
66 times->utime = tsk->utime;
67 times->stime = tsk->stime;
68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
69 return;
70 }
71
72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime;
79 }
80}
81
82/*
83 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 13 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
84 */ 14 */
85void update_rlimit_cpu(unsigned long rlim_new) 15void update_rlimit_cpu(unsigned long rlim_new)
@@ -300,6 +230,71 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
300 return 0; 230 return 0;
301} 231}
302 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{
235 struct sighand_struct *sighand;
236 struct signal_struct *sig;
237 struct task_struct *t;
238
239 *times = INIT_CPUTIME;
240
241 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand);
243 if (!sighand)
244 goto out;
245
246 sig = tsk->signal;
247
248 t = tsk;
249 do {
250 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime;
253
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out:
261 rcu_read_unlock();
262}
263
264static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
265{
266 if (cputime_gt(b->utime, a->utime))
267 a->utime = b->utime;
268
269 if (cputime_gt(b->stime, a->stime))
270 a->stime = b->stime;
271
272 if (b->sum_exec_runtime > a->sum_exec_runtime)
273 a->sum_exec_runtime = b->sum_exec_runtime;
274}
275
276void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
277{
278 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
279 struct task_cputime sum;
280 unsigned long flags;
281
282 spin_lock_irqsave(&cputimer->lock, flags);
283 if (!cputimer->running) {
284 cputimer->running = 1;
285 /*
286 * The POSIX timer interface allows for absolute time expiry
287 * values through the TIMER_ABSTIME flag, therefore we have
288 * to synchronize the timer to the clock every time we start
289 * it.
290 */
291 thread_group_cputime(tsk, &sum);
292 update_gt_cputime(&cputimer->cputime, &sum);
293 }
294 *times = cputimer->cputime;
295 spin_unlock_irqrestore(&cputimer->lock, flags);
296}
297
303/* 298/*
304 * Sample a process (thread group) clock for the given group_leader task. 299 * Sample a process (thread group) clock for the given group_leader task.
305 * Must be called with tasklist_lock held for reading. 300 * Must be called with tasklist_lock held for reading.
@@ -527,7 +522,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
527{ 522{
528 struct task_cputime cputime; 523 struct task_cputime cputime;
529 524
530 thread_group_cputime(tsk, &cputime); 525 thread_group_cputimer(tsk, &cputime);
531 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
532 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime.utime, cputime.stime, cputime.sum_exec_runtime);
533} 528}
@@ -686,6 +681,33 @@ static void cpu_timer_fire(struct k_itimer *timer)
686} 681}
687 682
688/* 683/*
684 * Sample a process (thread group) timer for the given group_leader task.
685 * Must be called with tasklist_lock held for reading.
686 */
687static int cpu_timer_sample_group(const clockid_t which_clock,
688 struct task_struct *p,
689 union cpu_time_count *cpu)
690{
691 struct task_cputime cputime;
692
693 thread_group_cputimer(p, &cputime);
694 switch (CPUCLOCK_WHICH(which_clock)) {
695 default:
696 return -EINVAL;
697 case CPUCLOCK_PROF:
698 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
699 break;
700 case CPUCLOCK_VIRT:
701 cpu->cpu = cputime.utime;
702 break;
703 case CPUCLOCK_SCHED:
704 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
705 break;
706 }
707 return 0;
708}
709
710/*
689 * Guts of sys_timer_settime for CPU timers. 711 * Guts of sys_timer_settime for CPU timers.
690 * This is called with the timer locked and interrupts disabled. 712 * This is called with the timer locked and interrupts disabled.
691 * If we return TIMER_RETRY, it's necessary to release the timer's lock 713 * If we return TIMER_RETRY, it's necessary to release the timer's lock
@@ -746,7 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
746 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 768 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
747 cpu_clock_sample(timer->it_clock, p, &val); 769 cpu_clock_sample(timer->it_clock, p, &val);
748 } else { 770 } else {
749 cpu_clock_sample_group(timer->it_clock, p, &val); 771 cpu_timer_sample_group(timer->it_clock, p, &val);
750 } 772 }
751 773
752 if (old) { 774 if (old) {
@@ -894,7 +916,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
894 read_unlock(&tasklist_lock); 916 read_unlock(&tasklist_lock);
895 goto dead; 917 goto dead;
896 } else { 918 } else {
897 cpu_clock_sample_group(timer->it_clock, p, &now); 919 cpu_timer_sample_group(timer->it_clock, p, &now);
898 clear_dead = (unlikely(p->exit_state) && 920 clear_dead = (unlikely(p->exit_state) &&
899 thread_group_empty(p)); 921 thread_group_empty(p));
900 } 922 }
@@ -1034,6 +1056,19 @@ static void check_thread_timers(struct task_struct *tsk,
1034 } 1056 }
1035} 1057}
1036 1058
1059static void stop_process_timers(struct task_struct *tsk)
1060{
1061 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
1062 unsigned long flags;
1063
1064 if (!cputimer->running)
1065 return;
1066
1067 spin_lock_irqsave(&cputimer->lock, flags);
1068 cputimer->running = 0;
1069 spin_unlock_irqrestore(&cputimer->lock, flags);
1070}
1071
1037/* 1072/*
1038 * Check for any per-thread CPU timers that have fired and move them 1073 * Check for any per-thread CPU timers that have fired and move them
1039 * off the tsk->*_timers list onto the firing list. Per-thread timers 1074 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1057,13 +1092,15 @@ static void check_process_timers(struct task_struct *tsk,
1057 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && 1092 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1058 list_empty(&timers[CPUCLOCK_VIRT]) && 1093 list_empty(&timers[CPUCLOCK_VIRT]) &&
1059 cputime_eq(sig->it_virt_expires, cputime_zero) && 1094 cputime_eq(sig->it_virt_expires, cputime_zero) &&
1060 list_empty(&timers[CPUCLOCK_SCHED])) 1095 list_empty(&timers[CPUCLOCK_SCHED])) {
1096 stop_process_timers(tsk);
1061 return; 1097 return;
1098 }
1062 1099
1063 /* 1100 /*
1064 * Collect the current process totals. 1101 * Collect the current process totals.
1065 */ 1102 */
1066 thread_group_cputime(tsk, &cputime); 1103 thread_group_cputimer(tsk, &cputime);
1067 utime = cputime.utime; 1104 utime = cputime.utime;
1068 ptime = cputime_add(utime, cputime.stime); 1105 ptime = cputime_add(utime, cputime.stime);
1069 sum_sched_runtime = cputime.sum_exec_runtime; 1106 sum_sched_runtime = cputime.sum_exec_runtime;
@@ -1234,7 +1271,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1234 clear_dead_task(timer, now); 1271 clear_dead_task(timer, now);
1235 goto out_unlock; 1272 goto out_unlock;
1236 } 1273 }
1237 cpu_clock_sample_group(timer->it_clock, p, &now); 1274 cpu_timer_sample_group(timer->it_clock, p, &now);
1238 bump_cpu_timer(timer, now); 1275 bump_cpu_timer(timer, now);
1239 /* Leave the tasklist_lock locked for the call below. */ 1276 /* Leave the tasklist_lock locked for the call below. */
1240 } 1277 }
@@ -1329,11 +1366,12 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1329 if (!task_cputime_zero(&sig->cputime_expires)) { 1366 if (!task_cputime_zero(&sig->cputime_expires)) {
1330 struct task_cputime group_sample; 1367 struct task_cputime group_sample;
1331 1368
1332 thread_group_cputime(tsk, &group_sample); 1369 thread_group_cputimer(tsk, &group_sample);
1333 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1370 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1334 return 1; 1371 return 1;
1335 } 1372 }
1336 return 0; 1373
1374 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
1337} 1375}
1338 1376
1339/* 1377/*
@@ -1411,7 +1449,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1411 struct list_head *head; 1449 struct list_head *head;
1412 1450
1413 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1451 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1414 cpu_clock_sample_group(clock_idx, tsk, &now); 1452 cpu_timer_sample_group(clock_idx, tsk, &now);
1415 1453
1416 if (oldval) { 1454 if (oldval) {
1417 if (!cputime_eq(*oldval, cputime_zero)) { 1455 if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 887c63787de6..052ec4d195c7 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -477,10 +477,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
477 477
478/* Create a POSIX.1b interval timer. */ 478/* Create a POSIX.1b interval timer. */
479 479
480asmlinkage long 480SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
481sys_timer_create(const clockid_t which_clock, 481 struct sigevent __user *, timer_event_spec,
482 struct sigevent __user *timer_event_spec, 482 timer_t __user *, created_timer_id)
483 timer_t __user * created_timer_id)
484{ 483{
485 struct k_itimer *new_timer; 484 struct k_itimer *new_timer;
486 int error, new_timer_id; 485 int error, new_timer_id;
@@ -661,8 +660,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
661} 660}
662 661
663/* Get the time remaining on a POSIX.1b interval timer. */ 662/* Get the time remaining on a POSIX.1b interval timer. */
664asmlinkage long 663SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
665sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) 664 struct itimerspec __user *, setting)
666{ 665{
667 struct k_itimer *timr; 666 struct k_itimer *timr;
668 struct itimerspec cur_setting; 667 struct itimerspec cur_setting;
@@ -691,8 +690,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
691 * the call back to do_schedule_next_timer(). So all we need to do is 690 * the call back to do_schedule_next_timer(). So all we need to do is
692 * to pick up the frozen overrun. 691 * to pick up the frozen overrun.
693 */ 692 */
694asmlinkage long 693SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
695sys_timer_getoverrun(timer_t timer_id)
696{ 694{
697 struct k_itimer *timr; 695 struct k_itimer *timr;
698 int overrun; 696 int overrun;
@@ -760,10 +758,9 @@ common_timer_set(struct k_itimer *timr, int flags,
760} 758}
761 759
762/* Set a POSIX.1b interval timer */ 760/* Set a POSIX.1b interval timer */
763asmlinkage long 761SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
764sys_timer_settime(timer_t timer_id, int flags, 762 const struct itimerspec __user *, new_setting,
765 const struct itimerspec __user *new_setting, 763 struct itimerspec __user *, old_setting)
766 struct itimerspec __user *old_setting)
767{ 764{
768 struct k_itimer *timr; 765 struct k_itimer *timr;
769 struct itimerspec new_spec, old_spec; 766 struct itimerspec new_spec, old_spec;
@@ -816,8 +813,7 @@ static inline int timer_delete_hook(struct k_itimer *timer)
816} 813}
817 814
818/* Delete a POSIX.1b interval timer. */ 815/* Delete a POSIX.1b interval timer. */
819asmlinkage long 816SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
820sys_timer_delete(timer_t timer_id)
821{ 817{
822 struct k_itimer *timer; 818 struct k_itimer *timer;
823 unsigned long flags; 819 unsigned long flags;
@@ -903,8 +899,8 @@ int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
903} 899}
904EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); 900EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
905 901
906asmlinkage long sys_clock_settime(const clockid_t which_clock, 902SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
907 const struct timespec __user *tp) 903 const struct timespec __user *, tp)
908{ 904{
909 struct timespec new_tp; 905 struct timespec new_tp;
910 906
@@ -916,8 +912,8 @@ asmlinkage long sys_clock_settime(const clockid_t which_clock,
916 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 912 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
917} 913}
918 914
919asmlinkage long 915SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
920sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp) 916 struct timespec __user *,tp)
921{ 917{
922 struct timespec kernel_tp; 918 struct timespec kernel_tp;
923 int error; 919 int error;
@@ -933,8 +929,8 @@ sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
933 929
934} 930}
935 931
936asmlinkage long 932SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
937sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) 933 struct timespec __user *, tp)
938{ 934{
939 struct timespec rtn_tp; 935 struct timespec rtn_tp;
940 int error; 936 int error;
@@ -963,10 +959,9 @@ static int common_nsleep(const clockid_t which_clock, int flags,
963 which_clock); 959 which_clock);
964} 960}
965 961
966asmlinkage long 962SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
967sys_clock_nanosleep(const clockid_t which_clock, int flags, 963 const struct timespec __user *, rqtp,
968 const struct timespec __user *rqtp, 964 struct timespec __user *, rmtp)
969 struct timespec __user *rmtp)
970{ 965{
971 struct timespec t; 966 struct timespec t;
972 967
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 597823b5b700..720ea4f781bd 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o 6obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += process.o console.o 7obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o
8obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
9 10
10obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b8628be2a465..a3961b205de7 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -78,6 +78,12 @@ void pm_restore_console(void)
78 } 78 }
79 set_console(orig_fgconsole); 79 set_console(orig_fgconsole);
80 release_console_sem(); 80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 }
86
81 kmsg_redirect = orig_kmsg; 87 kmsg_redirect = orig_kmsg;
82} 88}
83#endif 89#endif
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 45e8541ab7e3..5f21ab2bbcdf 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <asm/suspend.h>
25 26
26#include "power.h" 27#include "power.h"
27 28
@@ -71,6 +72,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
71 mutex_unlock(&pm_mutex); 72 mutex_unlock(&pm_mutex);
72} 73}
73 74
75static bool entering_platform_hibernation;
76
77bool system_entering_hibernation(void)
78{
79 return entering_platform_hibernation;
80}
81EXPORT_SYMBOL(system_entering_hibernation);
82
74#ifdef CONFIG_PM_DEBUG 83#ifdef CONFIG_PM_DEBUG
75static void hibernation_debug_sleep(void) 84static void hibernation_debug_sleep(void)
76{ 85{
@@ -206,7 +215,7 @@ static int create_image(int platform_mode)
206 return error; 215 return error;
207 216
208 device_pm_lock(); 217 device_pm_lock();
209 local_irq_disable(); 218
210 /* At this point, device_suspend() has been called, but *not* 219 /* At this point, device_suspend() has been called, but *not*
211 * device_power_down(). We *must* call device_power_down() now. 220 * device_power_down(). We *must* call device_power_down() now.
212 * Otherwise, drivers for some devices (e.g. interrupt controllers) 221 * Otherwise, drivers for some devices (e.g. interrupt controllers)
@@ -217,6 +226,24 @@ static int create_image(int platform_mode)
217 if (error) { 226 if (error) {
218 printk(KERN_ERR "PM: Some devices failed to power down, " 227 printk(KERN_ERR "PM: Some devices failed to power down, "
219 "aborting hibernation\n"); 228 "aborting hibernation\n");
229 goto Unlock;
230 }
231
232 error = platform_pre_snapshot(platform_mode);
233 if (error || hibernation_test(TEST_PLATFORM))
234 goto Platform_finish;
235
236 error = disable_nonboot_cpus();
237 if (error || hibernation_test(TEST_CPUS)
238 || hibernation_testmode(HIBERNATION_TEST))
239 goto Enable_cpus;
240
241 local_irq_disable();
242
243 sysdev_suspend(PMSG_FREEZE);
244 if (error) {
245 printk(KERN_ERR "PM: Some devices failed to power down, "
246 "aborting hibernation\n");
220 goto Enable_irqs; 247 goto Enable_irqs;
221 } 248 }
222 249
@@ -233,15 +260,28 @@ static int create_image(int platform_mode)
233 restore_processor_state(); 260 restore_processor_state();
234 if (!in_suspend) 261 if (!in_suspend)
235 platform_leave(platform_mode); 262 platform_leave(platform_mode);
263
236 Power_up: 264 Power_up:
265 sysdev_resume();
237 /* NOTE: device_power_up() is just a resume() for devices 266 /* NOTE: device_power_up() is just a resume() for devices
238 * that suspended with irqs off ... no overall powerup. 267 * that suspended with irqs off ... no overall powerup.
239 */ 268 */
240 device_power_up(in_suspend ? 269
241 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
242 Enable_irqs: 270 Enable_irqs:
243 local_irq_enable(); 271 local_irq_enable();
272
273 Enable_cpus:
274 enable_nonboot_cpus();
275
276 Platform_finish:
277 platform_finish(platform_mode);
278
279 device_power_up(in_suspend ?
280 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
281
282 Unlock:
244 device_pm_unlock(); 283 device_pm_unlock();
284
245 return error; 285 return error;
246} 286}
247 287
@@ -249,7 +289,7 @@ static int create_image(int platform_mode)
249 * hibernation_snapshot - quiesce devices and create the hibernation 289 * hibernation_snapshot - quiesce devices and create the hibernation
250 * snapshot image. 290 * snapshot image.
251 * @platform_mode - if set, use the platform driver, if available, to 291 * @platform_mode - if set, use the platform driver, if available, to
252 * prepare the platform frimware for the power transition. 292 * prepare the platform firmware for the power transition.
253 * 293 *
254 * Must be called with pm_mutex held 294 * Must be called with pm_mutex held
255 */ 295 */
@@ -275,25 +315,9 @@ int hibernation_snapshot(int platform_mode)
275 if (hibernation_test(TEST_DEVICES)) 315 if (hibernation_test(TEST_DEVICES))
276 goto Recover_platform; 316 goto Recover_platform;
277 317
278 error = platform_pre_snapshot(platform_mode); 318 error = create_image(platform_mode);
279 if (error || hibernation_test(TEST_PLATFORM)) 319 /* Control returns here after successful restore */
280 goto Finish;
281
282 error = disable_nonboot_cpus();
283 if (!error) {
284 if (hibernation_test(TEST_CPUS))
285 goto Enable_cpus;
286
287 if (hibernation_testmode(HIBERNATION_TEST))
288 goto Enable_cpus;
289 320
290 error = create_image(platform_mode);
291 /* Control returns here after successful restore */
292 }
293 Enable_cpus:
294 enable_nonboot_cpus();
295 Finish:
296 platform_finish(platform_mode);
297 Resume_devices: 321 Resume_devices:
298 device_resume(in_suspend ? 322 device_resume(in_suspend ?
299 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
@@ -315,18 +339,33 @@ int hibernation_snapshot(int platform_mode)
315 * kernel. 339 * kernel.
316 */ 340 */
317 341
318static int resume_target_kernel(void) 342static int resume_target_kernel(bool platform_mode)
319{ 343{
320 int error; 344 int error;
321 345
322 device_pm_lock(); 346 device_pm_lock();
323 local_irq_disable(); 347
324 error = device_power_down(PMSG_QUIESCE); 348 error = device_power_down(PMSG_QUIESCE);
325 if (error) { 349 if (error) {
326 printk(KERN_ERR "PM: Some devices failed to power down, " 350 printk(KERN_ERR "PM: Some devices failed to power down, "
327 "aborting resume\n"); 351 "aborting resume\n");
328 goto Enable_irqs; 352 goto Unlock;
329 } 353 }
354
355 error = platform_pre_restore(platform_mode);
356 if (error)
357 goto Cleanup;
358
359 error = disable_nonboot_cpus();
360 if (error)
361 goto Enable_cpus;
362
363 local_irq_disable();
364
365 error = sysdev_suspend(PMSG_QUIESCE);
366 if (error)
367 goto Enable_irqs;
368
330 /* We'll ignore saved state, but this gets preempt count (etc) right */ 369 /* We'll ignore saved state, but this gets preempt count (etc) right */
331 save_processor_state(); 370 save_processor_state();
332 error = restore_highmem(); 371 error = restore_highmem();
@@ -349,10 +388,23 @@ static int resume_target_kernel(void)
349 swsusp_free(); 388 swsusp_free();
350 restore_processor_state(); 389 restore_processor_state();
351 touch_softlockup_watchdog(); 390 touch_softlockup_watchdog();
352 device_power_up(PMSG_RECOVER); 391
392 sysdev_resume();
393
353 Enable_irqs: 394 Enable_irqs:
354 local_irq_enable(); 395 local_irq_enable();
396
397 Enable_cpus:
398 enable_nonboot_cpus();
399
400 Cleanup:
401 platform_restore_cleanup(platform_mode);
402
403 device_power_up(PMSG_RECOVER);
404
405 Unlock:
355 device_pm_unlock(); 406 device_pm_unlock();
407
356 return error; 408 return error;
357} 409}
358 410
@@ -360,7 +412,7 @@ static int resume_target_kernel(void)
360 * hibernation_restore - quiesce devices and restore the hibernation 412 * hibernation_restore - quiesce devices and restore the hibernation
361 * snapshot image. If successful, control returns in hibernation_snaphot() 413 * snapshot image. If successful, control returns in hibernation_snaphot()
362 * @platform_mode - if set, use the platform driver, if available, to 414 * @platform_mode - if set, use the platform driver, if available, to
363 * prepare the platform frimware for the transition. 415 * prepare the platform firmware for the transition.
364 * 416 *
365 * Must be called with pm_mutex held 417 * Must be called with pm_mutex held
366 */ 418 */
@@ -372,19 +424,10 @@ int hibernation_restore(int platform_mode)
372 pm_prepare_console(); 424 pm_prepare_console();
373 suspend_console(); 425 suspend_console();
374 error = device_suspend(PMSG_QUIESCE); 426 error = device_suspend(PMSG_QUIESCE);
375 if (error)
376 goto Finish;
377
378 error = platform_pre_restore(platform_mode);
379 if (!error) { 427 if (!error) {
380 error = disable_nonboot_cpus(); 428 error = resume_target_kernel(platform_mode);
381 if (!error) 429 device_resume(PMSG_RECOVER);
382 error = resume_target_kernel();
383 enable_nonboot_cpus();
384 } 430 }
385 platform_restore_cleanup(platform_mode);
386 device_resume(PMSG_RECOVER);
387 Finish:
388 resume_console(); 431 resume_console();
389 pm_restore_console(); 432 pm_restore_console();
390 return error; 433 return error;
@@ -411,6 +454,7 @@ int hibernation_platform_enter(void)
411 if (error) 454 if (error)
412 goto Close; 455 goto Close;
413 456
457 entering_platform_hibernation = true;
414 suspend_console(); 458 suspend_console();
415 error = device_suspend(PMSG_HIBERNATE); 459 error = device_suspend(PMSG_HIBERNATE);
416 if (error) { 460 if (error) {
@@ -419,36 +463,46 @@ int hibernation_platform_enter(void)
419 goto Resume_devices; 463 goto Resume_devices;
420 } 464 }
421 465
466 device_pm_lock();
467
468 error = device_power_down(PMSG_HIBERNATE);
469 if (error)
470 goto Unlock;
471
422 error = hibernation_ops->prepare(); 472 error = hibernation_ops->prepare();
423 if (error) 473 if (error)
424 goto Resume_devices; 474 goto Platofrm_finish;
425 475
426 error = disable_nonboot_cpus(); 476 error = disable_nonboot_cpus();
427 if (error) 477 if (error)
428 goto Finish; 478 goto Platofrm_finish;
429 479
430 device_pm_lock();
431 local_irq_disable(); 480 local_irq_disable();
432 error = device_power_down(PMSG_HIBERNATE); 481 sysdev_suspend(PMSG_HIBERNATE);
433 if (!error) { 482 hibernation_ops->enter();
434 hibernation_ops->enter(); 483 /* We should never get here */
435 /* We should never get here */ 484 while (1);
436 while (1);
437 }
438 local_irq_enable();
439 device_pm_unlock();
440 485
441 /* 486 /*
442 * We don't need to reenable the nonboot CPUs or resume consoles, since 487 * We don't need to reenable the nonboot CPUs or resume consoles, since
443 * the system is going to be halted anyway. 488 * the system is going to be halted anyway.
444 */ 489 */
445 Finish: 490 Platofrm_finish:
446 hibernation_ops->finish(); 491 hibernation_ops->finish();
492
493 device_power_up(PMSG_RESTORE);
494
495 Unlock:
496 device_pm_unlock();
497
447 Resume_devices: 498 Resume_devices:
499 entering_platform_hibernation = false;
448 device_resume(PMSG_RESTORE); 500 device_resume(PMSG_RESTORE);
449 resume_console(); 501 resume_console();
502
450 Close: 503 Close:
451 hibernation_ops->end(); 504 hibernation_ops->end();
505
452 return error; 506 return error;
453} 507}
454 508
@@ -585,6 +639,12 @@ static int software_resume(void)
585 unsigned int flags; 639 unsigned int flags;
586 640
587 /* 641 /*
642 * If the user said "noresume".. bail out early.
643 */
644 if (noresume)
645 return 0;
646
647 /*
588 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs 648 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
589 * is configured into the kernel. Since the regular hibernate 649 * is configured into the kernel. Since the regular hibernate
590 * trigger path is via sysfs which takes a buffer mutex before 650 * trigger path is via sysfs which takes a buffer mutex before
@@ -600,6 +660,11 @@ static int software_resume(void)
600 mutex_unlock(&pm_mutex); 660 mutex_unlock(&pm_mutex);
601 return -ENOENT; 661 return -ENOENT;
602 } 662 }
663 /*
664 * Some device discovery might still be in progress; we need
665 * to wait for this to finish.
666 */
667 wait_for_device_probe();
603 swsusp_resume_device = name_to_dev_t(resume_file); 668 swsusp_resume_device = name_to_dev_t(resume_file);
604 pr_debug("PM: Resume from partition %s\n", resume_file); 669 pr_debug("PM: Resume from partition %s\n", resume_file);
605 } else { 670 } else {
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 239988873971..f172f41858bb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -57,16 +57,6 @@ int pm_notifier_call_chain(unsigned long val)
57#ifdef CONFIG_PM_DEBUG 57#ifdef CONFIG_PM_DEBUG
58int pm_test_level = TEST_NONE; 58int pm_test_level = TEST_NONE;
59 59
60static int suspend_test(int level)
61{
62 if (pm_test_level == level) {
63 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
64 mdelay(5000);
65 return 1;
66 }
67 return 0;
68}
69
70static const char * const pm_tests[__TEST_AFTER_LAST] = { 60static const char * const pm_tests[__TEST_AFTER_LAST] = {
71 [TEST_NONE] = "none", 61 [TEST_NONE] = "none",
72 [TEST_CORE] = "core", 62 [TEST_CORE] = "core",
@@ -125,14 +115,24 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
125} 115}
126 116
127power_attr(pm_test); 117power_attr(pm_test);
128#else /* !CONFIG_PM_DEBUG */ 118#endif /* CONFIG_PM_DEBUG */
129static inline int suspend_test(int level) { return 0; }
130#endif /* !CONFIG_PM_DEBUG */
131 119
132#endif /* CONFIG_PM_SLEEP */ 120#endif /* CONFIG_PM_SLEEP */
133 121
134#ifdef CONFIG_SUSPEND 122#ifdef CONFIG_SUSPEND
135 123
124static int suspend_test(int level)
125{
126#ifdef CONFIG_PM_DEBUG
127 if (pm_test_level == level) {
128 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
129 mdelay(5000);
130 return 1;
131 }
132#endif /* !CONFIG_PM_DEBUG */
133 return 0;
134}
135
136#ifdef CONFIG_PM_TEST_SUSPEND 136#ifdef CONFIG_PM_TEST_SUSPEND
137 137
138/* 138/*
@@ -287,25 +287,55 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
287 */ 287 */
288static int suspend_enter(suspend_state_t state) 288static int suspend_enter(suspend_state_t state)
289{ 289{
290 int error = 0; 290 int error;
291 291
292 device_pm_lock(); 292 device_pm_lock();
293 arch_suspend_disable_irqs();
294 BUG_ON(!irqs_disabled());
295 293
296 if ((error = device_power_down(PMSG_SUSPEND))) { 294 error = device_power_down(PMSG_SUSPEND);
295 if (error) {
297 printk(KERN_ERR "PM: Some devices failed to power down\n"); 296 printk(KERN_ERR "PM: Some devices failed to power down\n");
298 goto Done; 297 goto Done;
299 } 298 }
300 299
301 if (!suspend_test(TEST_CORE)) 300 if (suspend_ops->prepare) {
302 error = suspend_ops->enter(state); 301 error = suspend_ops->prepare();
302 if (error)
303 goto Power_up_devices;
304 }
305
306 if (suspend_test(TEST_PLATFORM))
307 goto Platfrom_finish;
308
309 error = disable_nonboot_cpus();
310 if (error || suspend_test(TEST_CPUS))
311 goto Enable_cpus;
312
313 arch_suspend_disable_irqs();
314 BUG_ON(!irqs_disabled());
315
316 error = sysdev_suspend(PMSG_SUSPEND);
317 if (!error) {
318 if (!suspend_test(TEST_CORE))
319 error = suspend_ops->enter(state);
320 sysdev_resume();
321 }
303 322
304 device_power_up(PMSG_RESUME);
305 Done:
306 arch_suspend_enable_irqs(); 323 arch_suspend_enable_irqs();
307 BUG_ON(irqs_disabled()); 324 BUG_ON(irqs_disabled());
325
326 Enable_cpus:
327 enable_nonboot_cpus();
328
329 Platfrom_finish:
330 if (suspend_ops->finish)
331 suspend_ops->finish();
332
333 Power_up_devices:
334 device_power_up(PMSG_RESUME);
335
336 Done:
308 device_pm_unlock(); 337 device_pm_unlock();
338
309 return error; 339 return error;
310} 340}
311 341
@@ -337,23 +367,8 @@ int suspend_devices_and_enter(suspend_state_t state)
337 if (suspend_test(TEST_DEVICES)) 367 if (suspend_test(TEST_DEVICES))
338 goto Recover_platform; 368 goto Recover_platform;
339 369
340 if (suspend_ops->prepare) { 370 suspend_enter(state);
341 error = suspend_ops->prepare();
342 if (error)
343 goto Resume_devices;
344 }
345
346 if (suspend_test(TEST_PLATFORM))
347 goto Finish;
348
349 error = disable_nonboot_cpus();
350 if (!error && !suspend_test(TEST_CPUS))
351 suspend_enter(state);
352 371
353 enable_nonboot_cpus();
354 Finish:
355 if (suspend_ops->finish)
356 suspend_ops->finish();
357 Resume_devices: 372 Resume_devices:
358 suspend_test_start(); 373 suspend_test_start();
359 device_resume(PMSG_RESUME); 374 device_resume(PMSG_RESUME);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f2..33e2e4a819f9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
321 321
322 INIT_LIST_HEAD(list); 322 INIT_LIST_HEAD(list);
323 323
324 for_each_zone(zone) { 324 for_each_populated_zone(zone) {
325 unsigned long zone_start, zone_end; 325 unsigned long zone_start, zone_end;
326 struct mem_extent *ext, *cur, *aux; 326 struct mem_extent *ext, *cur, *aux;
327 327
328 if (!populated_zone(zone))
329 continue;
330
331 zone_start = zone->zone_start_pfn; 328 zone_start = zone->zone_start_pfn;
332 zone_end = zone->zone_start_pfn + zone->spanned_pages; 329 zone_end = zone->zone_start_pfn + zone->spanned_pages;
333 330
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
804 struct zone *zone; 801 struct zone *zone;
805 unsigned int cnt = 0; 802 unsigned int cnt = 0;
806 803
807 for_each_zone(zone) 804 for_each_populated_zone(zone)
808 if (populated_zone(zone) && is_highmem(zone)) 805 if (is_highmem(zone))
809 cnt += zone_page_state(zone, NR_FREE_PAGES); 806 cnt += zone_page_state(zone, NR_FREE_PAGES);
810 807
811 return cnt; 808 return cnt;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 6da14358537c..505f319e489c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -60,6 +60,7 @@ static struct block_device *resume_bdev;
60static int submit(int rw, pgoff_t page_off, struct page *page, 60static int submit(int rw, pgoff_t page_off, struct page *page,
61 struct bio **bio_chain) 61 struct bio **bio_chain)
62{ 62{
63 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
63 struct bio *bio; 64 struct bio *bio;
64 65
65 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
@@ -80,7 +81,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
80 bio_get(bio); 81 bio_get(bio);
81 82
82 if (bio_chain == NULL) { 83 if (bio_chain == NULL) {
83 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 84 submit_bio(bio_rw, bio);
84 wait_on_page_locked(page); 85 wait_on_page_locked(page);
85 if (rw == READ) 86 if (rw == READ)
86 bio_set_pages_dirty(bio); 87 bio_set_pages_dirty(bio);
@@ -90,7 +91,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
90 get_page(page); /* These pages are freed later */ 91 get_page(page); /* These pages are freed later */
91 bio->bi_private = *bio_chain; 92 bio->bi_private = *bio_chain;
92 *bio_chain = bio; 93 *bio_chain = bio;
93 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 94 submit_bio(bio_rw, bio);
94 } 95 }
95 return 0; 96 return 0;
96} 97}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c91451559..78c35047586d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/time.h> 52#include <linux/time.h>
53#include <linux/rbtree.h> 53#include <linux/rbtree.h>
54#include <linux/io.h>
54 55
55#include "power.h" 56#include "power.h"
56 57
@@ -229,17 +230,16 @@ int swsusp_shrink_memory(void)
229 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; 230 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
230 tmp = size; 231 tmp = size;
231 size += highmem_size; 232 size += highmem_size;
232 for_each_zone (zone) 233 for_each_populated_zone(zone) {
233 if (populated_zone(zone)) { 234 tmp += snapshot_additional_pages(zone);
234 tmp += snapshot_additional_pages(zone); 235 if (is_highmem(zone)) {
235 if (is_highmem(zone)) { 236 highmem_size -=
236 highmem_size -=
237 zone_page_state(zone, NR_FREE_PAGES); 237 zone_page_state(zone, NR_FREE_PAGES);
238 } else { 238 } else {
239 tmp -= zone_page_state(zone, NR_FREE_PAGES); 239 tmp -= zone_page_state(zone, NR_FREE_PAGES);
240 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 240 tmp += zone->lowmem_reserve[ZONE_NORMAL];
241 }
242 } 241 }
242 }
243 243
244 if (highmem_size < 0) 244 if (highmem_size < 0)
245 highmem_size = 0; 245 highmem_size = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 005b93d839ba..6c85359364f2 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -95,15 +95,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
95 data->swap = swsusp_resume_device ? 95 data->swap = swsusp_resume_device ?
96 swap_type_of(swsusp_resume_device, 0, NULL) : -1; 96 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
97 data->mode = O_RDONLY; 97 data->mode = O_RDONLY;
98 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 98 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
99 if (error) 99 if (error)
100 pm_notifier_call_chain(PM_POST_RESTORE); 100 pm_notifier_call_chain(PM_POST_HIBERNATION);
101 } else { 101 } else {
102 data->swap = -1; 102 data->swap = -1;
103 data->mode = O_WRONLY; 103 data->mode = O_WRONLY;
104 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 104 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
105 if (error) 105 if (error)
106 pm_notifier_call_chain(PM_POST_HIBERNATION); 106 pm_notifier_call_chain(PM_POST_RESTORE);
107 } 107 }
108 if (error) 108 if (error)
109 atomic_inc(&snapshot_device_available); 109 atomic_inc(&snapshot_device_available);
diff --git a/kernel/printk.c b/kernel/printk.c
index 7015733793e8..5052b5497c67 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h>
35 36
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37 38
@@ -73,7 +74,6 @@ EXPORT_SYMBOL(oops_in_progress);
73 * driver system. 74 * driver system.
74 */ 75 */
75static DECLARE_MUTEX(console_sem); 76static DECLARE_MUTEX(console_sem);
76static DECLARE_MUTEX(secondary_console_sem);
77struct console *console_drivers; 77struct console *console_drivers;
78EXPORT_SYMBOL_GPL(console_drivers); 78EXPORT_SYMBOL_GPL(console_drivers);
79 79
@@ -136,6 +136,24 @@ static char *log_buf = __log_buf;
136static int log_buf_len = __LOG_BUF_LEN; 136static int log_buf_len = __LOG_BUF_LEN;
137static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 137static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
138 138
139#ifdef CONFIG_KEXEC
140/*
141 * This appends the listed symbols to /proc/vmcoreinfo
142 *
143 * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
144 * obtain access to symbols that are otherwise very difficult to locate. These
145 * symbols are specifically used so that utilities can access and extract the
146 * dmesg log from a vmcore file after a crash.
147 */
148void log_buf_kexec_setup(void)
149{
150 VMCOREINFO_SYMBOL(log_buf);
151 VMCOREINFO_SYMBOL(log_end);
152 VMCOREINFO_SYMBOL(log_buf_len);
153 VMCOREINFO_SYMBOL(logged_chars);
154}
155#endif
156
139static int __init log_buf_len_setup(char *str) 157static int __init log_buf_len_setup(char *str)
140{ 158{
141 unsigned size = memparse(str, &str); 159 unsigned size = memparse(str, &str);
@@ -382,7 +400,7 @@ out:
382 return error; 400 return error;
383} 401}
384 402
385asmlinkage long sys_syslog(int type, char __user *buf, int len) 403SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
386{ 404{
387 return do_syslog(type, buf, len); 405 return do_syslog(type, buf, len);
388} 406}
@@ -742,11 +760,6 @@ EXPORT_SYMBOL(vprintk);
742 760
743#else 761#else
744 762
745asmlinkage long sys_syslog(int type, char __user *buf, int len)
746{
747 return -ENOSYS;
748}
749
750static void call_console_drivers(unsigned start, unsigned end) 763static void call_console_drivers(unsigned start, unsigned end)
751{ 764{
752} 765}
@@ -896,12 +909,14 @@ void suspend_console(void)
896 printk("Suspending console(s) (use no_console_suspend to debug)\n"); 909 printk("Suspending console(s) (use no_console_suspend to debug)\n");
897 acquire_console_sem(); 910 acquire_console_sem();
898 console_suspended = 1; 911 console_suspended = 1;
912 up(&console_sem);
899} 913}
900 914
901void resume_console(void) 915void resume_console(void)
902{ 916{
903 if (!console_suspend_enabled) 917 if (!console_suspend_enabled)
904 return; 918 return;
919 down(&console_sem);
905 console_suspended = 0; 920 console_suspended = 0;
906 release_console_sem(); 921 release_console_sem();
907} 922}
@@ -917,11 +932,9 @@ void resume_console(void)
917void acquire_console_sem(void) 932void acquire_console_sem(void)
918{ 933{
919 BUG_ON(in_interrupt()); 934 BUG_ON(in_interrupt());
920 if (console_suspended) {
921 down(&secondary_console_sem);
922 return;
923 }
924 down(&console_sem); 935 down(&console_sem);
936 if (console_suspended)
937 return;
925 console_locked = 1; 938 console_locked = 1;
926 console_may_schedule = 1; 939 console_may_schedule = 1;
927} 940}
@@ -931,6 +944,10 @@ int try_acquire_console_sem(void)
931{ 944{
932 if (down_trylock(&console_sem)) 945 if (down_trylock(&console_sem))
933 return -1; 946 return -1;
947 if (console_suspended) {
948 up(&console_sem);
949 return -1;
950 }
934 console_locked = 1; 951 console_locked = 1;
935 console_may_schedule = 0; 952 console_may_schedule = 0;
936 return 0; 953 return 0;
@@ -984,7 +1001,7 @@ void release_console_sem(void)
984 unsigned wake_klogd = 0; 1001 unsigned wake_klogd = 0;
985 1002
986 if (console_suspended) { 1003 if (console_suspended) {
987 up(&secondary_console_sem); 1004 up(&console_sem);
988 return; 1005 return;
989 } 1006 }
990 1007
@@ -1294,8 +1311,11 @@ EXPORT_SYMBOL(printk_ratelimit);
1294bool printk_timed_ratelimit(unsigned long *caller_jiffies, 1311bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1295 unsigned int interval_msecs) 1312 unsigned int interval_msecs)
1296{ 1313{
1297 if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { 1314 if (*caller_jiffies == 0
1298 *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); 1315 || !time_in_range(jiffies, *caller_jiffies,
1316 *caller_jiffies
1317 + msecs_to_jiffies(interval_msecs))) {
1318 *caller_jiffies = jiffies;
1299 return true; 1319 return true;
1300 } 1320 }
1301 return false; 1321 return false;
diff --git a/kernel/profile.c b/kernel/profile.c
index 784933acf5b8..7724e0409bae 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -114,12 +114,15 @@ int __ref profile_init(void)
114 if (!slab_is_available()) { 114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes); 115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask); 116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
117 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
117 return 0; 118 return 0;
118 } 119 }
119 120
120 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) 121 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
121 return -ENOMEM; 122 return -ENOMEM;
122 123
124 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
125
123 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 126 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
124 if (prof_buffer) 127 if (prof_buffer)
125 return 0; 128 return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 29dc700e198c..aaad0ec34194 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
60{ 60{
61 spin_lock(&child->sighand->siglock); 61 spin_lock(&child->sighand->siglock);
62 if (task_is_traced(child)) { 62 if (task_is_traced(child)) {
63 if (child->signal->flags & SIGNAL_STOP_STOPPED) { 63 /*
64 * If the group stop is completed or in progress,
65 * this thread was already counted as stopped.
66 */
67 if (child->signal->flags & SIGNAL_STOP_STOPPED ||
68 child->signal->group_stop_count)
64 __set_task_state(child, TASK_STOPPED); 69 __set_task_state(child, TASK_STOPPED);
65 } else { 70 else
66 signal_wake_up(child, 1); 71 signal_wake_up(child, 1);
67 }
68 } 72 }
69 spin_unlock(&child->sighand->siglock); 73 spin_unlock(&child->sighand->siglock);
70} 74}
@@ -235,18 +239,58 @@ out:
235 return retval; 239 return retval;
236} 240}
237 241
238static inline void __ptrace_detach(struct task_struct *child, unsigned int data) 242/*
243 * Called with irqs disabled, returns true if childs should reap themselves.
244 */
245static int ignoring_children(struct sighand_struct *sigh)
239{ 246{
240 child->exit_code = data; 247 int ret;
241 /* .. re-parent .. */ 248 spin_lock(&sigh->siglock);
242 __ptrace_unlink(child); 249 ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
243 /* .. and wake it up. */ 250 (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
244 if (child->exit_state != EXIT_ZOMBIE) 251 spin_unlock(&sigh->siglock);
245 wake_up_process(child); 252 return ret;
253}
254
255/*
256 * Called with tasklist_lock held for writing.
257 * Unlink a traced task, and clean it up if it was a traced zombie.
258 * Return true if it needs to be reaped with release_task().
259 * (We can't call release_task() here because we already hold tasklist_lock.)
260 *
261 * If it's a zombie, our attachedness prevented normal parent notification
262 * or self-reaping. Do notification now if it would have happened earlier.
263 * If it should reap itself, return true.
264 *
265 * If it's our own child, there is no notification to do.
266 * But if our normal children self-reap, then this child
267 * was prevented by ptrace and we must reap it now.
268 */
269static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
270{
271 __ptrace_unlink(p);
272
273 if (p->exit_state == EXIT_ZOMBIE) {
274 if (!task_detached(p) && thread_group_empty(p)) {
275 if (!same_thread_group(p->real_parent, tracer))
276 do_notify_parent(p, p->exit_signal);
277 else if (ignoring_children(tracer->sighand))
278 p->exit_signal = -1;
279 }
280 if (task_detached(p)) {
281 /* Mark it as in the process of being reaped. */
282 p->exit_state = EXIT_DEAD;
283 return true;
284 }
285 }
286
287 return false;
246} 288}
247 289
248int ptrace_detach(struct task_struct *child, unsigned int data) 290int ptrace_detach(struct task_struct *child, unsigned int data)
249{ 291{
292 bool dead = false;
293
250 if (!valid_signal(data)) 294 if (!valid_signal(data))
251 return -EIO; 295 return -EIO;
252 296
@@ -255,14 +299,45 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
255 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); 299 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
256 300
257 write_lock_irq(&tasklist_lock); 301 write_lock_irq(&tasklist_lock);
258 /* protect against de_thread()->release_task() */ 302 /*
259 if (child->ptrace) 303 * This child can be already killed. Make sure de_thread() or
260 __ptrace_detach(child, data); 304 * our sub-thread doing do_wait() didn't do release_task() yet.
305 */
306 if (child->ptrace) {
307 child->exit_code = data;
308 dead = __ptrace_detach(current, child);
309 }
261 write_unlock_irq(&tasklist_lock); 310 write_unlock_irq(&tasklist_lock);
262 311
312 if (unlikely(dead))
313 release_task(child);
314
263 return 0; 315 return 0;
264} 316}
265 317
318/*
319 * Detach all tasks we were using ptrace on.
320 */
321void exit_ptrace(struct task_struct *tracer)
322{
323 struct task_struct *p, *n;
324 LIST_HEAD(ptrace_dead);
325
326 write_lock_irq(&tasklist_lock);
327 list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
328 if (__ptrace_detach(tracer, p))
329 list_add(&p->ptrace_entry, &ptrace_dead);
330 }
331 write_unlock_irq(&tasklist_lock);
332
333 BUG_ON(!list_empty(&tracer->ptraced));
334
335 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
336 list_del_init(&p->ptrace_entry);
337 release_task(p);
338 }
339}
340
266int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 341int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
267{ 342{
268 int copied = 0; 343 int copied = 0;
@@ -574,7 +649,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
574#define arch_ptrace_attach(child) do { } while (0) 649#define arch_ptrace_attach(child) do { } while (0)
575#endif 650#endif
576 651
577asmlinkage long sys_ptrace(long request, long pid, long addr, long data) 652SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
578{ 653{
579 struct task_struct *child; 654 struct task_struct *child;
580 long ret; 655 long ret;
@@ -612,8 +687,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
612 goto out_put_task_struct; 687 goto out_put_task_struct;
613 688
614 ret = arch_ptrace(child, request, addr, data); 689 ret = arch_ptrace(child, request, addr, data);
615 if (ret < 0)
616 goto out_put_task_struct;
617 690
618 out_put_task_struct: 691 out_put_task_struct:
619 put_task_struct(child); 692 put_task_struct(child);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 490934fc7ac3..0f2b0b311304 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 70 .cur = -300,
70 .completed = -300, 71 .completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
73 .cpumask = CPU_BITS_NONE, 74 .cpumask = CPU_BITS_NONE,
74}; 75};
75 76
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
77DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; 78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
78 97
79static int blimit = 10; 98static int blimit = 10;
80static int qhimark = 10000; 99static int qhimark = 10000;
@@ -679,8 +698,8 @@ int rcu_needs_cpu(int cpu)
679void rcu_check_callbacks(int cpu, int user) 698void rcu_check_callbacks(int cpu, int user)
680{ 699{
681 if (user || 700 if (user ||
682 (idle_cpu(cpu) && !in_softirq() && 701 (idle_cpu(cpu) && rcu_scheduler_active &&
683 hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
684 703
685 /* 704 /*
686 * Get here if this CPU took its interrupt from user 705 * Get here if this CPU took its interrupt from user
@@ -716,7 +735,7 @@ void rcu_check_callbacks(int cpu, int user)
716 raise_rcu_softirq(); 735 raise_rcu_softirq();
717} 736}
718 737
719static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, 738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
720 struct rcu_data *rdp) 739 struct rcu_data *rdp)
721{ 740{
722 unsigned long flags; 741 unsigned long flags;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d92a76a881aa..2c7b8457d0d2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,6 +44,7 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
47 48
48enum rcu_barrier { 49enum rcu_barrier {
49 RCU_BARRIER_STD, 50 RCU_BARRIER_STD,
@@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
55static atomic_t rcu_barrier_cpu_count; 56static atomic_t rcu_barrier_cpu_count;
56static DEFINE_MUTEX(rcu_barrier_mutex); 57static DEFINE_MUTEX(rcu_barrier_mutex);
57static struct completion rcu_barrier_completion; 58static struct completion rcu_barrier_completion;
59int rcu_scheduler_active __read_mostly;
58 60
59/* 61/*
60 * Awaken the corresponding synchronize_rcu() instance now that a 62 * Awaken the corresponding synchronize_rcu() instance now that a
@@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head)
80void synchronize_rcu(void) 82void synchronize_rcu(void)
81{ 83{
82 struct rcu_synchronize rcu; 84 struct rcu_synchronize rcu;
85
86 if (rcu_blocking_is_gp())
87 return;
88
83 init_completion(&rcu.completion); 89 init_completion(&rcu.completion);
84 /* Will wake me after RCU finished. */ 90 /* Will wake me after RCU finished. */
85 call_rcu(&rcu.head, wakeme_after_rcu); 91 call_rcu(&rcu.head, wakeme_after_rcu);
@@ -116,6 +122,8 @@ static void rcu_barrier_func(void *type)
116 } 122 }
117} 123}
118 124
125static inline void wait_migrated_callbacks(void);
126
119/* 127/*
120 * Orchestrate the specified type of RCU barrier, waiting for all 128 * Orchestrate the specified type of RCU barrier, waiting for all
121 * RCU callbacks of the specified type to complete. 129 * RCU callbacks of the specified type to complete.
@@ -141,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
141 complete(&rcu_barrier_completion); 149 complete(&rcu_barrier_completion);
142 wait_for_completion(&rcu_barrier_completion); 150 wait_for_completion(&rcu_barrier_completion);
143 mutex_unlock(&rcu_barrier_mutex); 151 mutex_unlock(&rcu_barrier_mutex);
152 wait_migrated_callbacks();
144} 153}
145 154
146/** 155/**
@@ -170,8 +179,55 @@ void rcu_barrier_sched(void)
170} 179}
171EXPORT_SYMBOL_GPL(rcu_barrier_sched); 180EXPORT_SYMBOL_GPL(rcu_barrier_sched);
172 181
182static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
183static struct rcu_head rcu_migrate_head[3];
184static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
185
186static void rcu_migrate_callback(struct rcu_head *notused)
187{
188 if (atomic_dec_and_test(&rcu_migrate_type_count))
189 wake_up(&rcu_migrate_wq);
190}
191
192static inline void wait_migrated_callbacks(void)
193{
194 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
195}
196
197static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
198 unsigned long action, void *hcpu)
199{
200 if (action == CPU_DYING) {
201 /*
202 * preempt_disable() in on_each_cpu() prevents stop_machine(),
203 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
204 * returns, all online cpus have queued rcu_barrier_func(),
205 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
206 *
207 * These callbacks ensure _rcu_barrier() waits for all
208 * RCU callbacks of the specified type to complete.
209 */
210 atomic_set(&rcu_migrate_type_count, 3);
211 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
212 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
213 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
214 } else if (action == CPU_POST_DEAD) {
215 /* rcu_migrate_head is protected by cpu_add_remove_lock */
216 wait_migrated_callbacks();
217 }
218
219 return NOTIFY_OK;
220}
221
173void __init rcu_init(void) 222void __init rcu_init(void)
174{ 223{
175 __rcu_init(); 224 __rcu_init();
225 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
176} 226}
177 227
228void rcu_scheduler_starting(void)
229{
230 WARN_ON(num_online_cpus() != 1);
231 WARN_ON(nr_context_switches() > 0);
232 rcu_scheduler_active = 1;
233}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 33cfc50781f9..ce97a4df64d3 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ 147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148}; 148};
149 149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
150static DEFINE_PER_CPU(struct rcu_data, rcu_data); 193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
151static struct rcu_ctrlblk rcu_ctrlblk = { 195static struct rcu_ctrlblk rcu_ctrlblk = {
152 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), 196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
153 .completed = 0, 197 .completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
427 } 471 }
428} 472}
429 473
430DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
431 .dynticks = 1,
432};
433
434#ifdef CONFIG_NO_HZ 474#ifdef CONFIG_NO_HZ
435static DEFINE_PER_CPU(int, rcu_update_flag); 475static DEFINE_PER_CPU(int, rcu_update_flag);
436 476
@@ -1181,6 +1221,9 @@ void __synchronize_sched(void)
1181{ 1221{
1182 struct rcu_synchronize rcu; 1222 struct rcu_synchronize rcu;
1183 1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1184 init_completion(&rcu.completion); 1227 init_completion(&rcu.completion);
1185 /* Will wake me after RCU finished. */ 1228 /* Will wake me after RCU finished. */
1186 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7c4142a79f0a..9b4a975a4b4a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers = 0;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask;
129 130
130static int stutter_pause_test = 0; 131static int stutter_pause_test = 0;
131 132
@@ -889,10 +890,9 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
889 */ 890 */
890static void rcu_torture_shuffle_tasks(void) 891static void rcu_torture_shuffle_tasks(void)
891{ 892{
892 cpumask_t tmp_mask;
893 int i; 893 int i;
894 894
895 cpus_setall(tmp_mask); 895 cpumask_setall(shuffle_tmp_mask);
896 get_online_cpus(); 896 get_online_cpus();
897 897
898 /* No point in shuffling if there is only one online CPU (ex: UP) */ 898 /* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void)
902 } 902 }
903 903
904 if (rcu_idle_cpu != -1) 904 if (rcu_idle_cpu != -1)
905 cpu_clear(rcu_idle_cpu, tmp_mask); 905 cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
906 906
907 set_cpus_allowed_ptr(current, &tmp_mask); 907 set_cpus_allowed_ptr(current, shuffle_tmp_mask);
908 908
909 if (reader_tasks) { 909 if (reader_tasks) {
910 for (i = 0; i < nrealreaders; i++) 910 for (i = 0; i < nrealreaders; i++)
911 if (reader_tasks[i]) 911 if (reader_tasks[i])
912 set_cpus_allowed_ptr(reader_tasks[i], 912 set_cpus_allowed_ptr(reader_tasks[i],
913 &tmp_mask); 913 shuffle_tmp_mask);
914 } 914 }
915 915
916 if (fakewriter_tasks) { 916 if (fakewriter_tasks) {
917 for (i = 0; i < nfakewriters; i++) 917 for (i = 0; i < nfakewriters; i++)
918 if (fakewriter_tasks[i]) 918 if (fakewriter_tasks[i])
919 set_cpus_allowed_ptr(fakewriter_tasks[i], 919 set_cpus_allowed_ptr(fakewriter_tasks[i],
920 &tmp_mask); 920 shuffle_tmp_mask);
921 } 921 }
922 922
923 if (writer_task) 923 if (writer_task)
924 set_cpus_allowed_ptr(writer_task, &tmp_mask); 924 set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
925 925
926 if (stats_task) 926 if (stats_task)
927 set_cpus_allowed_ptr(stats_task, &tmp_mask); 927 set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
928 928
929 if (rcu_idle_cpu == -1) 929 if (rcu_idle_cpu == -1)
930 rcu_idle_cpu = num_online_cpus() - 1; 930 rcu_idle_cpu = num_online_cpus() - 1;
@@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void)
1012 if (shuffler_task) { 1012 if (shuffler_task) {
1013 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 1013 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
1014 kthread_stop(shuffler_task); 1014 kthread_stop(shuffler_task);
1015 free_cpumask_var(shuffle_tmp_mask);
1015 } 1016 }
1016 shuffler_task = NULL; 1017 shuffler_task = NULL;
1017 1018
@@ -1190,10 +1191,18 @@ rcu_torture_init(void)
1190 } 1191 }
1191 if (test_no_idle_hz) { 1192 if (test_no_idle_hz) {
1192 rcu_idle_cpu = num_online_cpus() - 1; 1193 rcu_idle_cpu = num_online_cpus() - 1;
1194
1195 if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
1196 firsterr = -ENOMEM;
1197 VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
1198 goto unwind;
1199 }
1200
1193 /* Create the shuffler thread */ 1201 /* Create the shuffler thread */
1194 shuffler_task = kthread_run(rcu_torture_shuffle, NULL, 1202 shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
1195 "rcu_torture_shuffle"); 1203 "rcu_torture_shuffle");
1196 if (IS_ERR(shuffler_task)) { 1204 if (IS_ERR(shuffler_task)) {
1205 free_cpumask_var(shuffle_tmp_mask);
1197 firsterr = PTR_ERR(shuffler_task); 1206 firsterr = PTR_ERR(shuffler_task);
1198 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); 1207 VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
1199 shuffler_task = NULL; 1208 shuffler_task = NULL;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f2d8638e6c60..7f3266922572 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -78,6 +78,26 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data);
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 80
81/*
82 * Increment the quiescent state counter.
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag.
86 */
87void rcu_qsctr_inc(int cpu)
88{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
90 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed;
92}
93
94void rcu_bh_qsctr_inc(int cpu)
95{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed;
99}
100
81#ifdef CONFIG_NO_HZ 101#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 102DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1, 103 .dynticks_nesting = 1,
@@ -948,8 +968,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
948void rcu_check_callbacks(int cpu, int user) 968void rcu_check_callbacks(int cpu, int user)
949{ 969{
950 if (user || 970 if (user ||
951 (idle_cpu(cpu) && !in_softirq() && 971 (idle_cpu(cpu) && rcu_scheduler_active &&
952 hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 972 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
953 973
954 /* 974 /*
955 * Get here if this CPU took its interrupt from user 975 * Get here if this CPU took its interrupt from user
@@ -1314,7 +1334,7 @@ int rcu_needs_cpu(int cpu)
1314 * access due to the fact that this CPU cannot possibly have any RCU 1334 * access due to the fact that this CPU cannot possibly have any RCU
1315 * callbacks in flight yet. 1335 * callbacks in flight yet.
1316 */ 1336 */
1317static void 1337static void __cpuinit
1318rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1338rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1319{ 1339{
1320 unsigned long flags; 1340 unsigned long flags;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 000000000000..5e872bbf07f5
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,10 @@
1
2/*
3 * RCU implementation internal declarations:
4 */
5extern struct rcu_state rcu_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data);
7
8extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e837826..4ee954f6a8d5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,8 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#include "rcutree.h"
47
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{ 49{
48 if (!rdp->beenonline) 50 if (!rdp->beenonline)
diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77b..bc188549788f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -663,8 +663,10 @@ int relay_late_setup_files(struct rchan *chan,
663 663
664 mutex_lock(&relay_channels_mutex); 664 mutex_lock(&relay_channels_mutex);
665 /* Is chan already set up? */ 665 /* Is chan already set up? */
666 if (unlikely(chan->has_base_filename)) 666 if (unlikely(chan->has_base_filename)) {
667 mutex_unlock(&relay_channels_mutex);
667 return -EEXIST; 668 return -EEXIST;
669 }
668 chan->has_base_filename = 1; 670 chan->has_base_filename = 1;
669 chan->parent = parent; 671 chan->parent = parent;
670 curr_cpu = get_cpu(); 672 curr_cpu = get_cpu();
@@ -675,9 +677,7 @@ int relay_late_setup_files(struct rchan *chan,
675 */ 677 */
676 for_each_online_cpu(i) { 678 for_each_online_cpu(i) {
677 if (unlikely(!chan->buf[i])) { 679 if (unlikely(!chan->buf[i])) {
678 printk(KERN_ERR "relay_late_setup_files: CPU %u " 680 WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
679 "has no buffer, it must have!\n", i);
680 BUG();
681 err = -EINVAL; 681 err = -EINVAL;
682 break; 682 break;
683 } 683 }
@@ -748,7 +748,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
748 * from the scheduler (trying to re-grab 748 * from the scheduler (trying to re-grab
749 * rq->lock), so defer it. 749 * rq->lock), so defer it.
750 */ 750 */
751 __mod_timer(&buf->timer, jiffies + 1); 751 mod_timer(&buf->timer, jiffies + 1);
752 } 752 }
753 753
754 old = buf->data; 754 old = buf->data;
@@ -795,13 +795,15 @@ void relay_subbufs_consumed(struct rchan *chan,
795 if (!chan) 795 if (!chan)
796 return; 796 return;
797 797
798 if (cpu >= NR_CPUS || !chan->buf[cpu]) 798 if (cpu >= NR_CPUS || !chan->buf[cpu] ||
799 subbufs_consumed > chan->n_subbufs)
799 return; 800 return;
800 801
801 buf = chan->buf[cpu]; 802 buf = chan->buf[cpu];
802 buf->subbufs_consumed += subbufs_consumed; 803 if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
803 if (buf->subbufs_consumed > buf->subbufs_produced)
804 buf->subbufs_consumed = buf->subbufs_produced; 804 buf->subbufs_consumed = buf->subbufs_produced;
805 else
806 buf->subbufs_consumed += subbufs_consumed;
805} 807}
806EXPORT_SYMBOL_GPL(relay_subbufs_consumed); 808EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
807 809
diff --git a/kernel/resource.c b/kernel/resource.c
index ca6a1536b205..fd5d7d574bb9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -620,6 +620,7 @@ resource_size_t resource_alignment(struct resource *res)
620 * @start: resource start address 620 * @start: resource start address
621 * @n: resource region size 621 * @n: resource region size
622 * @name: reserving caller's ID string 622 * @name: reserving caller's ID string
623 * @flags: IO resource flags
623 */ 624 */
624struct resource * __request_region(struct resource *parent, 625struct resource * __request_region(struct resource *parent,
625 resource_size_t start, resource_size_t n, 626 resource_size_t start, resource_size_t n,
diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..6cc1fd5d5072 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
223{ 223{
224 ktime_t now; 224 ktime_t now;
225 225
226 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) 226 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
227 return; 227 return;
228 228
229 if (hrtimer_active(&rt_b->rt_period_timer)) 229 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
231 231
232 spin_lock(&rt_b->rt_runtime_lock); 232 spin_lock(&rt_b->rt_runtime_lock);
233 for (;;) { 233 for (;;) {
234 unsigned long delta;
235 ktime_t soft, hard;
236
234 if (hrtimer_active(&rt_b->rt_period_timer)) 237 if (hrtimer_active(&rt_b->rt_period_timer))
235 break; 238 break;
236 239
237 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 240 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
238 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 241 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
239 hrtimer_start_expires(&rt_b->rt_period_timer, 242
240 HRTIMER_MODE_ABS); 243 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0);
241 } 248 }
242 spin_unlock(&rt_b->rt_runtime_lock); 249 spin_unlock(&rt_b->rt_runtime_lock);
243} 250}
@@ -331,6 +338,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331 */ 338 */
332static DEFINE_SPINLOCK(task_group_lock); 339static DEFINE_SPINLOCK(task_group_lock);
333 340
341#ifdef CONFIG_SMP
342static int root_task_group_empty(void)
343{
344 return list_empty(&root_task_group.children);
345}
346#endif
347
334#ifdef CONFIG_FAIR_GROUP_SCHED 348#ifdef CONFIG_FAIR_GROUP_SCHED
335#ifdef CONFIG_USER_SCHED 349#ifdef CONFIG_USER_SCHED
336# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 350# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +405,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391 405
392#else 406#else
393 407
408#ifdef CONFIG_SMP
409static int root_task_group_empty(void)
410{
411 return 1;
412}
413#endif
414
394static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 415static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395static inline struct task_group *task_group(struct task_struct *p) 416static inline struct task_group *task_group(struct task_struct *p)
396{ 417{
@@ -467,11 +488,17 @@ struct rt_rq {
467 struct rt_prio_array active; 488 struct rt_prio_array active;
468 unsigned long rt_nr_running; 489 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 490#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 491 struct {
492 int curr; /* highest queued rt task prio */
493#ifdef CONFIG_SMP
494 int next; /* next highest */
495#endif
496 } highest_prio;
471#endif 497#endif
472#ifdef CONFIG_SMP 498#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 499 unsigned long rt_nr_migratory;
474 int overloaded; 500 int overloaded;
501 struct plist_head pushable_tasks;
475#endif 502#endif
476 int rt_throttled; 503 int rt_throttled;
477 u64 rt_time; 504 u64 rt_time;
@@ -549,7 +576,6 @@ struct rq {
549 unsigned long nr_running; 576 unsigned long nr_running;
550 #define CPU_LOAD_IDX_MAX 5 577 #define CPU_LOAD_IDX_MAX 5
551 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 578 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552 unsigned char idle_at_tick;
553#ifdef CONFIG_NO_HZ 579#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen; 580 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 581 unsigned char in_nohz_recently;
@@ -590,6 +616,7 @@ struct rq {
590 struct root_domain *rd; 616 struct root_domain *rd;
591 struct sched_domain *sd; 617 struct sched_domain *sd;
592 618
619 unsigned char idle_at_tick;
593 /* For active balancing */ 620 /* For active balancing */
594 int active_balance; 621 int active_balance;
595 int push_cpu; 622 int push_cpu;
@@ -618,9 +645,6 @@ struct rq {
618 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 645 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
619 646
620 /* sys_sched_yield() stats */ 647 /* sys_sched_yield() stats */
621 unsigned int yld_exp_empty;
622 unsigned int yld_act_empty;
623 unsigned int yld_both_empty;
624 unsigned int yld_count; 648 unsigned int yld_count;
625 649
626 /* schedule() stats */ 650 /* schedule() stats */
@@ -1093,7 +1117,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1093 if (rq == this_rq()) { 1117 if (rq == this_rq()) {
1094 hrtimer_restart(timer); 1118 hrtimer_restart(timer);
1095 } else if (!rq->hrtick_csd_pending) { 1119 } else if (!rq->hrtick_csd_pending) {
1096 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); 1120 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1097 rq->hrtick_csd_pending = 1; 1121 rq->hrtick_csd_pending = 1;
1098 } 1122 }
1099} 1123}
@@ -1129,7 +1153,8 @@ static __init void init_hrtick(void)
1129 */ 1153 */
1130static void hrtick_start(struct rq *rq, u64 delay) 1154static void hrtick_start(struct rq *rq, u64 delay)
1131{ 1155{
1132 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0);
1133} 1158}
1134 1159
1135static inline void init_hrtick(void) 1160static inline void init_hrtick(void)
@@ -1183,10 +1208,10 @@ static void resched_task(struct task_struct *p)
1183 1208
1184 assert_spin_locked(&task_rq(p)->lock); 1209 assert_spin_locked(&task_rq(p)->lock);
1185 1210
1186 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1211 if (test_tsk_need_resched(p))
1187 return; 1212 return;
1188 1213
1189 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1214 set_tsk_need_resched(p);
1190 1215
1191 cpu = task_cpu(p); 1216 cpu = task_cpu(p);
1192 if (cpu == smp_processor_id()) 1217 if (cpu == smp_processor_id())
@@ -1242,7 +1267,7 @@ void wake_up_idle_cpu(int cpu)
1242 * lockless. The worst case is that the other CPU runs the 1267 * lockless. The worst case is that the other CPU runs the
1243 * idle task through an additional NOOP schedule() 1268 * idle task through an additional NOOP schedule()
1244 */ 1269 */
1245 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); 1270 set_tsk_need_resched(rq->idle);
1246 1271
1247 /* NEED_RESCHED must be visible before we test polling */ 1272 /* NEED_RESCHED must be visible before we test polling */
1248 smp_mb(); 1273 smp_mb();
@@ -1323,8 +1348,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1323 * slice expiry etc. 1348 * slice expiry etc.
1324 */ 1349 */
1325 1350
1326#define WEIGHT_IDLEPRIO 2 1351#define WEIGHT_IDLEPRIO 3
1327#define WMULT_IDLEPRIO (1 << 31) 1352#define WMULT_IDLEPRIO 1431655765
1328 1353
1329/* 1354/*
1330 * Nice levels are multiplicative, with a gentle 10% change for every 1355 * Nice levels are multiplicative, with a gentle 10% change for every
@@ -1610,21 +1635,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610 1635
1611#endif 1636#endif
1612 1637
1638#ifdef CONFIG_PREEMPT
1639
1613/* 1640/*
1614 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1641 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1642 * way at the expense of forcing extra atomic operations in all
1643 * invocations. This assures that the double_lock is acquired using the
1644 * same underlying policy as the spinlock_t on this architecture, which
1645 * reduces latency compared to the unfair variant below. However, it
1646 * also adds more overhead and therefore may reduce throughput.
1615 */ 1647 */
1616static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1648static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1649 __releases(this_rq->lock)
1650 __acquires(busiest->lock)
1651 __acquires(this_rq->lock)
1652{
1653 spin_unlock(&this_rq->lock);
1654 double_rq_lock(this_rq, busiest);
1655
1656 return 1;
1657}
1658
1659#else
1660/*
1661 * Unfair double_lock_balance: Optimizes throughput at the expense of
1662 * latency by eliminating extra atomic operations when the locks are
1663 * already in proper order on entry. This favors lower cpu-ids and will
1664 * grant the double lock to lower cpus over higher ids under contention,
1665 * regardless of entry order into the function.
1666 */
1667static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1617 __releases(this_rq->lock) 1668 __releases(this_rq->lock)
1618 __acquires(busiest->lock) 1669 __acquires(busiest->lock)
1619 __acquires(this_rq->lock) 1670 __acquires(this_rq->lock)
1620{ 1671{
1621 int ret = 0; 1672 int ret = 0;
1622 1673
1623 if (unlikely(!irqs_disabled())) {
1624 /* printk() doesn't work good under rq->lock */
1625 spin_unlock(&this_rq->lock);
1626 BUG_ON(1);
1627 }
1628 if (unlikely(!spin_trylock(&busiest->lock))) { 1674 if (unlikely(!spin_trylock(&busiest->lock))) {
1629 if (busiest < this_rq) { 1675 if (busiest < this_rq) {
1630 spin_unlock(&this_rq->lock); 1676 spin_unlock(&this_rq->lock);
@@ -1637,6 +1683,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1637 return ret; 1683 return ret;
1638} 1684}
1639 1685
1686#endif /* CONFIG_PREEMPT */
1687
1688/*
1689 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1690 */
1691static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1692{
1693 if (unlikely(!irqs_disabled())) {
1694 /* printk() doesn't work good under rq->lock */
1695 spin_unlock(&this_rq->lock);
1696 BUG_ON(1);
1697 }
1698
1699 return _double_lock_balance(this_rq, busiest);
1700}
1701
1640static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1702static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(busiest->lock) 1703 __releases(busiest->lock)
1642{ 1704{
@@ -1705,6 +1767,9 @@ static void update_avg(u64 *avg, u64 sample)
1705 1767
1706static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1768static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1707{ 1769{
1770 if (wakeup)
1771 p->se.start_runtime = p->se.sum_exec_runtime;
1772
1708 sched_info_queued(p); 1773 sched_info_queued(p);
1709 p->sched_class->enqueue_task(rq, p, wakeup); 1774 p->sched_class->enqueue_task(rq, p, wakeup);
1710 p->se.on_rq = 1; 1775 p->se.on_rq = 1;
@@ -1712,10 +1777,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1712 1777
1713static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1778static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1714{ 1779{
1715 if (sleep && p->se.last_wakeup) { 1780 if (sleep) {
1716 update_avg(&p->se.avg_overlap, 1781 if (p->se.last_wakeup) {
1717 p->se.sum_exec_runtime - p->se.last_wakeup); 1782 update_avg(&p->se.avg_overlap,
1718 p->se.last_wakeup = 0; 1783 p->se.sum_exec_runtime - p->se.last_wakeup);
1784 p->se.last_wakeup = 0;
1785 } else {
1786 update_avg(&p->se.avg_wakeup,
1787 sysctl_sched_wakeup_granularity);
1788 }
1719 } 1789 }
1720 1790
1721 sched_info_dequeued(p); 1791 sched_info_dequeued(p);
@@ -2017,7 +2087,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2017 * it must be off the runqueue _entirely_, and not 2087 * it must be off the runqueue _entirely_, and not
2018 * preempted! 2088 * preempted!
2019 * 2089 *
2020 * So if it wa still runnable (but just not actively 2090 * So if it was still runnable (but just not actively
2021 * running right now), it's preempted, and we should 2091 * running right now), it's preempted, and we should
2022 * yield - it could be a while. 2092 * yield - it could be a while.
2023 */ 2093 */
@@ -2267,7 +2337,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2267 sync = 0; 2337 sync = 0;
2268 2338
2269#ifdef CONFIG_SMP 2339#ifdef CONFIG_SMP
2270 if (sched_feat(LB_WAKEUP_UPDATE)) { 2340 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2271 struct sched_domain *sd; 2341 struct sched_domain *sd;
2272 2342
2273 this_cpu = raw_smp_processor_id(); 2343 this_cpu = raw_smp_processor_id();
@@ -2345,6 +2415,22 @@ out_activate:
2345 activate_task(rq, p, 1); 2415 activate_task(rq, p, 1);
2346 success = 1; 2416 success = 1;
2347 2417
2418 /*
2419 * Only attribute actual wakeups done by this task.
2420 */
2421 if (!in_interrupt()) {
2422 struct sched_entity *se = &current->se;
2423 u64 sample = se->sum_exec_runtime;
2424
2425 if (se->last_wakeup)
2426 sample -= se->last_wakeup;
2427 else
2428 sample -= se->start_runtime;
2429 update_avg(&se->avg_wakeup, sample);
2430
2431 se->last_wakeup = se->sum_exec_runtime;
2432 }
2433
2348out_running: 2434out_running:
2349 trace_sched_wakeup(rq, p, success); 2435 trace_sched_wakeup(rq, p, success);
2350 check_preempt_curr(rq, p, sync); 2436 check_preempt_curr(rq, p, sync);
@@ -2355,8 +2441,6 @@ out_running:
2355 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
2356#endif 2442#endif
2357out: 2443out:
2358 current->se.last_wakeup = current->se.sum_exec_runtime;
2359
2360 task_rq_unlock(rq, &flags); 2444 task_rq_unlock(rq, &flags);
2361 2445
2362 return success; 2446 return success;
@@ -2386,6 +2470,8 @@ static void __sched_fork(struct task_struct *p)
2386 p->se.prev_sum_exec_runtime = 0; 2470 p->se.prev_sum_exec_runtime = 0;
2387 p->se.last_wakeup = 0; 2471 p->se.last_wakeup = 0;
2388 p->se.avg_overlap = 0; 2472 p->se.avg_overlap = 0;
2473 p->se.start_runtime = 0;
2474 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2389 2475
2390#ifdef CONFIG_SCHEDSTATS 2476#ifdef CONFIG_SCHEDSTATS
2391 p->se.wait_start = 0; 2477 p->se.wait_start = 0;
@@ -2448,6 +2534,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2448 /* Want to start with kernel preemption disabled. */ 2534 /* Want to start with kernel preemption disabled. */
2449 task_thread_info(p)->preempt_count = 1; 2535 task_thread_info(p)->preempt_count = 1;
2450#endif 2536#endif
2537 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2538
2451 put_cpu(); 2539 put_cpu();
2452} 2540}
2453 2541
@@ -2491,7 +2579,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2491#ifdef CONFIG_PREEMPT_NOTIFIERS 2579#ifdef CONFIG_PREEMPT_NOTIFIERS
2492 2580
2493/** 2581/**
2494 * preempt_notifier_register - tell me when current is being being preempted & rescheduled 2582 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2495 * @notifier: notifier struct to register 2583 * @notifier: notifier struct to register
2496 */ 2584 */
2497void preempt_notifier_register(struct preempt_notifier *notifier) 2585void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2588,6 +2676,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2588{ 2676{
2589 struct mm_struct *mm = rq->prev_mm; 2677 struct mm_struct *mm = rq->prev_mm;
2590 long prev_state; 2678 long prev_state;
2679#ifdef CONFIG_SMP
2680 int post_schedule = 0;
2681
2682 if (current->sched_class->needs_post_schedule)
2683 post_schedule = current->sched_class->needs_post_schedule(rq);
2684#endif
2591 2685
2592 rq->prev_mm = NULL; 2686 rq->prev_mm = NULL;
2593 2687
@@ -2606,7 +2700,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2606 finish_arch_switch(prev); 2700 finish_arch_switch(prev);
2607 finish_lock_switch(rq, prev); 2701 finish_lock_switch(rq, prev);
2608#ifdef CONFIG_SMP 2702#ifdef CONFIG_SMP
2609 if (current->sched_class->post_schedule) 2703 if (post_schedule)
2610 current->sched_class->post_schedule(rq); 2704 current->sched_class->post_schedule(rq);
2611#endif 2705#endif
2612 2706
@@ -2913,6 +3007,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2913 struct sched_domain *sd, enum cpu_idle_type idle, 3007 struct sched_domain *sd, enum cpu_idle_type idle,
2914 int *all_pinned) 3008 int *all_pinned)
2915{ 3009{
3010 int tsk_cache_hot = 0;
2916 /* 3011 /*
2917 * We do not migrate tasks that are: 3012 * We do not migrate tasks that are:
2918 * 1) running (obviously), or 3013 * 1) running (obviously), or
@@ -2936,10 +3031,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2936 * 2) too many balance attempts have failed. 3031 * 2) too many balance attempts have failed.
2937 */ 3032 */
2938 3033
2939 if (!task_hot(p, rq->clock, sd) || 3034 tsk_cache_hot = task_hot(p, rq->clock, sd);
2940 sd->nr_balance_failed > sd->cache_nice_tries) { 3035 if (!tsk_cache_hot ||
3036 sd->nr_balance_failed > sd->cache_nice_tries) {
2941#ifdef CONFIG_SCHEDSTATS 3037#ifdef CONFIG_SCHEDSTATS
2942 if (task_hot(p, rq->clock, sd)) { 3038 if (tsk_cache_hot) {
2943 schedstat_inc(sd, lb_hot_gained[idle]); 3039 schedstat_inc(sd, lb_hot_gained[idle]);
2944 schedstat_inc(p, se.nr_forced_migrations); 3040 schedstat_inc(p, se.nr_forced_migrations);
2945 } 3041 }
@@ -2947,7 +3043,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2947 return 1; 3043 return 1;
2948 } 3044 }
2949 3045
2950 if (task_hot(p, rq->clock, sd)) { 3046 if (tsk_cache_hot) {
2951 schedstat_inc(p, se.nr_failed_migrations_hot); 3047 schedstat_inc(p, se.nr_failed_migrations_hot);
2952 return 0; 3048 return 0;
2953 } 3049 }
@@ -2987,6 +3083,16 @@ next:
2987 pulled++; 3083 pulled++;
2988 rem_load_move -= p->se.load.weight; 3084 rem_load_move -= p->se.load.weight;
2989 3085
3086#ifdef CONFIG_PREEMPT
3087 /*
3088 * NEWIDLE balancing is a source of latency, so preemptible kernels
3089 * will stop after the first task is pulled to minimize the critical
3090 * section.
3091 */
3092 if (idle == CPU_NEWLY_IDLE)
3093 goto out;
3094#endif
3095
2990 /* 3096 /*
2991 * We only want to steal up to the prescribed amount of weighted load. 3097 * We only want to steal up to the prescribed amount of weighted load.
2992 */ 3098 */
@@ -3033,9 +3139,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3033 sd, idle, all_pinned, &this_best_prio); 3139 sd, idle, all_pinned, &this_best_prio);
3034 class = class->next; 3140 class = class->next;
3035 3141
3142#ifdef CONFIG_PREEMPT
3143 /*
3144 * NEWIDLE balancing is a source of latency, so preemptible
3145 * kernels will stop after the first task is pulled to minimize
3146 * the critical section.
3147 */
3036 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3148 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3037 break; 3149 break;
3038 3150#endif
3039 } while (class && max_load_move > total_load_moved); 3151 } while (class && max_load_move > total_load_moved);
3040 3152
3041 return total_load_moved > 0; 3153 return total_load_moved > 0;
@@ -3085,246 +3197,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3085 3197
3086 return 0; 3198 return 0;
3087} 3199}
3088 3200/********** Helpers for find_busiest_group ************************/
3089/* 3201/*
3090 * find_busiest_group finds and returns the busiest CPU group within the 3202 * sd_lb_stats - Structure to store the statistics of a sched_domain
3091 * domain. It calculates and returns the amount of weighted load which 3203 * during load balancing.
3092 * should be moved to restore balance via the imbalance parameter.
3093 */ 3204 */
3094static struct sched_group * 3205struct sd_lb_stats {
3095find_busiest_group(struct sched_domain *sd, int this_cpu, 3206 struct sched_group *busiest; /* Busiest group in this sd */
3096 unsigned long *imbalance, enum cpu_idle_type idle, 3207 struct sched_group *this; /* Local group in this sd */
3097 int *sd_idle, const struct cpumask *cpus, int *balance) 3208 unsigned long total_load; /* Total load of all groups in sd */
3098{ 3209 unsigned long total_pwr; /* Total power of all groups in sd */
3099 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3210 unsigned long avg_load; /* Average load across all groups in sd */
3100 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3211
3101 unsigned long max_pull; 3212 /** Statistics of this group */
3102 unsigned long busiest_load_per_task, busiest_nr_running; 3213 unsigned long this_load;
3103 unsigned long this_load_per_task, this_nr_running; 3214 unsigned long this_load_per_task;
3104 int load_idx, group_imb = 0; 3215 unsigned long this_nr_running;
3216
3217 /* Statistics of the busiest group */
3218 unsigned long max_load;
3219 unsigned long busiest_load_per_task;
3220 unsigned long busiest_nr_running;
3221
3222 int group_imb; /* Is there imbalance in this sd */
3105#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3223#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3106 int power_savings_balance = 1; 3224 int power_savings_balance; /* Is powersave balance needed for this sd */
3107 unsigned long leader_nr_running = 0, min_load_per_task = 0; 3225 struct sched_group *group_min; /* Least loaded group in sd */
3108 unsigned long min_nr_running = ULONG_MAX; 3226 struct sched_group *group_leader; /* Group which relieves group_min */
3109 struct sched_group *group_min = NULL, *group_leader = NULL; 3227 unsigned long min_load_per_task; /* load_per_task in group_min */
3228 unsigned long leader_nr_running; /* Nr running of group_leader */
3229 unsigned long min_nr_running; /* Nr running of group_min */
3110#endif 3230#endif
3231};
3111 3232
3112 max_load = this_load = total_load = total_pwr = 0; 3233/*
3113 busiest_load_per_task = busiest_nr_running = 0; 3234 * sg_lb_stats - stats of a sched_group required for load_balancing
3114 this_load_per_task = this_nr_running = 0; 3235 */
3236struct sg_lb_stats {
3237 unsigned long avg_load; /*Avg load across the CPUs of the group */
3238 unsigned long group_load; /* Total load over the CPUs of the group */
3239 unsigned long sum_nr_running; /* Nr tasks running in the group */
3240 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3241 unsigned long group_capacity;
3242 int group_imb; /* Is there an imbalance in the group ? */
3243};
3115 3244
3116 if (idle == CPU_NOT_IDLE) 3245/**
3246 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3247 * @group: The group whose first cpu is to be returned.
3248 */
3249static inline unsigned int group_first_cpu(struct sched_group *group)
3250{
3251 return cpumask_first(sched_group_cpus(group));
3252}
3253
3254/**
3255 * get_sd_load_idx - Obtain the load index for a given sched domain.
3256 * @sd: The sched_domain whose load_idx is to be obtained.
3257 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3258 */
3259static inline int get_sd_load_idx(struct sched_domain *sd,
3260 enum cpu_idle_type idle)
3261{
3262 int load_idx;
3263
3264 switch (idle) {
3265 case CPU_NOT_IDLE:
3117 load_idx = sd->busy_idx; 3266 load_idx = sd->busy_idx;
3118 else if (idle == CPU_NEWLY_IDLE) 3267 break;
3268
3269 case CPU_NEWLY_IDLE:
3119 load_idx = sd->newidle_idx; 3270 load_idx = sd->newidle_idx;
3120 else 3271 break;
3272 default:
3121 load_idx = sd->idle_idx; 3273 load_idx = sd->idle_idx;
3274 break;
3275 }
3122 3276
3123 do { 3277 return load_idx;
3124 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; 3278}
3125 int local_group;
3126 int i;
3127 int __group_imb = 0;
3128 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3129 unsigned long sum_nr_running, sum_weighted_load;
3130 unsigned long sum_avg_load_per_task;
3131 unsigned long avg_load_per_task;
3132 3279
3133 local_group = cpumask_test_cpu(this_cpu,
3134 sched_group_cpus(group));
3135 3280
3136 if (local_group) 3281#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3137 balance_cpu = cpumask_first(sched_group_cpus(group)); 3282/**
3283 * init_sd_power_savings_stats - Initialize power savings statistics for
3284 * the given sched_domain, during load balancing.
3285 *
3286 * @sd: Sched domain whose power-savings statistics are to be initialized.
3287 * @sds: Variable containing the statistics for sd.
3288 * @idle: Idle status of the CPU at which we're performing load-balancing.
3289 */
3290static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3291 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3292{
3293 /*
3294 * Busy processors will not participate in power savings
3295 * balance.
3296 */
3297 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3298 sds->power_savings_balance = 0;
3299 else {
3300 sds->power_savings_balance = 1;
3301 sds->min_nr_running = ULONG_MAX;
3302 sds->leader_nr_running = 0;
3303 }
3304}
3138 3305
3139 /* Tally up the load of all CPUs in the group */ 3306/**
3140 sum_weighted_load = sum_nr_running = avg_load = 0; 3307 * update_sd_power_savings_stats - Update the power saving stats for a
3141 sum_avg_load_per_task = avg_load_per_task = 0; 3308 * sched_domain while performing load balancing.
3309 *
3310 * @group: sched_group belonging to the sched_domain under consideration.
3311 * @sds: Variable containing the statistics of the sched_domain
3312 * @local_group: Does group contain the CPU for which we're performing
3313 * load balancing ?
3314 * @sgs: Variable containing the statistics of the group.
3315 */
3316static inline void update_sd_power_savings_stats(struct sched_group *group,
3317 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3318{
3142 3319
3143 max_cpu_load = 0; 3320 if (!sds->power_savings_balance)
3144 min_cpu_load = ~0UL; 3321 return;
3145 3322
3146 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3323 /*
3147 struct rq *rq = cpu_rq(i); 3324 * If the local group is idle or completely loaded
3325 * no need to do power savings balance at this domain
3326 */
3327 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3328 !sds->this_nr_running))
3329 sds->power_savings_balance = 0;
3148 3330
3149 if (*sd_idle && rq->nr_running) 3331 /*
3150 *sd_idle = 0; 3332 * If a group is already running at full capacity or idle,
3333 * don't include that group in power savings calculations
3334 */
3335 if (!sds->power_savings_balance ||
3336 sgs->sum_nr_running >= sgs->group_capacity ||
3337 !sgs->sum_nr_running)
3338 return;
3151 3339
3152 /* Bias balancing toward cpus of our domain */ 3340 /*
3153 if (local_group) { 3341 * Calculate the group which has the least non-idle load.
3154 if (idle_cpu(i) && !first_idle_cpu) { 3342 * This is the group from where we need to pick up the load
3155 first_idle_cpu = 1; 3343 * for saving power
3156 balance_cpu = i; 3344 */
3157 } 3345 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3346 (sgs->sum_nr_running == sds->min_nr_running &&
3347 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3348 sds->group_min = group;
3349 sds->min_nr_running = sgs->sum_nr_running;
3350 sds->min_load_per_task = sgs->sum_weighted_load /
3351 sgs->sum_nr_running;
3352 }
3158 3353
3159 load = target_load(i, load_idx); 3354 /*
3160 } else { 3355 * Calculate the group which is almost near its
3161 load = source_load(i, load_idx); 3356 * capacity but still has some space to pick up some load
3162 if (load > max_cpu_load) 3357 * from other group and save more power
3163 max_cpu_load = load; 3358 */
3164 if (min_cpu_load > load) 3359 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3165 min_cpu_load = load; 3360 return;
3166 }
3167 3361
3168 avg_load += load; 3362 if (sgs->sum_nr_running > sds->leader_nr_running ||
3169 sum_nr_running += rq->nr_running; 3363 (sgs->sum_nr_running == sds->leader_nr_running &&
3170 sum_weighted_load += weighted_cpuload(i); 3364 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3365 sds->group_leader = group;
3366 sds->leader_nr_running = sgs->sum_nr_running;
3367 }
3368}
3171 3369
3172 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3370/**
3173 } 3371 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3372 * @sds: Variable containing the statistics of the sched_domain
3373 * under consideration.
3374 * @this_cpu: Cpu at which we're currently performing load-balancing.
3375 * @imbalance: Variable to store the imbalance.
3376 *
3377 * Description:
3378 * Check if we have potential to perform some power-savings balance.
3379 * If yes, set the busiest group to be the least loaded group in the
3380 * sched_domain, so that it's CPUs can be put to idle.
3381 *
3382 * Returns 1 if there is potential to perform power-savings balance.
3383 * Else returns 0.
3384 */
3385static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3386 int this_cpu, unsigned long *imbalance)
3387{
3388 if (!sds->power_savings_balance)
3389 return 0;
3174 3390
3175 /* 3391 if (sds->this != sds->group_leader ||
3176 * First idle cpu or the first cpu(busiest) in this sched group 3392 sds->group_leader == sds->group_min)
3177 * is eligible for doing load balancing at this and above 3393 return 0;
3178 * domains. In the newly idle case, we will allow all the cpu's
3179 * to do the newly idle load balance.
3180 */
3181 if (idle != CPU_NEWLY_IDLE && local_group &&
3182 balance_cpu != this_cpu && balance) {
3183 *balance = 0;
3184 goto ret;
3185 }
3186 3394
3187 total_load += avg_load; 3395 *imbalance = sds->min_load_per_task;
3188 total_pwr += group->__cpu_power; 3396 sds->busiest = sds->group_min;
3189 3397
3190 /* Adjust by relative CPU power of the group */ 3398 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3191 avg_load = sg_div_cpu_power(group, 3399 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3192 avg_load * SCHED_LOAD_SCALE); 3400 group_first_cpu(sds->group_leader);
3401 }
3193 3402
3403 return 1;
3194 3404
3195 /* 3405}
3196 * Consider the group unbalanced when the imbalance is larger 3406#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3197 * than the average weight of two tasks. 3407static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3198 * 3408 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3199 * APZ: with cgroup the avg task weight can vary wildly and 3409{
3200 * might not be a suitable number - should we keep a 3410 return;
3201 * normalized nr_running number somewhere that negates 3411}
3202 * the hierarchy?
3203 */
3204 avg_load_per_task = sg_div_cpu_power(group,
3205 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3206 3412
3207 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3413static inline void update_sd_power_savings_stats(struct sched_group *group,
3208 __group_imb = 1; 3414 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3415{
3416 return;
3417}
3209 3418
3210 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3419static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3420 int this_cpu, unsigned long *imbalance)
3421{
3422 return 0;
3423}
3424#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3425
3426
3427/**
3428 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3429 * @group: sched_group whose statistics are to be updated.
3430 * @this_cpu: Cpu for which load balance is currently performed.
3431 * @idle: Idle status of this_cpu
3432 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3433 * @sd_idle: Idle status of the sched_domain containing group.
3434 * @local_group: Does group contain this_cpu.
3435 * @cpus: Set of cpus considered for load balancing.
3436 * @balance: Should we balance.
3437 * @sgs: variable to hold the statistics for this group.
3438 */
3439static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3440 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3441 int local_group, const struct cpumask *cpus,
3442 int *balance, struct sg_lb_stats *sgs)
3443{
3444 unsigned long load, max_cpu_load, min_cpu_load;
3445 int i;
3446 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3447 unsigned long sum_avg_load_per_task;
3448 unsigned long avg_load_per_task;
3449
3450 if (local_group)
3451 balance_cpu = group_first_cpu(group);
3211 3452
3453 /* Tally up the load of all CPUs in the group */
3454 sum_avg_load_per_task = avg_load_per_task = 0;
3455 max_cpu_load = 0;
3456 min_cpu_load = ~0UL;
3457
3458 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3459 struct rq *rq = cpu_rq(i);
3460
3461 if (*sd_idle && rq->nr_running)
3462 *sd_idle = 0;
3463
3464 /* Bias balancing toward cpus of our domain */
3212 if (local_group) { 3465 if (local_group) {
3213 this_load = avg_load; 3466 if (idle_cpu(i) && !first_idle_cpu) {
3214 this = group; 3467 first_idle_cpu = 1;
3215 this_nr_running = sum_nr_running; 3468 balance_cpu = i;
3216 this_load_per_task = sum_weighted_load; 3469 }
3217 } else if (avg_load > max_load && 3470
3218 (sum_nr_running > group_capacity || __group_imb)) { 3471 load = target_load(i, load_idx);
3219 max_load = avg_load; 3472 } else {
3220 busiest = group; 3473 load = source_load(i, load_idx);
3221 busiest_nr_running = sum_nr_running; 3474 if (load > max_cpu_load)
3222 busiest_load_per_task = sum_weighted_load; 3475 max_cpu_load = load;
3223 group_imb = __group_imb; 3476 if (min_cpu_load > load)
3477 min_cpu_load = load;
3224 } 3478 }
3225 3479
3226#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3480 sgs->group_load += load;
3227 /* 3481 sgs->sum_nr_running += rq->nr_running;
3228 * Busy processors will not participate in power savings 3482 sgs->sum_weighted_load += weighted_cpuload(i);
3229 * balance.
3230 */
3231 if (idle == CPU_NOT_IDLE ||
3232 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3233 goto group_next;
3234 3483
3235 /* 3484 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3236 * If the local group is idle or completely loaded 3485 }
3237 * no need to do power savings balance at this domain
3238 */
3239 if (local_group && (this_nr_running >= group_capacity ||
3240 !this_nr_running))
3241 power_savings_balance = 0;
3242 3486
3243 /* 3487 /*
3244 * If a group is already running at full capacity or idle, 3488 * First idle cpu or the first cpu(busiest) in this sched group
3245 * don't include that group in power savings calculations 3489 * is eligible for doing load balancing at this and above
3246 */ 3490 * domains. In the newly idle case, we will allow all the cpu's
3247 if (!power_savings_balance || sum_nr_running >= group_capacity 3491 * to do the newly idle load balance.
3248 || !sum_nr_running) 3492 */
3249 goto group_next; 3493 if (idle != CPU_NEWLY_IDLE && local_group &&
3494 balance_cpu != this_cpu && balance) {
3495 *balance = 0;
3496 return;
3497 }
3250 3498
3251 /* 3499 /* Adjust by relative CPU power of the group */
3252 * Calculate the group which has the least non-idle load. 3500 sgs->avg_load = sg_div_cpu_power(group,
3253 * This is the group from where we need to pick up the load 3501 sgs->group_load * SCHED_LOAD_SCALE);
3254 * for saving power
3255 */
3256 if ((sum_nr_running < min_nr_running) ||
3257 (sum_nr_running == min_nr_running &&
3258 cpumask_first(sched_group_cpus(group)) >
3259 cpumask_first(sched_group_cpus(group_min)))) {
3260 group_min = group;
3261 min_nr_running = sum_nr_running;
3262 min_load_per_task = sum_weighted_load /
3263 sum_nr_running;
3264 }
3265 3502
3266 /* 3503
3267 * Calculate the group which is almost near its 3504 /*
3268 * capacity but still has some space to pick up some load 3505 * Consider the group unbalanced when the imbalance is larger
3269 * from other group and save more power 3506 * than the average weight of two tasks.
3270 */ 3507 *
3271 if (sum_nr_running <= group_capacity - 1) { 3508 * APZ: with cgroup the avg task weight can vary wildly and
3272 if (sum_nr_running > leader_nr_running || 3509 * might not be a suitable number - should we keep a
3273 (sum_nr_running == leader_nr_running && 3510 * normalized nr_running number somewhere that negates
3274 cpumask_first(sched_group_cpus(group)) < 3511 * the hierarchy?
3275 cpumask_first(sched_group_cpus(group_leader)))) { 3512 */
3276 group_leader = group; 3513 avg_load_per_task = sg_div_cpu_power(group,
3277 leader_nr_running = sum_nr_running; 3514 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3278 } 3515
3516 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3517 sgs->group_imb = 1;
3518
3519 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3520
3521}
3522
3523/**
3524 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3525 * @sd: sched_domain whose statistics are to be updated.
3526 * @this_cpu: Cpu for which load balance is currently performed.
3527 * @idle: Idle status of this_cpu
3528 * @sd_idle: Idle status of the sched_domain containing group.
3529 * @cpus: Set of cpus considered for load balancing.
3530 * @balance: Should we balance.
3531 * @sds: variable to hold the statistics for this sched_domain.
3532 */
3533static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3534 enum cpu_idle_type idle, int *sd_idle,
3535 const struct cpumask *cpus, int *balance,
3536 struct sd_lb_stats *sds)
3537{
3538 struct sched_group *group = sd->groups;
3539 struct sg_lb_stats sgs;
3540 int load_idx;
3541
3542 init_sd_power_savings_stats(sd, sds, idle);
3543 load_idx = get_sd_load_idx(sd, idle);
3544
3545 do {
3546 int local_group;
3547
3548 local_group = cpumask_test_cpu(this_cpu,
3549 sched_group_cpus(group));
3550 memset(&sgs, 0, sizeof(sgs));
3551 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3552 local_group, cpus, balance, &sgs);
3553
3554 if (local_group && balance && !(*balance))
3555 return;
3556
3557 sds->total_load += sgs.group_load;
3558 sds->total_pwr += group->__cpu_power;
3559
3560 if (local_group) {
3561 sds->this_load = sgs.avg_load;
3562 sds->this = group;
3563 sds->this_nr_running = sgs.sum_nr_running;
3564 sds->this_load_per_task = sgs.sum_weighted_load;
3565 } else if (sgs.avg_load > sds->max_load &&
3566 (sgs.sum_nr_running > sgs.group_capacity ||
3567 sgs.group_imb)) {
3568 sds->max_load = sgs.avg_load;
3569 sds->busiest = group;
3570 sds->busiest_nr_running = sgs.sum_nr_running;
3571 sds->busiest_load_per_task = sgs.sum_weighted_load;
3572 sds->group_imb = sgs.group_imb;
3279 } 3573 }
3280group_next: 3574
3281#endif 3575 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3282 group = group->next; 3576 group = group->next;
3283 } while (group != sd->groups); 3577 } while (group != sd->groups);
3284 3578
3285 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 3579}
3286 goto out_balanced;
3287
3288 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3289 3580
3290 if (this_load >= avg_load || 3581/**
3291 100*max_load <= sd->imbalance_pct*this_load) 3582 * fix_small_imbalance - Calculate the minor imbalance that exists
3292 goto out_balanced; 3583 * amongst the groups of a sched_domain, during
3584 * load balancing.
3585 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3586 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3587 * @imbalance: Variable to store the imbalance.
3588 */
3589static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3590 int this_cpu, unsigned long *imbalance)
3591{
3592 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3593 unsigned int imbn = 2;
3594
3595 if (sds->this_nr_running) {
3596 sds->this_load_per_task /= sds->this_nr_running;
3597 if (sds->busiest_load_per_task >
3598 sds->this_load_per_task)
3599 imbn = 1;
3600 } else
3601 sds->this_load_per_task =
3602 cpu_avg_load_per_task(this_cpu);
3293 3603
3294 busiest_load_per_task /= busiest_nr_running; 3604 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3295 if (group_imb) 3605 sds->busiest_load_per_task * imbn) {
3296 busiest_load_per_task = min(busiest_load_per_task, avg_load); 3606 *imbalance = sds->busiest_load_per_task;
3607 return;
3608 }
3297 3609
3298 /* 3610 /*
3299 * We're trying to get all the cpus to the average_load, so we don't 3611 * OK, we don't have enough imbalance to justify moving tasks,
3300 * want to push ourselves above the average load, nor do we wish to 3612 * however we may be able to increase total CPU power used by
3301 * reduce the max loaded cpu below the average load, as either of these 3613 * moving them.
3302 * actions would just result in more rebalancing later, and ping-pong
3303 * tasks around. Thus we look for the minimum possible imbalance.
3304 * Negative imbalances (*we* are more loaded than anyone else) will
3305 * be counted as no imbalance for these purposes -- we can't fix that
3306 * by pulling tasks to us. Be careful of negative numbers as they'll
3307 * appear as very large values with unsigned longs.
3308 */ 3614 */
3309 if (max_load <= busiest_load_per_task)
3310 goto out_balanced;
3311 3615
3616 pwr_now += sds->busiest->__cpu_power *
3617 min(sds->busiest_load_per_task, sds->max_load);
3618 pwr_now += sds->this->__cpu_power *
3619 min(sds->this_load_per_task, sds->this_load);
3620 pwr_now /= SCHED_LOAD_SCALE;
3621
3622 /* Amount of load we'd subtract */
3623 tmp = sg_div_cpu_power(sds->busiest,
3624 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3625 if (sds->max_load > tmp)
3626 pwr_move += sds->busiest->__cpu_power *
3627 min(sds->busiest_load_per_task, sds->max_load - tmp);
3628
3629 /* Amount of load we'd add */
3630 if (sds->max_load * sds->busiest->__cpu_power <
3631 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3632 tmp = sg_div_cpu_power(sds->this,
3633 sds->max_load * sds->busiest->__cpu_power);
3634 else
3635 tmp = sg_div_cpu_power(sds->this,
3636 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3637 pwr_move += sds->this->__cpu_power *
3638 min(sds->this_load_per_task, sds->this_load + tmp);
3639 pwr_move /= SCHED_LOAD_SCALE;
3640
3641 /* Move if we gain throughput */
3642 if (pwr_move > pwr_now)
3643 *imbalance = sds->busiest_load_per_task;
3644}
3645
3646/**
3647 * calculate_imbalance - Calculate the amount of imbalance present within the
3648 * groups of a given sched_domain during load balance.
3649 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3650 * @this_cpu: Cpu for which currently load balance is being performed.
3651 * @imbalance: The variable to store the imbalance.
3652 */
3653static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3654 unsigned long *imbalance)
3655{
3656 unsigned long max_pull;
3312 /* 3657 /*
3313 * In the presence of smp nice balancing, certain scenarios can have 3658 * In the presence of smp nice balancing, certain scenarios can have
3314 * max load less than avg load(as we skip the groups at or below 3659 * max load less than avg load(as we skip the groups at or below
3315 * its cpu_power, while calculating max_load..) 3660 * its cpu_power, while calculating max_load..)
3316 */ 3661 */
3317 if (max_load < avg_load) { 3662 if (sds->max_load < sds->avg_load) {
3318 *imbalance = 0; 3663 *imbalance = 0;
3319 goto small_imbalance; 3664 return fix_small_imbalance(sds, this_cpu, imbalance);
3320 } 3665 }
3321 3666
3322 /* Don't want to pull so many tasks that a group would go idle */ 3667 /* Don't want to pull so many tasks that a group would go idle */
3323 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 3668 max_pull = min(sds->max_load - sds->avg_load,
3669 sds->max_load - sds->busiest_load_per_task);
3324 3670
3325 /* How much load to actually move to equalise the imbalance */ 3671 /* How much load to actually move to equalise the imbalance */
3326 *imbalance = min(max_pull * busiest->__cpu_power, 3672 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3327 (avg_load - this_load) * this->__cpu_power) 3673 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3328 / SCHED_LOAD_SCALE; 3674 / SCHED_LOAD_SCALE;
3329 3675
3330 /* 3676 /*
@@ -3333,78 +3679,110 @@ group_next:
3333 * a think about bumping its value to force at least one task to be 3679 * a think about bumping its value to force at least one task to be
3334 * moved 3680 * moved
3335 */ 3681 */
3336 if (*imbalance < busiest_load_per_task) { 3682 if (*imbalance < sds->busiest_load_per_task)
3337 unsigned long tmp, pwr_now, pwr_move; 3683 return fix_small_imbalance(sds, this_cpu, imbalance);
3338 unsigned int imbn;
3339
3340small_imbalance:
3341 pwr_move = pwr_now = 0;
3342 imbn = 2;
3343 if (this_nr_running) {
3344 this_load_per_task /= this_nr_running;
3345 if (busiest_load_per_task > this_load_per_task)
3346 imbn = 1;
3347 } else
3348 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3349 3684
3350 if (max_load - this_load + busiest_load_per_task >= 3685}
3351 busiest_load_per_task * imbn) { 3686/******* find_busiest_group() helpers end here *********************/
3352 *imbalance = busiest_load_per_task;
3353 return busiest;
3354 }
3355 3687
3356 /* 3688/**
3357 * OK, we don't have enough imbalance to justify moving tasks, 3689 * find_busiest_group - Returns the busiest group within the sched_domain
3358 * however we may be able to increase total CPU power used by 3690 * if there is an imbalance. If there isn't an imbalance, and
3359 * moving them. 3691 * the user has opted for power-savings, it returns a group whose
3360 */ 3692 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3693 * such a group exists.
3694 *
3695 * Also calculates the amount of weighted load which should be moved
3696 * to restore balance.
3697 *
3698 * @sd: The sched_domain whose busiest group is to be returned.
3699 * @this_cpu: The cpu for which load balancing is currently being performed.
3700 * @imbalance: Variable which stores amount of weighted load which should
3701 * be moved to restore balance/put a group to idle.
3702 * @idle: The idle status of this_cpu.
3703 * @sd_idle: The idleness of sd
3704 * @cpus: The set of CPUs under consideration for load-balancing.
3705 * @balance: Pointer to a variable indicating if this_cpu
3706 * is the appropriate cpu to perform load balancing at this_level.
3707 *
3708 * Returns: - the busiest group if imbalance exists.
3709 * - If no imbalance and user has opted for power-savings balance,
3710 * return the least loaded group whose CPUs can be
3711 * put to idle by rebalancing its tasks onto our group.
3712 */
3713static struct sched_group *
3714find_busiest_group(struct sched_domain *sd, int this_cpu,
3715 unsigned long *imbalance, enum cpu_idle_type idle,
3716 int *sd_idle, const struct cpumask *cpus, int *balance)
3717{
3718 struct sd_lb_stats sds;
3361 3719
3362 pwr_now += busiest->__cpu_power * 3720 memset(&sds, 0, sizeof(sds));
3363 min(busiest_load_per_task, max_load);
3364 pwr_now += this->__cpu_power *
3365 min(this_load_per_task, this_load);
3366 pwr_now /= SCHED_LOAD_SCALE;
3367
3368 /* Amount of load we'd subtract */
3369 tmp = sg_div_cpu_power(busiest,
3370 busiest_load_per_task * SCHED_LOAD_SCALE);
3371 if (max_load > tmp)
3372 pwr_move += busiest->__cpu_power *
3373 min(busiest_load_per_task, max_load - tmp);
3374
3375 /* Amount of load we'd add */
3376 if (max_load * busiest->__cpu_power <
3377 busiest_load_per_task * SCHED_LOAD_SCALE)
3378 tmp = sg_div_cpu_power(this,
3379 max_load * busiest->__cpu_power);
3380 else
3381 tmp = sg_div_cpu_power(this,
3382 busiest_load_per_task * SCHED_LOAD_SCALE);
3383 pwr_move += this->__cpu_power *
3384 min(this_load_per_task, this_load + tmp);
3385 pwr_move /= SCHED_LOAD_SCALE;
3386 3721
3387 /* Move if we gain throughput */ 3722 /*
3388 if (pwr_move > pwr_now) 3723 * Compute the various statistics relavent for load balancing at
3389 *imbalance = busiest_load_per_task; 3724 * this level.
3390 } 3725 */
3726 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3727 balance, &sds);
3728
3729 /* Cases where imbalance does not exist from POV of this_cpu */
3730 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3731 * at this level.
3732 * 2) There is no busy sibling group to pull from.
3733 * 3) This group is the busiest group.
3734 * 4) This group is more busy than the avg busieness at this
3735 * sched_domain.
3736 * 5) The imbalance is within the specified limit.
3737 * 6) Any rebalance would lead to ping-pong
3738 */
3739 if (balance && !(*balance))
3740 goto ret;
3391 3741
3392 return busiest; 3742 if (!sds.busiest || sds.busiest_nr_running == 0)
3743 goto out_balanced;
3393 3744
3394out_balanced: 3745 if (sds.this_load >= sds.max_load)
3395#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3746 goto out_balanced;
3396 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3397 goto ret;
3398 3747
3399 if (this == group_leader && group_leader != group_min) { 3748 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3400 *imbalance = min_load_per_task; 3749
3401 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { 3750 if (sds.this_load >= sds.avg_load)
3402 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = 3751 goto out_balanced;
3403 cpumask_first(sched_group_cpus(group_leader)); 3752
3404 } 3753 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3405 return group_min; 3754 goto out_balanced;
3406 } 3755
3407#endif 3756 sds.busiest_load_per_task /= sds.busiest_nr_running;
3757 if (sds.group_imb)
3758 sds.busiest_load_per_task =
3759 min(sds.busiest_load_per_task, sds.avg_load);
3760
3761 /*
3762 * We're trying to get all the cpus to the average_load, so we don't
3763 * want to push ourselves above the average load, nor do we wish to
3764 * reduce the max loaded cpu below the average load, as either of these
3765 * actions would just result in more rebalancing later, and ping-pong
3766 * tasks around. Thus we look for the minimum possible imbalance.
3767 * Negative imbalances (*we* are more loaded than anyone else) will
3768 * be counted as no imbalance for these purposes -- we can't fix that
3769 * by pulling tasks to us. Be careful of negative numbers as they'll
3770 * appear as very large values with unsigned longs.
3771 */
3772 if (sds.max_load <= sds.busiest_load_per_task)
3773 goto out_balanced;
3774
3775 /* Looks like there is an imbalance. Compute it */
3776 calculate_imbalance(&sds, this_cpu, imbalance);
3777 return sds.busiest;
3778
3779out_balanced:
3780 /*
3781 * There is no obvious imbalance. But check if we can do some balancing
3782 * to save power.
3783 */
3784 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3785 return sds.busiest;
3408ret: 3786ret:
3409 *imbalance = 0; 3787 *imbalance = 0;
3410 return NULL; 3788 return NULL;
@@ -3448,19 +3826,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3448 */ 3826 */
3449#define MAX_PINNED_INTERVAL 512 3827#define MAX_PINNED_INTERVAL 512
3450 3828
3829/* Working cpumask for load_balance and load_balance_newidle. */
3830static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3831
3451/* 3832/*
3452 * Check this_cpu to ensure it is balanced within domain. Attempt to move 3833 * Check this_cpu to ensure it is balanced within domain. Attempt to move
3453 * tasks if there is an imbalance. 3834 * tasks if there is an imbalance.
3454 */ 3835 */
3455static int load_balance(int this_cpu, struct rq *this_rq, 3836static int load_balance(int this_cpu, struct rq *this_rq,
3456 struct sched_domain *sd, enum cpu_idle_type idle, 3837 struct sched_domain *sd, enum cpu_idle_type idle,
3457 int *balance, struct cpumask *cpus) 3838 int *balance)
3458{ 3839{
3459 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3840 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3460 struct sched_group *group; 3841 struct sched_group *group;
3461 unsigned long imbalance; 3842 unsigned long imbalance;
3462 struct rq *busiest; 3843 struct rq *busiest;
3463 unsigned long flags; 3844 unsigned long flags;
3845 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3464 3846
3465 cpumask_setall(cpus); 3847 cpumask_setall(cpus);
3466 3848
@@ -3615,8 +3997,7 @@ out:
3615 * this_rq is locked. 3997 * this_rq is locked.
3616 */ 3998 */
3617static int 3999static int
3618load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 4000load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3619 struct cpumask *cpus)
3620{ 4001{
3621 struct sched_group *group; 4002 struct sched_group *group;
3622 struct rq *busiest = NULL; 4003 struct rq *busiest = NULL;
@@ -3624,6 +4005,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3624 int ld_moved = 0; 4005 int ld_moved = 0;
3625 int sd_idle = 0; 4006 int sd_idle = 0;
3626 int all_pinned = 0; 4007 int all_pinned = 0;
4008 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3627 4009
3628 cpumask_setall(cpus); 4010 cpumask_setall(cpus);
3629 4011
@@ -3764,10 +4146,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3764 struct sched_domain *sd; 4146 struct sched_domain *sd;
3765 int pulled_task = 0; 4147 int pulled_task = 0;
3766 unsigned long next_balance = jiffies + HZ; 4148 unsigned long next_balance = jiffies + HZ;
3767 cpumask_var_t tmpmask;
3768
3769 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3770 return;
3771 4149
3772 for_each_domain(this_cpu, sd) { 4150 for_each_domain(this_cpu, sd) {
3773 unsigned long interval; 4151 unsigned long interval;
@@ -3778,7 +4156,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3778 if (sd->flags & SD_BALANCE_NEWIDLE) 4156 if (sd->flags & SD_BALANCE_NEWIDLE)
3779 /* If we've pulled tasks over stop searching: */ 4157 /* If we've pulled tasks over stop searching: */
3780 pulled_task = load_balance_newidle(this_cpu, this_rq, 4158 pulled_task = load_balance_newidle(this_cpu, this_rq,
3781 sd, tmpmask); 4159 sd);
3782 4160
3783 interval = msecs_to_jiffies(sd->balance_interval); 4161 interval = msecs_to_jiffies(sd->balance_interval);
3784 if (time_after(next_balance, sd->last_balance + interval)) 4162 if (time_after(next_balance, sd->last_balance + interval))
@@ -3793,7 +4171,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3793 */ 4171 */
3794 this_rq->next_balance = next_balance; 4172 this_rq->next_balance = next_balance;
3795 } 4173 }
3796 free_cpumask_var(tmpmask);
3797} 4174}
3798 4175
3799/* 4176/*
@@ -3880,19 +4257,24 @@ int select_nohz_load_balancer(int stop_tick)
3880 int cpu = smp_processor_id(); 4257 int cpu = smp_processor_id();
3881 4258
3882 if (stop_tick) { 4259 if (stop_tick) {
3883 cpumask_set_cpu(cpu, nohz.cpu_mask);
3884 cpu_rq(cpu)->in_nohz_recently = 1; 4260 cpu_rq(cpu)->in_nohz_recently = 1;
3885 4261
3886 /* 4262 if (!cpu_active(cpu)) {
3887 * If we are going offline and still the leader, give up! 4263 if (atomic_read(&nohz.load_balancer) != cpu)
3888 */ 4264 return 0;
3889 if (!cpu_active(cpu) && 4265
3890 atomic_read(&nohz.load_balancer) == cpu) { 4266 /*
4267 * If we are going offline and still the leader,
4268 * give up!
4269 */
3891 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 4270 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3892 BUG(); 4271 BUG();
4272
3893 return 0; 4273 return 0;
3894 } 4274 }
3895 4275
4276 cpumask_set_cpu(cpu, nohz.cpu_mask);
4277
3896 /* time for ilb owner also to sleep */ 4278 /* time for ilb owner also to sleep */
3897 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4279 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3898 if (atomic_read(&nohz.load_balancer) == cpu) 4280 if (atomic_read(&nohz.load_balancer) == cpu)
@@ -3938,11 +4320,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3938 unsigned long next_balance = jiffies + 60*HZ; 4320 unsigned long next_balance = jiffies + 60*HZ;
3939 int update_next_balance = 0; 4321 int update_next_balance = 0;
3940 int need_serialize; 4322 int need_serialize;
3941 cpumask_var_t tmp;
3942
3943 /* Fails alloc? Rebalancing probably not a priority right now. */
3944 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3945 return;
3946 4323
3947 for_each_domain(cpu, sd) { 4324 for_each_domain(cpu, sd) {
3948 if (!(sd->flags & SD_LOAD_BALANCE)) 4325 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3967,7 +4344,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3967 } 4344 }
3968 4345
3969 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4346 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3970 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { 4347 if (load_balance(cpu, rq, sd, idle, &balance)) {
3971 /* 4348 /*
3972 * We've pulled tasks over so either we're no 4349 * We've pulled tasks over so either we're no
3973 * longer idle, or one of our SMT siblings is 4350 * longer idle, or one of our SMT siblings is
@@ -4001,8 +4378,6 @@ out:
4001 */ 4378 */
4002 if (likely(update_next_balance)) 4379 if (likely(update_next_balance))
4003 rq->next_balance = next_balance; 4380 rq->next_balance = next_balance;
4004
4005 free_cpumask_var(tmp);
4006} 4381}
4007 4382
4008/* 4383/*
@@ -4052,6 +4427,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4052#endif 4427#endif
4053} 4428}
4054 4429
4430static inline int on_null_domain(int cpu)
4431{
4432 return !rcu_dereference(cpu_rq(cpu)->sd);
4433}
4434
4055/* 4435/*
4056 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4436 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4057 * 4437 *
@@ -4109,7 +4489,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4109 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4489 cpumask_test_cpu(cpu, nohz.cpu_mask))
4110 return; 4490 return;
4111#endif 4491#endif
4112 if (time_after_eq(jiffies, rq->next_balance)) 4492 /* Don't need to rebalance while attached to NULL domain */
4493 if (time_after_eq(jiffies, rq->next_balance) &&
4494 likely(!on_null_domain(cpu)))
4113 raise_softirq(SCHED_SOFTIRQ); 4495 raise_softirq(SCHED_SOFTIRQ);
4114} 4496}
4115 4497
@@ -4399,10 +4781,7 @@ void scheduler_tick(void)
4399#endif 4781#endif
4400} 4782}
4401 4783
4402#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4784unsigned long get_parent_ip(unsigned long addr)
4403 defined(CONFIG_PREEMPT_TRACER))
4404
4405static inline unsigned long get_parent_ip(unsigned long addr)
4406{ 4785{
4407 if (in_lock_functions(addr)) { 4786 if (in_lock_functions(addr)) {
4408 addr = CALLER_ADDR2; 4787 addr = CALLER_ADDR2;
@@ -4412,6 +4791,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
4412 return addr; 4791 return addr;
4413} 4792}
4414 4793
4794#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4795 defined(CONFIG_PREEMPT_TRACER))
4796
4415void __kprobes add_preempt_count(int val) 4797void __kprobes add_preempt_count(int val)
4416{ 4798{
4417#ifdef CONFIG_DEBUG_PREEMPT 4799#ifdef CONFIG_DEBUG_PREEMPT
@@ -4440,7 +4822,7 @@ void __kprobes sub_preempt_count(int val)
4440 /* 4822 /*
4441 * Underflow? 4823 * Underflow?
4442 */ 4824 */
4443 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) 4825 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4444 return; 4826 return;
4445 /* 4827 /*
4446 * Is the spinlock portion underflowing? 4828 * Is the spinlock portion underflowing?
@@ -4503,11 +4885,33 @@ static inline void schedule_debug(struct task_struct *prev)
4503#endif 4885#endif
4504} 4886}
4505 4887
4888static void put_prev_task(struct rq *rq, struct task_struct *prev)
4889{
4890 if (prev->state == TASK_RUNNING) {
4891 u64 runtime = prev->se.sum_exec_runtime;
4892
4893 runtime -= prev->se.prev_sum_exec_runtime;
4894 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4895
4896 /*
4897 * In order to avoid avg_overlap growing stale when we are
4898 * indeed overlapping and hence not getting put to sleep, grow
4899 * the avg_overlap on preemption.
4900 *
4901 * We use the average preemption runtime because that
4902 * correlates to the amount of cache footprint a task can
4903 * build up.
4904 */
4905 update_avg(&prev->se.avg_overlap, runtime);
4906 }
4907 prev->sched_class->put_prev_task(rq, prev);
4908}
4909
4506/* 4910/*
4507 * Pick up the highest-prio task: 4911 * Pick up the highest-prio task:
4508 */ 4912 */
4509static inline struct task_struct * 4913static inline struct task_struct *
4510pick_next_task(struct rq *rq, struct task_struct *prev) 4914pick_next_task(struct rq *rq)
4511{ 4915{
4512 const struct sched_class *class; 4916 const struct sched_class *class;
4513 struct task_struct *p; 4917 struct task_struct *p;
@@ -4538,15 +4942,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
4538/* 4942/*
4539 * schedule() is the main scheduler function. 4943 * schedule() is the main scheduler function.
4540 */ 4944 */
4541asmlinkage void __sched schedule(void) 4945asmlinkage void __sched __schedule(void)
4542{ 4946{
4543 struct task_struct *prev, *next; 4947 struct task_struct *prev, *next;
4544 unsigned long *switch_count; 4948 unsigned long *switch_count;
4545 struct rq *rq; 4949 struct rq *rq;
4546 int cpu; 4950 int cpu;
4547 4951
4548need_resched:
4549 preempt_disable();
4550 cpu = smp_processor_id(); 4952 cpu = smp_processor_id();
4551 rq = cpu_rq(cpu); 4953 rq = cpu_rq(cpu);
4552 rcu_qsctr_inc(cpu); 4954 rcu_qsctr_inc(cpu);
@@ -4581,8 +4983,8 @@ need_resched_nonpreemptible:
4581 if (unlikely(!rq->nr_running)) 4983 if (unlikely(!rq->nr_running))
4582 idle_balance(cpu, rq); 4984 idle_balance(cpu, rq);
4583 4985
4584 prev->sched_class->put_prev_task(rq, prev); 4986 put_prev_task(rq, prev);
4585 next = pick_next_task(rq, prev); 4987 next = pick_next_task(rq);
4586 4988
4587 if (likely(prev != next)) { 4989 if (likely(prev != next)) {
4588 sched_info_switch(prev, next); 4990 sched_info_switch(prev, next);
@@ -4603,13 +5005,80 @@ need_resched_nonpreemptible:
4603 5005
4604 if (unlikely(reacquire_kernel_lock(current) < 0)) 5006 if (unlikely(reacquire_kernel_lock(current) < 0))
4605 goto need_resched_nonpreemptible; 5007 goto need_resched_nonpreemptible;
5008}
4606 5009
5010asmlinkage void __sched schedule(void)
5011{
5012need_resched:
5013 preempt_disable();
5014 __schedule();
4607 preempt_enable_no_resched(); 5015 preempt_enable_no_resched();
4608 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5016 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4609 goto need_resched; 5017 goto need_resched;
4610} 5018}
4611EXPORT_SYMBOL(schedule); 5019EXPORT_SYMBOL(schedule);
4612 5020
5021#ifdef CONFIG_SMP
5022/*
5023 * Look out! "owner" is an entirely speculative pointer
5024 * access and not reliable.
5025 */
5026int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5027{
5028 unsigned int cpu;
5029 struct rq *rq;
5030
5031 if (!sched_feat(OWNER_SPIN))
5032 return 0;
5033
5034#ifdef CONFIG_DEBUG_PAGEALLOC
5035 /*
5036 * Need to access the cpu field knowing that
5037 * DEBUG_PAGEALLOC could have unmapped it if
5038 * the mutex owner just released it and exited.
5039 */
5040 if (probe_kernel_address(&owner->cpu, cpu))
5041 goto out;
5042#else
5043 cpu = owner->cpu;
5044#endif
5045
5046 /*
5047 * Even if the access succeeded (likely case),
5048 * the cpu field may no longer be valid.
5049 */
5050 if (cpu >= nr_cpumask_bits)
5051 goto out;
5052
5053 /*
5054 * We need to validate that we can do a
5055 * get_cpu() and that we have the percpu area.
5056 */
5057 if (!cpu_online(cpu))
5058 goto out;
5059
5060 rq = cpu_rq(cpu);
5061
5062 for (;;) {
5063 /*
5064 * Owner changed, break to re-assess state.
5065 */
5066 if (lock->owner != owner)
5067 break;
5068
5069 /*
5070 * Is that owner really running on that cpu?
5071 */
5072 if (task_thread_info(rq->curr) != owner || need_resched())
5073 return 0;
5074
5075 cpu_relax();
5076 }
5077out:
5078 return 1;
5079}
5080#endif
5081
4613#ifdef CONFIG_PREEMPT 5082#ifdef CONFIG_PREEMPT
4614/* 5083/*
4615 * this is the entry point to schedule() from in-kernel preemption 5084 * this is the entry point to schedule() from in-kernel preemption
@@ -4637,7 +5106,7 @@ asmlinkage void __sched preempt_schedule(void)
4637 * between schedule and now. 5106 * between schedule and now.
4638 */ 5107 */
4639 barrier(); 5108 barrier();
4640 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5109 } while (need_resched());
4641} 5110}
4642EXPORT_SYMBOL(preempt_schedule); 5111EXPORT_SYMBOL(preempt_schedule);
4643 5112
@@ -4666,7 +5135,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4666 * between schedule and now. 5135 * between schedule and now.
4667 */ 5136 */
4668 barrier(); 5137 barrier();
4669 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5138 } while (need_resched());
4670} 5139}
4671 5140
4672#endif /* CONFIG_PREEMPT */ 5141#endif /* CONFIG_PREEMPT */
@@ -4687,8 +5156,8 @@ EXPORT_SYMBOL(default_wake_function);
4687 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5156 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4688 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5157 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4689 */ 5158 */
4690static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5159void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4691 int nr_exclusive, int sync, void *key) 5160 int nr_exclusive, int sync, void *key)
4692{ 5161{
4693 wait_queue_t *curr, *next; 5162 wait_queue_t *curr, *next;
4694 5163
@@ -4727,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4727 __wake_up_common(q, mode, 1, 0, NULL); 5196 __wake_up_common(q, mode, 1, 0, NULL);
4728} 5197}
4729 5198
5199void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5200{
5201 __wake_up_common(q, mode, 1, 0, key);
5202}
5203
4730/** 5204/**
4731 * __wake_up_sync - wake up threads blocked on a waitqueue. 5205 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4732 * @q: the waitqueue 5206 * @q: the waitqueue
4733 * @mode: which threads 5207 * @mode: which threads
4734 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5208 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5209 * @key: opaque value to be passed to wakeup targets
4735 * 5210 *
4736 * The sync wakeup differs that the waker knows that it will schedule 5211 * The sync wakeup differs that the waker knows that it will schedule
4737 * away soon, so while the target thread will be woken up, it will not 5212 * away soon, so while the target thread will be woken up, it will not
@@ -4740,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4740 * 5215 *
4741 * On UP it can prevent extra preemption. 5216 * On UP it can prevent extra preemption.
4742 */ 5217 */
4743void 5218void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4744__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 5219 int nr_exclusive, void *key)
4745{ 5220{
4746 unsigned long flags; 5221 unsigned long flags;
4747 int sync = 1; 5222 int sync = 1;
@@ -4753,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4753 sync = 0; 5228 sync = 0;
4754 5229
4755 spin_lock_irqsave(&q->lock, flags); 5230 spin_lock_irqsave(&q->lock, flags);
4756 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 5231 __wake_up_common(q, mode, nr_exclusive, sync, key);
4757 spin_unlock_irqrestore(&q->lock, flags); 5232 spin_unlock_irqrestore(&q->lock, flags);
4758} 5233}
5234EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5235
5236/*
5237 * __wake_up_sync - see __wake_up_sync_key()
5238 */
5239void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5240{
5241 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5242}
4759EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 5243EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4760 5244
4761/** 5245/**
@@ -5126,7 +5610,7 @@ int can_nice(const struct task_struct *p, const int nice)
5126 * sys_setpriority is a more generic, but much slower function that 5610 * sys_setpriority is a more generic, but much slower function that
5127 * does similar things. 5611 * does similar things.
5128 */ 5612 */
5129asmlinkage long sys_nice(int increment) 5613SYSCALL_DEFINE1(nice, int, increment)
5130{ 5614{
5131 long nice, retval; 5615 long nice, retval;
5132 5616
@@ -5140,7 +5624,7 @@ asmlinkage long sys_nice(int increment)
5140 if (increment > 40) 5624 if (increment > 40)
5141 increment = 40; 5625 increment = 40;
5142 5626
5143 nice = PRIO_TO_NICE(current->static_prio) + increment; 5627 nice = TASK_NICE(current) + increment;
5144 if (nice < -20) 5628 if (nice < -20)
5145 nice = -20; 5629 nice = -20;
5146 if (nice > 19) 5630 if (nice > 19)
@@ -5433,8 +5917,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5433 * @policy: new policy. 5917 * @policy: new policy.
5434 * @param: structure containing the new RT priority. 5918 * @param: structure containing the new RT priority.
5435 */ 5919 */
5436asmlinkage long 5920SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5437sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5921 struct sched_param __user *, param)
5438{ 5922{
5439 /* negative values for policy are not valid */ 5923 /* negative values for policy are not valid */
5440 if (policy < 0) 5924 if (policy < 0)
@@ -5448,7 +5932,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5448 * @pid: the pid in question. 5932 * @pid: the pid in question.
5449 * @param: structure containing the new RT priority. 5933 * @param: structure containing the new RT priority.
5450 */ 5934 */
5451asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) 5935SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5452{ 5936{
5453 return do_sched_setscheduler(pid, -1, param); 5937 return do_sched_setscheduler(pid, -1, param);
5454} 5938}
@@ -5457,7 +5941,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
5457 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 5941 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5458 * @pid: the pid in question. 5942 * @pid: the pid in question.
5459 */ 5943 */
5460asmlinkage long sys_sched_getscheduler(pid_t pid) 5944SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5461{ 5945{
5462 struct task_struct *p; 5946 struct task_struct *p;
5463 int retval; 5947 int retval;
@@ -5482,7 +5966,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
5482 * @pid: the pid in question. 5966 * @pid: the pid in question.
5483 * @param: structure containing the RT priority. 5967 * @param: structure containing the RT priority.
5484 */ 5968 */
5485asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 5969SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5486{ 5970{
5487 struct sched_param lp; 5971 struct sched_param lp;
5488 struct task_struct *p; 5972 struct task_struct *p;
@@ -5600,8 +6084,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5600 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 6084 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5601 * @user_mask_ptr: user-space pointer to the new cpu mask 6085 * @user_mask_ptr: user-space pointer to the new cpu mask
5602 */ 6086 */
5603asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 6087SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5604 unsigned long __user *user_mask_ptr) 6088 unsigned long __user *, user_mask_ptr)
5605{ 6089{
5606 cpumask_var_t new_mask; 6090 cpumask_var_t new_mask;
5607 int retval; 6091 int retval;
@@ -5648,8 +6132,8 @@ out_unlock:
5648 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 6132 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5649 * @user_mask_ptr: user-space pointer to hold the current cpu mask 6133 * @user_mask_ptr: user-space pointer to hold the current cpu mask
5650 */ 6134 */
5651asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 6135SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5652 unsigned long __user *user_mask_ptr) 6136 unsigned long __user *, user_mask_ptr)
5653{ 6137{
5654 int ret; 6138 int ret;
5655 cpumask_var_t mask; 6139 cpumask_var_t mask;
@@ -5678,7 +6162,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5678 * This function yields the current CPU to other tasks. If there are no 6162 * This function yields the current CPU to other tasks. If there are no
5679 * other threads running on this CPU then this function will return. 6163 * other threads running on this CPU then this function will return.
5680 */ 6164 */
5681asmlinkage long sys_sched_yield(void) 6165SYSCALL_DEFINE0(sched_yield)
5682{ 6166{
5683 struct rq *rq = this_rq_lock(); 6167 struct rq *rq = this_rq_lock();
5684 6168
@@ -5819,7 +6303,7 @@ long __sched io_schedule_timeout(long timeout)
5819 * this syscall returns the maximum rt_priority that can be used 6303 * this syscall returns the maximum rt_priority that can be used
5820 * by a given scheduling class. 6304 * by a given scheduling class.
5821 */ 6305 */
5822asmlinkage long sys_sched_get_priority_max(int policy) 6306SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5823{ 6307{
5824 int ret = -EINVAL; 6308 int ret = -EINVAL;
5825 6309
@@ -5844,7 +6328,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
5844 * this syscall returns the minimum rt_priority that can be used 6328 * this syscall returns the minimum rt_priority that can be used
5845 * by a given scheduling class. 6329 * by a given scheduling class.
5846 */ 6330 */
5847asmlinkage long sys_sched_get_priority_min(int policy) 6331SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5848{ 6332{
5849 int ret = -EINVAL; 6333 int ret = -EINVAL;
5850 6334
@@ -5869,8 +6353,8 @@ asmlinkage long sys_sched_get_priority_min(int policy)
5869 * this syscall writes the default timeslice value of a given process 6353 * this syscall writes the default timeslice value of a given process
5870 * into the user-space timespec buffer. A value of '0' means infinity. 6354 * into the user-space timespec buffer. A value of '0' means infinity.
5871 */ 6355 */
5872asmlinkage 6356SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5873long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 6357 struct timespec __user *, interval)
5874{ 6358{
5875 struct task_struct *p; 6359 struct task_struct *p;
5876 unsigned int time_slice; 6360 unsigned int time_slice;
@@ -5939,12 +6423,7 @@ void sched_show_task(struct task_struct *p)
5939 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 6423 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5940#endif 6424#endif
5941#ifdef CONFIG_DEBUG_STACK_USAGE 6425#ifdef CONFIG_DEBUG_STACK_USAGE
5942 { 6426 free = stack_not_used(p);
5943 unsigned long *n = end_of_stack(p);
5944 while (!*n)
5945 n++;
5946 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5947 }
5948#endif 6427#endif
5949 printk(KERN_CONT "%5lu %5d %6d\n", free, 6428 printk(KERN_CONT "%5lu %5d %6d\n", free,
5950 task_pid_nr(p), task_pid_nr(p->real_parent)); 6429 task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -6418,7 +6897,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6418 if (!rq->nr_running) 6897 if (!rq->nr_running)
6419 break; 6898 break;
6420 update_rq_clock(rq); 6899 update_rq_clock(rq);
6421 next = pick_next_task(rq, rq->curr); 6900 next = pick_next_task(rq);
6422 if (!next) 6901 if (!next)
6423 break; 6902 break;
6424 next->sched_class->put_prev_task(rq, next); 6903 next->sched_class->put_prev_task(rq, next);
@@ -6939,20 +7418,26 @@ static void free_rootdomain(struct root_domain *rd)
6939 7418
6940static void rq_attach_root(struct rq *rq, struct root_domain *rd) 7419static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6941{ 7420{
7421 struct root_domain *old_rd = NULL;
6942 unsigned long flags; 7422 unsigned long flags;
6943 7423
6944 spin_lock_irqsave(&rq->lock, flags); 7424 spin_lock_irqsave(&rq->lock, flags);
6945 7425
6946 if (rq->rd) { 7426 if (rq->rd) {
6947 struct root_domain *old_rd = rq->rd; 7427 old_rd = rq->rd;
6948 7428
6949 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 7429 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6950 set_rq_offline(rq); 7430 set_rq_offline(rq);
6951 7431
6952 cpumask_clear_cpu(rq->cpu, old_rd->span); 7432 cpumask_clear_cpu(rq->cpu, old_rd->span);
6953 7433
6954 if (atomic_dec_and_test(&old_rd->refcount)) 7434 /*
6955 free_rootdomain(old_rd); 7435 * If we dont want to free the old_rt yet then
7436 * set old_rd to NULL to skip the freeing later
7437 * in this function:
7438 */
7439 if (!atomic_dec_and_test(&old_rd->refcount))
7440 old_rd = NULL;
6956 } 7441 }
6957 7442
6958 atomic_inc(&rd->refcount); 7443 atomic_inc(&rd->refcount);
@@ -6963,6 +7448,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6963 set_rq_online(rq); 7448 set_rq_online(rq);
6964 7449
6965 spin_unlock_irqrestore(&rq->lock, flags); 7450 spin_unlock_irqrestore(&rq->lock, flags);
7451
7452 if (old_rd)
7453 free_rootdomain(old_rd);
6966} 7454}
6967 7455
6968static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7456static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@ -7240,7 +7728,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7240{ 7728{
7241 int group; 7729 int group;
7242 7730
7243 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7731 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7244 group = cpumask_first(mask); 7732 group = cpumask_first(mask);
7245 if (sg) 7733 if (sg)
7246 *sg = &per_cpu(sched_group_core, group).sg; 7734 *sg = &per_cpu(sched_group_core, group).sg;
@@ -7269,7 +7757,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7269 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7757 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7270 group = cpumask_first(mask); 7758 group = cpumask_first(mask);
7271#elif defined(CONFIG_SCHED_SMT) 7759#elif defined(CONFIG_SCHED_SMT)
7272 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7760 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7273 group = cpumask_first(mask); 7761 group = cpumask_first(mask);
7274#else 7762#else
7275 group = cpu; 7763 group = cpu;
@@ -7612,7 +8100,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7612 SD_INIT(sd, SIBLING); 8100 SD_INIT(sd, SIBLING);
7613 set_domain_attribute(sd, attr); 8101 set_domain_attribute(sd, attr);
7614 cpumask_and(sched_domain_span(sd), 8102 cpumask_and(sched_domain_span(sd),
7615 &per_cpu(cpu_sibling_map, i), cpu_map); 8103 topology_thread_cpumask(i), cpu_map);
7616 sd->parent = p; 8104 sd->parent = p;
7617 p->child = sd; 8105 p->child = sd;
7618 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 8106 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7623,7 +8111,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7623 /* Set up CPU (sibling) groups */ 8111 /* Set up CPU (sibling) groups */
7624 for_each_cpu(i, cpu_map) { 8112 for_each_cpu(i, cpu_map) {
7625 cpumask_and(this_sibling_map, 8113 cpumask_and(this_sibling_map,
7626 &per_cpu(cpu_sibling_map, i), cpu_map); 8114 topology_thread_cpumask(i), cpu_map);
7627 if (i != cpumask_first(this_sibling_map)) 8115 if (i != cpumask_first(this_sibling_map))
7628 continue; 8116 continue;
7629 8117
@@ -8204,11 +8692,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8204 __set_bit(MAX_RT_PRIO, array->bitmap); 8692 __set_bit(MAX_RT_PRIO, array->bitmap);
8205 8693
8206#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8694#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8207 rt_rq->highest_prio = MAX_RT_PRIO; 8695 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8696#ifdef CONFIG_SMP
8697 rt_rq->highest_prio.next = MAX_RT_PRIO;
8698#endif
8208#endif 8699#endif
8209#ifdef CONFIG_SMP 8700#ifdef CONFIG_SMP
8210 rt_rq->rt_nr_migratory = 0; 8701 rt_rq->rt_nr_migratory = 0;
8211 rt_rq->overloaded = 0; 8702 rt_rq->overloaded = 0;
8703 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8212#endif 8704#endif
8213 8705
8214 rt_rq->rt_time = 0; 8706 rt_rq->rt_time = 0;
@@ -8295,6 +8787,9 @@ void __init sched_init(void)
8295#ifdef CONFIG_USER_SCHED 8787#ifdef CONFIG_USER_SCHED
8296 alloc_size *= 2; 8788 alloc_size *= 2;
8297#endif 8789#endif
8790#ifdef CONFIG_CPUMASK_OFFSTACK
8791 alloc_size += num_possible_cpus() * cpumask_size();
8792#endif
8298 /* 8793 /*
8299 * As sched_init() is called before page_alloc is setup, 8794 * As sched_init() is called before page_alloc is setup,
8300 * we use alloc_bootmem(). 8795 * we use alloc_bootmem().
@@ -8332,6 +8827,12 @@ void __init sched_init(void)
8332 ptr += nr_cpu_ids * sizeof(void **); 8827 ptr += nr_cpu_ids * sizeof(void **);
8333#endif /* CONFIG_USER_SCHED */ 8828#endif /* CONFIG_USER_SCHED */
8334#endif /* CONFIG_RT_GROUP_SCHED */ 8829#endif /* CONFIG_RT_GROUP_SCHED */
8830#ifdef CONFIG_CPUMASK_OFFSTACK
8831 for_each_possible_cpu(i) {
8832 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8833 ptr += cpumask_size();
8834 }
8835#endif /* CONFIG_CPUMASK_OFFSTACK */
8335 } 8836 }
8336 8837
8337#ifdef CONFIG_SMP 8838#ifdef CONFIG_SMP
@@ -9050,6 +9551,13 @@ static int tg_schedulable(struct task_group *tg, void *data)
9050 runtime = d->rt_runtime; 9551 runtime = d->rt_runtime;
9051 } 9552 }
9052 9553
9554#ifdef CONFIG_USER_SCHED
9555 if (tg == &root_task_group) {
9556 period = global_rt_period();
9557 runtime = global_rt_runtime();
9558 }
9559#endif
9560
9053 /* 9561 /*
9054 * Cannot have more runtime than the period. 9562 * Cannot have more runtime than the period.
9055 */ 9563 */
@@ -9203,6 +9711,16 @@ static int sched_rt_global_constraints(void)
9203 9711
9204 return ret; 9712 return ret;
9205} 9713}
9714
9715int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9716{
9717 /* Don't accept realtime tasks when there is no way for them to run */
9718 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9719 return 0;
9720
9721 return 1;
9722}
9723
9206#else /* !CONFIG_RT_GROUP_SCHED */ 9724#else /* !CONFIG_RT_GROUP_SCHED */
9207static int sched_rt_global_constraints(void) 9725static int sched_rt_global_constraints(void)
9208{ 9726{
@@ -9296,8 +9814,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9296 struct task_struct *tsk) 9814 struct task_struct *tsk)
9297{ 9815{
9298#ifdef CONFIG_RT_GROUP_SCHED 9816#ifdef CONFIG_RT_GROUP_SCHED
9299 /* Don't accept realtime tasks when there is no way for them to run */ 9817 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9300 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9301 return -EINVAL; 9818 return -EINVAL;
9302#else 9819#else
9303 /* We don't support RT-tasks being in separate groups */ 9820 /* We don't support RT-tasks being in separate groups */
@@ -9460,7 +9977,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9460 9977
9461static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 9978static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9462{ 9979{
9463 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9980 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9464 u64 data; 9981 u64 data;
9465 9982
9466#ifndef CONFIG_64BIT 9983#ifndef CONFIG_64BIT
@@ -9479,7 +9996,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9479 9996
9480static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 9997static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9481{ 9998{
9482 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9999 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9483 10000
9484#ifndef CONFIG_64BIT 10001#ifndef CONFIG_64BIT
9485 /* 10002 /*
@@ -9568,14 +10085,14 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9568 struct cpuacct *ca; 10085 struct cpuacct *ca;
9569 int cpu; 10086 int cpu;
9570 10087
9571 if (!cpuacct_subsys.active) 10088 if (unlikely(!cpuacct_subsys.active))
9572 return; 10089 return;
9573 10090
9574 cpu = task_cpu(tsk); 10091 cpu = task_cpu(tsk);
9575 ca = task_ca(tsk); 10092 ca = task_ca(tsk);
9576 10093
9577 for (; ca; ca = ca->parent) { 10094 for (; ca; ca = ca->parent) {
9578 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10095 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9579 *cpuusage += cputime; 10096 *cpuusage += cputime;
9580 } 10097 }
9581} 10098}
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414cc..819f17ac796e 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -24,11 +24,12 @@
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
25 * consistent between cpus (never more than 2 jiffies difference). 25 * consistent between cpus (never more than 2 jiffies difference).
26 */ 26 */
27#include <linux/sched.h>
28#include <linux/percpu.h>
29#include <linux/spinlock.h> 27#include <linux/spinlock.h>
30#include <linux/ktime.h> 28#include <linux/hardirq.h>
31#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/percpu.h>
31#include <linux/ktime.h>
32#include <linux/sched.h>
32 33
33/* 34/*
34 * Scheduler clock - returns current time in nanosec units. 35 * Scheduler clock - returns current time in nanosec units.
@@ -43,6 +44,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
43static __read_mostly int sched_clock_running; 44static __read_mostly int sched_clock_running;
44 45
45#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 46#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
47__read_mostly int sched_clock_stable;
46 48
47struct sched_clock_data { 49struct sched_clock_data {
48 /* 50 /*
@@ -87,7 +89,7 @@ void sched_clock_init(void)
87} 89}
88 90
89/* 91/*
90 * min,max except they take wrapping into account 92 * min, max except they take wrapping into account
91 */ 93 */
92 94
93static inline u64 wrap_min(u64 x, u64 y) 95static inline u64 wrap_min(u64 x, u64 y)
@@ -111,15 +113,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
111 s64 delta = now - scd->tick_raw; 113 s64 delta = now - scd->tick_raw;
112 u64 clock, min_clock, max_clock; 114 u64 clock, min_clock, max_clock;
113 115
114 WARN_ON_ONCE(!irqs_disabled());
115
116 if (unlikely(delta < 0)) 116 if (unlikely(delta < 0))
117 delta = 0; 117 delta = 0;
118 118
119 /* 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta, 120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock), 121 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC); 122 * scd->tick_gtod + TICK_NSEC);
123 */ 123 */
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
@@ -148,8 +148,20 @@ static void lock_double_clock(struct sched_clock_data *data1,
148 148
149u64 sched_clock_cpu(int cpu) 149u64 sched_clock_cpu(int cpu)
150{ 150{
151 struct sched_clock_data *scd = cpu_sdc(cpu);
152 u64 now, clock, this_clock, remote_clock; 151 u64 now, clock, this_clock, remote_clock;
152 struct sched_clock_data *scd;
153
154 if (sched_clock_stable)
155 return sched_clock();
156
157 scd = cpu_sdc(cpu);
158
159 /*
160 * Normally this is not called in NMI context - but if it is,
161 * trying to do any locking here is totally lethal.
162 */
163 if (unlikely(in_nmi()))
164 return scd->clock;
153 165
154 if (unlikely(!sched_clock_running)) 166 if (unlikely(!sched_clock_running))
155 return 0ull; 167 return 0ull;
@@ -195,14 +207,18 @@ u64 sched_clock_cpu(int cpu)
195 207
196void sched_clock_tick(void) 208void sched_clock_tick(void)
197{ 209{
198 struct sched_clock_data *scd = this_scd(); 210 struct sched_clock_data *scd;
199 u64 now, now_gtod; 211 u64 now, now_gtod;
200 212
213 if (sched_clock_stable)
214 return;
215
201 if (unlikely(!sched_clock_running)) 216 if (unlikely(!sched_clock_running))
202 return; 217 return;
203 218
204 WARN_ON_ONCE(!irqs_disabled()); 219 WARN_ON_ONCE(!irqs_disabled());
205 220
221 scd = this_scd();
206 now_gtod = ktime_to_ns(ktime_get()); 222 now_gtod = ktime_to_ns(ktime_get());
207 now = sched_clock(); 223 now = sched_clock();
208 224
@@ -250,7 +266,7 @@ u64 sched_clock_cpu(int cpu)
250 return sched_clock(); 266 return sched_clock();
251} 267}
252 268
253#endif 269#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
254 270
255unsigned long long cpu_clock(int cpu) 271unsigned long long cpu_clock(int cpu)
256{ 272{
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 642a94ef8a0a..9a7e859b8fbf 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -25,7 +25,7 @@ struct cpupri {
25 25
26#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 16eeba4e4169..467ca72f1657 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m, int cpu)
272 P(nr_switches); 272 P(nr_switches);
273 P(nr_load_updates); 273 P(nr_load_updates);
274 P(nr_uninterruptible); 274 P(nr_uninterruptible);
275 SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
276 PN(next_balance); 275 PN(next_balance);
277 P(curr->pid); 276 P(curr->pid);
278 PN(clock); 277 PN(clock);
@@ -287,9 +286,6 @@ static void print_cpu(struct seq_file *m, int cpu)
287#ifdef CONFIG_SCHEDSTATS 286#ifdef CONFIG_SCHEDSTATS
288#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); 287#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
289 288
290 P(yld_exp_empty);
291 P(yld_act_empty);
292 P(yld_both_empty);
293 P(yld_count); 289 P(yld_count);
294 290
295 P(sched_switch); 291 P(sched_switch);
@@ -314,7 +310,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
314 u64 now = ktime_to_ns(ktime_get()); 310 u64 now = ktime_to_ns(ktime_get());
315 int cpu; 311 int cpu;
316 312
317 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n", 313 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
318 init_utsname()->release, 314 init_utsname()->release,
319 (int)strcspn(init_utsname()->version, " "), 315 (int)strcspn(init_utsname()->version, " "),
320 init_utsname()->version); 316 init_utsname()->version);
@@ -325,6 +321,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
325 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 321 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
326#define PN(x) \ 322#define PN(x) \
327 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 323 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
324 P(jiffies);
328 PN(sysctl_sched_latency); 325 PN(sysctl_sched_latency);
329 PN(sysctl_sched_min_granularity); 326 PN(sysctl_sched_min_granularity);
330 PN(sysctl_sched_wakeup_granularity); 327 PN(sysctl_sched_wakeup_granularity);
@@ -397,6 +394,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
397 PN(se.vruntime); 394 PN(se.vruntime);
398 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
399 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup);
400 398
401 nr_switches = p->nvcsw + p->nivcsw; 399 nr_switches = p->nvcsw + p->nivcsw;
402 400
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8e1352c75557..3816f217f119 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -283,7 +283,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
283 struct sched_entity, 283 struct sched_entity,
284 run_node); 284 run_node);
285 285
286 if (vruntime == cfs_rq->min_vruntime) 286 if (!cfs_rq->curr)
287 vruntime = se->vruntime; 287 vruntime = se->vruntime;
288 else 288 else
289 vruntime = min_vruntime(vruntime, se->vruntime); 289 vruntime = min_vruntime(vruntime, se->vruntime);
@@ -429,7 +429,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
429 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); 429 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
430 430
431 for_each_sched_entity(se) { 431 for_each_sched_entity(se) {
432 struct load_weight *load = &cfs_rq->load; 432 struct load_weight *load;
433
434 cfs_rq = cfs_rq_of(se);
435 load = &cfs_rq->load;
433 436
434 if (unlikely(!se->on_rq)) { 437 if (unlikely(!se->on_rq)) {
435 struct load_weight lw = cfs_rq->load; 438 struct load_weight lw = cfs_rq->load;
@@ -677,9 +680,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
677 unsigned long thresh = sysctl_sched_latency; 680 unsigned long thresh = sysctl_sched_latency;
678 681
679 /* 682 /*
680 * convert the sleeper threshold into virtual time 683 * Convert the sleeper threshold into virtual time.
684 * SCHED_IDLE is a special sub-class. We care about
685 * fairness only relative to other SCHED_IDLE tasks,
686 * all of which have the same weight.
681 */ 687 */
682 if (sched_feat(NORMALIZED_SLEEPER)) 688 if (sched_feat(NORMALIZED_SLEEPER) &&
689 task_of(se)->policy != SCHED_IDLE)
683 thresh = calc_delta_fair(thresh, se); 690 thresh = calc_delta_fair(thresh, se);
684 691
685 vruntime -= thresh; 692 vruntime -= thresh;
@@ -712,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
712 __enqueue_entity(cfs_rq, se); 719 __enqueue_entity(cfs_rq, se);
713} 720}
714 721
715static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 722static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
716{ 723{
717 if (cfs_rq->last == se) 724 if (cfs_rq->last == se)
718 cfs_rq->last = NULL; 725 cfs_rq->last = NULL;
@@ -721,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
721 cfs_rq->next = NULL; 728 cfs_rq->next = NULL;
722} 729}
723 730
731static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
732{
733 for_each_sched_entity(se)
734 __clear_buddies(cfs_rq_of(se), se);
735}
736
724static void 737static void
725dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 738dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
726{ 739{
@@ -761,8 +774,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
761 774
762 ideal_runtime = sched_slice(cfs_rq, curr); 775 ideal_runtime = sched_slice(cfs_rq, curr);
763 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 776 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
764 if (delta_exec > ideal_runtime) 777 if (delta_exec > ideal_runtime) {
765 resched_task(rq_of(cfs_rq)->curr); 778 resched_task(rq_of(cfs_rq)->curr);
779 /*
780 * The current task ran long enough, ensure it doesn't get
781 * re-elected due to buddy favours.
782 */
783 clear_buddies(cfs_rq, curr);
784 }
766} 785}
767 786
768static void 787static void
@@ -1295,16 +1314,63 @@ out:
1295} 1314}
1296#endif /* CONFIG_SMP */ 1315#endif /* CONFIG_SMP */
1297 1316
1298static unsigned long wakeup_gran(struct sched_entity *se) 1317/*
1318 * Adaptive granularity
1319 *
1320 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1321 * with the limit of wakeup_gran -- when it never does a wakeup.
1322 *
1323 * So the smaller avg_wakeup is the faster we want this task to preempt,
1324 * but we don't want to treat the preemptee unfairly and therefore allow it
1325 * to run for at least the amount of time we'd like to run.
1326 *
1327 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1328 *
1329 * NOTE: we use *nr_running to scale with load, this nicely matches the
1330 * degrading latency on load.
1331 */
1332static unsigned long
1333adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1334{
1335 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1336 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1337 u64 gran = 0;
1338
1339 if (this_run < expected_wakeup)
1340 gran = expected_wakeup - this_run;
1341
1342 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1343}
1344
1345static unsigned long
1346wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1299{ 1347{
1300 unsigned long gran = sysctl_sched_wakeup_granularity; 1348 unsigned long gran = sysctl_sched_wakeup_granularity;
1301 1349
1350 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1351 gran = adaptive_gran(curr, se);
1352
1302 /* 1353 /*
1303 * More easily preempt - nice tasks, while not making it harder for 1354 * Since its curr running now, convert the gran from real-time
1304 * + nice tasks. 1355 * to virtual-time in his units.
1305 */ 1356 */
1306 if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD) 1357 if (sched_feat(ASYM_GRAN)) {
1307 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); 1358 /*
1359 * By using 'se' instead of 'curr' we penalize light tasks, so
1360 * they get preempted easier. That is, if 'se' < 'curr' then
1361 * the resulting gran will be larger, therefore penalizing the
1362 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1363 * be smaller, again penalizing the lighter task.
1364 *
1365 * This is especially important for buddies when the leftmost
1366 * task is higher priority than the buddy.
1367 */
1368 if (unlikely(se->load.weight != NICE_0_LOAD))
1369 gran = calc_delta_fair(gran, se);
1370 } else {
1371 if (unlikely(curr->load.weight != NICE_0_LOAD))
1372 gran = calc_delta_fair(gran, curr);
1373 }
1308 1374
1309 return gran; 1375 return gran;
1310} 1376}
@@ -1331,7 +1397,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1331 if (vdiff <= 0) 1397 if (vdiff <= 0)
1332 return -1; 1398 return -1;
1333 1399
1334 gran = wakeup_gran(curr); 1400 gran = wakeup_gran(curr, se);
1335 if (vdiff > gran) 1401 if (vdiff > gran)
1336 return 1; 1402 return 1;
1337 1403
@@ -1340,14 +1406,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1340 1406
1341static void set_last_buddy(struct sched_entity *se) 1407static void set_last_buddy(struct sched_entity *se)
1342{ 1408{
1343 for_each_sched_entity(se) 1409 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1344 cfs_rq_of(se)->last = se; 1410 for_each_sched_entity(se)
1411 cfs_rq_of(se)->last = se;
1412 }
1345} 1413}
1346 1414
1347static void set_next_buddy(struct sched_entity *se) 1415static void set_next_buddy(struct sched_entity *se)
1348{ 1416{
1349 for_each_sched_entity(se) 1417 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1350 cfs_rq_of(se)->next = se; 1418 for_each_sched_entity(se)
1419 cfs_rq_of(se)->next = se;
1420 }
1351} 1421}
1352 1422
1353/* 1423/*
@@ -1393,12 +1463,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1393 return; 1463 return;
1394 1464
1395 /* 1465 /*
1396 * Batch tasks do not preempt (their preemption is driven by 1466 * Batch and idle tasks do not preempt (their preemption is driven by
1397 * the tick): 1467 * the tick):
1398 */ 1468 */
1399 if (unlikely(p->policy == SCHED_BATCH)) 1469 if (unlikely(p->policy != SCHED_NORMAL))
1400 return; 1470 return;
1401 1471
1472 /* Idle tasks are by definition preempted by everybody. */
1473 if (unlikely(curr->policy == SCHED_IDLE)) {
1474 resched_task(curr);
1475 return;
1476 }
1477
1402 if (!sched_feat(WAKEUP_PREEMPT)) 1478 if (!sched_feat(WAKEUP_PREEMPT))
1403 return; 1479 return;
1404 1480
@@ -1435,6 +1511,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1435 1511
1436 do { 1512 do {
1437 se = pick_next_entity(cfs_rq); 1513 se = pick_next_entity(cfs_rq);
1514 /*
1515 * If se was a buddy, clear it so that it will have to earn
1516 * the favour again.
1517 */
1518 __clear_buddies(cfs_rq, se);
1438 set_next_entity(cfs_rq, se); 1519 set_next_entity(cfs_rq, se);
1439 cfs_rq = group_cfs_rq(se); 1520 cfs_rq = group_cfs_rq(se);
1440 } while (cfs_rq); 1521 } while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b5d2c6..4569bfa7df9b 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,5 +1,6 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(NORMALIZED_SLEEPER, 1) 2SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1)
3SCHED_FEAT(WAKEUP_PREEMPT, 1) 4SCHED_FEAT(WAKEUP_PREEMPT, 1)
4SCHED_FEAT(START_DEBIT, 1) 5SCHED_FEAT(START_DEBIT, 1)
5SCHED_FEAT(AFFINE_WAKEUPS, 1) 6SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@ -13,3 +14,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 14SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0) 15SCHED_FEAT(WAKEUP_OVERLAP, 0)
15SCHED_FEAT(LAST_BUDDY, 1) 16SCHED_FEAT(LAST_BUDDY, 1)
17SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 954e1a81b796..299d012b4394 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,40 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{
8 return container_of(rt_se, struct task_struct, rt);
9}
10
11#ifdef CONFIG_RT_GROUP_SCHED
12
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{
15 return rt_rq->rq;
16}
17
18static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
19{
20 return rt_se->rt_rq;
21}
22
23#else /* CONFIG_RT_GROUP_SCHED */
24
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{
27 return container_of(rt_rq, struct rq, rt);
28}
29
30static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
31{
32 struct task_struct *p = rt_task_of(rt_se);
33 struct rq *rq = task_rq(p);
34
35 return &rq->rt;
36}
37
38#endif /* CONFIG_RT_GROUP_SCHED */
39
6#ifdef CONFIG_SMP 40#ifdef CONFIG_SMP
7 41
8static inline int rt_overloaded(struct rq *rq) 42static inline int rt_overloaded(struct rq *rq)
@@ -37,25 +71,69 @@ static inline void rt_clear_overload(struct rq *rq)
37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 71 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 72}
39 73
40static void update_rt_migration(struct rq *rq) 74static void update_rt_migration(struct rt_rq *rt_rq)
41{ 75{
42 if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { 76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
43 if (!rq->rt.overloaded) { 77 if (!rt_rq->overloaded) {
44 rt_set_overload(rq); 78 rt_set_overload(rq_of_rt_rq(rt_rq));
45 rq->rt.overloaded = 1; 79 rt_rq->overloaded = 1;
46 } 80 }
47 } else if (rq->rt.overloaded) { 81 } else if (rt_rq->overloaded) {
48 rt_clear_overload(rq); 82 rt_clear_overload(rq_of_rt_rq(rt_rq));
49 rq->rt.overloaded = 0; 83 rt_rq->overloaded = 0;
50 } 84 }
51} 85}
52#endif /* CONFIG_SMP */
53 86
54static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{
89 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++;
91
92 update_rt_migration(rt_rq);
93}
94
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{
97 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--;
99
100 update_rt_migration(rt_rq);
101}
102
103static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
104{
105 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
106 plist_node_init(&p->pushable_tasks, p->prio);
107 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
108}
109
110static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
111{
112 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
113}
114
115#else
116
117static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
55{ 118{
56 return container_of(rt_se, struct task_struct, rt);
57} 119}
58 120
121static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
122{
123}
124
125static inline
126void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
127{
128}
129
130static inline
131void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
132{
133}
134
135#endif /* CONFIG_SMP */
136
59static inline int on_rt_rq(struct sched_rt_entity *rt_se) 137static inline int on_rt_rq(struct sched_rt_entity *rt_se)
60{ 138{
61 return !list_empty(&rt_se->run_list); 139 return !list_empty(&rt_se->run_list);
@@ -79,16 +157,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 157#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 158 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 159
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{
84 return rt_rq->rq;
85}
86
87static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
88{
89 return rt_se->rt_rq;
90}
91
92#define for_each_sched_rt_entity(rt_se) \ 160#define for_each_sched_rt_entity(rt_se) \
93 for (; rt_se; rt_se = rt_se->parent) 161 for (; rt_se; rt_se = rt_se->parent)
94 162
@@ -108,7 +176,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
108 if (rt_rq->rt_nr_running) { 176 if (rt_rq->rt_nr_running) {
109 if (rt_se && !on_rt_rq(rt_se)) 177 if (rt_se && !on_rt_rq(rt_se))
110 enqueue_rt_entity(rt_se); 178 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 179 if (rt_rq->highest_prio.curr < curr->prio)
112 resched_task(curr); 180 resched_task(curr);
113 } 181 }
114} 182}
@@ -176,19 +244,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
176#define for_each_leaf_rt_rq(rt_rq, rq) \ 244#define for_each_leaf_rt_rq(rt_rq, rq) \
177 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 245 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
178 246
179static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
180{
181 return container_of(rt_rq, struct rq, rt);
182}
183
184static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
185{
186 struct task_struct *p = rt_task_of(rt_se);
187 struct rq *rq = task_rq(p);
188
189 return &rq->rt;
190}
191
192#define for_each_sched_rt_entity(rt_se) \ 247#define for_each_sched_rt_entity(rt_se) \
193 for (; rt_se; rt_se = NULL) 248 for (; rt_se; rt_se = NULL)
194 249
@@ -473,7 +528,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
473 struct rt_rq *rt_rq = group_rt_rq(rt_se); 528 struct rt_rq *rt_rq = group_rt_rq(rt_se);
474 529
475 if (rt_rq) 530 if (rt_rq)
476 return rt_rq->highest_prio; 531 return rt_rq->highest_prio.curr;
477#endif 532#endif
478 533
479 return rt_task_of(rt_se)->prio; 534 return rt_task_of(rt_se)->prio;
@@ -547,91 +602,174 @@ static void update_curr_rt(struct rq *rq)
547 } 602 }
548} 603}
549 604
550static inline 605#if defined CONFIG_SMP
551void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 606
607static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
608
609static inline int next_prio(struct rq *rq)
552{ 610{
553 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 611 struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
554 rt_rq->rt_nr_running++; 612
555#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 613 if (next && rt_prio(next->prio))
556 if (rt_se_prio(rt_se) < rt_rq->highest_prio) { 614 return next->prio;
557#ifdef CONFIG_SMP 615 else
558 struct rq *rq = rq_of_rt_rq(rt_rq); 616 return MAX_RT_PRIO;
559#endif 617}
618
619static void
620inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
621{
622 struct rq *rq = rq_of_rt_rq(rt_rq);
623
624 if (prio < prev_prio) {
625
626 /*
627 * If the new task is higher in priority than anything on the
628 * run-queue, we know that the previous high becomes our
629 * next-highest.
630 */
631 rt_rq->highest_prio.next = prev_prio;
560 632
561 rt_rq->highest_prio = rt_se_prio(rt_se);
562#ifdef CONFIG_SMP
563 if (rq->online) 633 if (rq->online)
564 cpupri_set(&rq->rd->cpupri, rq->cpu, 634 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
565 rt_se_prio(rt_se));
566#endif
567 }
568#endif
569#ifdef CONFIG_SMP
570 if (rt_se->nr_cpus_allowed > 1) {
571 struct rq *rq = rq_of_rt_rq(rt_rq);
572 635
573 rq->rt.rt_nr_migratory++; 636 } else if (prio == rt_rq->highest_prio.curr)
574 } 637 /*
638 * If the next task is equal in priority to the highest on
639 * the run-queue, then we implicitly know that the next highest
640 * task cannot be any lower than current
641 */
642 rt_rq->highest_prio.next = prio;
643 else if (prio < rt_rq->highest_prio.next)
644 /*
645 * Otherwise, we need to recompute next-highest
646 */
647 rt_rq->highest_prio.next = next_prio(rq);
648}
575 649
576 update_rt_migration(rq_of_rt_rq(rt_rq)); 650static void
577#endif 651dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
578#ifdef CONFIG_RT_GROUP_SCHED 652{
579 if (rt_se_boosted(rt_se)) 653 struct rq *rq = rq_of_rt_rq(rt_rq);
580 rt_rq->rt_nr_boosted++;
581 654
582 if (rt_rq->tg) 655 if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
583 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 656 rt_rq->highest_prio.next = next_prio(rq);
584#else 657
585 start_rt_bandwidth(&def_rt_bandwidth); 658 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
586#endif 659 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
587} 660}
588 661
662#else /* CONFIG_SMP */
663
589static inline 664static inline
590void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 665void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
591{ 666static inline
592#ifdef CONFIG_SMP 667void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
593 int highest_prio = rt_rq->highest_prio; 668
594#endif 669#endif /* CONFIG_SMP */
595 670
596 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
597 WARN_ON(!rt_rq->rt_nr_running);
598 rt_rq->rt_nr_running--;
599#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 671#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
672static void
673inc_rt_prio(struct rt_rq *rt_rq, int prio)
674{
675 int prev_prio = rt_rq->highest_prio.curr;
676
677 if (prio < prev_prio)
678 rt_rq->highest_prio.curr = prio;
679
680 inc_rt_prio_smp(rt_rq, prio, prev_prio);
681}
682
683static void
684dec_rt_prio(struct rt_rq *rt_rq, int prio)
685{
686 int prev_prio = rt_rq->highest_prio.curr;
687
600 if (rt_rq->rt_nr_running) { 688 if (rt_rq->rt_nr_running) {
601 struct rt_prio_array *array;
602 689
603 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); 690 WARN_ON(prio < prev_prio);
604 if (rt_se_prio(rt_se) == rt_rq->highest_prio) { 691
605 /* recalculate */ 692 /*
606 array = &rt_rq->active; 693 * This may have been our highest task, and therefore
607 rt_rq->highest_prio = 694 * we may have some recomputation to do
695 */
696 if (prio == prev_prio) {
697 struct rt_prio_array *array = &rt_rq->active;
698
699 rt_rq->highest_prio.curr =
608 sched_find_first_bit(array->bitmap); 700 sched_find_first_bit(array->bitmap);
609 } /* otherwise leave rq->highest prio alone */ 701 }
702
610 } else 703 } else
611 rt_rq->highest_prio = MAX_RT_PRIO; 704 rt_rq->highest_prio.curr = MAX_RT_PRIO;
612#endif
613#ifdef CONFIG_SMP
614 if (rt_se->nr_cpus_allowed > 1) {
615 struct rq *rq = rq_of_rt_rq(rt_rq);
616 rq->rt.rt_nr_migratory--;
617 }
618 705
619 if (rt_rq->highest_prio != highest_prio) { 706 dec_rt_prio_smp(rt_rq, prio, prev_prio);
620 struct rq *rq = rq_of_rt_rq(rt_rq); 707}
621 708
622 if (rq->online) 709#else
623 cpupri_set(&rq->rd->cpupri, rq->cpu, 710
624 rt_rq->highest_prio); 711static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
625 } 712static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
713
714#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
626 715
627 update_rt_migration(rq_of_rt_rq(rt_rq));
628#endif /* CONFIG_SMP */
629#ifdef CONFIG_RT_GROUP_SCHED 716#ifdef CONFIG_RT_GROUP_SCHED
717
718static void
719inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
720{
721 if (rt_se_boosted(rt_se))
722 rt_rq->rt_nr_boosted++;
723
724 if (rt_rq->tg)
725 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
726}
727
728static void
729dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
730{
630 if (rt_se_boosted(rt_se)) 731 if (rt_se_boosted(rt_se))
631 rt_rq->rt_nr_boosted--; 732 rt_rq->rt_nr_boosted--;
632 733
633 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 734 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
634#endif 735}
736
737#else /* CONFIG_RT_GROUP_SCHED */
738
739static void
740inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
741{
742 start_rt_bandwidth(&def_rt_bandwidth);
743}
744
745static inline
746void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
747
748#endif /* CONFIG_RT_GROUP_SCHED */
749
750static inline
751void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
752{
753 int prio = rt_se_prio(rt_se);
754
755 WARN_ON(!rt_prio(prio));
756 rt_rq->rt_nr_running++;
757
758 inc_rt_prio(rt_rq, prio);
759 inc_rt_migration(rt_se, rt_rq);
760 inc_rt_group(rt_se, rt_rq);
761}
762
763static inline
764void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
765{
766 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
767 WARN_ON(!rt_rq->rt_nr_running);
768 rt_rq->rt_nr_running--;
769
770 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
771 dec_rt_migration(rt_se, rt_rq);
772 dec_rt_group(rt_se, rt_rq);
635} 773}
636 774
637static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 775static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -718,6 +856,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
718 856
719 enqueue_rt_entity(rt_se); 857 enqueue_rt_entity(rt_se);
720 858
859 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
860 enqueue_pushable_task(rq, p);
861
721 inc_cpu_load(rq, p->se.load.weight); 862 inc_cpu_load(rq, p->se.load.weight);
722} 863}
723 864
@@ -728,6 +869,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
728 update_curr_rt(rq); 869 update_curr_rt(rq);
729 dequeue_rt_entity(rt_se); 870 dequeue_rt_entity(rt_se);
730 871
872 dequeue_pushable_task(rq, p);
873
731 dec_cpu_load(rq, p->se.load.weight); 874 dec_cpu_load(rq, p->se.load.weight);
732} 875}
733 876
@@ -878,7 +1021,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
878 return next; 1021 return next;
879} 1022}
880 1023
881static struct task_struct *pick_next_task_rt(struct rq *rq) 1024static struct task_struct *_pick_next_task_rt(struct rq *rq)
882{ 1025{
883 struct sched_rt_entity *rt_se; 1026 struct sched_rt_entity *rt_se;
884 struct task_struct *p; 1027 struct task_struct *p;
@@ -900,6 +1043,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
900 1043
901 p = rt_task_of(rt_se); 1044 p = rt_task_of(rt_se);
902 p->se.exec_start = rq->clock; 1045 p->se.exec_start = rq->clock;
1046
1047 return p;
1048}
1049
1050static struct task_struct *pick_next_task_rt(struct rq *rq)
1051{
1052 struct task_struct *p = _pick_next_task_rt(rq);
1053
1054 /* The running task is never eligible for pushing */
1055 if (p)
1056 dequeue_pushable_task(rq, p);
1057
903 return p; 1058 return p;
904} 1059}
905 1060
@@ -907,6 +1062,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
907{ 1062{
908 update_curr_rt(rq); 1063 update_curr_rt(rq);
909 p->se.exec_start = 0; 1064 p->se.exec_start = 0;
1065
1066 /*
1067 * The previous task needs to be made eligible for pushing
1068 * if it is still active
1069 */
1070 if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
1071 enqueue_pushable_task(rq, p);
910} 1072}
911 1073
912#ifdef CONFIG_SMP 1074#ifdef CONFIG_SMP
@@ -960,16 +1122,17 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
960 1122
961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1123static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
962 1124
963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 1125static inline int pick_optimal_cpu(int this_cpu,
1126 const struct cpumask *mask)
964{ 1127{
965 int first; 1128 int first;
966 1129
967 /* "this_cpu" is cheaper to preempt than a remote processor */ 1130 /* "this_cpu" is cheaper to preempt than a remote processor */
968 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) 1131 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
969 return this_cpu; 1132 return this_cpu;
970 1133
971 first = first_cpu(*mask); 1134 first = cpumask_first(mask);
972 if (first != NR_CPUS) 1135 if (first < nr_cpu_ids)
973 return first; 1136 return first;
974 1137
975 return -1; 1138 return -1;
@@ -981,6 +1144,7 @@ static int find_lowest_rq(struct task_struct *task)
981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1144 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
982 int this_cpu = smp_processor_id(); 1145 int this_cpu = smp_processor_id();
983 int cpu = task_cpu(task); 1146 int cpu = task_cpu(task);
1147 cpumask_var_t domain_mask;
984 1148
985 if (task->rt.nr_cpus_allowed == 1) 1149 if (task->rt.nr_cpus_allowed == 1)
986 return -1; /* No other targets possible */ 1150 return -1; /* No other targets possible */
@@ -1013,19 +1177,25 @@ static int find_lowest_rq(struct task_struct *task)
1013 if (this_cpu == cpu) 1177 if (this_cpu == cpu)
1014 this_cpu = -1; /* Skip this_cpu opt if the same */ 1178 this_cpu = -1; /* Skip this_cpu opt if the same */
1015 1179
1016 for_each_domain(cpu, sd) { 1180 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1017 if (sd->flags & SD_WAKE_AFFINE) { 1181 for_each_domain(cpu, sd) {
1018 cpumask_t domain_mask; 1182 if (sd->flags & SD_WAKE_AFFINE) {
1019 int best_cpu; 1183 int best_cpu;
1184
1185 cpumask_and(domain_mask,
1186 sched_domain_span(sd),
1187 lowest_mask);
1020 1188
1021 cpumask_and(&domain_mask, sched_domain_span(sd), 1189 best_cpu = pick_optimal_cpu(this_cpu,
1022 lowest_mask); 1190 domain_mask);
1023 1191
1024 best_cpu = pick_optimal_cpu(this_cpu, 1192 if (best_cpu != -1) {
1025 &domain_mask); 1193 free_cpumask_var(domain_mask);
1026 if (best_cpu != -1) 1194 return best_cpu;
1027 return best_cpu; 1195 }
1196 }
1028 } 1197 }
1198 free_cpumask_var(domain_mask);
1029 } 1199 }
1030 1200
1031 /* 1201 /*
@@ -1072,7 +1242,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1072 } 1242 }
1073 1243
1074 /* If this rq is still suitable use it. */ 1244 /* If this rq is still suitable use it. */
1075 if (lowest_rq->rt.highest_prio > task->prio) 1245 if (lowest_rq->rt.highest_prio.curr > task->prio)
1076 break; 1246 break;
1077 1247
1078 /* try again */ 1248 /* try again */
@@ -1083,6 +1253,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1083 return lowest_rq; 1253 return lowest_rq;
1084} 1254}
1085 1255
1256static inline int has_pushable_tasks(struct rq *rq)
1257{
1258 return !plist_head_empty(&rq->rt.pushable_tasks);
1259}
1260
1261static struct task_struct *pick_next_pushable_task(struct rq *rq)
1262{
1263 struct task_struct *p;
1264
1265 if (!has_pushable_tasks(rq))
1266 return NULL;
1267
1268 p = plist_first_entry(&rq->rt.pushable_tasks,
1269 struct task_struct, pushable_tasks);
1270
1271 BUG_ON(rq->cpu != task_cpu(p));
1272 BUG_ON(task_current(rq, p));
1273 BUG_ON(p->rt.nr_cpus_allowed <= 1);
1274
1275 BUG_ON(!p->se.on_rq);
1276 BUG_ON(!rt_task(p));
1277
1278 return p;
1279}
1280
1086/* 1281/*
1087 * If the current CPU has more than one RT task, see if the non 1282 * If the current CPU has more than one RT task, see if the non
1088 * running task can migrate over to a CPU that is running a task 1283 * running task can migrate over to a CPU that is running a task
@@ -1092,13 +1287,11 @@ static int push_rt_task(struct rq *rq)
1092{ 1287{
1093 struct task_struct *next_task; 1288 struct task_struct *next_task;
1094 struct rq *lowest_rq; 1289 struct rq *lowest_rq;
1095 int ret = 0;
1096 int paranoid = RT_MAX_TRIES;
1097 1290
1098 if (!rq->rt.overloaded) 1291 if (!rq->rt.overloaded)
1099 return 0; 1292 return 0;
1100 1293
1101 next_task = pick_next_highest_task_rt(rq, -1); 1294 next_task = pick_next_pushable_task(rq);
1102 if (!next_task) 1295 if (!next_task)
1103 return 0; 1296 return 0;
1104 1297
@@ -1127,16 +1320,34 @@ static int push_rt_task(struct rq *rq)
1127 struct task_struct *task; 1320 struct task_struct *task;
1128 /* 1321 /*
1129 * find lock_lowest_rq releases rq->lock 1322 * find lock_lowest_rq releases rq->lock
1130 * so it is possible that next_task has changed. 1323 * so it is possible that next_task has migrated.
1131 * If it has, then try again. 1324 *
1325 * We need to make sure that the task is still on the same
1326 * run-queue and is also still the next task eligible for
1327 * pushing.
1132 */ 1328 */
1133 task = pick_next_highest_task_rt(rq, -1); 1329 task = pick_next_pushable_task(rq);
1134 if (unlikely(task != next_task) && task && paranoid--) { 1330 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1135 put_task_struct(next_task); 1331 /*
1136 next_task = task; 1332 * If we get here, the task hasnt moved at all, but
1137 goto retry; 1333 * it has failed to push. We will not try again,
1334 * since the other cpus will pull from us when they
1335 * are ready.
1336 */
1337 dequeue_pushable_task(rq, next_task);
1338 goto out;
1138 } 1339 }
1139 goto out; 1340
1341 if (!task)
1342 /* No more tasks, just exit */
1343 goto out;
1344
1345 /*
1346 * Something has shifted, try again.
1347 */
1348 put_task_struct(next_task);
1349 next_task = task;
1350 goto retry;
1140 } 1351 }
1141 1352
1142 deactivate_task(rq, next_task, 0); 1353 deactivate_task(rq, next_task, 0);
@@ -1147,23 +1358,12 @@ static int push_rt_task(struct rq *rq)
1147 1358
1148 double_unlock_balance(rq, lowest_rq); 1359 double_unlock_balance(rq, lowest_rq);
1149 1360
1150 ret = 1;
1151out: 1361out:
1152 put_task_struct(next_task); 1362 put_task_struct(next_task);
1153 1363
1154 return ret; 1364 return 1;
1155} 1365}
1156 1366
1157/*
1158 * TODO: Currently we just use the second highest prio task on
1159 * the queue, and stop when it can't migrate (or there's
1160 * no more RT tasks). There may be a case where a lower
1161 * priority RT task has a different affinity than the
1162 * higher RT task. In this case the lower RT task could
1163 * possibly be able to migrate where as the higher priority
1164 * RT task could not. We currently ignore this issue.
1165 * Enhancements are welcome!
1166 */
1167static void push_rt_tasks(struct rq *rq) 1367static void push_rt_tasks(struct rq *rq)
1168{ 1368{
1169 /* push_rt_task will return true if it moved an RT */ 1369 /* push_rt_task will return true if it moved an RT */
@@ -1174,33 +1374,35 @@ static void push_rt_tasks(struct rq *rq)
1174static int pull_rt_task(struct rq *this_rq) 1374static int pull_rt_task(struct rq *this_rq)
1175{ 1375{
1176 int this_cpu = this_rq->cpu, ret = 0, cpu; 1376 int this_cpu = this_rq->cpu, ret = 0, cpu;
1177 struct task_struct *p, *next; 1377 struct task_struct *p;
1178 struct rq *src_rq; 1378 struct rq *src_rq;
1179 1379
1180 if (likely(!rt_overloaded(this_rq))) 1380 if (likely(!rt_overloaded(this_rq)))
1181 return 0; 1381 return 0;
1182 1382
1183 next = pick_next_task_rt(this_rq);
1184
1185 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1383 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1186 if (this_cpu == cpu) 1384 if (this_cpu == cpu)
1187 continue; 1385 continue;
1188 1386
1189 src_rq = cpu_rq(cpu); 1387 src_rq = cpu_rq(cpu);
1388
1389 /*
1390 * Don't bother taking the src_rq->lock if the next highest
1391 * task is known to be lower-priority than our current task.
1392 * This may look racy, but if this value is about to go
1393 * logically higher, the src_rq will push this task away.
1394 * And if its going logically lower, we do not care
1395 */
1396 if (src_rq->rt.highest_prio.next >=
1397 this_rq->rt.highest_prio.curr)
1398 continue;
1399
1190 /* 1400 /*
1191 * We can potentially drop this_rq's lock in 1401 * We can potentially drop this_rq's lock in
1192 * double_lock_balance, and another CPU could 1402 * double_lock_balance, and another CPU could
1193 * steal our next task - hence we must cause 1403 * alter this_rq
1194 * the caller to recalculate the next task
1195 * in that case:
1196 */ 1404 */
1197 if (double_lock_balance(this_rq, src_rq)) { 1405 double_lock_balance(this_rq, src_rq);
1198 struct task_struct *old_next = next;
1199
1200 next = pick_next_task_rt(this_rq);
1201 if (next != old_next)
1202 ret = 1;
1203 }
1204 1406
1205 /* 1407 /*
1206 * Are there still pullable RT tasks? 1408 * Are there still pullable RT tasks?
@@ -1214,7 +1416,7 @@ static int pull_rt_task(struct rq *this_rq)
1214 * Do we have an RT task that preempts 1416 * Do we have an RT task that preempts
1215 * the to-be-scheduled task? 1417 * the to-be-scheduled task?
1216 */ 1418 */
1217 if (p && (!next || (p->prio < next->prio))) { 1419 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1218 WARN_ON(p == src_rq->curr); 1420 WARN_ON(p == src_rq->curr);
1219 WARN_ON(!p->se.on_rq); 1421 WARN_ON(!p->se.on_rq);
1220 1422
@@ -1224,12 +1426,9 @@ static int pull_rt_task(struct rq *this_rq)
1224 * This is just that p is wakeing up and hasn't 1426 * This is just that p is wakeing up and hasn't
1225 * had a chance to schedule. We only pull 1427 * had a chance to schedule. We only pull
1226 * p if it is lower in priority than the 1428 * p if it is lower in priority than the
1227 * current task on the run queue or 1429 * current task on the run queue
1228 * this_rq next task is lower in prio than
1229 * the current task on that rq.
1230 */ 1430 */
1231 if (p->prio < src_rq->curr->prio || 1431 if (p->prio < src_rq->curr->prio)
1232 (next && next->prio < src_rq->curr->prio))
1233 goto skip; 1432 goto skip;
1234 1433
1235 ret = 1; 1434 ret = 1;
@@ -1242,13 +1441,7 @@ static int pull_rt_task(struct rq *this_rq)
1242 * case there's an even higher prio task 1441 * case there's an even higher prio task
1243 * in another runqueue. (low likelyhood 1442 * in another runqueue. (low likelyhood
1244 * but possible) 1443 * but possible)
1245 *
1246 * Update next so that we won't pick a task
1247 * on another cpu with a priority lower (or equal)
1248 * than the one we just picked.
1249 */ 1444 */
1250 next = p;
1251
1252 } 1445 }
1253 skip: 1446 skip:
1254 double_unlock_balance(this_rq, src_rq); 1447 double_unlock_balance(this_rq, src_rq);
@@ -1260,24 +1453,27 @@ static int pull_rt_task(struct rq *this_rq)
1260static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1453static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1261{ 1454{
1262 /* Try to pull RT tasks here if we lower this rq's prio */ 1455 /* Try to pull RT tasks here if we lower this rq's prio */
1263 if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) 1456 if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
1264 pull_rt_task(rq); 1457 pull_rt_task(rq);
1265} 1458}
1266 1459
1460/*
1461 * assumes rq->lock is held
1462 */
1463static int needs_post_schedule_rt(struct rq *rq)
1464{
1465 return has_pushable_tasks(rq);
1466}
1467
1267static void post_schedule_rt(struct rq *rq) 1468static void post_schedule_rt(struct rq *rq)
1268{ 1469{
1269 /* 1470 /*
1270 * If we have more than one rt_task queued, then 1471 * This is only called if needs_post_schedule_rt() indicates that
1271 * see if we can push the other rt_tasks off to other CPUS. 1472 * we need to push tasks away
1272 * Note we may release the rq lock, and since
1273 * the lock was owned by prev, we need to release it
1274 * first via finish_lock_switch and then reaquire it here.
1275 */ 1473 */
1276 if (unlikely(rq->rt.overloaded)) { 1474 spin_lock_irq(&rq->lock);
1277 spin_lock_irq(&rq->lock); 1475 push_rt_tasks(rq);
1278 push_rt_tasks(rq); 1476 spin_unlock_irq(&rq->lock);
1279 spin_unlock_irq(&rq->lock);
1280 }
1281} 1477}
1282 1478
1283/* 1479/*
@@ -1288,7 +1484,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1288{ 1484{
1289 if (!task_running(rq, p) && 1485 if (!task_running(rq, p) &&
1290 !test_tsk_need_resched(rq->curr) && 1486 !test_tsk_need_resched(rq->curr) &&
1291 rq->rt.overloaded) 1487 has_pushable_tasks(rq) &&
1488 p->rt.nr_cpus_allowed > 1)
1292 push_rt_tasks(rq); 1489 push_rt_tasks(rq);
1293} 1490}
1294 1491
@@ -1324,6 +1521,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1324 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { 1521 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
1325 struct rq *rq = task_rq(p); 1522 struct rq *rq = task_rq(p);
1326 1523
1524 if (!task_current(rq, p)) {
1525 /*
1526 * Make sure we dequeue this task from the pushable list
1527 * before going further. It will either remain off of
1528 * the list because we are no longer pushable, or it
1529 * will be requeued.
1530 */
1531 if (p->rt.nr_cpus_allowed > 1)
1532 dequeue_pushable_task(rq, p);
1533
1534 /*
1535 * Requeue if our weight is changing and still > 1
1536 */
1537 if (weight > 1)
1538 enqueue_pushable_task(rq, p);
1539
1540 }
1541
1327 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1542 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
1328 rq->rt.rt_nr_migratory++; 1543 rq->rt.rt_nr_migratory++;
1329 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { 1544 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1331,7 +1546,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1331 rq->rt.rt_nr_migratory--; 1546 rq->rt.rt_nr_migratory--;
1332 } 1547 }
1333 1548
1334 update_rt_migration(rq); 1549 update_rt_migration(&rq->rt);
1335 } 1550 }
1336 1551
1337 cpumask_copy(&p->cpus_allowed, new_mask); 1552 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -1346,7 +1561,7 @@ static void rq_online_rt(struct rq *rq)
1346 1561
1347 __enable_runtime(rq); 1562 __enable_runtime(rq);
1348 1563
1349 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); 1564 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1350} 1565}
1351 1566
1352/* Assumes rq->lock is held */ 1567/* Assumes rq->lock is held */
@@ -1438,7 +1653,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1438 * can release the rq lock and p could migrate. 1653 * can release the rq lock and p could migrate.
1439 * Only reschedule if p is still on the same runqueue. 1654 * Only reschedule if p is still on the same runqueue.
1440 */ 1655 */
1441 if (p->prio > rq->rt.highest_prio && rq->curr == p) 1656 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1442 resched_task(p); 1657 resched_task(p);
1443#else 1658#else
1444 /* For UP simply resched on drop of prio */ 1659 /* For UP simply resched on drop of prio */
@@ -1509,6 +1724,9 @@ static void set_curr_task_rt(struct rq *rq)
1509 struct task_struct *p = rq->curr; 1724 struct task_struct *p = rq->curr;
1510 1725
1511 p->se.exec_start = rq->clock; 1726 p->se.exec_start = rq->clock;
1727
1728 /* The running task is never eligible for pushing */
1729 dequeue_pushable_task(rq, p);
1512} 1730}
1513 1731
1514static const struct sched_class rt_sched_class = { 1732static const struct sched_class rt_sched_class = {
@@ -1531,6 +1749,7 @@ static const struct sched_class rt_sched_class = {
1531 .rq_online = rq_online_rt, 1749 .rq_online = rq_online_rt,
1532 .rq_offline = rq_offline_rt, 1750 .rq_offline = rq_offline_rt,
1533 .pre_schedule = pre_schedule_rt, 1751 .pre_schedule = pre_schedule_rt,
1752 .needs_post_schedule = needs_post_schedule_rt,
1534 .post_schedule = post_schedule_rt, 1753 .post_schedule = post_schedule_rt,
1535 .task_wake_up = task_wake_up_rt, 1754 .task_wake_up = task_wake_up_rt,
1536 .switched_from = switched_from_rt, 1755 .switched_from = switched_from_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index f2773b5d1226..32d2bd4061b0 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -4,7 +4,7 @@
4 * bump this up when changing the output format or the meaning of an existing 4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort) 5 * format, so that tools can adapt (or abort)
6 */ 6 */
7#define SCHEDSTAT_VERSION 14 7#define SCHEDSTAT_VERSION 15
8 8
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
@@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
26 26
27 /* runqueue-specific stats */ 27 /* runqueue-specific stats */
28 seq_printf(seq, 28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu", 29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_both_empty, 30 cpu, rq->yld_count,
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 32 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_cpu_time, 33 rq->rq_cpu_time,
@@ -296,20 +295,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
296static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
297 cputime_t cputime) 296 cputime_t cputime)
298{ 297{
299 struct signal_struct *sig; 298 struct thread_group_cputimer *cputimer;
300 299
301 /* tsk == current, ensure it is safe to use ->signal */ 300 /* tsk == current, ensure it is safe to use ->signal */
302 if (unlikely(tsk->exit_state)) 301 if (unlikely(tsk->exit_state))
303 return; 302 return;
304 303
305 sig = tsk->signal; 304 cputimer = &tsk->signal->cputimer;
306 if (sig->cputime.totals) {
307 struct task_cputime *times;
308 305
309 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 306 if (!cputimer->running)
310 times->utime = cputime_add(times->utime, cputime); 307 return;
311 put_cpu_no_resched(); 308
312 } 309 spin_lock(&cputimer->lock);
310 cputimer->cputime.utime =
311 cputime_add(cputimer->cputime.utime, cputime);
312 spin_unlock(&cputimer->lock);
313} 313}
314 314
315/** 315/**
@@ -325,20 +325,21 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 325static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 326 cputime_t cputime)
327{ 327{
328 struct signal_struct *sig; 328 struct thread_group_cputimer *cputimer;
329 329
330 /* tsk == current, ensure it is safe to use ->signal */ 330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state)) 331 if (unlikely(tsk->exit_state))
332 return; 332 return;
333 333
334 sig = tsk->signal; 334 cputimer = &tsk->signal->cputimer;
335 if (sig->cputime.totals) {
336 struct task_cputime *times;
337 335
338 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 336 if (!cputimer->running)
339 times->stime = cputime_add(times->stime, cputime); 337 return;
340 put_cpu_no_resched(); 338
341 } 339 spin_lock(&cputimer->lock);
340 cputimer->cputime.stime =
341 cputime_add(cputimer->cputime.stime, cputime);
342 spin_unlock(&cputimer->lock);
342} 343}
343 344
344/** 345/**
@@ -354,6 +355,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
354static inline void account_group_exec_runtime(struct task_struct *tsk, 355static inline void account_group_exec_runtime(struct task_struct *tsk,
355 unsigned long long ns) 356 unsigned long long ns)
356{ 357{
358 struct thread_group_cputimer *cputimer;
357 struct signal_struct *sig; 359 struct signal_struct *sig;
358 360
359 sig = tsk->signal; 361 sig = tsk->signal;
@@ -362,11 +364,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
362 if (unlikely(!sig)) 364 if (unlikely(!sig))
363 return; 365 return;
364 366
365 if (sig->cputime.totals) { 367 cputimer = &sig->cputimer;
366 struct task_cputime *times;
367 368
368 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 369 if (!cputimer->running)
369 times->sum_exec_runtime += ns; 370 return;
370 put_cpu_no_resched(); 371
371 } 372 spin_lock(&cputimer->lock);
373 cputimer->cputime.sum_exec_runtime += ns;
374 spin_unlock(&cputimer->lock);
372} 375}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ad64fcb731f2..57d4b13b631d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/seccomp.h> 9#include <linux/seccomp.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/compat.h>
11 12
12/* #define SECCOMP_DEBUG 1 */ 13/* #define SECCOMP_DEBUG 1 */
13#define NR_SECCOMP_MODES 1 14#define NR_SECCOMP_MODES 1
@@ -22,7 +23,7 @@ static int mode1_syscalls[] = {
22 0, /* null terminated */ 23 0, /* null terminated */
23}; 24};
24 25
25#ifdef TIF_32BIT 26#ifdef CONFIG_COMPAT
26static int mode1_syscalls_32[] = { 27static int mode1_syscalls_32[] = {
27 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, 28 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
28 0, /* null terminated */ 29 0, /* null terminated */
@@ -37,8 +38,8 @@ void __secure_computing(int this_syscall)
37 switch (mode) { 38 switch (mode) {
38 case 1: 39 case 1:
39 syscall = mode1_syscalls; 40 syscall = mode1_syscalls;
40#ifdef TIF_32BIT 41#ifdef CONFIG_COMPAT
41 if (test_thread_flag(TIF_32BIT)) 42 if (is_compat_task())
42 syscall = mode1_syscalls_32; 43 syscall = mode1_syscalls_32;
43#endif 44#endif
44 do { 45 do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 3152ac3b62e2..d8034737db4c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,10 +55,22 @@ static int sig_handler_ignored(void __user *handler, int sig)
55 (handler == SIG_DFL && sig_kernel_ignore(sig)); 55 (handler == SIG_DFL && sig_kernel_ignore(sig));
56} 56}
57 57
58static int sig_ignored(struct task_struct *t, int sig) 58static int sig_task_ignored(struct task_struct *t, int sig,
59 int from_ancestor_ns)
59{ 60{
60 void __user *handler; 61 void __user *handler;
61 62
63 handler = sig_handler(t, sig);
64
65 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
66 handler == SIG_DFL && !from_ancestor_ns)
67 return 1;
68
69 return sig_handler_ignored(handler, sig);
70}
71
72static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
73{
62 /* 74 /*
63 * Blocked signals are never ignored, since the 75 * Blocked signals are never ignored, since the
64 * signal handler may change by the time it is 76 * signal handler may change by the time it is
@@ -67,14 +79,13 @@ static int sig_ignored(struct task_struct *t, int sig)
67 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 79 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
68 return 0; 80 return 0;
69 81
70 handler = sig_handler(t, sig); 82 if (!sig_task_ignored(t, sig, from_ancestor_ns))
71 if (!sig_handler_ignored(handler, sig))
72 return 0; 83 return 0;
73 84
74 /* 85 /*
75 * Tracers may want to know about even ignored signals. 86 * Tracers may want to know about even ignored signals.
76 */ 87 */
77 return !tracehook_consider_ignored_signal(t, sig, handler); 88 return !tracehook_consider_ignored_signal(t, sig);
78} 89}
79 90
80/* 91/*
@@ -318,7 +329,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
318 return 1; 329 return 1;
319 if (handler != SIG_IGN && handler != SIG_DFL) 330 if (handler != SIG_IGN && handler != SIG_DFL)
320 return 0; 331 return 0;
321 return !tracehook_consider_fatal_signal(tsk, sig, handler); 332 return !tracehook_consider_fatal_signal(tsk, sig);
322} 333}
323 334
324 335
@@ -624,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
624 * Returns true if the signal should be actually delivered, otherwise 635 * Returns true if the signal should be actually delivered, otherwise
625 * it should be dropped. 636 * it should be dropped.
626 */ 637 */
627static int prepare_signal(int sig, struct task_struct *p) 638static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
628{ 639{
629 struct signal_struct *signal = p->signal; 640 struct signal_struct *signal = p->signal;
630 struct task_struct *t; 641 struct task_struct *t;
@@ -708,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
708 } 719 }
709 } 720 }
710 721
711 return !sig_ignored(p, sig); 722 return !sig_ignored(p, sig, from_ancestor_ns);
712} 723}
713 724
714/* 725/*
@@ -777,7 +788,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
777 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 788 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
778 !sigismember(&t->real_blocked, sig) && 789 !sigismember(&t->real_blocked, sig) &&
779 (sig == SIGKILL || 790 (sig == SIGKILL ||
780 !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { 791 !tracehook_consider_fatal_signal(t, sig))) {
781 /* 792 /*
782 * This signal will be fatal to the whole group. 793 * This signal will be fatal to the whole group.
783 */ 794 */
@@ -813,8 +824,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
813 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 824 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
814} 825}
815 826
816static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 827static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
817 int group) 828 int group, int from_ancestor_ns)
818{ 829{
819 struct sigpending *pending; 830 struct sigpending *pending;
820 struct sigqueue *q; 831 struct sigqueue *q;
@@ -822,7 +833,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
822 trace_sched_signal_send(sig, t); 833 trace_sched_signal_send(sig, t);
823 834
824 assert_spin_locked(&t->sighand->siglock); 835 assert_spin_locked(&t->sighand->siglock);
825 if (!prepare_signal(sig, t)) 836
837 if (!prepare_signal(sig, t, from_ancestor_ns))
826 return 0; 838 return 0;
827 839
828 pending = group ? &t->signal->shared_pending : &t->pending; 840 pending = group ? &t->signal->shared_pending : &t->pending;
@@ -871,6 +883,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
871 break; 883 break;
872 default: 884 default:
873 copy_siginfo(&q->info, info); 885 copy_siginfo(&q->info, info);
886 if (from_ancestor_ns)
887 q->info.si_pid = 0;
874 break; 888 break;
875 } 889 }
876 } else if (!is_si_special(info)) { 890 } else if (!is_si_special(info)) {
@@ -889,6 +903,20 @@ out_set:
889 return 0; 903 return 0;
890} 904}
891 905
906static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
907 int group)
908{
909 int from_ancestor_ns = 0;
910
911#ifdef CONFIG_PID_NS
912 if (!is_si_special(info) && SI_FROMUSER(info) &&
913 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
914 from_ancestor_ns = 1;
915#endif
916
917 return __send_signal(sig, info, t, group, from_ancestor_ns);
918}
919
892int print_fatal_signals; 920int print_fatal_signals;
893 921
894static void print_fatal_signal(struct pt_regs *regs, int signr) 922static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -909,7 +937,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
909 } 937 }
910#endif 938#endif
911 printk("\n"); 939 printk("\n");
940 preempt_disable();
912 show_regs(regs); 941 show_regs(regs);
942 preempt_enable();
913} 943}
914 944
915static int __init setup_print_fatal_signals(char *str) 945static int __init setup_print_fatal_signals(char *str)
@@ -1131,7 +1161,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1131 if (sig && p->sighand) { 1161 if (sig && p->sighand) {
1132 unsigned long flags; 1162 unsigned long flags;
1133 spin_lock_irqsave(&p->sighand->siglock, flags); 1163 spin_lock_irqsave(&p->sighand->siglock, flags);
1134 ret = __group_send_sig_info(sig, info, p); 1164 ret = __send_signal(sig, info, p, 1, 0);
1135 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1165 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1136 } 1166 }
1137out_unlock: 1167out_unlock:
@@ -1318,7 +1348,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1318 goto ret; 1348 goto ret;
1319 1349
1320 ret = 1; /* the signal is ignored */ 1350 ret = 1; /* the signal is ignored */
1321 if (!prepare_signal(sig, t)) 1351 if (!prepare_signal(sig, t, 0))
1322 goto out; 1352 goto out;
1323 1353
1324 ret = 0; 1354 ret = 0;
@@ -1365,7 +1395,6 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1365 struct siginfo info; 1395 struct siginfo info;
1366 unsigned long flags; 1396 unsigned long flags;
1367 struct sighand_struct *psig; 1397 struct sighand_struct *psig;
1368 struct task_cputime cputime;
1369 int ret = sig; 1398 int ret = sig;
1370 1399
1371 BUG_ON(sig == -1); 1400 BUG_ON(sig == -1);
@@ -1395,9 +1424,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1395 info.si_uid = __task_cred(tsk)->uid; 1424 info.si_uid = __task_cred(tsk)->uid;
1396 rcu_read_unlock(); 1425 rcu_read_unlock();
1397 1426
1398 thread_group_cputime(tsk, &cputime); 1427 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
1399 info.si_utime = cputime_to_jiffies(cputime.utime); 1428 tsk->signal->utime));
1400 info.si_stime = cputime_to_jiffies(cputime.stime); 1429 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1430 tsk->signal->stime));
1401 1431
1402 info.si_status = tsk->exit_code & 0x7f; 1432 info.si_status = tsk->exit_code & 0x7f;
1403 if (tsk->exit_code & 0x80) 1433 if (tsk->exit_code & 0x80)
@@ -1573,7 +1603,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1573 read_lock(&tasklist_lock); 1603 read_lock(&tasklist_lock);
1574 if (may_ptrace_stop()) { 1604 if (may_ptrace_stop()) {
1575 do_notify_parent_cldstop(current, CLD_TRAPPED); 1605 do_notify_parent_cldstop(current, CLD_TRAPPED);
1606 /*
1607 * Don't want to allow preemption here, because
1608 * sys_ptrace() needs this task to be inactive.
1609 *
1610 * XXX: implement read_unlock_no_resched().
1611 */
1612 preempt_disable();
1576 read_unlock(&tasklist_lock); 1613 read_unlock(&tasklist_lock);
1614 preempt_enable_no_resched();
1577 schedule(); 1615 schedule();
1578 } else { 1616 } else {
1579 /* 1617 /*
@@ -1834,9 +1872,16 @@ relock:
1834 1872
1835 /* 1873 /*
1836 * Global init gets no signals it doesn't want. 1874 * Global init gets no signals it doesn't want.
1875 * Container-init gets no signals it doesn't want from same
1876 * container.
1877 *
1878 * Note that if global/container-init sees a sig_kernel_only()
1879 * signal here, the signal must have been generated internally
1880 * or must have come from an ancestor namespace. In either
1881 * case, the signal cannot be dropped.
1837 */ 1882 */
1838 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && 1883 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
1839 !signal_group_exit(signal)) 1884 !sig_kernel_only(signr))
1840 continue; 1885 continue;
1841 1886
1842 if (sig_kernel_stop(signr)) { 1887 if (sig_kernel_stop(signr)) {
@@ -1961,7 +2006,7 @@ EXPORT_SYMBOL(unblock_all_signals);
1961 * System call entry points. 2006 * System call entry points.
1962 */ 2007 */
1963 2008
1964asmlinkage long sys_restart_syscall(void) 2009SYSCALL_DEFINE0(restart_syscall)
1965{ 2010{
1966 struct restart_block *restart = &current_thread_info()->restart_block; 2011 struct restart_block *restart = &current_thread_info()->restart_block;
1967 return restart->fn(restart); 2012 return restart->fn(restart);
@@ -2014,8 +2059,8 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2014 return error; 2059 return error;
2015} 2060}
2016 2061
2017asmlinkage long 2062SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
2018sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize) 2063 sigset_t __user *, oset, size_t, sigsetsize)
2019{ 2064{
2020 int error = -EINVAL; 2065 int error = -EINVAL;
2021 sigset_t old_set, new_set; 2066 sigset_t old_set, new_set;
@@ -2074,8 +2119,7 @@ out:
2074 return error; 2119 return error;
2075} 2120}
2076 2121
2077asmlinkage long 2122SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
2078sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize)
2079{ 2123{
2080 return do_sigpending(set, sigsetsize); 2124 return do_sigpending(set, sigsetsize);
2081} 2125}
@@ -2146,11 +2190,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2146 2190
2147#endif 2191#endif
2148 2192
2149asmlinkage long 2193SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2150sys_rt_sigtimedwait(const sigset_t __user *uthese, 2194 siginfo_t __user *, uinfo, const struct timespec __user *, uts,
2151 siginfo_t __user *uinfo, 2195 size_t, sigsetsize)
2152 const struct timespec __user *uts,
2153 size_t sigsetsize)
2154{ 2196{
2155 int ret, sig; 2197 int ret, sig;
2156 sigset_t these; 2198 sigset_t these;
@@ -2223,8 +2265,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2223 return ret; 2265 return ret;
2224} 2266}
2225 2267
2226asmlinkage long 2268SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2227sys_kill(pid_t pid, int sig)
2228{ 2269{
2229 struct siginfo info; 2270 struct siginfo info;
2230 2271
@@ -2283,7 +2324,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2283 * exists but it's not belonging to the target process anymore. This 2324 * exists but it's not belonging to the target process anymore. This
2284 * method solves the problem of threads exiting and PIDs getting reused. 2325 * method solves the problem of threads exiting and PIDs getting reused.
2285 */ 2326 */
2286asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig) 2327SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
2287{ 2328{
2288 /* This is only valid for single tasks */ 2329 /* This is only valid for single tasks */
2289 if (pid <= 0 || tgid <= 0) 2330 if (pid <= 0 || tgid <= 0)
@@ -2295,8 +2336,7 @@ asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
2295/* 2336/*
2296 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2337 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2297 */ 2338 */
2298asmlinkage long 2339SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2299sys_tkill(pid_t pid, int sig)
2300{ 2340{
2301 /* This is only valid for single tasks */ 2341 /* This is only valid for single tasks */
2302 if (pid <= 0) 2342 if (pid <= 0)
@@ -2305,8 +2345,8 @@ sys_tkill(pid_t pid, int sig)
2305 return do_tkill(0, pid, sig); 2345 return do_tkill(0, pid, sig);
2306} 2346}
2307 2347
2308asmlinkage long 2348SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2309sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo) 2349 siginfo_t __user *, uinfo)
2310{ 2350{
2311 siginfo_t info; 2351 siginfo_t info;
2312 2352
@@ -2434,8 +2474,7 @@ out:
2434 2474
2435#ifdef __ARCH_WANT_SYS_SIGPENDING 2475#ifdef __ARCH_WANT_SYS_SIGPENDING
2436 2476
2437asmlinkage long 2477SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2438sys_sigpending(old_sigset_t __user *set)
2439{ 2478{
2440 return do_sigpending(set, sizeof(*set)); 2479 return do_sigpending(set, sizeof(*set));
2441} 2480}
@@ -2446,8 +2485,8 @@ sys_sigpending(old_sigset_t __user *set)
2446/* Some platforms have their own version with special arguments others 2485/* Some platforms have their own version with special arguments others
2447 support only sys_rt_sigprocmask. */ 2486 support only sys_rt_sigprocmask. */
2448 2487
2449asmlinkage long 2488SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
2450sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset) 2489 old_sigset_t __user *, oset)
2451{ 2490{
2452 int error; 2491 int error;
2453 old_sigset_t old_set, new_set; 2492 old_sigset_t old_set, new_set;
@@ -2497,11 +2536,10 @@ out:
2497#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 2536#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2498 2537
2499#ifdef __ARCH_WANT_SYS_RT_SIGACTION 2538#ifdef __ARCH_WANT_SYS_RT_SIGACTION
2500asmlinkage long 2539SYSCALL_DEFINE4(rt_sigaction, int, sig,
2501sys_rt_sigaction(int sig, 2540 const struct sigaction __user *, act,
2502 const struct sigaction __user *act, 2541 struct sigaction __user *, oact,
2503 struct sigaction __user *oact, 2542 size_t, sigsetsize)
2504 size_t sigsetsize)
2505{ 2543{
2506 struct k_sigaction new_sa, old_sa; 2544 struct k_sigaction new_sa, old_sa;
2507 int ret = -EINVAL; 2545 int ret = -EINVAL;
@@ -2531,15 +2569,13 @@ out:
2531/* 2569/*
2532 * For backwards compatibility. Functionality superseded by sigprocmask. 2570 * For backwards compatibility. Functionality superseded by sigprocmask.
2533 */ 2571 */
2534asmlinkage long 2572SYSCALL_DEFINE0(sgetmask)
2535sys_sgetmask(void)
2536{ 2573{
2537 /* SMP safe */ 2574 /* SMP safe */
2538 return current->blocked.sig[0]; 2575 return current->blocked.sig[0];
2539} 2576}
2540 2577
2541asmlinkage long 2578SYSCALL_DEFINE1(ssetmask, int, newmask)
2542sys_ssetmask(int newmask)
2543{ 2579{
2544 int old; 2580 int old;
2545 2581
@@ -2559,8 +2595,7 @@ sys_ssetmask(int newmask)
2559/* 2595/*
2560 * For backwards compatibility. Functionality superseded by sigaction. 2596 * For backwards compatibility. Functionality superseded by sigaction.
2561 */ 2597 */
2562asmlinkage unsigned long 2598SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
2563sys_signal(int sig, __sighandler_t handler)
2564{ 2599{
2565 struct k_sigaction new_sa, old_sa; 2600 struct k_sigaction new_sa, old_sa;
2566 int ret; 2601 int ret;
@@ -2577,8 +2612,7 @@ sys_signal(int sig, __sighandler_t handler)
2577 2612
2578#ifdef __ARCH_WANT_SYS_PAUSE 2613#ifdef __ARCH_WANT_SYS_PAUSE
2579 2614
2580asmlinkage long 2615SYSCALL_DEFINE0(pause)
2581sys_pause(void)
2582{ 2616{
2583 current->state = TASK_INTERRUPTIBLE; 2617 current->state = TASK_INTERRUPTIBLE;
2584 schedule(); 2618 schedule();
@@ -2588,7 +2622,7 @@ sys_pause(void)
2588#endif 2622#endif
2589 2623
2590#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 2624#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
2591asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) 2625SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
2592{ 2626{
2593 sigset_t newset; 2627 sigset_t newset;
2594 2628
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..cf2bc01186ef
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,640 @@
1/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 *
11 * See Documentation/slow-work.txt
12 */
13
14#include <linux/module.h>
15#include <linux/slow-work.h>
16#include <linux/kthread.h>
17#include <linux/freezer.h>
18#include <linux/wait.h>
19
20#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
21 * things to do */
22#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */
24
25static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long);
27
28#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
30 void __user *, size_t *, loff_t *);
31
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
33 void __user *, size_t *, loff_t *);
34#endif
35
36/*
37 * The pool of threads has at least min threads in it as long as someone is
38 * using the facility, and may have as many as max.
39 *
40 * A portion of the pool may be processing very slow operations.
41 */
42static unsigned slow_work_min_threads = 2;
43static unsigned slow_work_max_threads = 4;
44static unsigned vslow_work_proportion = 50; /* % of threads that may process
45 * very slow work */
46
47#ifdef CONFIG_SYSCTL
48static const int slow_work_min_min_threads = 2;
49static int slow_work_max_max_threads = 255;
50static const int slow_work_min_vslow = 1;
51static const int slow_work_max_vslow = 99;
52
53ctl_table slow_work_sysctls[] = {
54 {
55 .ctl_name = CTL_UNNUMBERED,
56 .procname = "min-threads",
57 .data = &slow_work_min_threads,
58 .maxlen = sizeof(unsigned),
59 .mode = 0644,
60 .proc_handler = slow_work_min_threads_sysctl,
61 .extra1 = (void *) &slow_work_min_min_threads,
62 .extra2 = &slow_work_max_threads,
63 },
64 {
65 .ctl_name = CTL_UNNUMBERED,
66 .procname = "max-threads",
67 .data = &slow_work_max_threads,
68 .maxlen = sizeof(unsigned),
69 .mode = 0644,
70 .proc_handler = slow_work_max_threads_sysctl,
71 .extra1 = &slow_work_min_threads,
72 .extra2 = (void *) &slow_work_max_max_threads,
73 },
74 {
75 .ctl_name = CTL_UNNUMBERED,
76 .procname = "vslow-percentage",
77 .data = &vslow_work_proportion,
78 .maxlen = sizeof(unsigned),
79 .mode = 0644,
80 .proc_handler = &proc_dointvec_minmax,
81 .extra1 = (void *) &slow_work_min_vslow,
82 .extra2 = (void *) &slow_work_max_vslow,
83 },
84 { .ctl_name = 0 }
85};
86#endif
87
88/*
89 * The active state of the thread pool
90 */
91static atomic_t slow_work_thread_count;
92static atomic_t vslow_work_executing_count;
93
94static bool slow_work_may_not_start_new_thread;
95static bool slow_work_cull; /* cull a thread due to lack of activity */
96static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
97static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98static struct slow_work slow_work_new_thread; /* new thread starter */
99
100/*
101 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs.
104 *
105 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items.
107 */
108static LIST_HEAD(slow_work_queue);
109static LIST_HEAD(vslow_work_queue);
110static DEFINE_SPINLOCK(slow_work_queue_lock);
111
112/*
113 * The thread controls. A variable used to signal to the threads that they
114 * should exit when the queue is empty, a waitqueue used by the threads to wait
115 * for signals, and a completion set by the last thread to exit.
116 */
117static bool slow_work_threads_should_exit;
118static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
119static DECLARE_COMPLETION(slow_work_last_thread_exited);
120
121/*
122 * The number of users of the thread pool and its lock. Whilst this is zero we
123 * have no threads hanging around, and when this reaches zero, we wait for all
124 * active or queued work items to complete and kill all the threads we do have.
125 */
126static int slow_work_user_count;
127static DEFINE_MUTEX(slow_work_user_lock);
128
129/*
130 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items.
132 *
133 * The answer is rounded up to at least 1, but may not equal or exceed the
134 * maximum number of the threads in the pool. This means we always have at
135 * least one thread that can process slow work items, and we always have at
136 * least one thread that won't get tied up doing so.
137 */
138static unsigned slow_work_calc_vsmax(void)
139{
140 unsigned vsmax;
141
142 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
143 vsmax /= 100;
144 vsmax = max(vsmax, 1U);
145 return min(vsmax, slow_work_max_threads - 1);
146}
147
148/*
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do.
151 */
152static bool slow_work_execute(void)
153{
154 struct slow_work *work = NULL;
155 unsigned vsmax;
156 bool very_slow;
157
158 vsmax = slow_work_calc_vsmax();
159
160 /* see if we can schedule a new thread to be started if we're not
161 * keeping up with the work */
162 if (!waitqueue_active(&slow_work_thread_wq) &&
163 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
164 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
165 !slow_work_may_not_start_new_thread)
166 slow_work_enqueue(&slow_work_new_thread);
167
168 /* find something to execute */
169 spin_lock_irq(&slow_work_queue_lock);
170 if (!list_empty(&vslow_work_queue) &&
171 atomic_read(&vslow_work_executing_count) < vsmax) {
172 work = list_entry(vslow_work_queue.next,
173 struct slow_work, link);
174 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
175 BUG();
176 list_del_init(&work->link);
177 atomic_inc(&vslow_work_executing_count);
178 very_slow = true;
179 } else if (!list_empty(&slow_work_queue)) {
180 work = list_entry(slow_work_queue.next,
181 struct slow_work, link);
182 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
183 BUG();
184 list_del_init(&work->link);
185 very_slow = false;
186 } else {
187 very_slow = false; /* avoid the compiler warning */
188 }
189 spin_unlock_irq(&slow_work_queue_lock);
190
191 if (!work)
192 return false;
193
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG();
196
197 work->ops->execute(work);
198
199 if (very_slow)
200 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202
203 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously
206 *
207 * there is, however, a race between us testing the pending flag and
208 * getting the spinlock, and between the enqueuer setting the pending
209 * flag and getting the spinlock, so we use a deferral bit to tell us
210 * if the enqueuer got there first
211 */
212 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
213 spin_lock_irq(&slow_work_queue_lock);
214
215 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
216 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
217 goto auto_requeue;
218
219 spin_unlock_irq(&slow_work_queue_lock);
220 }
221
222 work->ops->put_ref(work);
223 return true;
224
225auto_requeue:
226 /* we must complete the enqueue operation
227 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already
229 */
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue);
232 else
233 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock);
235 return true;
236}
237
238/**
239 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue
241 *
242 * Schedule a slow work item for processing. If the item is already undergoing
243 * execution, this guarantees not to re-enter the execution routine until the
244 * first execution finishes.
245 *
246 * The item is pinned by this function as it retains a reference to it, managed
247 * through the item operations. The item is unpinned once it has been
248 * executed.
249 *
250 * An item may hog the thread that is running it for a relatively large amount
251 * of time, sufficient, for example, to perform several lookup, mkdir, create
252 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
253 *
254 * Conversely, if a number of items are awaiting processing, it may take some
255 * time before any given item is given attention. The number of threads in the
256 * pool may be increased to deal with demand, but only up to a limit.
257 *
258 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
259 * the very slow queue, from which only a portion of the threads will be
260 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow.
262 *
263 * Returns 0 if successful, -EAGAIN if not.
264 */
265int slow_work_enqueue(struct slow_work *work)
266{
267 unsigned long flags;
268
269 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work);
271 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref);
273
274 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once
276 * per enqueue request
277 *
278 * we use the PENDING bit to merge together repeat requests without
279 * having to disable IRQs and take the spinlock, whilst still
280 * maintaining our promise
281 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
283 spin_lock_irqsave(&slow_work_queue_lock, flags);
284
285 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously
287 *
288 * this, however, leaves us with a problem if we're asked to
289 * enqueue the work whilst someone is executing the work
290 * function as simply queueing the work immediately means that
291 * another thread may try executing it whilst it is already
292 * under execution
293 *
294 * to deal with this, we set the ENQ_DEFERRED bit instead of
295 * enqueueing, and the thread currently executing the work
296 * function will enqueue the work item when the work function
297 * returns and it has cleared the EXECUTING bit
298 */
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else {
302 if (work->ops->get_ref(work) < 0)
303 goto cant_get_ref;
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
305 list_add_tail(&work->link, &vslow_work_queue);
306 else
307 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq);
309 }
310
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 }
313 return 0;
314
315cant_get_ref:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN;
318}
319EXPORT_SYMBOL(slow_work_enqueue);
320
321/*
322 * Worker thread culling algorithm
323 */
324static bool slow_work_cull_thread(void)
325{
326 unsigned long flags;
327 bool do_cull = false;
328
329 spin_lock_irqsave(&slow_work_queue_lock, flags);
330
331 if (slow_work_cull) {
332 slow_work_cull = false;
333
334 if (list_empty(&slow_work_queue) &&
335 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer,
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true;
341 }
342 }
343
344 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
345 return do_cull;
346}
347
348/*
349 * Determine if there is slow work available for dispatch
350 */
351static inline bool slow_work_available(int vsmax)
352{
353 return !list_empty(&slow_work_queue) ||
354 (!list_empty(&vslow_work_queue) &&
355 atomic_read(&vslow_work_executing_count) < vsmax);
356}
357
358/*
359 * Worker thread dispatcher
360 */
361static int slow_work_thread(void *_data)
362{
363 int vsmax;
364
365 DEFINE_WAIT(wait);
366
367 set_freezable();
368 set_user_nice(current, -5);
369
370 for (;;) {
371 vsmax = vslow_work_proportion;
372 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100;
374
375 prepare_to_wait(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE);
377 if (!freezing(current) &&
378 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) &&
380 !slow_work_cull)
381 schedule();
382 finish_wait(&slow_work_thread_wq, &wait);
383
384 try_to_freeze();
385
386 vsmax = vslow_work_proportion;
387 vsmax *= atomic_read(&slow_work_thread_count);
388 vsmax /= 100;
389
390 if (slow_work_available(vsmax) && slow_work_execute()) {
391 cond_resched();
392 if (list_empty(&slow_work_queue) &&
393 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer,
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue;
399 }
400
401 if (slow_work_threads_should_exit)
402 break;
403
404 if (slow_work_cull && slow_work_cull_thread())
405 break;
406 }
407
408 if (atomic_dec_and_test(&slow_work_thread_count))
409 complete_and_exit(&slow_work_last_thread_exited, 0);
410 return 0;
411}
412
413/*
414 * Handle thread cull timer expiration
415 */
416static void slow_work_cull_timeout(unsigned long data)
417{
418 slow_work_cull = true;
419 wake_up(&slow_work_thread_wq);
420}
421
422/*
423 * Get a reference on slow work thread starter
424 */
425static int slow_work_new_thread_get_ref(struct slow_work *work)
426{
427 return 0;
428}
429
430/*
431 * Drop a reference on slow work thread starter
432 */
433static void slow_work_new_thread_put_ref(struct slow_work *work)
434{
435}
436
437/*
438 * Start a new slow work thread
439 */
440static void slow_work_new_thread_execute(struct slow_work *work)
441{
442 struct task_struct *p;
443
444 if (slow_work_threads_should_exit)
445 return;
446
447 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
448 return;
449
450 if (!mutex_trylock(&slow_work_user_lock))
451 return;
452
453 slow_work_may_not_start_new_thread = true;
454 atomic_inc(&slow_work_thread_count);
455 p = kthread_run(slow_work_thread, NULL, "kslowd");
456 if (IS_ERR(p)) {
457 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
458 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT);
462 } else {
463 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1);
465 }
466
467 mutex_unlock(&slow_work_user_lock);
468}
469
470static const struct slow_work_ops slow_work_new_thread_ops = {
471 .get_ref = slow_work_new_thread_get_ref,
472 .put_ref = slow_work_new_thread_put_ref,
473 .execute = slow_work_new_thread_execute,
474};
475
476/*
477 * post-OOM new thread start suppression expiration
478 */
479static void slow_work_oom_timeout(unsigned long data)
480{
481 slow_work_may_not_start_new_thread = false;
482}
483
484#ifdef CONFIG_SYSCTL
485/*
486 * Handle adjustment of the minimum number of threads
487 */
488static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
489 struct file *filp, void __user *buffer,
490 size_t *lenp, loff_t *ppos)
491{
492 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
493 int n;
494
495 if (ret == 0) {
496 mutex_lock(&slow_work_user_lock);
497 if (slow_work_user_count > 0) {
498 /* see if we need to start or stop threads */
499 n = atomic_read(&slow_work_thread_count) -
500 slow_work_min_threads;
501
502 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0)
505 mod_timer(&slow_work_cull_timer,
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 }
508 mutex_unlock(&slow_work_user_lock);
509 }
510
511 return ret;
512}
513
514/*
515 * Handle adjustment of the maximum number of threads
516 */
517static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
518 struct file *filp, void __user *buffer,
519 size_t *lenp, loff_t *ppos)
520{
521 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
522 int n;
523
524 if (ret == 0) {
525 mutex_lock(&slow_work_user_lock);
526 if (slow_work_user_count > 0) {
527 /* see if we need to stop threads */
528 n = slow_work_max_threads -
529 atomic_read(&slow_work_thread_count);
530
531 if (n < 0)
532 mod_timer(&slow_work_cull_timer,
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 }
535 mutex_unlock(&slow_work_user_lock);
536 }
537
538 return ret;
539}
540#endif /* CONFIG_SYSCTL */
541
542/**
543 * slow_work_register_user - Register a user of the facility
544 *
545 * Register a user of the facility, starting up the initial threads if there
546 * aren't any other users at this point. This will return 0 if successful, or
547 * an error if not.
548 */
549int slow_work_register_user(void)
550{
551 struct task_struct *p;
552 int loop;
553
554 mutex_lock(&slow_work_user_lock);
555
556 if (slow_work_user_count == 0) {
557 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
558 init_completion(&slow_work_last_thread_exited);
559
560 slow_work_threads_should_exit = false;
561 slow_work_init(&slow_work_new_thread,
562 &slow_work_new_thread_ops);
563 slow_work_may_not_start_new_thread = false;
564 slow_work_cull = false;
565
566 /* start the minimum number of threads */
567 for (loop = 0; loop < slow_work_min_threads; loop++) {
568 atomic_inc(&slow_work_thread_count);
569 p = kthread_run(slow_work_thread, NULL, "kslowd");
570 if (IS_ERR(p))
571 goto error;
572 }
573 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
574 }
575
576 slow_work_user_count++;
577 mutex_unlock(&slow_work_user_lock);
578 return 0;
579
580error:
581 if (atomic_dec_and_test(&slow_work_thread_count))
582 complete(&slow_work_last_thread_exited);
583 if (loop > 0) {
584 printk(KERN_ERR "Slow work thread pool:"
585 " Aborting startup on ENOMEM\n");
586 slow_work_threads_should_exit = true;
587 wake_up_all(&slow_work_thread_wq);
588 wait_for_completion(&slow_work_last_thread_exited);
589 printk(KERN_ERR "Slow work thread pool: Aborted\n");
590 }
591 mutex_unlock(&slow_work_user_lock);
592 return PTR_ERR(p);
593}
594EXPORT_SYMBOL(slow_work_register_user);
595
596/**
597 * slow_work_unregister_user - Unregister a user of the facility
598 *
599 * Unregister a user of the facility, killing all the threads if this was the
600 * last one.
601 */
602void slow_work_unregister_user(void)
603{
604 mutex_lock(&slow_work_user_lock);
605
606 BUG_ON(slow_work_user_count <= 0);
607
608 slow_work_user_count--;
609 if (slow_work_user_count == 0) {
610 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
611 slow_work_threads_should_exit = true;
612 wake_up_all(&slow_work_thread_wq);
613 wait_for_completion(&slow_work_last_thread_exited);
614 printk(KERN_NOTICE "Slow work thread pool:"
615 " Shut down complete\n");
616 }
617
618 del_timer_sync(&slow_work_cull_timer);
619
620 mutex_unlock(&slow_work_user_lock);
621}
622EXPORT_SYMBOL(slow_work_unregister_user);
623
624/*
625 * Initialise the slow work facility
626 */
627static int __init init_slow_work(void)
628{
629 unsigned nr_cpus = num_possible_cpus();
630
631 if (slow_work_max_threads < nr_cpus)
632 slow_work_max_threads = nr_cpus;
633#ifdef CONFIG_SYSCTL
634 if (slow_work_max_max_threads < nr_cpus * 2)
635 slow_work_max_max_threads = nr_cpus * 2;
636#endif
637 return 0;
638}
639
640subsys_initcall(init_slow_work);
diff --git a/kernel/smp.c b/kernel/smp.c
index 5cfa0e5e3e88..858baac568ee 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -2,39 +2,82 @@
2 * Generic helpers for smp ipi calls 2 * Generic helpers for smp ipi calls
3 * 3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008 4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 *
6 */ 5 */
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/percpu.h>
10#include <linux/rcupdate.h> 6#include <linux/rcupdate.h>
11#include <linux/rculist.h> 7#include <linux/rculist.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/percpu.h>
11#include <linux/init.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h>
13 14
14static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); 15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
15static LIST_HEAD(call_function_queue); 16
16__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); 17static struct {
18 struct list_head queue;
19 spinlock_t lock;
20} call_function __cacheline_aligned_in_smp =
21 {
22 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock),
24 };
17 25
18enum { 26enum {
19 CSD_FLAG_WAIT = 0x01, 27 CSD_FLAG_LOCK = 0x01,
20 CSD_FLAG_ALLOC = 0x02,
21}; 28};
22 29
23struct call_function_data { 30struct call_function_data {
24 struct call_single_data csd; 31 struct call_single_data csd;
25 spinlock_t lock; 32 spinlock_t lock;
26 unsigned int refs; 33 unsigned int refs;
27 struct rcu_head rcu_head; 34 cpumask_var_t cpumask;
28 unsigned long cpumask_bits[];
29}; 35};
30 36
31struct call_single_queue { 37struct call_single_queue {
32 struct list_head list; 38 struct list_head list;
33 spinlock_t lock; 39 spinlock_t lock;
40};
41
42static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
43 .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44};
45
46static int
47hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
48{
49 long cpu = (long)hcpu;
50 struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
51
52 switch (action) {
53 case CPU_UP_PREPARE:
54 case CPU_UP_PREPARE_FROZEN:
55 if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
56 cpu_to_node(cpu)))
57 return NOTIFY_BAD;
58 break;
59
60#ifdef CONFIG_CPU_HOTPLUG
61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN:
63
64 case CPU_DEAD:
65 case CPU_DEAD_FROZEN:
66 free_cpumask_var(cfd->cpumask);
67 break;
68#endif
69 };
70
71 return NOTIFY_OK;
72}
73
74static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
75 .notifier_call = hotplug_cfd,
34}; 76};
35 77
36static int __cpuinit init_call_single_data(void) 78static int __cpuinit init_call_single_data(void)
37{ 79{
80 void *cpu = (void *)(long)smp_processor_id();
38 int i; 81 int i;
39 82
40 for_each_possible_cpu(i) { 83 for_each_possible_cpu(i) {
@@ -43,29 +86,63 @@ static int __cpuinit init_call_single_data(void)
43 spin_lock_init(&q->lock); 86 spin_lock_init(&q->lock);
44 INIT_LIST_HEAD(&q->list); 87 INIT_LIST_HEAD(&q->list);
45 } 88 }
89
90 hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
91 register_cpu_notifier(&hotplug_cfd_notifier);
92
46 return 0; 93 return 0;
47} 94}
48early_initcall(init_call_single_data); 95early_initcall(init_call_single_data);
49 96
50static void csd_flag_wait(struct call_single_data *data) 97/*
98 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
99 *
100 * For non-synchronous ipi calls the csd can still be in use by the
101 * previous function call. For multi-cpu calls its even more interesting
102 * as we'll have to ensure no other cpu is observing our csd.
103 */
104static void csd_lock_wait(struct call_single_data *data)
51{ 105{
52 /* Wait for response */ 106 while (data->flags & CSD_FLAG_LOCK)
53 do {
54 if (!(data->flags & CSD_FLAG_WAIT))
55 break;
56 cpu_relax(); 107 cpu_relax();
57 } while (1); 108}
109
110static void csd_lock(struct call_single_data *data)
111{
112 csd_lock_wait(data);
113 data->flags = CSD_FLAG_LOCK;
114
115 /*
116 * prevent CPU from reordering the above assignment
117 * to ->flags with any subsequent assignments to other
118 * fields of the specified call_single_data structure:
119 */
120 smp_mb();
121}
122
123static void csd_unlock(struct call_single_data *data)
124{
125 WARN_ON(!(data->flags & CSD_FLAG_LOCK));
126
127 /*
128 * ensure we're all done before releasing data:
129 */
130 smp_mb();
131
132 data->flags &= ~CSD_FLAG_LOCK;
58} 133}
59 134
60/* 135/*
61 * Insert a previously allocated call_single_data element for execution 136 * Insert a previously allocated call_single_data element
62 * on the given CPU. data must already have ->func, ->info, and ->flags set. 137 * for execution on the given CPU. data must already have
138 * ->func, ->info, and ->flags set.
63 */ 139 */
64static void generic_exec_single(int cpu, struct call_single_data *data) 140static
141void generic_exec_single(int cpu, struct call_single_data *data, int wait)
65{ 142{
66 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 143 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
67 int wait = data->flags & CSD_FLAG_WAIT, ipi;
68 unsigned long flags; 144 unsigned long flags;
145 int ipi;
69 146
70 spin_lock_irqsave(&dst->lock, flags); 147 spin_lock_irqsave(&dst->lock, flags);
71 ipi = list_empty(&dst->list); 148 ipi = list_empty(&dst->list);
@@ -73,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
73 spin_unlock_irqrestore(&dst->lock, flags); 150 spin_unlock_irqrestore(&dst->lock, flags);
74 151
75 /* 152 /*
76 * Make the list addition visible before sending the ipi. 153 * The list addition should be visible before sending the IPI
154 * handler locks the list to pull the entry off it because of
155 * normal cache coherency rules implied by spinlocks.
156 *
157 * If IPIs can go out of order to the cache coherency protocol
158 * in an architecture, sufficient synchronisation should be added
159 * to arch code to make it appear to obey cache coherency WRT
160 * locking and barrier primitives. Generic code isn't really
161 * equipped to do the right thing...
77 */ 162 */
78 smp_mb();
79
80 if (ipi) 163 if (ipi)
81 arch_send_call_function_single_ipi(cpu); 164 arch_send_call_function_single_ipi(cpu);
82 165
83 if (wait) 166 if (wait)
84 csd_flag_wait(data); 167 csd_lock_wait(data);
85}
86
87static void rcu_free_call_data(struct rcu_head *head)
88{
89 struct call_function_data *data;
90
91 data = container_of(head, struct call_function_data, rcu_head);
92
93 kfree(data);
94} 168}
95 169
96/* 170/*
@@ -103,99 +177,88 @@ void generic_smp_call_function_interrupt(void)
103 int cpu = get_cpu(); 177 int cpu = get_cpu();
104 178
105 /* 179 /*
106 * It's ok to use list_for_each_rcu() here even though we may delete 180 * Ensure entry is visible on call_function_queue after we have
107 * 'pos', since list_del_rcu() doesn't clear ->next 181 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list
183 * and never get another IPI to process it.
184 */
185 smp_mb();
186
187 /*
188 * It's ok to use list_for_each_rcu() here even though we may
189 * delete 'pos', since list_del_rcu() doesn't clear ->next
108 */ 190 */
109 rcu_read_lock(); 191 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
111 int refs; 192 int refs;
112 193
113 if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) 194 spin_lock(&data->lock);
195 if (!cpumask_test_cpu(cpu, data->cpumask)) {
196 spin_unlock(&data->lock);
114 continue; 197 continue;
198 }
199 cpumask_clear_cpu(cpu, data->cpumask);
200 spin_unlock(&data->lock);
115 201
116 data->csd.func(data->csd.info); 202 data->csd.func(data->csd.info);
117 203
118 spin_lock(&data->lock); 204 spin_lock(&data->lock);
119 cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
120 WARN_ON(data->refs == 0); 205 WARN_ON(data->refs == 0);
121 data->refs--; 206 refs = --data->refs;
122 refs = data->refs; 207 if (!refs) {
208 spin_lock(&call_function.lock);
209 list_del_rcu(&data->csd.list);
210 spin_unlock(&call_function.lock);
211 }
123 spin_unlock(&data->lock); 212 spin_unlock(&data->lock);
124 213
125 if (refs) 214 if (refs)
126 continue; 215 continue;
127 216
128 spin_lock(&call_function_lock); 217 csd_unlock(&data->csd);
129 list_del_rcu(&data->csd.list);
130 spin_unlock(&call_function_lock);
131
132 if (data->csd.flags & CSD_FLAG_WAIT) {
133 /*
134 * serialize stores to data with the flag clear
135 * and wakeup
136 */
137 smp_wmb();
138 data->csd.flags &= ~CSD_FLAG_WAIT;
139 }
140 if (data->csd.flags & CSD_FLAG_ALLOC)
141 call_rcu(&data->rcu_head, rcu_free_call_data);
142 } 218 }
143 rcu_read_unlock();
144 219
145 put_cpu(); 220 put_cpu();
146} 221}
147 222
148/* 223/*
149 * Invoked by arch to handle an IPI for call function single. Must be called 224 * Invoked by arch to handle an IPI for call function single. Must be
150 * from the arch with interrupts disabled. 225 * called from the arch with interrupts disabled.
151 */ 226 */
152void generic_smp_call_function_single_interrupt(void) 227void generic_smp_call_function_single_interrupt(void)
153{ 228{
154 struct call_single_queue *q = &__get_cpu_var(call_single_queue); 229 struct call_single_queue *q = &__get_cpu_var(call_single_queue);
230 unsigned int data_flags;
155 LIST_HEAD(list); 231 LIST_HEAD(list);
156 232
157 /* 233 spin_lock(&q->lock);
158 * Need to see other stores to list head for checking whether 234 list_replace_init(&q->list, &list);
159 * list is empty without holding q->lock 235 spin_unlock(&q->lock);
160 */ 236
161 smp_read_barrier_depends(); 237 while (!list_empty(&list)) {
162 while (!list_empty(&q->list)) { 238 struct call_single_data *data;
163 unsigned int data_flags; 239
164 240 data = list_entry(list.next, struct call_single_data, list);
165 spin_lock(&q->lock); 241 list_del(&data->list);
166 list_replace_init(&q->list, &list); 242
167 spin_unlock(&q->lock);
168
169 while (!list_empty(&list)) {
170 struct call_single_data *data;
171
172 data = list_entry(list.next, struct call_single_data,
173 list);
174 list_del(&data->list);
175
176 /*
177 * 'data' can be invalid after this call if
178 * flags == 0 (when called through
179 * generic_exec_single(), so save them away before
180 * making the call.
181 */
182 data_flags = data->flags;
183
184 data->func(data->info);
185
186 if (data_flags & CSD_FLAG_WAIT) {
187 smp_wmb();
188 data->flags &= ~CSD_FLAG_WAIT;
189 } else if (data_flags & CSD_FLAG_ALLOC)
190 kfree(data);
191 }
192 /* 243 /*
193 * See comment on outer loop 244 * 'data' can be invalid after this call if flags == 0
245 * (when called through generic_exec_single()),
246 * so save them away before making the call:
194 */ 247 */
195 smp_read_barrier_depends(); 248 data_flags = data->flags;
249
250 data->func(data->info);
251
252 /*
253 * Unlocked CSDs are valid through generic_exec_single():
254 */
255 if (data_flags & CSD_FLAG_LOCK)
256 csd_unlock(data);
196 } 257 }
197} 258}
198 259
260static DEFINE_PER_CPU(struct call_single_data, csd_data);
261
199/* 262/*
200 * smp_call_function_single - Run a function on a specific CPU 263 * smp_call_function_single - Run a function on a specific CPU
201 * @func: The function to run. This must be fast and non-blocking. 264 * @func: The function to run. This must be fast and non-blocking.
@@ -209,41 +272,45 @@ void generic_smp_call_function_single_interrupt(void)
209int smp_call_function_single(int cpu, void (*func) (void *info), void *info, 272int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
210 int wait) 273 int wait)
211{ 274{
212 struct call_single_data d; 275 struct call_single_data d = {
276 .flags = 0,
277 };
213 unsigned long flags; 278 unsigned long flags;
214 /* prevent preemption and reschedule on another processor, 279 int this_cpu;
215 as well as CPU removal */
216 int me = get_cpu();
217 int err = 0; 280 int err = 0;
218 281
282 /*
283 * prevent preemption and reschedule on another processor,
284 * as well as CPU removal
285 */
286 this_cpu = get_cpu();
287
219 /* Can deadlock when called with interrupts disabled */ 288 /* Can deadlock when called with interrupts disabled */
220 WARN_ON(irqs_disabled()); 289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
221 290
222 if (cpu == me) { 291 if (cpu == this_cpu) {
223 local_irq_save(flags); 292 local_irq_save(flags);
224 func(info); 293 func(info);
225 local_irq_restore(flags); 294 local_irq_restore(flags);
226 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { 295 } else {
227 struct call_single_data *data = NULL; 296 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
297 struct call_single_data *data = &d;
228 298
229 if (!wait) { 299 if (!wait)
230 data = kmalloc(sizeof(*data), GFP_ATOMIC); 300 data = &__get_cpu_var(csd_data);
231 if (data)
232 data->flags = CSD_FLAG_ALLOC;
233 }
234 if (!data) {
235 data = &d;
236 data->flags = CSD_FLAG_WAIT;
237 }
238 301
239 data->func = func; 302 csd_lock(data);
240 data->info = info; 303
241 generic_exec_single(cpu, data); 304 data->func = func;
242 } else { 305 data->info = info;
243 err = -ENXIO; /* CPU not online */ 306 generic_exec_single(cpu, data, wait);
307 } else {
308 err = -ENXIO; /* CPU not online */
309 }
244 } 310 }
245 311
246 put_cpu(); 312 put_cpu();
313
247 return err; 314 return err;
248} 315}
249EXPORT_SYMBOL(smp_call_function_single); 316EXPORT_SYMBOL(smp_call_function_single);
@@ -253,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single);
253 * @cpu: The CPU to run on. 320 * @cpu: The CPU to run on.
254 * @data: Pre-allocated and setup data structure 321 * @data: Pre-allocated and setup data structure
255 * 322 *
256 * Like smp_call_function_single(), but allow caller to pass in a pre-allocated 323 * Like smp_call_function_single(), but allow caller to pass in a
257 * data structure. Useful for embedding @data inside other structures, for 324 * pre-allocated data structure. Useful for embedding @data inside
258 * instance. 325 * other structures, for instance.
259 *
260 */ 326 */
261void __smp_call_function_single(int cpu, struct call_single_data *data) 327void __smp_call_function_single(int cpu, struct call_single_data *data,
328 int wait)
262{ 329{
330 csd_lock(data);
331
263 /* Can deadlock when called with interrupts disabled */ 332 /* Can deadlock when called with interrupts disabled */
264 WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); 333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
265 334
266 generic_exec_single(cpu, data); 335 generic_exec_single(cpu, data, wait);
267} 336}
268 337
269/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ 338/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
339
270#ifndef arch_send_call_function_ipi_mask 340#ifndef arch_send_call_function_ipi_mask
271#define arch_send_call_function_ipi_mask(maskp) \ 341# define arch_send_call_function_ipi_mask(maskp) \
272 arch_send_call_function_ipi(*(maskp)) 342 arch_send_call_function_ipi(*(maskp))
273#endif 343#endif
274 344
275/** 345/**
@@ -277,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
277 * @mask: The set of cpus to run on (only runs on online subset). 347 * @mask: The set of cpus to run on (only runs on online subset).
278 * @func: The function to run. This must be fast and non-blocking. 348 * @func: The function to run. This must be fast and non-blocking.
279 * @info: An arbitrary pointer to pass to the function. 349 * @info: An arbitrary pointer to pass to the function.
280 * @wait: If true, wait (atomically) until function has completed on other CPUs. 350 * @wait: If true, wait (atomically) until function has completed
351 * on other CPUs.
281 * 352 *
282 * If @wait is true, then returns once @func has returned. Note that @wait 353 * If @wait is true, then returns once @func has returned. Note that @wait
283 * will be implicitly turned on in case of allocation failures, since 354 * will be implicitly turned on in case of allocation failures, since
@@ -288,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
288 * must be disabled when calling this function. 359 * must be disabled when calling this function.
289 */ 360 */
290void smp_call_function_many(const struct cpumask *mask, 361void smp_call_function_many(const struct cpumask *mask,
291 void (*func)(void *), void *info, 362 void (*func)(void *), void *info, bool wait)
292 bool wait)
293{ 363{
294 struct call_function_data *data; 364 struct call_function_data *data;
295 unsigned long flags; 365 unsigned long flags;
296 int cpu, next_cpu; 366 int cpu, next_cpu, this_cpu = smp_processor_id();
297 367
298 /* Can deadlock when called with interrupts disabled */ 368 /* Can deadlock when called with interrupts disabled */
299 WARN_ON(irqs_disabled()); 369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
300 370
301 /* So, what's a CPU they want? Ignoring this one. */ 371 /* So, what's a CPU they want? Ignoring this one. */
302 cpu = cpumask_first_and(mask, cpu_online_mask); 372 cpu = cpumask_first_and(mask, cpu_online_mask);
303 if (cpu == smp_processor_id()) 373 if (cpu == this_cpu)
304 cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 374 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
375
305 /* No online cpus? We're done. */ 376 /* No online cpus? We're done. */
306 if (cpu >= nr_cpu_ids) 377 if (cpu >= nr_cpu_ids)
307 return; 378 return;
308 379
309 /* Do we have another CPU which isn't us? */ 380 /* Do we have another CPU which isn't us? */
310 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); 381 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
311 if (next_cpu == smp_processor_id()) 382 if (next_cpu == this_cpu)
312 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); 383 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
313 384
314 /* Fastpath: do that cpu by itself. */ 385 /* Fastpath: do that cpu by itself. */
@@ -317,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask,
317 return; 388 return;
318 } 389 }
319 390
320 data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); 391 data = &__get_cpu_var(cfd_data);
321 if (unlikely(!data)) { 392 csd_lock(&data->csd);
322 /* Slow path. */
323 for_each_online_cpu(cpu) {
324 if (cpu == smp_processor_id())
325 continue;
326 if (cpumask_test_cpu(cpu, mask))
327 smp_call_function_single(cpu, func, info, wait);
328 }
329 return;
330 }
331 393
332 spin_lock_init(&data->lock); 394 spin_lock_irqsave(&data->lock, flags);
333 data->csd.flags = CSD_FLAG_ALLOC;
334 if (wait)
335 data->csd.flags |= CSD_FLAG_WAIT;
336 data->csd.func = func; 395 data->csd.func = func;
337 data->csd.info = info; 396 data->csd.info = info;
338 cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); 397 cpumask_and(data->cpumask, mask, cpu_online_mask);
339 cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); 398 cpumask_clear_cpu(this_cpu, data->cpumask);
340 data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); 399 data->refs = cpumask_weight(data->cpumask);
341 400
342 spin_lock_irqsave(&call_function_lock, flags); 401 spin_lock(&call_function.lock);
343 list_add_tail_rcu(&data->csd.list, &call_function_queue); 402 /*
344 spin_unlock_irqrestore(&call_function_lock, flags); 403 * Place entry at the _HEAD_ of the list, so that any cpu still
404 * observing the entry in generic_smp_call_function_interrupt()
405 * will not miss any other list entries:
406 */
407 list_add_rcu(&data->csd.list, &call_function.queue);
408 spin_unlock(&call_function.lock);
409
410 spin_unlock_irqrestore(&data->lock, flags);
345 411
346 /* 412 /*
347 * Make the list addition visible before sending the ipi. 413 * Make the list addition visible before sending the ipi.
414 * (IPIs must obey or appear to obey normal Linux cache
415 * coherency rules -- see comment in generic_exec_single).
348 */ 416 */
349 smp_mb(); 417 smp_mb();
350 418
351 /* Send a message to all CPUs in the map */ 419 /* Send a message to all CPUs in the map */
352 arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); 420 arch_send_call_function_ipi_mask(data->cpumask);
353 421
354 /* optionally wait for the CPUs to complete */ 422 /* Optionally wait for the CPUs to complete */
355 if (wait) 423 if (wait)
356 csd_flag_wait(&data->csd); 424 csd_lock_wait(&data->csd);
357} 425}
358EXPORT_SYMBOL(smp_call_function_many); 426EXPORT_SYMBOL(smp_call_function_many);
359 427
@@ -361,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many);
361 * smp_call_function(): Run a function on all other CPUs. 429 * smp_call_function(): Run a function on all other CPUs.
362 * @func: The function to run. This must be fast and non-blocking. 430 * @func: The function to run. This must be fast and non-blocking.
363 * @info: An arbitrary pointer to pass to the function. 431 * @info: An arbitrary pointer to pass to the function.
364 * @wait: If true, wait (atomically) until function has completed on other CPUs. 432 * @wait: If true, wait (atomically) until function has completed
433 * on other CPUs.
365 * 434 *
366 * Returns 0. 435 * Returns 0.
367 * 436 *
@@ -377,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
377 preempt_disable(); 446 preempt_disable();
378 smp_call_function_many(cpu_online_mask, func, info, wait); 447 smp_call_function_many(cpu_online_mask, func, info, wait);
379 preempt_enable(); 448 preempt_enable();
449
380 return 0; 450 return 0;
381} 451}
382EXPORT_SYMBOL(smp_call_function); 452EXPORT_SYMBOL(smp_call_function);
383 453
384void ipi_call_lock(void) 454void ipi_call_lock(void)
385{ 455{
386 spin_lock(&call_function_lock); 456 spin_lock(&call_function.lock);
387} 457}
388 458
389void ipi_call_unlock(void) 459void ipi_call_unlock(void)
390{ 460{
391 spin_unlock(&call_function_lock); 461 spin_unlock(&call_function.lock);
392} 462}
393 463
394void ipi_call_lock_irq(void) 464void ipi_call_lock_irq(void)
395{ 465{
396 spin_lock_irq(&call_function_lock); 466 spin_lock_irq(&call_function.lock);
397} 467}
398 468
399void ipi_call_unlock_irq(void) 469void ipi_call_unlock_irq(void)
400{ 470{
401 spin_unlock_irq(&call_function_lock); 471 spin_unlock_irq(&call_function.lock);
402} 472}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..2fecefacdc5b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,8 +21,10 @@
21#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/kthread.h> 22#include <linux/kthread.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/ftrace.h>
24#include <linux/smp.h> 25#include <linux/smp.h>
25#include <linux/tick.h> 26#include <linux/tick.h>
27#include <trace/irq.h>
26 28
27#include <asm/irq.h> 29#include <asm/irq.h>
28/* 30/*
@@ -52,13 +54,18 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
52 54
53static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 55static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
54 56
57char *softirq_to_name[NR_SOFTIRQS] = {
58 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
59 "TASKLET", "SCHED", "HRTIMER", "RCU"
60};
61
55/* 62/*
56 * we cannot loop indefinitely here to avoid userspace starvation, 63 * we cannot loop indefinitely here to avoid userspace starvation,
57 * but we also don't want to introduce a worst case 1/HZ latency 64 * but we also don't want to introduce a worst case 1/HZ latency
58 * to the pending events, so lets the scheduler to balance 65 * to the pending events, so lets the scheduler to balance
59 * the softirq load for us. 66 * the softirq load for us.
60 */ 67 */
61static inline void wakeup_softirqd(void) 68void wakeup_softirqd(void)
62{ 69{
63 /* Interrupts are disabled: no need to stop preemption */ 70 /* Interrupts are disabled: no need to stop preemption */
64 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 71 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -79,13 +86,23 @@ static void __local_bh_disable(unsigned long ip)
79 WARN_ON_ONCE(in_irq()); 86 WARN_ON_ONCE(in_irq());
80 87
81 raw_local_irq_save(flags); 88 raw_local_irq_save(flags);
82 add_preempt_count(SOFTIRQ_OFFSET); 89 /*
90 * The preempt tracer hooks into add_preempt_count and will break
91 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
92 * is set and before current->softirq_enabled is cleared.
93 * We must manually increment preempt_count here and manually
94 * call the trace_preempt_off later.
95 */
96 preempt_count() += SOFTIRQ_OFFSET;
83 /* 97 /*
84 * Were softirqs turned off above: 98 * Were softirqs turned off above:
85 */ 99 */
86 if (softirq_count() == SOFTIRQ_OFFSET) 100 if (softirq_count() == SOFTIRQ_OFFSET)
87 trace_softirqs_off(ip); 101 trace_softirqs_off(ip);
88 raw_local_irq_restore(flags); 102 raw_local_irq_restore(flags);
103
104 if (preempt_count() == SOFTIRQ_OFFSET)
105 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
89} 106}
90#else /* !CONFIG_TRACE_IRQFLAGS */ 107#else /* !CONFIG_TRACE_IRQFLAGS */
91static inline void __local_bh_disable(unsigned long ip) 108static inline void __local_bh_disable(unsigned long ip)
@@ -169,6 +186,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
169 */ 186 */
170#define MAX_SOFTIRQ_RESTART 10 187#define MAX_SOFTIRQ_RESTART 10
171 188
189DEFINE_TRACE(softirq_entry);
190DEFINE_TRACE(softirq_exit);
191
172asmlinkage void __do_softirq(void) 192asmlinkage void __do_softirq(void)
173{ 193{
174 struct softirq_action *h; 194 struct softirq_action *h;
@@ -180,7 +200,7 @@ asmlinkage void __do_softirq(void)
180 account_system_vtime(current); 200 account_system_vtime(current);
181 201
182 __local_bh_disable((unsigned long)__builtin_return_address(0)); 202 __local_bh_disable((unsigned long)__builtin_return_address(0));
183 trace_softirq_enter(); 203 lockdep_softirq_enter();
184 204
185 cpu = smp_processor_id(); 205 cpu = smp_processor_id();
186restart: 206restart:
@@ -195,12 +215,14 @@ restart:
195 if (pending & 1) { 215 if (pending & 1) {
196 int prev_count = preempt_count(); 216 int prev_count = preempt_count();
197 217
218 trace_softirq_entry(h, softirq_vec);
198 h->action(h); 219 h->action(h);
199 220 trace_softirq_exit(h, softirq_vec);
200 if (unlikely(prev_count != preempt_count())) { 221 if (unlikely(prev_count != preempt_count())) {
201 printk(KERN_ERR "huh, entered softirq %td %p" 222 printk(KERN_ERR "huh, entered softirq %td %s %p"
202 "with preempt_count %08x," 223 "with preempt_count %08x,"
203 " exited with %08x?\n", h - softirq_vec, 224 " exited with %08x?\n", h - softirq_vec,
225 softirq_to_name[h - softirq_vec],
204 h->action, prev_count, preempt_count()); 226 h->action, prev_count, preempt_count());
205 preempt_count() = prev_count; 227 preempt_count() = prev_count;
206 } 228 }
@@ -220,7 +242,7 @@ restart:
220 if (pending) 242 if (pending)
221 wakeup_softirqd(); 243 wakeup_softirqd();
222 244
223 trace_softirq_exit(); 245 lockdep_softirq_exit();
224 246
225 account_system_vtime(current); 247 account_system_vtime(current);
226 _local_bh_enable(); 248 _local_bh_enable();
@@ -496,7 +518,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
496 cp->flags = 0; 518 cp->flags = 0;
497 cp->priv = softirq; 519 cp->priv = softirq;
498 520
499 __smp_call_function_single(cpu, cp); 521 __smp_call_function_single(cpu, cp, 0);
500 return 0; 522 return 0;
501 } 523 }
502 return 1; 524 return 1;
@@ -626,6 +648,7 @@ static int ksoftirqd(void * __bind_cpu)
626 preempt_enable_no_resched(); 648 preempt_enable_no_resched();
627 cond_resched(); 649 cond_resched();
628 preempt_disable(); 650 preempt_disable();
651 rcu_qsctr_inc((long)__bind_cpu);
629 } 652 }
630 preempt_enable(); 653 preempt_enable();
631 set_current_state(TASK_INTERRUPTIBLE); 654 set_current_state(TASK_INTERRUPTIBLE);
@@ -795,6 +818,11 @@ int __init __weak early_irq_init(void)
795 return 0; 818 return 0;
796} 819}
797 820
821int __init __weak arch_probe_nr_irqs(void)
822{
823 return 0;
824}
825
798int __init __weak arch_early_irq_init(void) 826int __init __weak arch_early_irq_init(void)
799{ 827{
800 return 0; 828 return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9188c66278a..88796c330838 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
16#include <linux/lockdep.h> 16#include <linux/lockdep.h>
17#include <linux/notifier.h> 17#include <linux/notifier.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/sysctl.h>
19 20
20#include <asm/irq_regs.h> 21#include <asm/irq_regs.h>
21 22
@@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void)
88} 89}
89EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
90 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer,
94 size_t *lenp, loff_t *ppos)
95{
96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
98}
99
91/* 100/*
92 * This callback runs from the timer interrupt, and checks 101 * This callback runs from the timer interrupt, and checks
93 * whether the watchdog thread has hung or not: 102 * whether the watchdog thread has hung or not:
@@ -157,97 +166,11 @@ void softlockup_tick(void)
157} 166}
158 167
159/* 168/*
160 * Have a reasonable limit on the number of tasks checked:
161 */
162unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
163
164/*
165 * Zero means infinite timeout - no checking done:
166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170
171/*
172 * Only do the hung-tasks check on one CPU:
173 */
174static int check_cpu __read_mostly = -1;
175
176static void check_hung_task(struct task_struct *t, unsigned long now)
177{
178 unsigned long switch_count = t->nvcsw + t->nivcsw;
179
180 if (t->flags & PF_FROZEN)
181 return;
182
183 if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
184 t->last_switch_count = switch_count;
185 t->last_switch_timestamp = now;
186 return;
187 }
188 if ((long)(now - t->last_switch_timestamp) <
189 sysctl_hung_task_timeout_secs)
190 return;
191 if (!sysctl_hung_task_warnings)
192 return;
193 sysctl_hung_task_warnings--;
194
195 /*
196 * Ok, the task did not get scheduled for more than 2 minutes,
197 * complain:
198 */
199 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
200 "%ld seconds.\n", t->comm, t->pid,
201 sysctl_hung_task_timeout_secs);
202 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
203 " disables this message.\n");
204 sched_show_task(t);
205 __debug_show_held_locks(t);
206
207 t->last_switch_timestamp = now;
208 touch_nmi_watchdog();
209
210 if (softlockup_panic)
211 panic("softlockup: blocked tasks");
212}
213
214/*
215 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
216 * a really long time (120 seconds). If that happens, print out
217 * a warning.
218 */
219static void check_hung_uninterruptible_tasks(int this_cpu)
220{
221 int max_count = sysctl_hung_task_check_count;
222 unsigned long now = get_timestamp(this_cpu);
223 struct task_struct *g, *t;
224
225 /*
226 * If the system crashed already then all bets are off,
227 * do not report extra hung tasks:
228 */
229 if (test_taint(TAINT_DIE) || did_panic)
230 return;
231
232 read_lock(&tasklist_lock);
233 do_each_thread(g, t) {
234 if (!--max_count)
235 goto unlock;
236 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
237 if (t->state == TASK_UNINTERRUPTIBLE)
238 check_hung_task(t, now);
239 } while_each_thread(g, t);
240 unlock:
241 read_unlock(&tasklist_lock);
242}
243
244/*
245 * The watchdog thread - runs every second and touches the timestamp. 169 * The watchdog thread - runs every second and touches the timestamp.
246 */ 170 */
247static int watchdog(void *__bind_cpu) 171static int watchdog(void *__bind_cpu)
248{ 172{
249 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 173 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
250 int this_cpu = (long)__bind_cpu;
251 174
252 sched_setscheduler(current, SCHED_FIFO, &param); 175 sched_setscheduler(current, SCHED_FIFO, &param);
253 176
@@ -267,11 +190,6 @@ static int watchdog(void *__bind_cpu)
267 if (kthread_should_stop()) 190 if (kthread_should_stop())
268 break; 191 break;
269 192
270 if (this_cpu == check_cpu) {
271 if (sysctl_hung_task_timeout_secs)
272 check_hung_uninterruptible_tasks(this_cpu);
273 }
274
275 set_current_state(TASK_INTERRUPTIBLE); 193 set_current_state(TASK_INTERRUPTIBLE);
276 } 194 }
277 __set_current_state(TASK_RUNNING); 195 __set_current_state(TASK_RUNNING);
@@ -303,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
303 break; 221 break;
304 case CPU_ONLINE: 222 case CPU_ONLINE:
305 case CPU_ONLINE_FROZEN: 223 case CPU_ONLINE_FROZEN:
306 check_cpu = cpumask_any(cpu_online_mask);
307 wake_up_process(per_cpu(watchdog_task, hotcpu)); 224 wake_up_process(per_cpu(watchdog_task, hotcpu));
308 break; 225 break;
309#ifdef CONFIG_HOTPLUG_CPU 226#ifdef CONFIG_HOTPLUG_CPU
310 case CPU_DOWN_PREPARE:
311 case CPU_DOWN_PREPARE_FROZEN:
312 if (hotcpu == check_cpu) {
313 /* Pick any other online cpu. */
314 check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
315 }
316 break;
317
318 case CPU_UP_CANCELED: 227 case CPU_UP_CANCELED:
319 case CPU_UP_CANCELED_FROZEN: 228 case CPU_UP_CANCELED_FROZEN:
320 if (!per_cpu(watchdog_task, hotcpu)) 229 if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd3..7932653c4ebd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
121 local_irq_save(flags); 121 local_irq_save(flags);
122 preempt_disable(); 122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); 123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); 124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
125 return flags; 126 return flags;
126} 127}
127EXPORT_SYMBOL(_read_lock_irqsave); 128EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
151 local_irq_save(flags); 152 local_irq_save(flags);
152 preempt_disable(); 153 preempt_disable();
153 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); 154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
154 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); 155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
155 return flags; 157 return flags;
156} 158}
157EXPORT_SYMBOL(_write_lock_irqsave); 159EXPORT_SYMBOL(_write_lock_irqsave);
@@ -299,16 +301,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
299 local_irq_save(flags); 301 local_irq_save(flags);
300 preempt_disable(); 302 preempt_disable();
301 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 303 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
302 /* 304 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
303 * On lockdep we dont want the hand-coded irq-enable of 305 _raw_spin_lock_flags, &flags);
304 * _raw_spin_lock_flags() code, because lockdep assumes
305 * that interrupts are not re-enabled during lock-acquire:
306 */
307#ifdef CONFIG_LOCKDEP
308 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
309#else
310 _raw_spin_lock_flags(lock, &flags);
311#endif
312 return flags; 306 return flags;
313} 307}
314EXPORT_SYMBOL(_spin_lock_irqsave_nested); 308EXPORT_SYMBOL(_spin_lock_irqsave_nested);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0cd415ee62a2..912823e2a11b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);
44static int refcount; 44static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const cpumask_t *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
170 * doesn't hit this CPU until we're ready. */ 170 * doesn't hit this CPU until we're ready. */
171 get_cpu(); 171 get_cpu();
172 for_each_online_cpu(i) { 172 for_each_online_cpu(i) {
173 sm_work = percpu_ptr(stop_machine_work, i); 173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu); 174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work); 175 queue_work_on(i, stop_machine_wq, sm_work);
176 } 176 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 763c3c17ded3..51dbb55604e8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/ptrace.h> 36#include <linux/ptrace.h>
37#include <linux/fs_struct.h>
37 38
38#include <linux/compat.h> 39#include <linux/compat.h>
39#include <linux/syscalls.h> 40#include <linux/syscalls.h>
@@ -143,7 +144,7 @@ out:
143 return error; 144 return error;
144} 145}
145 146
146asmlinkage long sys_setpriority(int which, int who, int niceval) 147SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
147{ 148{
148 struct task_struct *g, *p; 149 struct task_struct *g, *p;
149 struct user_struct *user; 150 struct user_struct *user;
@@ -208,7 +209,7 @@ out:
208 * has been offset by 20 (ie it returns 40..1 instead of -20..19) 209 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
209 * to stay compatible. 210 * to stay compatible.
210 */ 211 */
211asmlinkage long sys_getpriority(int which, int who) 212SYSCALL_DEFINE2(getpriority, int, which, int, who)
212{ 213{
213 struct task_struct *g, *p; 214 struct task_struct *g, *p;
214 struct user_struct *user; 215 struct user_struct *user;
@@ -355,7 +356,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off);
355 * 356 *
356 * reboot doesn't sync: do that yourself before calling this. 357 * reboot doesn't sync: do that yourself before calling this.
357 */ 358 */
358asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) 359SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
360 void __user *, arg)
359{ 361{
360 char buffer[256]; 362 char buffer[256];
361 363
@@ -478,7 +480,7 @@ void ctrl_alt_del(void)
478 * SMP: There are not races, the GIDs are checked only by filesystem 480 * SMP: There are not races, the GIDs are checked only by filesystem
479 * operations (as far as semantic preservation is concerned). 481 * operations (as far as semantic preservation is concerned).
480 */ 482 */
481asmlinkage long sys_setregid(gid_t rgid, gid_t egid) 483SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
482{ 484{
483 const struct cred *old; 485 const struct cred *old;
484 struct cred *new; 486 struct cred *new;
@@ -529,7 +531,7 @@ error:
529 * 531 *
530 * SMP: Same implicit races as above. 532 * SMP: Same implicit races as above.
531 */ 533 */
532asmlinkage long sys_setgid(gid_t gid) 534SYSCALL_DEFINE1(setgid, gid_t, gid)
533{ 535{
534 const struct cred *old; 536 const struct cred *old;
535 struct cred *new; 537 struct cred *new;
@@ -558,7 +560,7 @@ error:
558 abort_creds(new); 560 abort_creds(new);
559 return retval; 561 return retval;
560} 562}
561 563
562/* 564/*
563 * change the user struct in a credentials set to match the new UID 565 * change the user struct in a credentials set to match the new UID
564 */ 566 */
@@ -570,6 +572,11 @@ static int set_user(struct cred *new)
570 if (!new_user) 572 if (!new_user)
571 return -EAGAIN; 573 return -EAGAIN;
572 574
575 if (!task_can_switch_user(new_user, current)) {
576 free_uid(new_user);
577 return -EINVAL;
578 }
579
573 if (atomic_read(&new_user->processes) >= 580 if (atomic_read(&new_user->processes) >=
574 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 581 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
575 new_user != INIT_USER) { 582 new_user != INIT_USER) {
@@ -597,7 +604,7 @@ static int set_user(struct cred *new)
597 * 100% compatible with BSD. A program which uses just setuid() will be 604 * 100% compatible with BSD. A program which uses just setuid() will be
598 * 100% compatible with POSIX with saved IDs. 605 * 100% compatible with POSIX with saved IDs.
599 */ 606 */
600asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) 607SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
601{ 608{
602 const struct cred *old; 609 const struct cred *old;
603 struct cred *new; 610 struct cred *new;
@@ -630,10 +637,11 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
630 goto error; 637 goto error;
631 } 638 }
632 639
633 retval = -EAGAIN; 640 if (new->uid != old->uid) {
634 if (new->uid != old->uid && set_user(new) < 0) 641 retval = set_user(new);
635 goto error; 642 if (retval < 0)
636 643 goto error;
644 }
637 if (ruid != (uid_t) -1 || 645 if (ruid != (uid_t) -1 ||
638 (euid != (uid_t) -1 && euid != old->uid)) 646 (euid != (uid_t) -1 && euid != old->uid))
639 new->suid = new->euid; 647 new->suid = new->euid;
@@ -661,7 +669,7 @@ error:
661 * will allow a root program to temporarily drop privileges and be able to 669 * will allow a root program to temporarily drop privileges and be able to
662 * regain them by swapping the real and effective uid. 670 * regain them by swapping the real and effective uid.
663 */ 671 */
664asmlinkage long sys_setuid(uid_t uid) 672SYSCALL_DEFINE1(setuid, uid_t, uid)
665{ 673{
666 const struct cred *old; 674 const struct cred *old;
667 struct cred *new; 675 struct cred *new;
@@ -679,9 +687,10 @@ asmlinkage long sys_setuid(uid_t uid)
679 retval = -EPERM; 687 retval = -EPERM;
680 if (capable(CAP_SETUID)) { 688 if (capable(CAP_SETUID)) {
681 new->suid = new->uid = uid; 689 new->suid = new->uid = uid;
682 if (uid != old->uid && set_user(new) < 0) { 690 if (uid != old->uid) {
683 retval = -EAGAIN; 691 retval = set_user(new);
684 goto error; 692 if (retval < 0)
693 goto error;
685 } 694 }
686 } else if (uid != old->uid && uid != new->suid) { 695 } else if (uid != old->uid && uid != new->suid) {
687 goto error; 696 goto error;
@@ -705,7 +714,7 @@ error:
705 * This function implements a generic ability to update ruid, euid, 714 * This function implements a generic ability to update ruid, euid,
706 * and suid. This allows you to implement the 4.4 compatible seteuid(). 715 * and suid. This allows you to implement the 4.4 compatible seteuid().
707 */ 716 */
708asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 717SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
709{ 718{
710 const struct cred *old; 719 const struct cred *old;
711 struct cred *new; 720 struct cred *new;
@@ -733,11 +742,13 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
733 goto error; 742 goto error;
734 } 743 }
735 744
736 retval = -EAGAIN;
737 if (ruid != (uid_t) -1) { 745 if (ruid != (uid_t) -1) {
738 new->uid = ruid; 746 new->uid = ruid;
739 if (ruid != old->uid && set_user(new) < 0) 747 if (ruid != old->uid) {
740 goto error; 748 retval = set_user(new);
749 if (retval < 0)
750 goto error;
751 }
741 } 752 }
742 if (euid != (uid_t) -1) 753 if (euid != (uid_t) -1)
743 new->euid = euid; 754 new->euid = euid;
@@ -756,7 +767,7 @@ error:
756 return retval; 767 return retval;
757} 768}
758 769
759asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) 770SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
760{ 771{
761 const struct cred *cred = current_cred(); 772 const struct cred *cred = current_cred();
762 int retval; 773 int retval;
@@ -771,7 +782,7 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
771/* 782/*
772 * Same as above, but for rgid, egid, sgid. 783 * Same as above, but for rgid, egid, sgid.
773 */ 784 */
774asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 785SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
775{ 786{
776 const struct cred *old; 787 const struct cred *old;
777 struct cred *new; 788 struct cred *new;
@@ -814,7 +825,7 @@ error:
814 return retval; 825 return retval;
815} 826}
816 827
817asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) 828SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
818{ 829{
819 const struct cred *cred = current_cred(); 830 const struct cred *cred = current_cred();
820 int retval; 831 int retval;
@@ -833,7 +844,7 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
833 * whatever uid it wants to). It normally shadows "euid", except when 844 * whatever uid it wants to). It normally shadows "euid", except when
834 * explicitly set by setfsuid() or for access.. 845 * explicitly set by setfsuid() or for access..
835 */ 846 */
836asmlinkage long sys_setfsuid(uid_t uid) 847SYSCALL_DEFINE1(setfsuid, uid_t, uid)
837{ 848{
838 const struct cred *old; 849 const struct cred *old;
839 struct cred *new; 850 struct cred *new;
@@ -870,7 +881,7 @@ change_okay:
870/* 881/*
871 * Samma på svenska.. 882 * Samma på svenska..
872 */ 883 */
873asmlinkage long sys_setfsgid(gid_t gid) 884SYSCALL_DEFINE1(setfsgid, gid_t, gid)
874{ 885{
875 const struct cred *old; 886 const struct cred *old;
876 struct cred *new; 887 struct cred *new;
@@ -919,7 +930,7 @@ void do_sys_times(struct tms *tms)
919 tms->tms_cstime = cputime_to_clock_t(cstime); 930 tms->tms_cstime = cputime_to_clock_t(cstime);
920} 931}
921 932
922asmlinkage long sys_times(struct tms __user * tbuf) 933SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
923{ 934{
924 if (tbuf) { 935 if (tbuf) {
925 struct tms tmp; 936 struct tms tmp;
@@ -944,7 +955,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
944 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. 955 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
945 * LBT 04.03.94 956 * LBT 04.03.94
946 */ 957 */
947asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) 958SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
948{ 959{
949 struct task_struct *p; 960 struct task_struct *p;
950 struct task_struct *group_leader = current->group_leader; 961 struct task_struct *group_leader = current->group_leader;
@@ -1003,10 +1014,8 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
1003 if (err) 1014 if (err)
1004 goto out; 1015 goto out;
1005 1016
1006 if (task_pgrp(p) != pgrp) { 1017 if (task_pgrp(p) != pgrp)
1007 change_pid(p, PIDTYPE_PGID, pgrp); 1018 change_pid(p, PIDTYPE_PGID, pgrp);
1008 set_task_pgrp(p, pid_nr(pgrp));
1009 }
1010 1019
1011 err = 0; 1020 err = 0;
1012out: 1021out:
@@ -1015,7 +1024,7 @@ out:
1015 return err; 1024 return err;
1016} 1025}
1017 1026
1018asmlinkage long sys_getpgid(pid_t pid) 1027SYSCALL_DEFINE1(getpgid, pid_t, pid)
1019{ 1028{
1020 struct task_struct *p; 1029 struct task_struct *p;
1021 struct pid *grp; 1030 struct pid *grp;
@@ -1045,14 +1054,14 @@ out:
1045 1054
1046#ifdef __ARCH_WANT_SYS_GETPGRP 1055#ifdef __ARCH_WANT_SYS_GETPGRP
1047 1056
1048asmlinkage long sys_getpgrp(void) 1057SYSCALL_DEFINE0(getpgrp)
1049{ 1058{
1050 return sys_getpgid(0); 1059 return sys_getpgid(0);
1051} 1060}
1052 1061
1053#endif 1062#endif
1054 1063
1055asmlinkage long sys_getsid(pid_t pid) 1064SYSCALL_DEFINE1(getsid, pid_t, pid)
1056{ 1065{
1057 struct task_struct *p; 1066 struct task_struct *p;
1058 struct pid *sid; 1067 struct pid *sid;
@@ -1080,7 +1089,7 @@ out:
1080 return retval; 1089 return retval;
1081} 1090}
1082 1091
1083asmlinkage long sys_setsid(void) 1092SYSCALL_DEFINE0(setsid)
1084{ 1093{
1085 struct task_struct *group_leader = current->group_leader; 1094 struct task_struct *group_leader = current->group_leader;
1086 struct pid *sid = task_pid(group_leader); 1095 struct pid *sid = task_pid(group_leader);
@@ -1311,7 +1320,7 @@ int set_current_groups(struct group_info *group_info)
1311 1320
1312EXPORT_SYMBOL(set_current_groups); 1321EXPORT_SYMBOL(set_current_groups);
1313 1322
1314asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) 1323SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
1315{ 1324{
1316 const struct cred *cred = current_cred(); 1325 const struct cred *cred = current_cred();
1317 int i; 1326 int i;
@@ -1340,7 +1349,7 @@ out:
1340 * without another task interfering. 1349 * without another task interfering.
1341 */ 1350 */
1342 1351
1343asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) 1352SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
1344{ 1353{
1345 struct group_info *group_info; 1354 struct group_info *group_info;
1346 int retval; 1355 int retval;
@@ -1394,7 +1403,7 @@ EXPORT_SYMBOL(in_egroup_p);
1394 1403
1395DECLARE_RWSEM(uts_sem); 1404DECLARE_RWSEM(uts_sem);
1396 1405
1397asmlinkage long sys_newuname(struct new_utsname __user * name) 1406SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1398{ 1407{
1399 int errno = 0; 1408 int errno = 0;
1400 1409
@@ -1405,7 +1414,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name)
1405 return errno; 1414 return errno;
1406} 1415}
1407 1416
1408asmlinkage long sys_sethostname(char __user *name, int len) 1417SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1409{ 1418{
1410 int errno; 1419 int errno;
1411 char tmp[__NEW_UTS_LEN]; 1420 char tmp[__NEW_UTS_LEN];
@@ -1429,7 +1438,7 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1429 1438
1430#ifdef __ARCH_WANT_SYS_GETHOSTNAME 1439#ifdef __ARCH_WANT_SYS_GETHOSTNAME
1431 1440
1432asmlinkage long sys_gethostname(char __user *name, int len) 1441SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
1433{ 1442{
1434 int i, errno; 1443 int i, errno;
1435 struct new_utsname *u; 1444 struct new_utsname *u;
@@ -1454,7 +1463,7 @@ asmlinkage long sys_gethostname(char __user *name, int len)
1454 * Only setdomainname; getdomainname can be implemented by calling 1463 * Only setdomainname; getdomainname can be implemented by calling
1455 * uname() 1464 * uname()
1456 */ 1465 */
1457asmlinkage long sys_setdomainname(char __user *name, int len) 1466SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1458{ 1467{
1459 int errno; 1468 int errno;
1460 char tmp[__NEW_UTS_LEN]; 1469 char tmp[__NEW_UTS_LEN];
@@ -1477,7 +1486,7 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
1477 return errno; 1486 return errno;
1478} 1487}
1479 1488
1480asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) 1489SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1481{ 1490{
1482 if (resource >= RLIM_NLIMITS) 1491 if (resource >= RLIM_NLIMITS)
1483 return -EINVAL; 1492 return -EINVAL;
@@ -1496,7 +1505,8 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
1496 * Back compatibility for getrlimit. Needed for some apps. 1505 * Back compatibility for getrlimit. Needed for some apps.
1497 */ 1506 */
1498 1507
1499asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim) 1508SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1509 struct rlimit __user *, rlim)
1500{ 1510{
1501 struct rlimit x; 1511 struct rlimit x;
1502 if (resource >= RLIM_NLIMITS) 1512 if (resource >= RLIM_NLIMITS)
@@ -1514,7 +1524,7 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1514 1524
1515#endif 1525#endif
1516 1526
1517asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1527SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1518{ 1528{
1519 struct rlimit new_rlim, *old_rlim; 1529 struct rlimit new_rlim, *old_rlim;
1520 int retval; 1530 int retval;
@@ -1523,22 +1533,14 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1523 return -EINVAL; 1533 return -EINVAL;
1524 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1534 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1525 return -EFAULT; 1535 return -EFAULT;
1536 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1537 return -EINVAL;
1526 old_rlim = current->signal->rlim + resource; 1538 old_rlim = current->signal->rlim + resource;
1527 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1539 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1528 !capable(CAP_SYS_RESOURCE)) 1540 !capable(CAP_SYS_RESOURCE))
1529 return -EPERM; 1541 return -EPERM;
1530 1542 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1531 if (resource == RLIMIT_NOFILE) { 1543 return -EPERM;
1532 if (new_rlim.rlim_max == RLIM_INFINITY)
1533 new_rlim.rlim_max = sysctl_nr_open;
1534 if (new_rlim.rlim_cur == RLIM_INFINITY)
1535 new_rlim.rlim_cur = sysctl_nr_open;
1536 if (new_rlim.rlim_max > sysctl_nr_open)
1537 return -EPERM;
1538 }
1539
1540 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1541 return -EINVAL;
1542 1544
1543 retval = security_task_setrlimit(resource, &new_rlim); 1545 retval = security_task_setrlimit(resource, &new_rlim);
1544 if (retval) 1546 if (retval)
@@ -1687,7 +1689,7 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1687 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1689 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1688} 1690}
1689 1691
1690asmlinkage long sys_getrusage(int who, struct rusage __user *ru) 1692SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1691{ 1693{
1692 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1694 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1693 who != RUSAGE_THREAD) 1695 who != RUSAGE_THREAD)
@@ -1695,14 +1697,14 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
1695 return getrusage(current, who, ru); 1697 return getrusage(current, who, ru);
1696} 1698}
1697 1699
1698asmlinkage long sys_umask(int mask) 1700SYSCALL_DEFINE1(umask, int, mask)
1699{ 1701{
1700 mask = xchg(&current->fs->umask, mask & S_IRWXUGO); 1702 mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1701 return mask; 1703 return mask;
1702} 1704}
1703 1705
1704asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1706SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1705 unsigned long arg4, unsigned long arg5) 1707 unsigned long, arg4, unsigned long, arg5)
1706{ 1708{
1707 struct task_struct *me = current; 1709 struct task_struct *me = current;
1708 unsigned char comm[sizeof(me->comm)]; 1710 unsigned char comm[sizeof(me->comm)];
@@ -1815,8 +1817,8 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1815 return error; 1817 return error;
1816} 1818}
1817 1819
1818asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, 1820SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
1819 struct getcpu_cache __user *unused) 1821 struct getcpu_cache __user *, unused)
1820{ 1822{
1821 int err = 0; 1823 int err = 0;
1822 int cpu = raw_smp_processor_id(); 1824 int cpu = raw_smp_processor_id();
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a23281707..27dad2967387 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -131,6 +131,7 @@ cond_syscall(sys_io_destroy);
131cond_syscall(sys_io_submit); 131cond_syscall(sys_io_submit);
132cond_syscall(sys_io_cancel); 132cond_syscall(sys_io_cancel);
133cond_syscall(sys_io_getevents); 133cond_syscall(sys_io_getevents);
134cond_syscall(sys_syslog);
134 135
135/* arch-specific weak syscall entries */ 136/* arch-specific weak syscall entries */
136cond_syscall(sys_pciconfig_read); 137cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 89d74436318c..4286b62b34a0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
48#include <linux/acpi.h> 48#include <linux/acpi.h>
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h>
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
53#include <asm/processor.h> 54#include <asm/processor.h>
@@ -95,13 +96,12 @@ static int sixty = 60;
95static int neg_one = -1; 96static int neg_one = -1;
96#endif 97#endif
97 98
98#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
99static int two = 2;
100#endif
101
102static int zero; 99static int zero;
103static int one = 1; 100static int __maybe_unused one = 1;
101static int __maybe_unused two = 2;
102static unsigned long one_ul = 1;
104static int one_hundred = 100; 103static int one_hundred = 100;
104static int one_thousand = 1000;
105 105
106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
107static int maxolduid = 65535; 107static int maxolduid = 65535;
@@ -144,6 +144,7 @@ extern int acct_parm[];
144 144
145#ifdef CONFIG_IA64 145#ifdef CONFIG_IA64
146extern int no_unaligned_warning; 146extern int no_unaligned_warning;
147extern int unaligned_dump_stack;
147#endif 148#endif
148 149
149#ifdef CONFIG_RT_MUTEXES 150#ifdef CONFIG_RT_MUTEXES
@@ -781,6 +782,14 @@ static struct ctl_table kern_table[] = {
781 .mode = 0644, 782 .mode = 0644,
782 .proc_handler = &proc_dointvec, 783 .proc_handler = &proc_dointvec,
783 }, 784 },
785 {
786 .ctl_name = CTL_UNNUMBERED,
787 .procname = "unaligned-dump-stack",
788 .data = &unaligned_dump_stack,
789 .maxlen = sizeof (int),
790 .mode = 0644,
791 .proc_handler = &proc_dointvec,
792 },
784#endif 793#endif
785#ifdef CONFIG_DETECT_SOFTLOCKUP 794#ifdef CONFIG_DETECT_SOFTLOCKUP
786 { 795 {
@@ -800,11 +809,24 @@ static struct ctl_table kern_table[] = {
800 .data = &softlockup_thresh, 809 .data = &softlockup_thresh,
801 .maxlen = sizeof(int), 810 .maxlen = sizeof(int),
802 .mode = 0644, 811 .mode = 0644,
803 .proc_handler = &proc_dointvec_minmax, 812 .proc_handler = &proc_dosoftlockup_thresh,
804 .strategy = &sysctl_intvec, 813 .strategy = &sysctl_intvec,
805 .extra1 = &neg_one, 814 .extra1 = &neg_one,
806 .extra2 = &sixty, 815 .extra2 = &sixty,
807 }, 816 },
817#endif
818#ifdef CONFIG_DETECT_HUNG_TASK
819 {
820 .ctl_name = CTL_UNNUMBERED,
821 .procname = "hung_task_panic",
822 .data = &sysctl_hung_task_panic,
823 .maxlen = sizeof(int),
824 .mode = 0644,
825 .proc_handler = &proc_dointvec_minmax,
826 .strategy = &sysctl_intvec,
827 .extra1 = &zero,
828 .extra2 = &one,
829 },
808 { 830 {
809 .ctl_name = CTL_UNNUMBERED, 831 .ctl_name = CTL_UNNUMBERED,
810 .procname = "hung_task_check_count", 832 .procname = "hung_task_check_count",
@@ -820,7 +842,7 @@ static struct ctl_table kern_table[] = {
820 .data = &sysctl_hung_task_timeout_secs, 842 .data = &sysctl_hung_task_timeout_secs,
821 .maxlen = sizeof(unsigned long), 843 .maxlen = sizeof(unsigned long),
822 .mode = 0644, 844 .mode = 0644,
823 .proc_handler = &proc_doulongvec_minmax, 845 .proc_handler = &proc_dohung_task_timeout_secs,
824 .strategy = &sysctl_intvec, 846 .strategy = &sysctl_intvec,
825 }, 847 },
826 { 848 {
@@ -890,6 +912,14 @@ static struct ctl_table kern_table[] = {
890 .proc_handler = &scan_unevictable_handler, 912 .proc_handler = &scan_unevictable_handler,
891 }, 913 },
892#endif 914#endif
915#ifdef CONFIG_SLOW_WORK
916 {
917 .ctl_name = CTL_UNNUMBERED,
918 .procname = "slow-work",
919 .mode = 0555,
920 .child = slow_work_sysctls,
921 },
922#endif
893/* 923/*
894 * NOTE: do not add new entries to this table unless you have read 924 * NOTE: do not add new entries to this table unless you have read
895 * Documentation/sysctl/ctl_unnumbered.txt 925 * Documentation/sysctl/ctl_unnumbered.txt
@@ -965,7 +995,7 @@ static struct ctl_table vm_table[] = {
965 .mode = 0644, 995 .mode = 0644,
966 .proc_handler = &dirty_background_bytes_handler, 996 .proc_handler = &dirty_background_bytes_handler,
967 .strategy = &sysctl_intvec, 997 .strategy = &sysctl_intvec,
968 .extra1 = &one, 998 .extra1 = &one_ul,
969 }, 999 },
970 { 1000 {
971 .ctl_name = VM_DIRTY_RATIO, 1001 .ctl_name = VM_DIRTY_RATIO,
@@ -986,7 +1016,7 @@ static struct ctl_table vm_table[] = {
986 .mode = 0644, 1016 .mode = 0644,
987 .proc_handler = &dirty_bytes_handler, 1017 .proc_handler = &dirty_bytes_handler,
988 .strategy = &sysctl_intvec, 1018 .strategy = &sysctl_intvec,
989 .extra1 = &one, 1019 .extra1 = &one_ul,
990 }, 1020 },
991 { 1021 {
992 .procname = "dirty_writeback_centisecs", 1022 .procname = "dirty_writeback_centisecs",
@@ -1000,7 +1030,7 @@ static struct ctl_table vm_table[] = {
1000 .data = &dirty_expire_interval, 1030 .data = &dirty_expire_interval,
1001 .maxlen = sizeof(dirty_expire_interval), 1031 .maxlen = sizeof(dirty_expire_interval),
1002 .mode = 0644, 1032 .mode = 0644,
1003 .proc_handler = &proc_dointvec_userhz_jiffies, 1033 .proc_handler = &proc_dointvec,
1004 }, 1034 },
1005 { 1035 {
1006 .ctl_name = VM_NR_PDFLUSH_THREADS, 1036 .ctl_name = VM_NR_PDFLUSH_THREADS,
@@ -1011,6 +1041,28 @@ static struct ctl_table vm_table[] = {
1011 .proc_handler = &proc_dointvec, 1041 .proc_handler = &proc_dointvec,
1012 }, 1042 },
1013 { 1043 {
1044 .ctl_name = CTL_UNNUMBERED,
1045 .procname = "nr_pdflush_threads_min",
1046 .data = &nr_pdflush_threads_min,
1047 .maxlen = sizeof nr_pdflush_threads_min,
1048 .mode = 0644 /* read-write */,
1049 .proc_handler = &proc_dointvec_minmax,
1050 .strategy = &sysctl_intvec,
1051 .extra1 = &one,
1052 .extra2 = &nr_pdflush_threads_max,
1053 },
1054 {
1055 .ctl_name = CTL_UNNUMBERED,
1056 .procname = "nr_pdflush_threads_max",
1057 .data = &nr_pdflush_threads_max,
1058 .maxlen = sizeof nr_pdflush_threads_max,
1059 .mode = 0644 /* read-write */,
1060 .proc_handler = &proc_dointvec_minmax,
1061 .strategy = &sysctl_intvec,
1062 .extra1 = &nr_pdflush_threads_min,
1063 .extra2 = &one_thousand,
1064 },
1065 {
1014 .ctl_name = VM_SWAPPINESS, 1066 .ctl_name = VM_SWAPPINESS,
1015 .procname = "swappiness", 1067 .procname = "swappiness",
1016 .data = &vm_swappiness, 1068 .data = &vm_swappiness,
@@ -1363,10 +1415,7 @@ static struct ctl_table fs_table[] = {
1363 .data = &lease_break_time, 1415 .data = &lease_break_time,
1364 .maxlen = sizeof(int), 1416 .maxlen = sizeof(int),
1365 .mode = 0644, 1417 .mode = 0644,
1366 .proc_handler = &proc_dointvec_minmax, 1418 .proc_handler = &proc_dointvec,
1367 .strategy = &sysctl_intvec,
1368 .extra1 = &zero,
1369 .extra2 = &two,
1370 }, 1419 },
1371#endif 1420#endif
1372#ifdef CONFIG_AIO 1421#ifdef CONFIG_AIO
@@ -1407,7 +1456,10 @@ static struct ctl_table fs_table[] = {
1407 .data = &suid_dumpable, 1456 .data = &suid_dumpable,
1408 .maxlen = sizeof(int), 1457 .maxlen = sizeof(int),
1409 .mode = 0644, 1458 .mode = 0644,
1410 .proc_handler = &proc_dointvec, 1459 .proc_handler = &proc_dointvec_minmax,
1460 .strategy = &sysctl_intvec,
1461 .extra1 = &zero,
1462 .extra2 = &two,
1411 }, 1463 },
1412#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) 1464#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
1413 { 1465 {
@@ -1688,7 +1740,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1688 return error; 1740 return error;
1689} 1741}
1690 1742
1691asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 1743SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1692{ 1744{
1693 struct __sysctl_args tmp; 1745 struct __sysctl_args tmp;
1694 int error; 1746 int error;
@@ -2989,7 +3041,7 @@ int sysctl_ms_jiffies(struct ctl_table *table,
2989#else /* CONFIG_SYSCTL_SYSCALL */ 3041#else /* CONFIG_SYSCTL_SYSCALL */
2990 3042
2991 3043
2992asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 3044SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
2993{ 3045{
2994 struct __sysctl_args tmp; 3046 struct __sysctl_args tmp;
2995 int error; 3047 int error;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index fafeb48f27c0..b38423ca711a 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, 219 { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, 220 { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, 221 { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
222 { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
222 {} 223 {}
223}; 224};
224 225
diff --git a/kernel/time.c b/kernel/time.c
index 4886e3ce83a4..29511943871a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -60,7 +60,7 @@ EXPORT_SYMBOL(sys_tz);
60 * why not move it into the appropriate arch directory (for those 60 * why not move it into the appropriate arch directory (for those
61 * architectures that need it). 61 * architectures that need it).
62 */ 62 */
63asmlinkage long sys_time(time_t __user * tloc) 63SYSCALL_DEFINE1(time, time_t __user *, tloc)
64{ 64{
65 time_t i = get_seconds(); 65 time_t i = get_seconds();
66 66
@@ -79,7 +79,7 @@ asmlinkage long sys_time(time_t __user * tloc)
79 * architectures that need it). 79 * architectures that need it).
80 */ 80 */
81 81
82asmlinkage long sys_stime(time_t __user *tptr) 82SYSCALL_DEFINE1(stime, time_t __user *, tptr)
83{ 83{
84 struct timespec tv; 84 struct timespec tv;
85 int err; 85 int err;
@@ -99,8 +99,8 @@ asmlinkage long sys_stime(time_t __user *tptr)
99 99
100#endif /* __ARCH_WANT_SYS_TIME */ 100#endif /* __ARCH_WANT_SYS_TIME */
101 101
102asmlinkage long sys_gettimeofday(struct timeval __user *tv, 102SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
103 struct timezone __user *tz) 103 struct timezone __user *, tz)
104{ 104{
105 if (likely(tv != NULL)) { 105 if (likely(tv != NULL)) {
106 struct timeval ktv; 106 struct timeval ktv;
@@ -184,8 +184,8 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
184 return 0; 184 return 0;
185} 185}
186 186
187asmlinkage long sys_settimeofday(struct timeval __user *tv, 187SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
188 struct timezone __user *tz) 188 struct timezone __user *, tz)
189{ 189{
190 struct timeval user_tv; 190 struct timeval user_tv;
191 struct timespec new_ts; 191 struct timespec new_ts;
@@ -205,7 +205,7 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
205 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 205 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
206} 206}
207 207
208asmlinkage long sys_adjtimex(struct timex __user *txc_p) 208SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
209{ 209{
210 struct timex txc; /* Local copy of parameter */ 210 struct timex txc; /* Local copy of parameter */
211 int ret; 211 int ret;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 905b0b50792d..0b0a6366c9d4 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ea2f48af83cf..d13be216a790 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev,
68 if (dev->mode != mode) { 68 if (dev->mode != mode) {
69 dev->set_mode(mode, dev); 69 dev->set_mode(mode, dev);
70 dev->mode = mode; 70 dev->mode = mode;
71
72 /*
73 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
74 * on it, so fix it up and emit a warning:
75 */
76 if (mode == CLOCK_EVT_MODE_ONESHOT) {
77 if (unlikely(!dev->mult)) {
78 dev->mult = 1;
79 WARN_ON(1);
80 }
81 }
71 } 82 }
72} 83}
73 84
@@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev)
168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 179 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
169 BUG_ON(!dev->cpumask); 180 BUG_ON(!dev->cpumask);
170 181
171 /*
172 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
173 * on it, so fix it up and emit a warning:
174 */
175 if (unlikely(!dev->mult)) {
176 dev->mult = 1;
177 WARN_ON(1);
178 }
179
180 spin_lock(&clockevents_lock); 182 spin_lock(&clockevents_lock);
181 183
182 list_add(&dev->list, &clockevent_devices); 184 list_add(&dev->list, &clockevent_devices);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ca89e1593f08..c46c931a7fe7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,82 @@
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 32#include <linux/tick.h>
33 33
34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc,
36 u64 start_tstamp)
37{
38 tc->cc = cc;
39 tc->cycle_last = cc->read(cc);
40 tc->nsec = start_tstamp;
41}
42EXPORT_SYMBOL(timecounter_init);
43
44/**
45 * timecounter_read_delta - get nanoseconds since last call of this function
46 * @tc: Pointer to time counter
47 *
48 * When the underlying cycle counter runs over, this will be handled
49 * correctly as long as it does not run over more than once between
50 * calls.
51 *
52 * The first call to this function for a new time counter initializes
53 * the time tracking and returns an undefined result.
54 */
55static u64 timecounter_read_delta(struct timecounter *tc)
56{
57 cycle_t cycle_now, cycle_delta;
58 u64 ns_offset;
59
60 /* read cycle counter: */
61 cycle_now = tc->cc->read(tc->cc);
62
63 /* calculate the delta since the last timecounter_read_delta(): */
64 cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
65
66 /* convert to nanoseconds: */
67 ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
68
69 /* update time stamp of timecounter_read_delta() call: */
70 tc->cycle_last = cycle_now;
71
72 return ns_offset;
73}
74
75u64 timecounter_read(struct timecounter *tc)
76{
77 u64 nsec;
78
79 /* increment time by nanoseconds since last call */
80 nsec = timecounter_read_delta(tc);
81 nsec += tc->nsec;
82 tc->nsec = nsec;
83
84 return nsec;
85}
86EXPORT_SYMBOL(timecounter_read);
87
88u64 timecounter_cyc2time(struct timecounter *tc,
89 cycle_t cycle_tstamp)
90{
91 u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
92 u64 nsec;
93
94 /*
95 * Instead of always treating cycle_tstamp as more recent
96 * than tc->cycle_last, detect when it is too far in the
97 * future and treat it as old time stamp instead.
98 */
99 if (cycle_delta > tc->cc->mask / 2) {
100 cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
101 nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
102 } else {
103 nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
104 }
105
106 return nsec;
107}
108EXPORT_SYMBOL(timecounter_cyc2time);
109
34/* XXX - Would like a better way for initializing curr_clocksource */ 110/* XXX - Would like a better way for initializing curr_clocksource */
35extern struct clocksource clocksource_jiffies; 111extern struct clocksource clocksource_jiffies;
36 112
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f5f793d92415..7fc64375ff43 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -1,71 +1,129 @@
1/* 1/*
2 * linux/kernel/time/ntp.c
3 *
4 * NTP state machine interfaces and logic. 2 * NTP state machine interfaces and logic.
5 * 3 *
6 * This code was mainly moved from kernel/timer.c and kernel/time.c 4 * This code was mainly moved from kernel/timer.c and kernel/time.c
7 * Please see those files for relevant copyright info and historical 5 * Please see those files for relevant copyright info and historical
8 * changelogs. 6 * changelogs.
9 */ 7 */
10
11#include <linux/mm.h>
12#include <linux/time.h>
13#include <linux/timex.h>
14#include <linux/jiffies.h>
15#include <linux/hrtimer.h>
16#include <linux/capability.h> 8#include <linux/capability.h>
17#include <linux/math64.h>
18#include <linux/clocksource.h> 9#include <linux/clocksource.h>
19#include <linux/workqueue.h> 10#include <linux/workqueue.h>
20#include <asm/timex.h> 11#include <linux/hrtimer.h>
12#include <linux/jiffies.h>
13#include <linux/math64.h>
14#include <linux/timex.h>
15#include <linux/time.h>
16#include <linux/mm.h>
21 17
22/* 18/*
23 * Timekeeping variables 19 * NTP timekeeping variables:
24 */ 20 */
25unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
26unsigned long tick_nsec; /* ACTHZ period (nsec) */
27u64 tick_length;
28static u64 tick_length_base;
29 21
30static struct hrtimer leap_timer; 22/* USER_HZ period (usecs): */
23unsigned long tick_usec = TICK_USEC;
31 24
32#define MAX_TICKADJ 500 /* microsecs */ 25/* ACTHZ period (nsecs): */
33#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ 26unsigned long tick_nsec;
34 NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 27
28u64 tick_length;
29static u64 tick_length_base;
30
31static struct hrtimer leap_timer;
32
33#define MAX_TICKADJ 500LL /* usecs */
34#define MAX_TICKADJ_SCALED \
35 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
35 36
36/* 37/*
37 * phase-lock loop variables 38 * phase-lock loop variables
38 */ 39 */
39/* TIME_ERROR prevents overwriting the CMOS clock */
40static int time_state = TIME_OK; /* clock synchronization status */
41int time_status = STA_UNSYNC; /* clock status bits */
42static long time_tai; /* TAI offset (s) */
43static s64 time_offset; /* time adjustment (ns) */
44static long time_constant = 2; /* pll time constant */
45long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
46long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
47static s64 time_freq; /* frequency offset (scaled ns/s)*/
48static long time_reftime; /* time at last adjustment (s) */
49long time_adjust;
50static long ntp_tick_adj;
51 40
41/*
42 * clock synchronization status
43 *
44 * (TIME_ERROR prevents overwriting the CMOS clock)
45 */
46static int time_state = TIME_OK;
47
48/* clock status bits: */
49int time_status = STA_UNSYNC;
50
51/* TAI offset (secs): */
52static long time_tai;
53
54/* time adjustment (nsecs): */
55static s64 time_offset;
56
57/* pll time constant: */
58static long time_constant = 2;
59
60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT;
62
63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT;
65
66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq;
68
69/* time at last adjustment (secs): */
70static long time_reftime;
71
72long time_adjust;
73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj;
76
77/*
78 * NTP methods:
79 */
80
81/*
82 * Update (tick_length, tick_length_base, tick_nsec), based
83 * on (tick_usec, ntp_tick_adj, time_freq):
84 */
52static void ntp_update_frequency(void) 85static void ntp_update_frequency(void)
53{ 86{
54 u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) 87 u64 second_length;
55 << NTP_SCALE_SHIFT; 88 u64 new_base;
56 second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; 89
57 second_length += time_freq; 90 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
91 << NTP_SCALE_SHIFT;
92
93 second_length += ntp_tick_adj;
94 second_length += time_freq;
58 95
59 tick_length_base = second_length; 96 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
97 new_base = div_u64(second_length, NTP_INTERVAL_FREQ);
60 98
61 tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; 99 /*
62 tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); 100 * Don't wait for the next second_overflow, apply
101 * the change to the tick length immediately:
102 */
103 tick_length += new_base - tick_length_base;
104 tick_length_base = new_base;
105}
106
107static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
108{
109 time_status &= ~STA_MODE;
110
111 if (secs < MINSEC)
112 return 0;
113
114 if (!(time_status & STA_FLL) && (secs <= MAXSEC))
115 return 0;
116
117 time_status |= STA_MODE;
118
119 return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
63} 120}
64 121
65static void ntp_update_offset(long offset) 122static void ntp_update_offset(long offset)
66{ 123{
67 long mtemp;
68 s64 freq_adj; 124 s64 freq_adj;
125 s64 offset64;
126 long secs;
69 127
70 if (!(time_status & STA_PLL)) 128 if (!(time_status & STA_PLL))
71 return; 129 return;
@@ -84,24 +142,23 @@ static void ntp_update_offset(long offset)
84 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
85 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
86 */ 144 */
87 if (time_status & STA_FREQHOLD || time_reftime == 0) 145 secs = xtime.tv_sec - time_reftime;
88 time_reftime = xtime.tv_sec; 146 if (unlikely(time_status & STA_FREQHOLD))
89 mtemp = xtime.tv_sec - time_reftime; 147 secs = 0;
148
90 time_reftime = xtime.tv_sec; 149 time_reftime = xtime.tv_sec;
91 150
92 freq_adj = (s64)offset * mtemp; 151 offset64 = offset;
93 freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); 152 freq_adj = (offset64 * secs) <<
94 time_status &= ~STA_MODE; 153 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
95 if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
96 freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
97 mtemp);
98 time_status |= STA_MODE;
99 }
100 freq_adj += time_freq;
101 freq_adj = min(freq_adj, MAXFREQ_SCALED);
102 time_freq = max(freq_adj, -MAXFREQ_SCALED);
103 154
104 time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); 155 freq_adj += ntp_update_offset_fll(offset64, secs);
156
157 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
158
159 time_freq = max(freq_adj, -MAXFREQ_SCALED);
160
161 time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
105} 162}
106 163
107/** 164/**
@@ -111,15 +168,15 @@ static void ntp_update_offset(long offset)
111 */ 168 */
112void ntp_clear(void) 169void ntp_clear(void)
113{ 170{
114 time_adjust = 0; /* stop active adjtime() */ 171 time_adjust = 0; /* stop active adjtime() */
115 time_status |= STA_UNSYNC; 172 time_status |= STA_UNSYNC;
116 time_maxerror = NTP_PHASE_LIMIT; 173 time_maxerror = NTP_PHASE_LIMIT;
117 time_esterror = NTP_PHASE_LIMIT; 174 time_esterror = NTP_PHASE_LIMIT;
118 175
119 ntp_update_frequency(); 176 ntp_update_frequency();
120 177
121 tick_length = tick_length_base; 178 tick_length = tick_length_base;
122 time_offset = 0; 179 time_offset = 0;
123} 180}
124 181
125/* 182/*
@@ -140,8 +197,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
140 xtime.tv_sec--; 197 xtime.tv_sec--;
141 wall_to_monotonic.tv_sec++; 198 wall_to_monotonic.tv_sec++;
142 time_state = TIME_OOP; 199 time_state = TIME_OOP;
143 printk(KERN_NOTICE "Clock: " 200 printk(KERN_NOTICE
144 "inserting leap second 23:59:60 UTC\n"); 201 "Clock: inserting leap second 23:59:60 UTC\n");
145 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); 202 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
146 res = HRTIMER_RESTART; 203 res = HRTIMER_RESTART;
147 break; 204 break;
@@ -150,8 +207,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
150 time_tai--; 207 time_tai--;
151 wall_to_monotonic.tv_sec--; 208 wall_to_monotonic.tv_sec--;
152 time_state = TIME_WAIT; 209 time_state = TIME_WAIT;
153 printk(KERN_NOTICE "Clock: " 210 printk(KERN_NOTICE
154 "deleting leap second 23:59:59 UTC\n"); 211 "Clock: deleting leap second 23:59:59 UTC\n");
155 break; 212 break;
156 case TIME_OOP: 213 case TIME_OOP:
157 time_tai++; 214 time_tai++;
@@ -179,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
179 */ 236 */
180void second_overflow(void) 237void second_overflow(void)
181{ 238{
182 s64 time_adj; 239 s64 delta;
183 240
184 /* Bump the maxerror field */ 241 /* Bump the maxerror field */
185 time_maxerror += MAXFREQ / NSEC_PER_USEC; 242 time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -192,24 +249,30 @@ void second_overflow(void)
192 * Compute the phase adjustment for the next second. The offset is 249 * Compute the phase adjustment for the next second. The offset is
193 * reduced by a fixed factor times the time constant. 250 * reduced by a fixed factor times the time constant.
194 */ 251 */
195 tick_length = tick_length_base; 252 tick_length = tick_length_base;
196 time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); 253
197 time_offset -= time_adj; 254 delta = shift_right(time_offset, SHIFT_PLL + time_constant);
198 tick_length += time_adj; 255 time_offset -= delta;
199 256 tick_length += delta;
200 if (unlikely(time_adjust)) { 257
201 if (time_adjust > MAX_TICKADJ) { 258 if (!time_adjust)
202 time_adjust -= MAX_TICKADJ; 259 return;
203 tick_length += MAX_TICKADJ_SCALED; 260
204 } else if (time_adjust < -MAX_TICKADJ) { 261 if (time_adjust > MAX_TICKADJ) {
205 time_adjust += MAX_TICKADJ; 262 time_adjust -= MAX_TICKADJ;
206 tick_length -= MAX_TICKADJ_SCALED; 263 tick_length += MAX_TICKADJ_SCALED;
207 } else { 264 return;
208 tick_length += (s64)(time_adjust * NSEC_PER_USEC /
209 NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
210 time_adjust = 0;
211 }
212 } 265 }
266
267 if (time_adjust < -MAX_TICKADJ) {
268 time_adjust += MAX_TICKADJ;
269 tick_length -= MAX_TICKADJ_SCALED;
270 return;
271 }
272
273 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
274 << NTP_SCALE_SHIFT;
275 time_adjust = 0;
213} 276}
214 277
215#ifdef CONFIG_GENERIC_CMOS_UPDATE 278#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -233,12 +296,13 @@ static void sync_cmos_clock(struct work_struct *work)
233 * This code is run on a timer. If the clock is set, that timer 296 * This code is run on a timer. If the clock is set, that timer
234 * may not expire at the correct time. Thus, we adjust... 297 * may not expire at the correct time. Thus, we adjust...
235 */ 298 */
236 if (!ntp_synced()) 299 if (!ntp_synced()) {
237 /* 300 /*
238 * Not synced, exit, do not restart a timer (if one is 301 * Not synced, exit, do not restart a timer (if one is
239 * running, let it run out). 302 * running, let it run out).
240 */ 303 */
241 return; 304 return;
305 }
242 306
243 getnstimeofday(&now); 307 getnstimeofday(&now);
244 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 308 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
@@ -270,7 +334,116 @@ static void notify_cmos_timer(void)
270static inline void notify_cmos_timer(void) { } 334static inline void notify_cmos_timer(void) { }
271#endif 335#endif
272 336
273/* adjtimex mainly allows reading (and writing, if superuser) of 337/*
338 * Start the leap seconds timer:
339 */
340static inline void ntp_start_leap_timer(struct timespec *ts)
341{
342 long now = ts->tv_sec;
343
344 if (time_status & STA_INS) {
345 time_state = TIME_INS;
346 now += 86400 - now % 86400;
347 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
348
349 return;
350 }
351
352 if (time_status & STA_DEL) {
353 time_state = TIME_DEL;
354 now += 86400 - (now + 1) % 86400;
355 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
356 }
357}
358
359/*
360 * Propagate a new txc->status value into the NTP state:
361 */
362static inline void process_adj_status(struct timex *txc, struct timespec *ts)
363{
364 if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
365 time_state = TIME_OK;
366 time_status = STA_UNSYNC;
367 }
368
369 /*
370 * If we turn on PLL adjustments then reset the
371 * reference time to current time.
372 */
373 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
374 time_reftime = xtime.tv_sec;
375
376 /* only set allowed bits */
377 time_status &= STA_RONLY;
378 time_status |= txc->status & ~STA_RONLY;
379
380 switch (time_state) {
381 case TIME_OK:
382 ntp_start_leap_timer(ts);
383 break;
384 case TIME_INS:
385 case TIME_DEL:
386 time_state = TIME_OK;
387 ntp_start_leap_timer(ts);
388 case TIME_WAIT:
389 if (!(time_status & (STA_INS | STA_DEL)))
390 time_state = TIME_OK;
391 break;
392 case TIME_OOP:
393 hrtimer_restart(&leap_timer);
394 break;
395 }
396}
397/*
398 * Called with the xtime lock held, so we can access and modify
399 * all the global NTP state:
400 */
401static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
402{
403 if (txc->modes & ADJ_STATUS)
404 process_adj_status(txc, ts);
405
406 if (txc->modes & ADJ_NANO)
407 time_status |= STA_NANO;
408
409 if (txc->modes & ADJ_MICRO)
410 time_status &= ~STA_NANO;
411
412 if (txc->modes & ADJ_FREQUENCY) {
413 time_freq = txc->freq * PPM_SCALE;
414 time_freq = min(time_freq, MAXFREQ_SCALED);
415 time_freq = max(time_freq, -MAXFREQ_SCALED);
416 }
417
418 if (txc->modes & ADJ_MAXERROR)
419 time_maxerror = txc->maxerror;
420
421 if (txc->modes & ADJ_ESTERROR)
422 time_esterror = txc->esterror;
423
424 if (txc->modes & ADJ_TIMECONST) {
425 time_constant = txc->constant;
426 if (!(time_status & STA_NANO))
427 time_constant += 4;
428 time_constant = min(time_constant, (long)MAXTC);
429 time_constant = max(time_constant, 0l);
430 }
431
432 if (txc->modes & ADJ_TAI && txc->constant > 0)
433 time_tai = txc->constant;
434
435 if (txc->modes & ADJ_OFFSET)
436 ntp_update_offset(txc->offset);
437
438 if (txc->modes & ADJ_TICK)
439 tick_usec = txc->tick;
440
441 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
442 ntp_update_frequency();
443}
444
445/*
446 * adjtimex mainly allows reading (and writing, if superuser) of
274 * kernel time-keeping variables. used by xntpd. 447 * kernel time-keeping variables. used by xntpd.
275 */ 448 */
276int do_adjtimex(struct timex *txc) 449int do_adjtimex(struct timex *txc)
@@ -291,11 +464,14 @@ int do_adjtimex(struct timex *txc)
291 if (txc->modes && !capable(CAP_SYS_TIME)) 464 if (txc->modes && !capable(CAP_SYS_TIME))
292 return -EPERM; 465 return -EPERM;
293 466
294 /* if the quartz is off by more than 10% something is VERY wrong! */ 467 /*
468 * if the quartz is off by more than 10% then
469 * something is VERY wrong!
470 */
295 if (txc->modes & ADJ_TICK && 471 if (txc->modes & ADJ_TICK &&
296 (txc->tick < 900000/USER_HZ || 472 (txc->tick < 900000/USER_HZ ||
297 txc->tick > 1100000/USER_HZ)) 473 txc->tick > 1100000/USER_HZ))
298 return -EINVAL; 474 return -EINVAL;
299 475
300 if (txc->modes & ADJ_STATUS && time_state != TIME_OK) 476 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
301 hrtimer_cancel(&leap_timer); 477 hrtimer_cancel(&leap_timer);
@@ -305,7 +481,6 @@ int do_adjtimex(struct timex *txc)
305 481
306 write_seqlock_irq(&xtime_lock); 482 write_seqlock_irq(&xtime_lock);
307 483
308 /* If there are input parameters, then process them */
309 if (txc->modes & ADJ_ADJTIME) { 484 if (txc->modes & ADJ_ADJTIME) {
310 long save_adjust = time_adjust; 485 long save_adjust = time_adjust;
311 486
@@ -315,98 +490,24 @@ int do_adjtimex(struct timex *txc)
315 ntp_update_frequency(); 490 ntp_update_frequency();
316 } 491 }
317 txc->offset = save_adjust; 492 txc->offset = save_adjust;
318 goto adj_done; 493 } else {
319 }
320 if (txc->modes) {
321 long sec;
322
323 if (txc->modes & ADJ_STATUS) {
324 if ((time_status & STA_PLL) &&
325 !(txc->status & STA_PLL)) {
326 time_state = TIME_OK;
327 time_status = STA_UNSYNC;
328 }
329 /* only set allowed bits */
330 time_status &= STA_RONLY;
331 time_status |= txc->status & ~STA_RONLY;
332
333 switch (time_state) {
334 case TIME_OK:
335 start_timer:
336 sec = ts.tv_sec;
337 if (time_status & STA_INS) {
338 time_state = TIME_INS;
339 sec += 86400 - sec % 86400;
340 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
341 } else if (time_status & STA_DEL) {
342 time_state = TIME_DEL;
343 sec += 86400 - (sec + 1) % 86400;
344 hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
345 }
346 break;
347 case TIME_INS:
348 case TIME_DEL:
349 time_state = TIME_OK;
350 goto start_timer;
351 break;
352 case TIME_WAIT:
353 if (!(time_status & (STA_INS | STA_DEL)))
354 time_state = TIME_OK;
355 break;
356 case TIME_OOP:
357 hrtimer_restart(&leap_timer);
358 break;
359 }
360 }
361
362 if (txc->modes & ADJ_NANO)
363 time_status |= STA_NANO;
364 if (txc->modes & ADJ_MICRO)
365 time_status &= ~STA_NANO;
366
367 if (txc->modes & ADJ_FREQUENCY) {
368 time_freq = (s64)txc->freq * PPM_SCALE;
369 time_freq = min(time_freq, MAXFREQ_SCALED);
370 time_freq = max(time_freq, -MAXFREQ_SCALED);
371 }
372
373 if (txc->modes & ADJ_MAXERROR)
374 time_maxerror = txc->maxerror;
375 if (txc->modes & ADJ_ESTERROR)
376 time_esterror = txc->esterror;
377
378 if (txc->modes & ADJ_TIMECONST) {
379 time_constant = txc->constant;
380 if (!(time_status & STA_NANO))
381 time_constant += 4;
382 time_constant = min(time_constant, (long)MAXTC);
383 time_constant = max(time_constant, 0l);
384 }
385
386 if (txc->modes & ADJ_TAI && txc->constant > 0)
387 time_tai = txc->constant;
388
389 if (txc->modes & ADJ_OFFSET)
390 ntp_update_offset(txc->offset);
391 if (txc->modes & ADJ_TICK)
392 tick_usec = txc->tick;
393 494
394 if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) 495 /* If there are input parameters, then process them: */
395 ntp_update_frequency(); 496 if (txc->modes)
396 } 497 process_adjtimex_modes(txc, &ts);
397 498
398 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, 499 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
399 NTP_SCALE_SHIFT); 500 NTP_SCALE_SHIFT);
400 if (!(time_status & STA_NANO)) 501 if (!(time_status & STA_NANO))
401 txc->offset /= NSEC_PER_USEC; 502 txc->offset /= NSEC_PER_USEC;
503 }
402 504
403adj_done:
404 result = time_state; /* mostly `TIME_OK' */ 505 result = time_state; /* mostly `TIME_OK' */
405 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 506 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
406 result = TIME_ERROR; 507 result = TIME_ERROR;
407 508
408 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * 509 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
409 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); 510 PPM_SCALE_INV, NTP_SCALE_SHIFT);
410 txc->maxerror = time_maxerror; 511 txc->maxerror = time_maxerror;
411 txc->esterror = time_esterror; 512 txc->esterror = time_esterror;
412 txc->status = time_status; 513 txc->status = time_status;
@@ -425,6 +526,7 @@ adj_done:
425 txc->calcnt = 0; 526 txc->calcnt = 0;
426 txc->errcnt = 0; 527 txc->errcnt = 0;
427 txc->stbcnt = 0; 528 txc->stbcnt = 0;
529
428 write_sequnlock_irq(&xtime_lock); 530 write_sequnlock_irq(&xtime_lock);
429 531
430 txc->time.tv_sec = ts.tv_sec; 532 txc->time.tv_sec = ts.tv_sec;
@@ -440,6 +542,8 @@ adj_done:
440static int __init ntp_tick_adj_setup(char *str) 542static int __init ntp_tick_adj_setup(char *str)
441{ 543{
442 ntp_tick_adj = simple_strtol(str, NULL, 0); 544 ntp_tick_adj = simple_strtol(str, NULL, 0);
545 ntp_tick_adj <<= NTP_SCALE_SHIFT;
546
443 return 1; 547 return 1;
444} 548}
445 549
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 63e05d423a09..21a5ca849514 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -274,6 +274,21 @@ out_bc:
274} 274}
275 275
276/* 276/*
277 * Transfer the do_timer job away from a dying cpu.
278 *
279 * Called with interrupts disabled.
280 */
281static void tick_handover_do_timer(int *cpup)
282{
283 if (*cpup == tick_do_timer_cpu) {
284 int cpu = cpumask_first(cpu_online_mask);
285
286 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
287 TICK_DO_TIMER_NONE;
288 }
289}
290
291/*
277 * Shutdown an event device on a given cpu: 292 * Shutdown an event device on a given cpu:
278 * 293 *
279 * This is called on a life CPU, when a CPU is dead. So we cannot 294 * This is called on a life CPU, when a CPU is dead. So we cannot
@@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup)
297 clockevents_exchange_device(dev, NULL); 312 clockevents_exchange_device(dev, NULL);
298 td->evtdev = NULL; 313 td->evtdev = NULL;
299 } 314 }
300 /* Transfer the do_timer job away from this cpu */
301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = cpumask_first(cpu_online_mask);
303
304 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
305 TICK_DO_TIMER_NONE;
306 }
307 spin_unlock_irqrestore(&tick_device_lock, flags); 315 spin_unlock_irqrestore(&tick_device_lock, flags);
308} 316}
309 317
@@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
357 tick_broadcast_oneshot_control(reason); 365 tick_broadcast_oneshot_control(reason);
358 break; 366 break;
359 367
368 case CLOCK_EVT_NOTIFY_CPU_DYING:
369 tick_handover_do_timer(dev);
370 break;
371
360 case CLOCK_EVT_NOTIFY_CPU_DEAD: 372 case CLOCK_EVT_NOTIFY_CPU_DEAD:
361 tick_shutdown_broadcast_oneshot(dev); 373 tick_shutdown_broadcast_oneshot(dev);
362 tick_shutdown_broadcast(dev); 374 tick_shutdown_broadcast(dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1b6c05bd0d0a..d3f1ef4d5cbe 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,7 +134,7 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(void)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
new file mode 100644
index 000000000000..71e7f1a19156
--- /dev/null
+++ b/kernel/time/timecompare.c
@@ -0,0 +1,191 @@
1/*
2 * Copyright (C) 2009 Intel Corporation.
3 * Author: Patrick Ohly <patrick.ohly@intel.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20#include <linux/timecompare.h>
21#include <linux/module.h>
22#include <linux/math64.h>
23
24/*
25 * fixed point arithmetic scale factor for skew
26 *
27 * Usually one would measure skew in ppb (parts per billion, 1e9), but
28 * using a factor of 2 simplifies the math.
29 */
30#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
31
32ktime_t timecompare_transform(struct timecompare *sync,
33 u64 source_tstamp)
34{
35 u64 nsec;
36
37 nsec = source_tstamp + sync->offset;
38 nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
39 TIMECOMPARE_SKEW_RESOLUTION;
40
41 return ns_to_ktime(nsec);
42}
43EXPORT_SYMBOL(timecompare_transform);
44
45int timecompare_offset(struct timecompare *sync,
46 s64 *offset,
47 u64 *source_tstamp)
48{
49 u64 start_source = 0, end_source = 0;
50 struct {
51 s64 offset;
52 s64 duration_target;
53 } buffer[10], sample, *samples;
54 int counter = 0, i;
55 int used;
56 int index;
57 int num_samples = sync->num_samples;
58
59 if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
60 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
61 if (!samples) {
62 samples = buffer;
63 num_samples = sizeof(buffer)/sizeof(buffer[0]);
64 }
65 } else {
66 samples = buffer;
67 }
68
69 /* run until we have enough valid samples, but do not try forever */
70 i = 0;
71 counter = 0;
72 while (1) {
73 u64 ts;
74 ktime_t start, end;
75
76 start = sync->target();
77 ts = timecounter_read(sync->source);
78 end = sync->target();
79
80 if (!i)
81 start_source = ts;
82
83 /* ignore negative durations */
84 sample.duration_target = ktime_to_ns(ktime_sub(end, start));
85 if (sample.duration_target >= 0) {
86 /*
87 * assume symetric delay to and from source:
88 * average target time corresponds to measured
89 * source time
90 */
91 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 -
93 ts;
94
95 /* simple insertion sort based on duration */
96 index = counter - 1;
97 while (index >= 0) {
98 if (samples[index].duration_target <
99 sample.duration_target)
100 break;
101 samples[index + 1] = samples[index];
102 index--;
103 }
104 samples[index + 1] = sample;
105 counter++;
106 }
107
108 i++;
109 if (counter >= num_samples || i >= 100000) {
110 end_source = ts;
111 break;
112 }
113 }
114
115 *source_tstamp = (end_source + start_source) / 2;
116
117 /* remove outliers by only using 75% of the samples */
118 used = counter * 3 / 4;
119 if (!used)
120 used = counter;
121 if (used) {
122 /* calculate average */
123 s64 off = 0;
124 for (index = 0; index < used; index++)
125 off += samples[index].offset;
126 *offset = div_s64(off, used);
127 }
128
129 if (samples && samples != buffer)
130 kfree(samples);
131
132 return used;
133}
134EXPORT_SYMBOL(timecompare_offset);
135
136void __timecompare_update(struct timecompare *sync,
137 u64 source_tstamp)
138{
139 s64 offset;
140 u64 average_time;
141
142 if (!timecompare_offset(sync, &offset, &average_time))
143 return;
144
145 if (!sync->last_update) {
146 sync->last_update = average_time;
147 sync->offset = offset;
148 sync->skew = 0;
149 } else {
150 s64 delta_nsec = average_time - sync->last_update;
151
152 /* avoid division by negative or small deltas */
153 if (delta_nsec >= 10000) {
154 s64 delta_offset_nsec = offset - sync->offset;
155 s64 skew; /* delta_offset_nsec *
156 TIMECOMPARE_SKEW_RESOLUTION /
157 delta_nsec */
158 u64 divisor;
159
160 /* div_s64() is limited to 32 bit divisor */
161 skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
162 divisor = delta_nsec;
163 while (unlikely(divisor >= ((s64)1) << 32)) {
164 /* divide both by 2; beware, right shift
165 of negative value has undefined
166 behavior and can only be used for
167 the positive divisor */
168 skew = div_s64(skew, 2);
169 divisor >>= 1;
170 }
171 skew = div_s64(skew, divisor);
172
173 /*
174 * Calculate new overall skew as 4/16 the
175 * old value and 12/16 the new one. This is
176 * a rather arbitrary tradeoff between
177 * only using the latest measurement (0/16 and
178 * 16/16) and even more weight on past measurements.
179 */
180#define TIMECOMPARE_NEW_SKEW_PER_16 12
181 sync->skew =
182 div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
183 sync->skew +
184 TIMECOMPARE_NEW_SKEW_PER_16 * skew,
185 16);
186 sync->last_update = average_time;
187 sync->offset = offset;
188 }
189 }
190}
191EXPORT_SYMBOL(__timecompare_update);
diff --git a/kernel/timer.c b/kernel/timer.c
index dee3f641a7a7..b4555568b4e4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer)
491 debug_object_free(timer, &timer_debug_descr); 491 debug_object_free(timer, &timer_debug_descr);
492} 492}
493 493
494static void __init_timer(struct timer_list *timer); 494static void __init_timer(struct timer_list *timer,
495 const char *name,
496 struct lock_class_key *key);
495 497
496void init_timer_on_stack(struct timer_list *timer) 498void init_timer_on_stack_key(struct timer_list *timer,
499 const char *name,
500 struct lock_class_key *key)
497{ 501{
498 debug_object_init_on_stack(timer, &timer_debug_descr); 502 debug_object_init_on_stack(timer, &timer_debug_descr);
499 __init_timer(timer); 503 __init_timer(timer, name, key);
500} 504}
501EXPORT_SYMBOL_GPL(init_timer_on_stack); 505EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
502 506
503void destroy_timer_on_stack(struct timer_list *timer) 507void destroy_timer_on_stack(struct timer_list *timer)
504{ 508{
@@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
512static inline void debug_timer_deactivate(struct timer_list *timer) { } 516static inline void debug_timer_deactivate(struct timer_list *timer) { }
513#endif 517#endif
514 518
515static void __init_timer(struct timer_list *timer) 519static void __init_timer(struct timer_list *timer,
520 const char *name,
521 struct lock_class_key *key)
516{ 522{
517 timer->entry.next = NULL; 523 timer->entry.next = NULL;
518 timer->base = __raw_get_cpu_var(tvec_bases); 524 timer->base = __raw_get_cpu_var(tvec_bases);
@@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer)
521 timer->start_pid = -1; 527 timer->start_pid = -1;
522 memset(timer->start_comm, 0, TASK_COMM_LEN); 528 memset(timer->start_comm, 0, TASK_COMM_LEN);
523#endif 529#endif
530 lockdep_init_map(&timer->lockdep_map, name, key, 0);
524} 531}
525 532
526/** 533/**
@@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer)
530 * init_timer() must be done to a timer prior calling *any* of the 537 * init_timer() must be done to a timer prior calling *any* of the
531 * other timer functions. 538 * other timer functions.
532 */ 539 */
533void init_timer(struct timer_list *timer) 540void init_timer_key(struct timer_list *timer,
541 const char *name,
542 struct lock_class_key *key)
534{ 543{
535 debug_timer_init(timer); 544 debug_timer_init(timer);
536 __init_timer(timer); 545 __init_timer(timer, name, key);
537} 546}
538EXPORT_SYMBOL(init_timer); 547EXPORT_SYMBOL(init_timer_key);
539 548
540void init_timer_deferrable(struct timer_list *timer) 549void init_timer_deferrable_key(struct timer_list *timer,
550 const char *name,
551 struct lock_class_key *key)
541{ 552{
542 init_timer(timer); 553 init_timer_key(timer, name, key);
543 timer_set_deferrable(timer); 554 timer_set_deferrable(timer);
544} 555}
545EXPORT_SYMBOL(init_timer_deferrable); 556EXPORT_SYMBOL(init_timer_deferrable_key);
546 557
547static inline void detach_timer(struct timer_list *timer, 558static inline void detach_timer(struct timer_list *timer,
548 int clear_pending) 559 int clear_pending)
@@ -589,11 +600,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
589 } 600 }
590} 601}
591 602
592int __mod_timer(struct timer_list *timer, unsigned long expires) 603static inline int
604__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
593{ 605{
594 struct tvec_base *base, *new_base; 606 struct tvec_base *base, *new_base;
595 unsigned long flags; 607 unsigned long flags;
596 int ret = 0; 608 int ret;
609
610 ret = 0;
597 611
598 timer_stats_timer_set_start_info(timer); 612 timer_stats_timer_set_start_info(timer);
599 BUG_ON(!timer->function); 613 BUG_ON(!timer->function);
@@ -603,6 +617,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
603 if (timer_pending(timer)) { 617 if (timer_pending(timer)) {
604 detach_timer(timer, 0); 618 detach_timer(timer, 0);
605 ret = 1; 619 ret = 1;
620 } else {
621 if (pending_only)
622 goto out_unlock;
606 } 623 }
607 624
608 debug_timer_activate(timer); 625 debug_timer_activate(timer);
@@ -629,42 +646,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
629 646
630 timer->expires = expires; 647 timer->expires = expires;
631 internal_add_timer(base, timer); 648 internal_add_timer(base, timer);
649
650out_unlock:
632 spin_unlock_irqrestore(&base->lock, flags); 651 spin_unlock_irqrestore(&base->lock, flags);
633 652
634 return ret; 653 return ret;
635} 654}
636 655
637EXPORT_SYMBOL(__mod_timer);
638
639/** 656/**
640 * add_timer_on - start a timer on a particular CPU 657 * mod_timer_pending - modify a pending timer's timeout
641 * @timer: the timer to be added 658 * @timer: the pending timer to be modified
642 * @cpu: the CPU to start it on 659 * @expires: new timeout in jiffies
643 * 660 *
644 * This is not very scalable on SMP. Double adds are not possible. 661 * mod_timer_pending() is the same for pending timers as mod_timer(),
662 * but will not re-activate and modify already deleted timers.
663 *
664 * It is useful for unserialized use of timers.
645 */ 665 */
646void add_timer_on(struct timer_list *timer, int cpu) 666int mod_timer_pending(struct timer_list *timer, unsigned long expires)
647{ 667{
648 struct tvec_base *base = per_cpu(tvec_bases, cpu); 668 return __mod_timer(timer, expires, true);
649 unsigned long flags;
650
651 timer_stats_timer_set_start_info(timer);
652 BUG_ON(timer_pending(timer) || !timer->function);
653 spin_lock_irqsave(&base->lock, flags);
654 timer_set_base(timer, base);
655 debug_timer_activate(timer);
656 internal_add_timer(base, timer);
657 /*
658 * Check whether the other CPU is idle and needs to be
659 * triggered to reevaluate the timer wheel when nohz is
660 * active. We are protected against the other CPU fiddling
661 * with the timer by holding the timer base lock. This also
662 * makes sure that a CPU on the way to idle can not evaluate
663 * the timer wheel.
664 */
665 wake_up_idle_cpu(cpu);
666 spin_unlock_irqrestore(&base->lock, flags);
667} 669}
670EXPORT_SYMBOL(mod_timer_pending);
668 671
669/** 672/**
670 * mod_timer - modify a timer's timeout 673 * mod_timer - modify a timer's timeout
@@ -688,9 +691,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
688 */ 691 */
689int mod_timer(struct timer_list *timer, unsigned long expires) 692int mod_timer(struct timer_list *timer, unsigned long expires)
690{ 693{
691 BUG_ON(!timer->function);
692
693 timer_stats_timer_set_start_info(timer);
694 /* 694 /*
695 * This is a common optimization triggered by the 695 * This is a common optimization triggered by the
696 * networking code - if the timer is re-modified 696 * networking code - if the timer is re-modified
@@ -699,12 +699,62 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
699 if (timer->expires == expires && timer_pending(timer)) 699 if (timer->expires == expires && timer_pending(timer))
700 return 1; 700 return 1;
701 701
702 return __mod_timer(timer, expires); 702 return __mod_timer(timer, expires, false);
703} 703}
704
705EXPORT_SYMBOL(mod_timer); 704EXPORT_SYMBOL(mod_timer);
706 705
707/** 706/**
707 * add_timer - start a timer
708 * @timer: the timer to be added
709 *
710 * The kernel will do a ->function(->data) callback from the
711 * timer interrupt at the ->expires point in the future. The
712 * current time is 'jiffies'.
713 *
714 * The timer's ->expires, ->function (and if the handler uses it, ->data)
715 * fields must be set prior calling this function.
716 *
717 * Timers with an ->expires field in the past will be executed in the next
718 * timer tick.
719 */
720void add_timer(struct timer_list *timer)
721{
722 BUG_ON(timer_pending(timer));
723 mod_timer(timer, timer->expires);
724}
725EXPORT_SYMBOL(add_timer);
726
727/**
728 * add_timer_on - start a timer on a particular CPU
729 * @timer: the timer to be added
730 * @cpu: the CPU to start it on
731 *
732 * This is not very scalable on SMP. Double adds are not possible.
733 */
734void add_timer_on(struct timer_list *timer, int cpu)
735{
736 struct tvec_base *base = per_cpu(tvec_bases, cpu);
737 unsigned long flags;
738
739 timer_stats_timer_set_start_info(timer);
740 BUG_ON(timer_pending(timer) || !timer->function);
741 spin_lock_irqsave(&base->lock, flags);
742 timer_set_base(timer, base);
743 debug_timer_activate(timer);
744 internal_add_timer(base, timer);
745 /*
746 * Check whether the other CPU is idle and needs to be
747 * triggered to reevaluate the timer wheel when nohz is
748 * active. We are protected against the other CPU fiddling
749 * with the timer by holding the timer base lock. This also
750 * makes sure that a CPU on the way to idle can not evaluate
751 * the timer wheel.
752 */
753 wake_up_idle_cpu(cpu);
754 spin_unlock_irqrestore(&base->lock, flags);
755}
756
757/**
708 * del_timer - deactive a timer. 758 * del_timer - deactive a timer.
709 * @timer: the timer to be deactivated 759 * @timer: the timer to be deactivated
710 * 760 *
@@ -733,7 +783,6 @@ int del_timer(struct timer_list *timer)
733 783
734 return ret; 784 return ret;
735} 785}
736
737EXPORT_SYMBOL(del_timer); 786EXPORT_SYMBOL(del_timer);
738 787
739#ifdef CONFIG_SMP 788#ifdef CONFIG_SMP
@@ -767,7 +816,6 @@ out:
767 816
768 return ret; 817 return ret;
769} 818}
770
771EXPORT_SYMBOL(try_to_del_timer_sync); 819EXPORT_SYMBOL(try_to_del_timer_sync);
772 820
773/** 821/**
@@ -789,6 +837,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
789 */ 837 */
790int del_timer_sync(struct timer_list *timer) 838int del_timer_sync(struct timer_list *timer)
791{ 839{
840#ifdef CONFIG_LOCKDEP
841 unsigned long flags;
842
843 local_irq_save(flags);
844 lock_map_acquire(&timer->lockdep_map);
845 lock_map_release(&timer->lockdep_map);
846 local_irq_restore(flags);
847#endif
848
792 for (;;) { 849 for (;;) {
793 int ret = try_to_del_timer_sync(timer); 850 int ret = try_to_del_timer_sync(timer);
794 if (ret >= 0) 851 if (ret >= 0)
@@ -796,7 +853,6 @@ int del_timer_sync(struct timer_list *timer)
796 cpu_relax(); 853 cpu_relax();
797 } 854 }
798} 855}
799
800EXPORT_SYMBOL(del_timer_sync); 856EXPORT_SYMBOL(del_timer_sync);
801#endif 857#endif
802 858
@@ -861,10 +917,36 @@ static inline void __run_timers(struct tvec_base *base)
861 917
862 set_running_timer(base, timer); 918 set_running_timer(base, timer);
863 detach_timer(timer, 1); 919 detach_timer(timer, 1);
920
864 spin_unlock_irq(&base->lock); 921 spin_unlock_irq(&base->lock);
865 { 922 {
866 int preempt_count = preempt_count(); 923 int preempt_count = preempt_count();
924
925#ifdef CONFIG_LOCKDEP
926 /*
927 * It is permissible to free the timer from
928 * inside the function that is called from
929 * it, this we need to take into account for
930 * lockdep too. To avoid bogus "held lock
931 * freed" warnings as well as problems when
932 * looking into timer->lockdep_map, make a
933 * copy and use that here.
934 */
935 struct lockdep_map lockdep_map =
936 timer->lockdep_map;
937#endif
938 /*
939 * Couple the lock chain with the lock chain at
940 * del_timer_sync() by acquiring the lock_map
941 * around the fn() call here and in
942 * del_timer_sync().
943 */
944 lock_map_acquire(&lockdep_map);
945
867 fn(data); 946 fn(data);
947
948 lock_map_release(&lockdep_map);
949
868 if (preempt_count != preempt_count()) { 950 if (preempt_count != preempt_count()) {
869 printk(KERN_ERR "huh, entered %p " 951 printk(KERN_ERR "huh, entered %p "
870 "with preempt_count %08x, exited" 952 "with preempt_count %08x, exited"
@@ -1129,7 +1211,7 @@ void do_timer(unsigned long ticks)
1129 * For backwards compatibility? This can be done in libc so Alpha 1211 * For backwards compatibility? This can be done in libc so Alpha
1130 * and all newer ports shouldn't need it. 1212 * and all newer ports shouldn't need it.
1131 */ 1213 */
1132asmlinkage unsigned long sys_alarm(unsigned int seconds) 1214SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1133{ 1215{
1134 return alarm_setitimer(seconds); 1216 return alarm_setitimer(seconds);
1135} 1217}
@@ -1152,7 +1234,7 @@ asmlinkage unsigned long sys_alarm(unsigned int seconds)
1152 * 1234 *
1153 * This is SMP safe as current->tgid does not change. 1235 * This is SMP safe as current->tgid does not change.
1154 */ 1236 */
1155asmlinkage long sys_getpid(void) 1237SYSCALL_DEFINE0(getpid)
1156{ 1238{
1157 return task_tgid_vnr(current); 1239 return task_tgid_vnr(current);
1158} 1240}
@@ -1163,7 +1245,7 @@ asmlinkage long sys_getpid(void)
1163 * value of ->real_parent under rcu_read_lock(), see 1245 * value of ->real_parent under rcu_read_lock(), see
1164 * release_task()->call_rcu(delayed_put_task_struct). 1246 * release_task()->call_rcu(delayed_put_task_struct).
1165 */ 1247 */
1166asmlinkage long sys_getppid(void) 1248SYSCALL_DEFINE0(getppid)
1167{ 1249{
1168 int pid; 1250 int pid;
1169 1251
@@ -1174,25 +1256,25 @@ asmlinkage long sys_getppid(void)
1174 return pid; 1256 return pid;
1175} 1257}
1176 1258
1177asmlinkage long sys_getuid(void) 1259SYSCALL_DEFINE0(getuid)
1178{ 1260{
1179 /* Only we change this so SMP safe */ 1261 /* Only we change this so SMP safe */
1180 return current_uid(); 1262 return current_uid();
1181} 1263}
1182 1264
1183asmlinkage long sys_geteuid(void) 1265SYSCALL_DEFINE0(geteuid)
1184{ 1266{
1185 /* Only we change this so SMP safe */ 1267 /* Only we change this so SMP safe */
1186 return current_euid(); 1268 return current_euid();
1187} 1269}
1188 1270
1189asmlinkage long sys_getgid(void) 1271SYSCALL_DEFINE0(getgid)
1190{ 1272{
1191 /* Only we change this so SMP safe */ 1273 /* Only we change this so SMP safe */
1192 return current_gid(); 1274 return current_gid();
1193} 1275}
1194 1276
1195asmlinkage long sys_getegid(void) 1277SYSCALL_DEFINE0(getegid)
1196{ 1278{
1197 /* Only we change this so SMP safe */ 1279 /* Only we change this so SMP safe */
1198 return current_egid(); 1280 return current_egid();
@@ -1268,7 +1350,7 @@ signed long __sched schedule_timeout(signed long timeout)
1268 expire = timeout + jiffies; 1350 expire = timeout + jiffies;
1269 1351
1270 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); 1352 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1271 __mod_timer(&timer, expire); 1353 __mod_timer(&timer, expire, false);
1272 schedule(); 1354 schedule();
1273 del_singleshot_timer_sync(&timer); 1355 del_singleshot_timer_sync(&timer);
1274 1356
@@ -1308,7 +1390,7 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1308EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1390EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1309 1391
1310/* Thread ID - the internal kernel "pid" */ 1392/* Thread ID - the internal kernel "pid" */
1311asmlinkage long sys_gettid(void) 1393SYSCALL_DEFINE0(gettid)
1312{ 1394{
1313 return task_pid_vnr(current); 1395 return task_pid_vnr(current);
1314} 1396}
@@ -1400,7 +1482,7 @@ out:
1400 return 0; 1482 return 0;
1401} 1483}
1402 1484
1403asmlinkage long sys_sysinfo(struct sysinfo __user *info) 1485SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
1404{ 1486{
1405 struct sysinfo val; 1487 struct sysinfo val;
1406 1488
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a6..2246141bda4d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
9config NOP_TRACER 9config NOP_TRACER
10 bool 10 bool
11 11
12config HAVE_FTRACE_NMI_ENTER
13 bool
14
12config HAVE_FUNCTION_TRACER 15config HAVE_FUNCTION_TRACER
13 bool 16 bool
14 17
@@ -31,12 +34,20 @@ config HAVE_FTRACE_MCOUNT_RECORD
31config HAVE_HW_BRANCH_TRACER 34config HAVE_HW_BRANCH_TRACER
32 bool 35 bool
33 36
37config HAVE_FTRACE_SYSCALLS
38 bool
39
34config TRACER_MAX_TRACE 40config TRACER_MAX_TRACE
35 bool 41 bool
36 42
37config RING_BUFFER 43config RING_BUFFER
38 bool 44 bool
39 45
46config FTRACE_NMI_ENTER
47 bool
48 depends on HAVE_FTRACE_NMI_ENTER
49 default y
50
40config TRACING 51config TRACING
41 bool 52 bool
42 select DEBUG_FS 53 select DEBUG_FS
@@ -44,14 +55,31 @@ config TRACING
44 select STACKTRACE if STACKTRACE_SUPPORT 55 select STACKTRACE if STACKTRACE_SUPPORT
45 select TRACEPOINTS 56 select TRACEPOINTS
46 select NOP_TRACER 57 select NOP_TRACER
58 select BINARY_PRINTF
59
60#
61# Minimum requirements an architecture has to meet for us to
62# be able to offer generic tracing facilities:
63#
64config TRACING_SUPPORT
65 bool
66 # PPC32 has no irqflags tracing support, but it can use most of the
67 # tracers anyway, they were tested to build and work. Note that new
68 # exceptions to this list aren't welcomed, better implement the
69 # irqflags tracing for your architecture.
70 depends on TRACE_IRQFLAGS_SUPPORT || PPC32
71 depends on STACKTRACE_SUPPORT
72 default y
73
74if TRACING_SUPPORT
47 75
48menu "Tracers" 76menu "Tracers"
49 77
50config FUNCTION_TRACER 78config FUNCTION_TRACER
51 bool "Kernel Function Tracer" 79 bool "Kernel Function Tracer"
52 depends on HAVE_FUNCTION_TRACER 80 depends on HAVE_FUNCTION_TRACER
53 depends on DEBUG_KERNEL
54 select FRAME_POINTER 81 select FRAME_POINTER
82 select KALLSYMS
55 select TRACING 83 select TRACING
56 select CONTEXT_SWITCH_TRACER 84 select CONTEXT_SWITCH_TRACER
57 help 85 help
@@ -71,18 +99,16 @@ config FUNCTION_GRAPH_TRACER
71 help 99 help
72 Enable the kernel to trace a function at both its return 100 Enable the kernel to trace a function at both its return
73 and its entry. 101 and its entry.
74 It's first purpose is to trace the duration of functions and 102 Its first purpose is to trace the duration of functions and
75 draw a call graph for each thread with some informations like 103 draw a call graph for each thread with some information like
76 the return value. 104 the return value. This is done by setting the current return
77 This is done by setting the current return address on the current 105 address on the current task structure into a stack of calls.
78 task structure into a stack of calls.
79 106
80config IRQSOFF_TRACER 107config IRQSOFF_TRACER
81 bool "Interrupts-off Latency Tracer" 108 bool "Interrupts-off Latency Tracer"
82 default n 109 default n
83 depends on TRACE_IRQFLAGS_SUPPORT 110 depends on TRACE_IRQFLAGS_SUPPORT
84 depends on GENERIC_TIME 111 depends on GENERIC_TIME
85 depends on DEBUG_KERNEL
86 select TRACE_IRQFLAGS 112 select TRACE_IRQFLAGS
87 select TRACING 113 select TRACING
88 select TRACER_MAX_TRACE 114 select TRACER_MAX_TRACE
@@ -105,7 +131,6 @@ config PREEMPT_TRACER
105 default n 131 default n
106 depends on GENERIC_TIME 132 depends on GENERIC_TIME
107 depends on PREEMPT 133 depends on PREEMPT
108 depends on DEBUG_KERNEL
109 select TRACING 134 select TRACING
110 select TRACER_MAX_TRACE 135 select TRACER_MAX_TRACE
111 help 136 help
@@ -126,13 +151,13 @@ config SYSPROF_TRACER
126 bool "Sysprof Tracer" 151 bool "Sysprof Tracer"
127 depends on X86 152 depends on X86
128 select TRACING 153 select TRACING
154 select CONTEXT_SWITCH_TRACER
129 help 155 help
130 This tracer provides the trace needed by the 'Sysprof' userspace 156 This tracer provides the trace needed by the 'Sysprof' userspace
131 tool. 157 tool.
132 158
133config SCHED_TRACER 159config SCHED_TRACER
134 bool "Scheduling Latency Tracer" 160 bool "Scheduling Latency Tracer"
135 depends on DEBUG_KERNEL
136 select TRACING 161 select TRACING
137 select CONTEXT_SWITCH_TRACER 162 select CONTEXT_SWITCH_TRACER
138 select TRACER_MAX_TRACE 163 select TRACER_MAX_TRACE
@@ -142,16 +167,30 @@ config SCHED_TRACER
142 167
143config CONTEXT_SWITCH_TRACER 168config CONTEXT_SWITCH_TRACER
144 bool "Trace process context switches" 169 bool "Trace process context switches"
145 depends on DEBUG_KERNEL
146 select TRACING 170 select TRACING
147 select MARKERS 171 select MARKERS
148 help 172 help
149 This tracer gets called from the context switch and records 173 This tracer gets called from the context switch and records
150 all switching of tasks. 174 all switching of tasks.
151 175
176config EVENT_TRACER
177 bool "Trace various events in the kernel"
178 select TRACING
179 help
180 This tracer hooks to various trace points in the kernel
181 allowing the user to pick and choose which trace point they
182 want to trace.
183
184config FTRACE_SYSCALLS
185 bool "Trace syscalls"
186 depends on HAVE_FTRACE_SYSCALLS
187 select TRACING
188 select KALLSYMS
189 help
190 Basic tracer to catch the syscall entry and exit events.
191
152config BOOT_TRACER 192config BOOT_TRACER
153 bool "Trace boot initcalls" 193 bool "Trace boot initcalls"
154 depends on DEBUG_KERNEL
155 select TRACING 194 select TRACING
156 select CONTEXT_SWITCH_TRACER 195 select CONTEXT_SWITCH_TRACER
157 help 196 help
@@ -164,13 +203,11 @@ config BOOT_TRACER
164 representation of the delays during initcalls - but the raw 203 representation of the delays during initcalls - but the raw
165 /debug/tracing/trace text output is readable too. 204 /debug/tracing/trace text output is readable too.
166 205
167 ( Note that tracing self tests can't be enabled if this tracer is 206 You must pass in ftrace=initcall to the kernel command line
168 selected, because the self-tests are an initcall as well and that 207 to enable this on bootup.
169 would invalidate the boot trace. )
170 208
171config TRACE_BRANCH_PROFILING 209config TRACE_BRANCH_PROFILING
172 bool "Trace likely/unlikely profiler" 210 bool "Trace likely/unlikely profiler"
173 depends on DEBUG_KERNEL
174 select TRACING 211 select TRACING
175 help 212 help
176 This tracer profiles all the the likely and unlikely macros 213 This tracer profiles all the the likely and unlikely macros
@@ -223,7 +260,6 @@ config BRANCH_TRACER
223 260
224config POWER_TRACER 261config POWER_TRACER
225 bool "Trace power consumption behavior" 262 bool "Trace power consumption behavior"
226 depends on DEBUG_KERNEL
227 depends on X86 263 depends on X86
228 select TRACING 264 select TRACING
229 help 265 help
@@ -235,9 +271,9 @@ config POWER_TRACER
235config STACK_TRACER 271config STACK_TRACER
236 bool "Trace max stack" 272 bool "Trace max stack"
237 depends on HAVE_FUNCTION_TRACER 273 depends on HAVE_FUNCTION_TRACER
238 depends on DEBUG_KERNEL
239 select FUNCTION_TRACER 274 select FUNCTION_TRACER
240 select STACKTRACE 275 select STACKTRACE
276 select KALLSYMS
241 help 277 help
242 This special tracer records the maximum stack footprint of the 278 This special tracer records the maximum stack footprint of the
243 kernel and displays it in debugfs/tracing/stack_trace. 279 kernel and displays it in debugfs/tracing/stack_trace.
@@ -264,11 +300,66 @@ config HW_BRANCH_TRACER
264 This tracer records all branches on the system in a circular 300 This tracer records all branches on the system in a circular
265 buffer giving access to the last N branches for each cpu. 301 buffer giving access to the last N branches for each cpu.
266 302
303config KMEMTRACE
304 bool "Trace SLAB allocations"
305 select TRACING
306 help
307 kmemtrace provides tracing for slab allocator functions, such as
308 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
309 data is then fed to the userspace application in order to analyse
310 allocation hotspots, internal fragmentation and so on, making it
311 possible to see how well an allocator performs, as well as debug
312 and profile kernel code.
313
314 This requires an userspace application to use. See
315 Documentation/vm/kmemtrace.txt for more information.
316
317 Saying Y will make the kernel somewhat larger and slower. However,
318 if you disable kmemtrace at run-time or boot-time, the performance
319 impact is minimal (depending on the arch the kernel is built for).
320
321 If unsure, say N.
322
323config WORKQUEUE_TRACER
324 bool "Trace workqueues"
325 select TRACING
326 help
327 The workqueue tracer provides some statistical informations
328 about each cpu workqueue thread such as the number of the
329 works inserted and executed since their creation. It can help
330 to evaluate the amount of work each of them have to perform.
331 For example it can help a developer to decide whether he should
332 choose a per cpu workqueue instead of a singlethreaded one.
333
334config BLK_DEV_IO_TRACE
335 bool "Support for tracing block io actions"
336 depends on SYSFS
337 depends on BLOCK
338 select RELAY
339 select DEBUG_FS
340 select TRACEPOINTS
341 select TRACING
342 select STACKTRACE
343 help
344 Say Y here if you want to be able to trace the block layer actions
345 on a given queue. Tracing allows you to see any traffic happening
346 on a block device queue. For more information (and the userspace
347 support tools needed), fetch the blktrace tools from:
348
349 git://git.kernel.dk/blktrace.git
350
351 Tracing also is possible using the ftrace interface, e.g.:
352
353 echo 1 > /sys/block/sda/sda1/trace/enable
354 echo blk > /sys/kernel/debug/tracing/current_tracer
355 cat /sys/kernel/debug/tracing/trace_pipe
356
357 If unsure, say N.
358
267config DYNAMIC_FTRACE 359config DYNAMIC_FTRACE
268 bool "enable/disable ftrace tracepoints dynamically" 360 bool "enable/disable ftrace tracepoints dynamically"
269 depends on FUNCTION_TRACER 361 depends on FUNCTION_TRACER
270 depends on HAVE_DYNAMIC_FTRACE 362 depends on HAVE_DYNAMIC_FTRACE
271 depends on DEBUG_KERNEL
272 default y 363 default y
273 help 364 help
274 This option will modify all the calls to ftrace dynamically 365 This option will modify all the calls to ftrace dynamically
@@ -294,7 +385,7 @@ config FTRACE_SELFTEST
294 385
295config FTRACE_STARTUP_TEST 386config FTRACE_STARTUP_TEST
296 bool "Perform a startup test on ftrace" 387 bool "Perform a startup test on ftrace"
297 depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER 388 depends on TRACING
298 select FTRACE_SELFTEST 389 select FTRACE_SELFTEST
299 help 390 help
300 This option performs a series of startup tests on ftrace. On bootup 391 This option performs a series of startup tests on ftrace. On bootup
@@ -302,4 +393,30 @@ config FTRACE_STARTUP_TEST
302 functioning properly. It will do tests on all the configured 393 functioning properly. It will do tests on all the configured
303 tracers of ftrace. 394 tracers of ftrace.
304 395
396config MMIOTRACE
397 bool "Memory mapped IO tracing"
398 depends on HAVE_MMIOTRACE_SUPPORT && PCI
399 select TRACING
400 help
401 Mmiotrace traces Memory Mapped I/O access and is meant for
402 debugging and reverse engineering. It is called from the ioremap
403 implementation and works via page faults. Tracing is disabled by
404 default and can be enabled at run-time.
405
406 See Documentation/tracers/mmiotrace.txt.
407 If you are not helping to develop drivers, say N.
408
409config MMIOTRACE_TEST
410 tristate "Test module for mmiotrace"
411 depends on MMIOTRACE && m
412 help
413 This is a dumb module for testing mmiotrace. It is very dangerous
414 as it will write garbage to IO memory starting at a given address.
415 However, it should be safe to use on e.g. unused portion of VRAM.
416
417 Say N, unless you absolutely know what you are doing.
418
305endmenu 419endmenu
420
421endif # TRACING_SUPPORT
422
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..2630f5121ec1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
20 20
21obj-$(CONFIG_TRACING) += trace.o 21obj-$(CONFIG_TRACING) += trace.o
22obj-$(CONFIG_TRACING) += trace_clock.o
23obj-$(CONFIG_TRACING) += trace_output.o
24obj-$(CONFIG_TRACING) += trace_stat.o
25obj-$(CONFIG_TRACING) += trace_printk.o
22obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 26obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
23obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o 27obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
24obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 28obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
@@ -33,5 +37,14 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 37obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 38obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o 39obj-$(CONFIG_POWER_TRACER) += trace_power.o
40obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
41obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
42obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
43obj-$(CONFIG_EVENT_TRACER) += trace_events.o
44obj-$(CONFIG_EVENT_TRACER) += events.o
45obj-$(CONFIG_EVENT_TRACER) += trace_export.o
46obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
47obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
48obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
36 49
37libftrace-y := ftrace.o 50libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
new file mode 100644
index 000000000000..b32ff446c3fb
--- /dev/null
+++ b/kernel/trace/blktrace.c
@@ -0,0 +1,1550 @@
1/*
2 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 *
17 */
18#include <linux/kernel.h>
19#include <linux/blkdev.h>
20#include <linux/blktrace_api.h>
21#include <linux/percpu.h>
22#include <linux/init.h>
23#include <linux/mutex.h>
24#include <linux/debugfs.h>
25#include <linux/time.h>
26#include <trace/block.h>
27#include <linux/uaccess.h>
28#include "trace_output.h"
29
30static unsigned int blktrace_seq __read_mostly = 1;
31
32static struct trace_array *blk_tr;
33static bool blk_tracer_enabled __read_mostly;
34
35/* Select an alternative, minimalistic output than the original one */
36#define TRACE_BLK_OPT_CLASSIC 0x1
37
38static struct tracer_opt blk_tracer_opts[] = {
39 /* Default disable the minimalistic output */
40 { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
41 { }
42};
43
44static struct tracer_flags blk_tracer_flags = {
45 .val = 0,
46 .opts = blk_tracer_opts,
47};
48
49/* Global reference count of probes */
50static atomic_t blk_probes_ref = ATOMIC_INIT(0);
51
52static void blk_register_tracepoints(void);
53static void blk_unregister_tracepoints(void);
54
55/*
56 * Send out a notify message.
57 */
58static void trace_note(struct blk_trace *bt, pid_t pid, int action,
59 const void *data, size_t len)
60{
61 struct blk_io_trace *t;
62 struct ring_buffer_event *event = NULL;
63 int pc = 0;
64 int cpu = smp_processor_id();
65 bool blk_tracer = blk_tracer_enabled;
66
67 if (blk_tracer) {
68 pc = preempt_count();
69 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
70 sizeof(*t) + len,
71 0, pc);
72 if (!event)
73 return;
74 t = ring_buffer_event_data(event);
75 goto record_it;
76 }
77
78 if (!bt->rchan)
79 return;
80
81 t = relay_reserve(bt->rchan, sizeof(*t) + len);
82 if (t) {
83 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
84 t->time = ktime_to_ns(ktime_get());
85record_it:
86 t->device = bt->dev;
87 t->action = action;
88 t->pid = pid;
89 t->cpu = cpu;
90 t->pdu_len = len;
91 memcpy((void *) t + sizeof(*t), data, len);
92
93 if (blk_tracer)
94 trace_buffer_unlock_commit(blk_tr, event, 0, pc);
95 }
96}
97
98/*
99 * Send out a notify for this process, if we haven't done so since a trace
100 * started
101 */
102static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
103{
104 tsk->btrace_seq = blktrace_seq;
105 trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
106}
107
108static void trace_note_time(struct blk_trace *bt)
109{
110 struct timespec now;
111 unsigned long flags;
112 u32 words[2];
113
114 getnstimeofday(&now);
115 words[0] = now.tv_sec;
116 words[1] = now.tv_nsec;
117
118 local_irq_save(flags);
119 trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
120 local_irq_restore(flags);
121}
122
123void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
124{
125 int n;
126 va_list args;
127 unsigned long flags;
128 char *buf;
129
130 if (unlikely(bt->trace_state != Blktrace_running &&
131 !blk_tracer_enabled))
132 return;
133
134 local_irq_save(flags);
135 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
136 va_start(args, fmt);
137 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
138 va_end(args);
139
140 trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
141 local_irq_restore(flags);
142}
143EXPORT_SYMBOL_GPL(__trace_note_message);
144
145static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
146 pid_t pid)
147{
148 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
149 return 1;
150 if (sector < bt->start_lba || sector > bt->end_lba)
151 return 1;
152 if (bt->pid && pid != bt->pid)
153 return 1;
154
155 return 0;
156}
157
158/*
159 * Data direction bit lookup
160 */
161static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
162 BLK_TC_ACT(BLK_TC_WRITE) };
163
164/* The ilog2() calls fall out because they're constant */
165#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
166 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
167
168/*
169 * The worker for the various blk_add_trace*() types. Fills out a
170 * blk_io_trace structure and places it in a per-cpu subbuffer.
171 */
172static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
173 int rw, u32 what, int error, int pdu_len, void *pdu_data)
174{
175 struct task_struct *tsk = current;
176 struct ring_buffer_event *event = NULL;
177 struct blk_io_trace *t;
178 unsigned long flags = 0;
179 unsigned long *sequence;
180 pid_t pid;
181 int cpu, pc = 0;
182 bool blk_tracer = blk_tracer_enabled;
183
184 if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
185 return;
186
187 what |= ddir_act[rw & WRITE];
188 what |= MASK_TC_BIT(rw, BARRIER);
189 what |= MASK_TC_BIT(rw, SYNCIO);
190 what |= MASK_TC_BIT(rw, AHEAD);
191 what |= MASK_TC_BIT(rw, META);
192 what |= MASK_TC_BIT(rw, DISCARD);
193
194 pid = tsk->pid;
195 if (unlikely(act_log_check(bt, what, sector, pid)))
196 return;
197 cpu = raw_smp_processor_id();
198
199 if (blk_tracer) {
200 tracing_record_cmdline(current);
201
202 pc = preempt_count();
203 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
204 sizeof(*t) + pdu_len,
205 0, pc);
206 if (!event)
207 return;
208 t = ring_buffer_event_data(event);
209 goto record_it;
210 }
211
212 /*
213 * A word about the locking here - we disable interrupts to reserve
214 * some space in the relay per-cpu buffer, to prevent an irq
215 * from coming in and stepping on our toes.
216 */
217 local_irq_save(flags);
218
219 if (unlikely(tsk->btrace_seq != blktrace_seq))
220 trace_note_tsk(bt, tsk);
221
222 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
223 if (t) {
224 sequence = per_cpu_ptr(bt->sequence, cpu);
225
226 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
227 t->sequence = ++(*sequence);
228 t->time = ktime_to_ns(ktime_get());
229record_it:
230 /*
231 * These two are not needed in ftrace as they are in the
232 * generic trace_entry, filled by tracing_generic_entry_update,
233 * but for the trace_event->bin() synthesizer benefit we do it
234 * here too.
235 */
236 t->cpu = cpu;
237 t->pid = pid;
238
239 t->sector = sector;
240 t->bytes = bytes;
241 t->action = what;
242 t->device = bt->dev;
243 t->error = error;
244 t->pdu_len = pdu_len;
245
246 if (pdu_len)
247 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
248
249 if (blk_tracer) {
250 trace_buffer_unlock_commit(blk_tr, event, 0, pc);
251 return;
252 }
253 }
254
255 local_irq_restore(flags);
256}
257
258static struct dentry *blk_tree_root;
259static DEFINE_MUTEX(blk_tree_mutex);
260
261static void blk_trace_free(struct blk_trace *bt)
262{
263 debugfs_remove(bt->msg_file);
264 debugfs_remove(bt->dropped_file);
265 relay_close(bt->rchan);
266 free_percpu(bt->sequence);
267 free_percpu(bt->msg_data);
268 kfree(bt);
269}
270
271static void blk_trace_cleanup(struct blk_trace *bt)
272{
273 blk_trace_free(bt);
274 if (atomic_dec_and_test(&blk_probes_ref))
275 blk_unregister_tracepoints();
276}
277
278int blk_trace_remove(struct request_queue *q)
279{
280 struct blk_trace *bt;
281
282 bt = xchg(&q->blk_trace, NULL);
283 if (!bt)
284 return -EINVAL;
285
286 if (bt->trace_state != Blktrace_running)
287 blk_trace_cleanup(bt);
288
289 return 0;
290}
291EXPORT_SYMBOL_GPL(blk_trace_remove);
292
293static int blk_dropped_open(struct inode *inode, struct file *filp)
294{
295 filp->private_data = inode->i_private;
296
297 return 0;
298}
299
300static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
301 size_t count, loff_t *ppos)
302{
303 struct blk_trace *bt = filp->private_data;
304 char buf[16];
305
306 snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
307
308 return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
309}
310
311static const struct file_operations blk_dropped_fops = {
312 .owner = THIS_MODULE,
313 .open = blk_dropped_open,
314 .read = blk_dropped_read,
315};
316
317static int blk_msg_open(struct inode *inode, struct file *filp)
318{
319 filp->private_data = inode->i_private;
320
321 return 0;
322}
323
324static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
325 size_t count, loff_t *ppos)
326{
327 char *msg;
328 struct blk_trace *bt;
329
330 if (count >= BLK_TN_MAX_MSG)
331 return -EINVAL;
332
333 msg = kmalloc(count + 1, GFP_KERNEL);
334 if (msg == NULL)
335 return -ENOMEM;
336
337 if (copy_from_user(msg, buffer, count)) {
338 kfree(msg);
339 return -EFAULT;
340 }
341
342 msg[count] = '\0';
343 bt = filp->private_data;
344 __trace_note_message(bt, "%s", msg);
345 kfree(msg);
346
347 return count;
348}
349
350static const struct file_operations blk_msg_fops = {
351 .owner = THIS_MODULE,
352 .open = blk_msg_open,
353 .write = blk_msg_write,
354};
355
356/*
357 * Keep track of how many times we encountered a full subbuffer, to aid
358 * the user space app in telling how many lost events there were.
359 */
360static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
361 void *prev_subbuf, size_t prev_padding)
362{
363 struct blk_trace *bt;
364
365 if (!relay_buf_full(buf))
366 return 1;
367
368 bt = buf->chan->private_data;
369 atomic_inc(&bt->dropped);
370 return 0;
371}
372
373static int blk_remove_buf_file_callback(struct dentry *dentry)
374{
375 struct dentry *parent = dentry->d_parent;
376 debugfs_remove(dentry);
377
378 /*
379 * this will fail for all but the last file, but that is ok. what we
380 * care about is the top level buts->name directory going away, when
381 * the last trace file is gone. Then we don't have to rmdir() that
382 * manually on trace stop, so it nicely solves the issue with
383 * force killing of running traces.
384 */
385
386 debugfs_remove(parent);
387 return 0;
388}
389
390static struct dentry *blk_create_buf_file_callback(const char *filename,
391 struct dentry *parent,
392 int mode,
393 struct rchan_buf *buf,
394 int *is_global)
395{
396 return debugfs_create_file(filename, mode, parent, buf,
397 &relay_file_operations);
398}
399
400static struct rchan_callbacks blk_relay_callbacks = {
401 .subbuf_start = blk_subbuf_start_callback,
402 .create_buf_file = blk_create_buf_file_callback,
403 .remove_buf_file = blk_remove_buf_file_callback,
404};
405
406/*
407 * Setup everything required to start tracing
408 */
409int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
410 struct blk_user_trace_setup *buts)
411{
412 struct blk_trace *old_bt, *bt = NULL;
413 struct dentry *dir = NULL;
414 int ret, i;
415
416 if (!buts->buf_size || !buts->buf_nr)
417 return -EINVAL;
418
419 strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
420 buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
421
422 /*
423 * some device names have larger paths - convert the slashes
424 * to underscores for this to work as expected
425 */
426 for (i = 0; i < strlen(buts->name); i++)
427 if (buts->name[i] == '/')
428 buts->name[i] = '_';
429
430 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
431 if (!bt)
432 return -ENOMEM;
433
434 ret = -ENOMEM;
435 bt->sequence = alloc_percpu(unsigned long);
436 if (!bt->sequence)
437 goto err;
438
439 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
440 if (!bt->msg_data)
441 goto err;
442
443 ret = -ENOENT;
444
445 mutex_lock(&blk_tree_mutex);
446 if (!blk_tree_root) {
447 blk_tree_root = debugfs_create_dir("block", NULL);
448 if (!blk_tree_root) {
449 mutex_unlock(&blk_tree_mutex);
450 goto err;
451 }
452 }
453 mutex_unlock(&blk_tree_mutex);
454
455 dir = debugfs_create_dir(buts->name, blk_tree_root);
456
457 if (!dir)
458 goto err;
459
460 bt->dir = dir;
461 bt->dev = dev;
462 atomic_set(&bt->dropped, 0);
463
464 ret = -EIO;
465 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
466 &blk_dropped_fops);
467 if (!bt->dropped_file)
468 goto err;
469
470 bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
471 if (!bt->msg_file)
472 goto err;
473
474 bt->rchan = relay_open("trace", dir, buts->buf_size,
475 buts->buf_nr, &blk_relay_callbacks, bt);
476 if (!bt->rchan)
477 goto err;
478
479 bt->act_mask = buts->act_mask;
480 if (!bt->act_mask)
481 bt->act_mask = (u16) -1;
482
483 bt->start_lba = buts->start_lba;
484 bt->end_lba = buts->end_lba;
485 if (!bt->end_lba)
486 bt->end_lba = -1ULL;
487
488 bt->pid = buts->pid;
489 bt->trace_state = Blktrace_setup;
490
491 ret = -EBUSY;
492 old_bt = xchg(&q->blk_trace, bt);
493 if (old_bt) {
494 (void) xchg(&q->blk_trace, old_bt);
495 goto err;
496 }
497
498 if (atomic_inc_return(&blk_probes_ref) == 1)
499 blk_register_tracepoints();
500
501 return 0;
502err:
503 blk_trace_free(bt);
504 return ret;
505}
506
507int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
508 char __user *arg)
509{
510 struct blk_user_trace_setup buts;
511 int ret;
512
513 ret = copy_from_user(&buts, arg, sizeof(buts));
514 if (ret)
515 return -EFAULT;
516
517 ret = do_blk_trace_setup(q, name, dev, &buts);
518 if (ret)
519 return ret;
520
521 if (copy_to_user(arg, &buts, sizeof(buts)))
522 return -EFAULT;
523
524 return 0;
525}
526EXPORT_SYMBOL_GPL(blk_trace_setup);
527
528int blk_trace_startstop(struct request_queue *q, int start)
529{
530 int ret;
531 struct blk_trace *bt = q->blk_trace;
532
533 if (bt == NULL)
534 return -EINVAL;
535
536 /*
537 * For starting a trace, we can transition from a setup or stopped
538 * trace. For stopping a trace, the state must be running
539 */
540 ret = -EINVAL;
541 if (start) {
542 if (bt->trace_state == Blktrace_setup ||
543 bt->trace_state == Blktrace_stopped) {
544 blktrace_seq++;
545 smp_mb();
546 bt->trace_state = Blktrace_running;
547
548 trace_note_time(bt);
549 ret = 0;
550 }
551 } else {
552 if (bt->trace_state == Blktrace_running) {
553 bt->trace_state = Blktrace_stopped;
554 relay_flush(bt->rchan);
555 ret = 0;
556 }
557 }
558
559 return ret;
560}
561EXPORT_SYMBOL_GPL(blk_trace_startstop);
562
563/**
564 * blk_trace_ioctl: - handle the ioctls associated with tracing
565 * @bdev: the block device
566 * @cmd: the ioctl cmd
567 * @arg: the argument data, if any
568 *
569 **/
570int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
571{
572 struct request_queue *q;
573 int ret, start = 0;
574 char b[BDEVNAME_SIZE];
575
576 q = bdev_get_queue(bdev);
577 if (!q)
578 return -ENXIO;
579
580 mutex_lock(&bdev->bd_mutex);
581
582 switch (cmd) {
583 case BLKTRACESETUP:
584 bdevname(bdev, b);
585 ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
586 break;
587 case BLKTRACESTART:
588 start = 1;
589 case BLKTRACESTOP:
590 ret = blk_trace_startstop(q, start);
591 break;
592 case BLKTRACETEARDOWN:
593 ret = blk_trace_remove(q);
594 break;
595 default:
596 ret = -ENOTTY;
597 break;
598 }
599
600 mutex_unlock(&bdev->bd_mutex);
601 return ret;
602}
603
604/**
605 * blk_trace_shutdown: - stop and cleanup trace structures
606 * @q: the request queue associated with the device
607 *
608 **/
609void blk_trace_shutdown(struct request_queue *q)
610{
611 if (q->blk_trace) {
612 blk_trace_startstop(q, 0);
613 blk_trace_remove(q);
614 }
615}
616
617/*
618 * blktrace probes
619 */
620
621/**
622 * blk_add_trace_rq - Add a trace for a request oriented action
623 * @q: queue the io is for
624 * @rq: the source request
625 * @what: the action
626 *
627 * Description:
628 * Records an action against a request. Will log the bio offset + size.
629 *
630 **/
631static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
632 u32 what)
633{
634 struct blk_trace *bt = q->blk_trace;
635 int rw = rq->cmd_flags & 0x03;
636
637 if (likely(!bt))
638 return;
639
640 if (blk_discard_rq(rq))
641 rw |= (1 << BIO_RW_DISCARD);
642
643 if (blk_pc_request(rq)) {
644 what |= BLK_TC_ACT(BLK_TC_PC);
645 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
646 rq->cmd_len, rq->cmd);
647 } else {
648 what |= BLK_TC_ACT(BLK_TC_FS);
649 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
650 rw, what, rq->errors, 0, NULL);
651 }
652}
653
654static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
655{
656 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
657}
658
659static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
660{
661 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
662}
663
664static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
665{
666 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
667}
668
669static void blk_add_trace_rq_requeue(struct request_queue *q,
670 struct request *rq)
671{
672 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
673}
674
675static void blk_add_trace_rq_complete(struct request_queue *q,
676 struct request *rq)
677{
678 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
679}
680
681/**
682 * blk_add_trace_bio - Add a trace for a bio oriented action
683 * @q: queue the io is for
684 * @bio: the source bio
685 * @what: the action
686 *
687 * Description:
688 * Records an action against a bio. Will log the bio offset + size.
689 *
690 **/
691static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
692 u32 what)
693{
694 struct blk_trace *bt = q->blk_trace;
695
696 if (likely(!bt))
697 return;
698
699 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
700 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
701}
702
703static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
704{
705 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
706}
707
708static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
709{
710 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
711}
712
713static void blk_add_trace_bio_backmerge(struct request_queue *q,
714 struct bio *bio)
715{
716 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
717}
718
719static void blk_add_trace_bio_frontmerge(struct request_queue *q,
720 struct bio *bio)
721{
722 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
723}
724
725static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
726{
727 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
728}
729
730static void blk_add_trace_getrq(struct request_queue *q,
731 struct bio *bio, int rw)
732{
733 if (bio)
734 blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
735 else {
736 struct blk_trace *bt = q->blk_trace;
737
738 if (bt)
739 __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
740 }
741}
742
743
744static void blk_add_trace_sleeprq(struct request_queue *q,
745 struct bio *bio, int rw)
746{
747 if (bio)
748 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
749 else {
750 struct blk_trace *bt = q->blk_trace;
751
752 if (bt)
753 __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
754 0, 0, NULL);
755 }
756}
757
758static void blk_add_trace_plug(struct request_queue *q)
759{
760 struct blk_trace *bt = q->blk_trace;
761
762 if (bt)
763 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
764}
765
766static void blk_add_trace_unplug_io(struct request_queue *q)
767{
768 struct blk_trace *bt = q->blk_trace;
769
770 if (bt) {
771 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
772 __be64 rpdu = cpu_to_be64(pdu);
773
774 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
775 sizeof(rpdu), &rpdu);
776 }
777}
778
779static void blk_add_trace_unplug_timer(struct request_queue *q)
780{
781 struct blk_trace *bt = q->blk_trace;
782
783 if (bt) {
784 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
785 __be64 rpdu = cpu_to_be64(pdu);
786
787 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
788 sizeof(rpdu), &rpdu);
789 }
790}
791
792static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
793 unsigned int pdu)
794{
795 struct blk_trace *bt = q->blk_trace;
796
797 if (bt) {
798 __be64 rpdu = cpu_to_be64(pdu);
799
800 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
801 BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
802 sizeof(rpdu), &rpdu);
803 }
804}
805
806/**
807 * blk_add_trace_remap - Add a trace for a remap operation
808 * @q: queue the io is for
809 * @bio: the source bio
810 * @dev: target device
811 * @from: source sector
812 * @to: target sector
813 *
814 * Description:
815 * Device mapper or raid target sometimes need to split a bio because
816 * it spans a stripe (or similar). Add a trace for that action.
817 *
818 **/
819static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
820 dev_t dev, sector_t from, sector_t to)
821{
822 struct blk_trace *bt = q->blk_trace;
823 struct blk_io_trace_remap r;
824
825 if (likely(!bt))
826 return;
827
828 r.device = cpu_to_be32(dev);
829 r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
830 r.sector = cpu_to_be64(to);
831
832 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
833 !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
834}
835
836/**
837 * blk_add_driver_data - Add binary message with driver-specific data
838 * @q: queue the io is for
839 * @rq: io request
840 * @data: driver-specific data
841 * @len: length of driver-specific data
842 *
843 * Description:
844 * Some drivers might want to write driver-specific data per request.
845 *
846 **/
847void blk_add_driver_data(struct request_queue *q,
848 struct request *rq,
849 void *data, size_t len)
850{
851 struct blk_trace *bt = q->blk_trace;
852
853 if (likely(!bt))
854 return;
855
856 if (blk_pc_request(rq))
857 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
858 rq->errors, len, data);
859 else
860 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
861 0, BLK_TA_DRV_DATA, rq->errors, len, data);
862}
863EXPORT_SYMBOL_GPL(blk_add_driver_data);
864
865static void blk_register_tracepoints(void)
866{
867 int ret;
868
869 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
870 WARN_ON(ret);
871 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
872 WARN_ON(ret);
873 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
874 WARN_ON(ret);
875 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
876 WARN_ON(ret);
877 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
878 WARN_ON(ret);
879 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
880 WARN_ON(ret);
881 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
882 WARN_ON(ret);
883 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
884 WARN_ON(ret);
885 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
886 WARN_ON(ret);
887 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
888 WARN_ON(ret);
889 ret = register_trace_block_getrq(blk_add_trace_getrq);
890 WARN_ON(ret);
891 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
892 WARN_ON(ret);
893 ret = register_trace_block_plug(blk_add_trace_plug);
894 WARN_ON(ret);
895 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
896 WARN_ON(ret);
897 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
898 WARN_ON(ret);
899 ret = register_trace_block_split(blk_add_trace_split);
900 WARN_ON(ret);
901 ret = register_trace_block_remap(blk_add_trace_remap);
902 WARN_ON(ret);
903}
904
905static void blk_unregister_tracepoints(void)
906{
907 unregister_trace_block_remap(blk_add_trace_remap);
908 unregister_trace_block_split(blk_add_trace_split);
909 unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
910 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
911 unregister_trace_block_plug(blk_add_trace_plug);
912 unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
913 unregister_trace_block_getrq(blk_add_trace_getrq);
914 unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
915 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
916 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
917 unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
918 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
919 unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
920 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
921 unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
922 unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
923 unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
924
925 tracepoint_synchronize_unregister();
926}
927
928/*
929 * struct blk_io_tracer formatting routines
930 */
931
932static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
933{
934 int i = 0;
935 int tc = t->action >> BLK_TC_SHIFT;
936
937 if (t->action == BLK_TN_MESSAGE) {
938 rwbs[i++] = 'N';
939 goto out;
940 }
941
942 if (tc & BLK_TC_DISCARD)
943 rwbs[i++] = 'D';
944 else if (tc & BLK_TC_WRITE)
945 rwbs[i++] = 'W';
946 else if (t->bytes)
947 rwbs[i++] = 'R';
948 else
949 rwbs[i++] = 'N';
950
951 if (tc & BLK_TC_AHEAD)
952 rwbs[i++] = 'A';
953 if (tc & BLK_TC_BARRIER)
954 rwbs[i++] = 'B';
955 if (tc & BLK_TC_SYNC)
956 rwbs[i++] = 'S';
957 if (tc & BLK_TC_META)
958 rwbs[i++] = 'M';
959out:
960 rwbs[i] = '\0';
961}
962
963static inline
964const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
965{
966 return (const struct blk_io_trace *)ent;
967}
968
969static inline const void *pdu_start(const struct trace_entry *ent)
970{
971 return te_blk_io_trace(ent) + 1;
972}
973
974static inline u32 t_sec(const struct trace_entry *ent)
975{
976 return te_blk_io_trace(ent)->bytes >> 9;
977}
978
979static inline unsigned long long t_sector(const struct trace_entry *ent)
980{
981 return te_blk_io_trace(ent)->sector;
982}
983
984static inline __u16 t_error(const struct trace_entry *ent)
985{
986 return te_blk_io_trace(ent)->error;
987}
988
989static __u64 get_pdu_int(const struct trace_entry *ent)
990{
991 const __u64 *val = pdu_start(ent);
992 return be64_to_cpu(*val);
993}
994
995static void get_pdu_remap(const struct trace_entry *ent,
996 struct blk_io_trace_remap *r)
997{
998 const struct blk_io_trace_remap *__r = pdu_start(ent);
999 __u64 sector = __r->sector;
1000
1001 r->device = be32_to_cpu(__r->device);
1002 r->device_from = be32_to_cpu(__r->device_from);
1003 r->sector = be64_to_cpu(sector);
1004}
1005
1006typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1007
1008static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1009{
1010 char rwbs[6];
1011 unsigned long long ts = iter->ts;
1012 unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1013 unsigned secs = (unsigned long)ts;
1014 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1015
1016 fill_rwbs(rwbs, t);
1017
1018 return trace_seq_printf(&iter->seq,
1019 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1020 MAJOR(t->device), MINOR(t->device), iter->cpu,
1021 secs, nsec_rem, iter->ent->pid, act, rwbs);
1022}
1023
1024static int blk_log_action(struct trace_iterator *iter, const char *act)
1025{
1026 char rwbs[6];
1027 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1028
1029 fill_rwbs(rwbs, t);
1030 return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1031 MAJOR(t->device), MINOR(t->device), act, rwbs);
1032}
1033
1034static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1035{
1036 char cmd[TASK_COMM_LEN];
1037
1038 trace_find_cmdline(ent->pid, cmd);
1039
1040 if (t_sec(ent))
1041 return trace_seq_printf(s, "%llu + %u [%s]\n",
1042 t_sector(ent), t_sec(ent), cmd);
1043 return trace_seq_printf(s, "[%s]\n", cmd);
1044}
1045
1046static int blk_log_with_error(struct trace_seq *s,
1047 const struct trace_entry *ent)
1048{
1049 if (t_sec(ent))
1050 return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
1051 t_sec(ent), t_error(ent));
1052 return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
1053}
1054
1055static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1056{
1057 struct blk_io_trace_remap r = { .device = 0, };
1058
1059 get_pdu_remap(ent, &r);
1060 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1061 t_sector(ent),
1062 t_sec(ent), MAJOR(r.device), MINOR(r.device),
1063 (unsigned long long)r.sector);
1064}
1065
1066static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1067{
1068 char cmd[TASK_COMM_LEN];
1069
1070 trace_find_cmdline(ent->pid, cmd);
1071
1072 return trace_seq_printf(s, "[%s]\n", cmd);
1073}
1074
1075static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1076{
1077 char cmd[TASK_COMM_LEN];
1078
1079 trace_find_cmdline(ent->pid, cmd);
1080
1081 return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1082}
1083
1084static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1085{
1086 char cmd[TASK_COMM_LEN];
1087
1088 trace_find_cmdline(ent->pid, cmd);
1089
1090 return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1091 get_pdu_int(ent), cmd);
1092}
1093
1094static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1095{
1096 int ret;
1097 const struct blk_io_trace *t = te_blk_io_trace(ent);
1098
1099 ret = trace_seq_putmem(s, t + 1, t->pdu_len);
1100 if (ret)
1101 return trace_seq_putc(s, '\n');
1102 return ret;
1103}
1104
1105/*
1106 * struct tracer operations
1107 */
1108
1109static void blk_tracer_print_header(struct seq_file *m)
1110{
1111 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1112 return;
1113 seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n"
1114 "# | | | | | |\n");
1115}
1116
1117static void blk_tracer_start(struct trace_array *tr)
1118{
1119 blk_tracer_enabled = true;
1120 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1121}
1122
1123static int blk_tracer_init(struct trace_array *tr)
1124{
1125 blk_tr = tr;
1126 blk_tracer_start(tr);
1127 return 0;
1128}
1129
1130static void blk_tracer_stop(struct trace_array *tr)
1131{
1132 blk_tracer_enabled = false;
1133 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1134}
1135
1136static void blk_tracer_reset(struct trace_array *tr)
1137{
1138 blk_tracer_stop(tr);
1139}
1140
1141static const struct {
1142 const char *act[2];
1143 int (*print)(struct trace_seq *s, const struct trace_entry *ent);
1144} what2act[] = {
1145 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1146 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
1147 [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic },
1148 [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic },
1149 [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic },
1150 [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error },
1151 [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic },
1152 [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error },
1153 [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug },
1154 [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug },
1155 [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
1156 [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
1157 [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
1158 [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
1159 [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
1160};
1161
1162static enum print_line_t print_one_line(struct trace_iterator *iter,
1163 bool classic)
1164{
1165 struct trace_seq *s = &iter->seq;
1166 const struct blk_io_trace *t;
1167 u16 what;
1168 int ret;
1169 bool long_act;
1170 blk_log_action_t *log_action;
1171
1172 t = te_blk_io_trace(iter->ent);
1173 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
1174 long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
1175 log_action = classic ? &blk_log_action_classic : &blk_log_action;
1176
1177 if (t->action == BLK_TN_MESSAGE) {
1178 ret = log_action(iter, long_act ? "message" : "m");
1179 if (ret)
1180 ret = blk_log_msg(s, iter->ent);
1181 goto out;
1182 }
1183
1184 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1185 ret = trace_seq_printf(s, "Bad pc action %x\n", what);
1186 else {
1187 ret = log_action(iter, what2act[what].act[long_act]);
1188 if (ret)
1189 ret = what2act[what].print(s, iter->ent);
1190 }
1191out:
1192 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1193}
1194
1195static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1196 int flags)
1197{
1198 if (!trace_print_context(iter))
1199 return TRACE_TYPE_PARTIAL_LINE;
1200
1201 return print_one_line(iter, false);
1202}
1203
1204static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1205{
1206 struct trace_seq *s = &iter->seq;
1207 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
1208 const int offset = offsetof(struct blk_io_trace, sector);
1209 struct blk_io_trace old = {
1210 .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
1211 .time = iter->ts,
1212 };
1213
1214 if (!trace_seq_putmem(s, &old, offset))
1215 return 0;
1216 return trace_seq_putmem(s, &t->sector,
1217 sizeof(old) - offset + t->pdu_len);
1218}
1219
1220static enum print_line_t
1221blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
1222{
1223 return blk_trace_synthesize_old_trace(iter) ?
1224 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1225}
1226
1227static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1228{
1229 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1230 return TRACE_TYPE_UNHANDLED;
1231
1232 return print_one_line(iter, true);
1233}
1234
1235static struct tracer blk_tracer __read_mostly = {
1236 .name = "blk",
1237 .init = blk_tracer_init,
1238 .reset = blk_tracer_reset,
1239 .start = blk_tracer_start,
1240 .stop = blk_tracer_stop,
1241 .print_header = blk_tracer_print_header,
1242 .print_line = blk_tracer_print_line,
1243 .flags = &blk_tracer_flags,
1244};
1245
1246static struct trace_event trace_blk_event = {
1247 .type = TRACE_BLK,
1248 .trace = blk_trace_event_print,
1249 .binary = blk_trace_event_print_binary,
1250};
1251
1252static int __init init_blk_tracer(void)
1253{
1254 if (!register_ftrace_event(&trace_blk_event)) {
1255 pr_warning("Warning: could not register block events\n");
1256 return 1;
1257 }
1258
1259 if (register_tracer(&blk_tracer) != 0) {
1260 pr_warning("Warning: could not register the block tracer\n");
1261 unregister_ftrace_event(&trace_blk_event);
1262 return 1;
1263 }
1264
1265 return 0;
1266}
1267
1268device_initcall(init_blk_tracer);
1269
1270static int blk_trace_remove_queue(struct request_queue *q)
1271{
1272 struct blk_trace *bt;
1273
1274 bt = xchg(&q->blk_trace, NULL);
1275 if (bt == NULL)
1276 return -EINVAL;
1277
1278 if (atomic_dec_and_test(&blk_probes_ref))
1279 blk_unregister_tracepoints();
1280
1281 blk_trace_free(bt);
1282 return 0;
1283}
1284
1285/*
1286 * Setup everything required to start tracing
1287 */
1288static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1289{
1290 struct blk_trace *old_bt, *bt = NULL;
1291 int ret = -ENOMEM;
1292
1293 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1294 if (!bt)
1295 return -ENOMEM;
1296
1297 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
1298 if (!bt->msg_data)
1299 goto free_bt;
1300
1301 bt->dev = dev;
1302 bt->act_mask = (u16)-1;
1303 bt->end_lba = -1ULL;
1304
1305 old_bt = xchg(&q->blk_trace, bt);
1306 if (old_bt != NULL) {
1307 (void)xchg(&q->blk_trace, old_bt);
1308 ret = -EBUSY;
1309 goto free_bt;
1310 }
1311
1312 if (atomic_inc_return(&blk_probes_ref) == 1)
1313 blk_register_tracepoints();
1314 return 0;
1315
1316free_bt:
1317 blk_trace_free(bt);
1318 return ret;
1319}
1320
1321/*
1322 * sysfs interface to enable and configure tracing
1323 */
1324
1325static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1326 struct device_attribute *attr,
1327 char *buf);
1328static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1329 struct device_attribute *attr,
1330 const char *buf, size_t count);
1331#define BLK_TRACE_DEVICE_ATTR(_name) \
1332 DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
1333 sysfs_blk_trace_attr_show, \
1334 sysfs_blk_trace_attr_store)
1335
1336static BLK_TRACE_DEVICE_ATTR(enable);
1337static BLK_TRACE_DEVICE_ATTR(act_mask);
1338static BLK_TRACE_DEVICE_ATTR(pid);
1339static BLK_TRACE_DEVICE_ATTR(start_lba);
1340static BLK_TRACE_DEVICE_ATTR(end_lba);
1341
1342static struct attribute *blk_trace_attrs[] = {
1343 &dev_attr_enable.attr,
1344 &dev_attr_act_mask.attr,
1345 &dev_attr_pid.attr,
1346 &dev_attr_start_lba.attr,
1347 &dev_attr_end_lba.attr,
1348 NULL
1349};
1350
1351struct attribute_group blk_trace_attr_group = {
1352 .name = "trace",
1353 .attrs = blk_trace_attrs,
1354};
1355
1356static const struct {
1357 int mask;
1358 const char *str;
1359} mask_maps[] = {
1360 { BLK_TC_READ, "read" },
1361 { BLK_TC_WRITE, "write" },
1362 { BLK_TC_BARRIER, "barrier" },
1363 { BLK_TC_SYNC, "sync" },
1364 { BLK_TC_QUEUE, "queue" },
1365 { BLK_TC_REQUEUE, "requeue" },
1366 { BLK_TC_ISSUE, "issue" },
1367 { BLK_TC_COMPLETE, "complete" },
1368 { BLK_TC_FS, "fs" },
1369 { BLK_TC_PC, "pc" },
1370 { BLK_TC_AHEAD, "ahead" },
1371 { BLK_TC_META, "meta" },
1372 { BLK_TC_DISCARD, "discard" },
1373 { BLK_TC_DRV_DATA, "drv_data" },
1374};
1375
1376static int blk_trace_str2mask(const char *str)
1377{
1378 int i;
1379 int mask = 0;
1380 char *s, *token;
1381
1382 s = kstrdup(str, GFP_KERNEL);
1383 if (s == NULL)
1384 return -ENOMEM;
1385 s = strstrip(s);
1386
1387 while (1) {
1388 token = strsep(&s, ",");
1389 if (token == NULL)
1390 break;
1391
1392 if (*token == '\0')
1393 continue;
1394
1395 for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1396 if (strcasecmp(token, mask_maps[i].str) == 0) {
1397 mask |= mask_maps[i].mask;
1398 break;
1399 }
1400 }
1401 if (i == ARRAY_SIZE(mask_maps)) {
1402 mask = -EINVAL;
1403 break;
1404 }
1405 }
1406 kfree(s);
1407
1408 return mask;
1409}
1410
1411static ssize_t blk_trace_mask2str(char *buf, int mask)
1412{
1413 int i;
1414 char *p = buf;
1415
1416 for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1417 if (mask & mask_maps[i].mask) {
1418 p += sprintf(p, "%s%s",
1419 (p == buf) ? "" : ",", mask_maps[i].str);
1420 }
1421 }
1422 *p++ = '\n';
1423
1424 return p - buf;
1425}
1426
1427static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
1428{
1429 if (bdev->bd_disk == NULL)
1430 return NULL;
1431
1432 return bdev_get_queue(bdev);
1433}
1434
1435static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1436 struct device_attribute *attr,
1437 char *buf)
1438{
1439 struct hd_struct *p = dev_to_part(dev);
1440 struct request_queue *q;
1441 struct block_device *bdev;
1442 ssize_t ret = -ENXIO;
1443
1444 lock_kernel();
1445 bdev = bdget(part_devt(p));
1446 if (bdev == NULL)
1447 goto out_unlock_kernel;
1448
1449 q = blk_trace_get_queue(bdev);
1450 if (q == NULL)
1451 goto out_bdput;
1452
1453 mutex_lock(&bdev->bd_mutex);
1454
1455 if (attr == &dev_attr_enable) {
1456 ret = sprintf(buf, "%u\n", !!q->blk_trace);
1457 goto out_unlock_bdev;
1458 }
1459
1460 if (q->blk_trace == NULL)
1461 ret = sprintf(buf, "disabled\n");
1462 else if (attr == &dev_attr_act_mask)
1463 ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
1464 else if (attr == &dev_attr_pid)
1465 ret = sprintf(buf, "%u\n", q->blk_trace->pid);
1466 else if (attr == &dev_attr_start_lba)
1467 ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
1468 else if (attr == &dev_attr_end_lba)
1469 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
1470
1471out_unlock_bdev:
1472 mutex_unlock(&bdev->bd_mutex);
1473out_bdput:
1474 bdput(bdev);
1475out_unlock_kernel:
1476 unlock_kernel();
1477 return ret;
1478}
1479
1480static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1481 struct device_attribute *attr,
1482 const char *buf, size_t count)
1483{
1484 struct block_device *bdev;
1485 struct request_queue *q;
1486 struct hd_struct *p;
1487 u64 value;
1488 ssize_t ret = -EINVAL;
1489
1490 if (count == 0)
1491 goto out;
1492
1493 if (attr == &dev_attr_act_mask) {
1494 if (sscanf(buf, "%llx", &value) != 1) {
1495 /* Assume it is a list of trace category names */
1496 ret = blk_trace_str2mask(buf);
1497 if (ret < 0)
1498 goto out;
1499 value = ret;
1500 }
1501 } else if (sscanf(buf, "%llu", &value) != 1)
1502 goto out;
1503
1504 ret = -ENXIO;
1505
1506 lock_kernel();
1507 p = dev_to_part(dev);
1508 bdev = bdget(part_devt(p));
1509 if (bdev == NULL)
1510 goto out_unlock_kernel;
1511
1512 q = blk_trace_get_queue(bdev);
1513 if (q == NULL)
1514 goto out_bdput;
1515
1516 mutex_lock(&bdev->bd_mutex);
1517
1518 if (attr == &dev_attr_enable) {
1519 if (value)
1520 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1521 else
1522 ret = blk_trace_remove_queue(q);
1523 goto out_unlock_bdev;
1524 }
1525
1526 ret = 0;
1527 if (q->blk_trace == NULL)
1528 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1529
1530 if (ret == 0) {
1531 if (attr == &dev_attr_act_mask)
1532 q->blk_trace->act_mask = value;
1533 else if (attr == &dev_attr_pid)
1534 q->blk_trace->pid = value;
1535 else if (attr == &dev_attr_start_lba)
1536 q->blk_trace->start_lba = value;
1537 else if (attr == &dev_attr_end_lba)
1538 q->blk_trace->end_lba = value;
1539 }
1540
1541out_unlock_bdev:
1542 mutex_unlock(&bdev->bd_mutex);
1543out_bdput:
1544 bdput(bdev);
1545out_unlock_kernel:
1546 unlock_kernel();
1547out:
1548 return ret ? ret : count;
1549}
1550
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
new file mode 100644
index 000000000000..246f2aa6dc46
--- /dev/null
+++ b/kernel/trace/events.c
@@ -0,0 +1,14 @@
1/*
2 * This is the place to register all trace points as events.
3 */
4
5#include <linux/stringify.h>
6
7#include <trace/trace_events.h>
8
9#include "trace_output.h"
10
11#include "trace_events_stage_1.h"
12#include "trace_events_stage_2.h"
13#include "trace_events_stage_3.h"
14
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09df..f1ed080406c3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
17#include <linux/clocksource.h> 17#include <linux/clocksource.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/suspend.h>
20#include <linux/debugfs.h> 21#include <linux/debugfs.h>
21#include <linux/hardirq.h> 22#include <linux/hardirq.h>
22#include <linux/kthread.h> 23#include <linux/kthread.h>
@@ -26,6 +27,9 @@
26#include <linux/sysctl.h> 27#include <linux/sysctl.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/hash.h>
31
32#include <trace/sched.h>
29 33
30#include <asm/ftrace.h> 34#include <asm/ftrace.h>
31 35
@@ -43,14 +47,14 @@
43 ftrace_kill(); \ 47 ftrace_kill(); \
44 } while (0) 48 } while (0)
45 49
50/* hash bits for specific function selection */
51#define FTRACE_HASH_BITS 7
52#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
53
46/* ftrace_enabled is a method to turn ftrace on or off */ 54/* ftrace_enabled is a method to turn ftrace on or off */
47int ftrace_enabled __read_mostly; 55int ftrace_enabled __read_mostly;
48static int last_ftrace_enabled; 56static int last_ftrace_enabled;
49 57
50/* set when tracing only a pid */
51struct pid *ftrace_pid_trace;
52static struct pid * const ftrace_swapper_pid = &init_struct_pid;
53
54/* Quick disabling of function tracer. */ 58/* Quick disabling of function tracer. */
55int function_trace_stop; 59int function_trace_stop;
56 60
@@ -60,9 +64,7 @@ int function_trace_stop;
60 */ 64 */
61static int ftrace_disabled __read_mostly; 65static int ftrace_disabled __read_mostly;
62 66
63static DEFINE_SPINLOCK(ftrace_lock); 67static DEFINE_MUTEX(ftrace_lock);
64static DEFINE_MUTEX(ftrace_sysctl_lock);
65static DEFINE_MUTEX(ftrace_start_lock);
66 68
67static struct ftrace_ops ftrace_list_end __read_mostly = 69static struct ftrace_ops ftrace_list_end __read_mostly =
68{ 70{
@@ -133,9 +135,6 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
133 135
134static int __register_ftrace_function(struct ftrace_ops *ops) 136static int __register_ftrace_function(struct ftrace_ops *ops)
135{ 137{
136 /* should not be called from interrupt context */
137 spin_lock(&ftrace_lock);
138
139 ops->next = ftrace_list; 138 ops->next = ftrace_list;
140 /* 139 /*
141 * We are entering ops into the ftrace_list but another 140 * We are entering ops into the ftrace_list but another
@@ -171,18 +170,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
171#endif 170#endif
172 } 171 }
173 172
174 spin_unlock(&ftrace_lock);
175
176 return 0; 173 return 0;
177} 174}
178 175
179static int __unregister_ftrace_function(struct ftrace_ops *ops) 176static int __unregister_ftrace_function(struct ftrace_ops *ops)
180{ 177{
181 struct ftrace_ops **p; 178 struct ftrace_ops **p;
182 int ret = 0;
183
184 /* should not be called from interrupt context */
185 spin_lock(&ftrace_lock);
186 179
187 /* 180 /*
188 * If we are removing the last function, then simply point 181 * If we are removing the last function, then simply point
@@ -191,17 +184,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
191 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 184 if (ftrace_list == ops && ops->next == &ftrace_list_end) {
192 ftrace_trace_function = ftrace_stub; 185 ftrace_trace_function = ftrace_stub;
193 ftrace_list = &ftrace_list_end; 186 ftrace_list = &ftrace_list_end;
194 goto out; 187 return 0;
195 } 188 }
196 189
197 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 190 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
198 if (*p == ops) 191 if (*p == ops)
199 break; 192 break;
200 193
201 if (*p != ops) { 194 if (*p != ops)
202 ret = -1; 195 return -1;
203 goto out;
204 }
205 196
206 *p = (*p)->next; 197 *p = (*p)->next;
207 198
@@ -222,21 +213,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
222 } 213 }
223 } 214 }
224 215
225 out: 216 return 0;
226 spin_unlock(&ftrace_lock);
227
228 return ret;
229} 217}
230 218
231static void ftrace_update_pid_func(void) 219static void ftrace_update_pid_func(void)
232{ 220{
233 ftrace_func_t func; 221 ftrace_func_t func;
234 222
235 /* should not be called from interrupt context */
236 spin_lock(&ftrace_lock);
237
238 if (ftrace_trace_function == ftrace_stub) 223 if (ftrace_trace_function == ftrace_stub)
239 goto out; 224 return;
240 225
241 func = ftrace_trace_function; 226 func = ftrace_trace_function;
242 227
@@ -253,23 +238,29 @@ static void ftrace_update_pid_func(void)
253#else 238#else
254 __ftrace_trace_function = func; 239 __ftrace_trace_function = func;
255#endif 240#endif
256
257 out:
258 spin_unlock(&ftrace_lock);
259} 241}
260 242
243/* set when tracing only a pid */
244struct pid *ftrace_pid_trace;
245static struct pid * const ftrace_swapper_pid = &init_struct_pid;
246
261#ifdef CONFIG_DYNAMIC_FTRACE 247#ifdef CONFIG_DYNAMIC_FTRACE
248
262#ifndef CONFIG_FTRACE_MCOUNT_RECORD 249#ifndef CONFIG_FTRACE_MCOUNT_RECORD
263# error Dynamic ftrace depends on MCOUNT_RECORD 250# error Dynamic ftrace depends on MCOUNT_RECORD
264#endif 251#endif
265 252
266/* 253static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
267 * Since MCOUNT_ADDR may point to mcount itself, we do not want 254
268 * to get it confused by reading a reference in the code as we 255struct ftrace_func_probe {
269 * are parsing on objcopy output of text. Use a variable for 256 struct hlist_node node;
270 * it instead. 257 struct ftrace_probe_ops *ops;
271 */ 258 unsigned long flags;
272static unsigned long mcount_addr = MCOUNT_ADDR; 259 unsigned long ip;
260 void *data;
261 struct rcu_head rcu;
262};
263
273 264
274enum { 265enum {
275 FTRACE_ENABLE_CALLS = (1 << 0), 266 FTRACE_ENABLE_CALLS = (1 << 0),
@@ -283,13 +274,13 @@ enum {
283 274
284static int ftrace_filtered; 275static int ftrace_filtered;
285 276
286static LIST_HEAD(ftrace_new_addrs); 277static struct dyn_ftrace *ftrace_new_addrs;
287 278
288static DEFINE_MUTEX(ftrace_regex_lock); 279static DEFINE_MUTEX(ftrace_regex_lock);
289 280
290struct ftrace_page { 281struct ftrace_page {
291 struct ftrace_page *next; 282 struct ftrace_page *next;
292 unsigned long index; 283 int index;
293 struct dyn_ftrace records[]; 284 struct dyn_ftrace records[];
294}; 285};
295 286
@@ -304,6 +295,19 @@ static struct ftrace_page *ftrace_pages;
304 295
305static struct dyn_ftrace *ftrace_free_records; 296static struct dyn_ftrace *ftrace_free_records;
306 297
298/*
299 * This is a double for. Do not use 'break' to break out of the loop,
300 * you must use a goto.
301 */
302#define do_for_each_ftrace_rec(pg, rec) \
303 for (pg = ftrace_pages_start; pg; pg = pg->next) { \
304 int _____i; \
305 for (_____i = 0; _____i < pg->index; _____i++) { \
306 rec = &pg->records[_____i];
307
308#define while_for_each_ftrace_rec() \
309 } \
310 }
307 311
308#ifdef CONFIG_KPROBES 312#ifdef CONFIG_KPROBES
309 313
@@ -337,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec)
337 341
338static void ftrace_free_rec(struct dyn_ftrace *rec) 342static void ftrace_free_rec(struct dyn_ftrace *rec)
339{ 343{
340 rec->ip = (unsigned long)ftrace_free_records; 344 rec->freelist = ftrace_free_records;
341 ftrace_free_records = rec; 345 ftrace_free_records = rec;
342 rec->flags |= FTRACE_FL_FREE; 346 rec->flags |= FTRACE_FL_FREE;
343} 347}
@@ -348,23 +352,22 @@ void ftrace_release(void *start, unsigned long size)
348 struct ftrace_page *pg; 352 struct ftrace_page *pg;
349 unsigned long s = (unsigned long)start; 353 unsigned long s = (unsigned long)start;
350 unsigned long e = s + size; 354 unsigned long e = s + size;
351 int i;
352 355
353 if (ftrace_disabled || !start) 356 if (ftrace_disabled || !start)
354 return; 357 return;
355 358
356 /* should not be called from interrupt context */ 359 mutex_lock(&ftrace_lock);
357 spin_lock(&ftrace_lock); 360 do_for_each_ftrace_rec(pg, rec) {
358 361 if ((rec->ip >= s) && (rec->ip < e)) {
359 for (pg = ftrace_pages_start; pg; pg = pg->next) { 362 /*
360 for (i = 0; i < pg->index; i++) { 363 * rec->ip is changed in ftrace_free_rec()
361 rec = &pg->records[i]; 364 * It should not between s and e if record was freed.
362 365 */
363 if ((rec->ip >= s) && (rec->ip < e)) 366 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
364 ftrace_free_rec(rec); 367 ftrace_free_rec(rec);
365 } 368 }
366 } 369 } while_for_each_ftrace_rec();
367 spin_unlock(&ftrace_lock); 370 mutex_unlock(&ftrace_lock);
368} 371}
369 372
370static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 373static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -381,7 +384,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
381 return NULL; 384 return NULL;
382 } 385 }
383 386
384 ftrace_free_records = (void *)rec->ip; 387 ftrace_free_records = rec->freelist;
385 memset(rec, 0, sizeof(*rec)); 388 memset(rec, 0, sizeof(*rec));
386 return rec; 389 return rec;
387 } 390 }
@@ -413,8 +416,8 @@ ftrace_record_ip(unsigned long ip)
413 return NULL; 416 return NULL;
414 417
415 rec->ip = ip; 418 rec->ip = ip;
416 419 rec->newlist = ftrace_new_addrs;
417 list_add(&rec->list, &ftrace_new_addrs); 420 ftrace_new_addrs = rec;
418 421
419 return rec; 422 return rec;
420} 423}
@@ -460,10 +463,10 @@ static void ftrace_bug(int failed, unsigned long ip)
460static int 463static int
461__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 464__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
462{ 465{
463 unsigned long ip, fl;
464 unsigned long ftrace_addr; 466 unsigned long ftrace_addr;
467 unsigned long ip, fl;
465 468
466 ftrace_addr = (unsigned long)ftrace_caller; 469 ftrace_addr = (unsigned long)FTRACE_ADDR;
467 470
468 ip = rec->ip; 471 ip = rec->ip;
469 472
@@ -472,7 +475,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
472 * it is not enabled then do nothing. 475 * it is not enabled then do nothing.
473 * 476 *
474 * If this record is not to be traced and 477 * If this record is not to be traced and
475 * it is enabled then disabled it. 478 * it is enabled then disable it.
476 * 479 *
477 */ 480 */
478 if (rec->flags & FTRACE_FL_NOTRACE) { 481 if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -492,7 +495,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
492 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) 495 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
493 return 0; 496 return 0;
494 497
495 /* Record is not filtered and is not enabled do nothing */ 498 /* Record is not filtered or enabled, do nothing */
496 if (!fl) 499 if (!fl)
497 return 0; 500 return 0;
498 501
@@ -514,7 +517,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
514 517
515 } else { 518 } else {
516 519
517 /* if record is not enabled do nothing */ 520 /* if record is not enabled, do nothing */
518 if (!(rec->flags & FTRACE_FL_ENABLED)) 521 if (!(rec->flags & FTRACE_FL_ENABLED))
519 return 0; 522 return 0;
520 523
@@ -530,41 +533,41 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
530 533
531static void ftrace_replace_code(int enable) 534static void ftrace_replace_code(int enable)
532{ 535{
533 int i, failed;
534 struct dyn_ftrace *rec; 536 struct dyn_ftrace *rec;
535 struct ftrace_page *pg; 537 struct ftrace_page *pg;
538 int failed;
536 539
537 for (pg = ftrace_pages_start; pg; pg = pg->next) { 540 do_for_each_ftrace_rec(pg, rec) {
538 for (i = 0; i < pg->index; i++) { 541 /*
539 rec = &pg->records[i]; 542 * Skip over free records, records that have
540 543 * failed and not converted.
541 /* 544 */
542 * Skip over free records and records that have 545 if (rec->flags & FTRACE_FL_FREE ||
543 * failed. 546 rec->flags & FTRACE_FL_FAILED ||
544 */ 547 !(rec->flags & FTRACE_FL_CONVERTED))
545 if (rec->flags & FTRACE_FL_FREE || 548 continue;
546 rec->flags & FTRACE_FL_FAILED)
547 continue;
548 549
549 /* ignore updates to this record's mcount site */ 550 /* ignore updates to this record's mcount site */
550 if (get_kprobe((void *)rec->ip)) { 551 if (get_kprobe((void *)rec->ip)) {
551 freeze_record(rec); 552 freeze_record(rec);
552 continue; 553 continue;
553 } else { 554 } else {
554 unfreeze_record(rec); 555 unfreeze_record(rec);
555 } 556 }
556 557
557 failed = __ftrace_replace_code(rec, enable); 558 failed = __ftrace_replace_code(rec, enable);
558 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 559 if (failed) {
559 rec->flags |= FTRACE_FL_FAILED; 560 rec->flags |= FTRACE_FL_FAILED;
560 if ((system_state == SYSTEM_BOOTING) || 561 if ((system_state == SYSTEM_BOOTING) ||
561 !core_kernel_text(rec->ip)) { 562 !core_kernel_text(rec->ip)) {
562 ftrace_free_rec(rec); 563 ftrace_free_rec(rec);
563 } else 564 } else {
564 ftrace_bug(failed, rec->ip); 565 ftrace_bug(failed, rec->ip);
565 } 566 /* Stop processing */
567 return;
568 }
566 } 569 }
567 } 570 } while_for_each_ftrace_rec();
568} 571}
569 572
570static int 573static int
@@ -575,7 +578,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
575 578
576 ip = rec->ip; 579 ip = rec->ip;
577 580
578 ret = ftrace_make_nop(mod, rec, mcount_addr); 581 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
579 if (ret) { 582 if (ret) {
580 ftrace_bug(ret, ip); 583 ftrace_bug(ret, ip);
581 rec->flags |= FTRACE_FL_FAILED; 584 rec->flags |= FTRACE_FL_FAILED;
@@ -584,6 +587,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
584 return 1; 587 return 1;
585} 588}
586 589
590/*
591 * archs can override this function if they must do something
592 * before the modifying code is performed.
593 */
594int __weak ftrace_arch_code_modify_prepare(void)
595{
596 return 0;
597}
598
599/*
600 * archs can override this function if they must do something
601 * after the modifying code is performed.
602 */
603int __weak ftrace_arch_code_modify_post_process(void)
604{
605 return 0;
606}
607
587static int __ftrace_modify_code(void *data) 608static int __ftrace_modify_code(void *data)
588{ 609{
589 int *command = data; 610 int *command = data;
@@ -606,7 +627,17 @@ static int __ftrace_modify_code(void *data)
606 627
607static void ftrace_run_update_code(int command) 628static void ftrace_run_update_code(int command)
608{ 629{
630 int ret;
631
632 ret = ftrace_arch_code_modify_prepare();
633 FTRACE_WARN_ON(ret);
634 if (ret)
635 return;
636
609 stop_machine(__ftrace_modify_code, &command, NULL); 637 stop_machine(__ftrace_modify_code, &command, NULL);
638
639 ret = ftrace_arch_code_modify_post_process();
640 FTRACE_WARN_ON(ret);
610} 641}
611 642
612static ftrace_func_t saved_ftrace_func; 643static ftrace_func_t saved_ftrace_func;
@@ -630,13 +661,10 @@ static void ftrace_startup(int command)
630 if (unlikely(ftrace_disabled)) 661 if (unlikely(ftrace_disabled))
631 return; 662 return;
632 663
633 mutex_lock(&ftrace_start_lock);
634 ftrace_start_up++; 664 ftrace_start_up++;
635 command |= FTRACE_ENABLE_CALLS; 665 command |= FTRACE_ENABLE_CALLS;
636 666
637 ftrace_startup_enable(command); 667 ftrace_startup_enable(command);
638
639 mutex_unlock(&ftrace_start_lock);
640} 668}
641 669
642static void ftrace_shutdown(int command) 670static void ftrace_shutdown(int command)
@@ -644,7 +672,6 @@ static void ftrace_shutdown(int command)
644 if (unlikely(ftrace_disabled)) 672 if (unlikely(ftrace_disabled))
645 return; 673 return;
646 674
647 mutex_lock(&ftrace_start_lock);
648 ftrace_start_up--; 675 ftrace_start_up--;
649 if (!ftrace_start_up) 676 if (!ftrace_start_up)
650 command |= FTRACE_DISABLE_CALLS; 677 command |= FTRACE_DISABLE_CALLS;
@@ -655,11 +682,9 @@ static void ftrace_shutdown(int command)
655 } 682 }
656 683
657 if (!command || !ftrace_enabled) 684 if (!command || !ftrace_enabled)
658 goto out; 685 return;
659 686
660 ftrace_run_update_code(command); 687 ftrace_run_update_code(command);
661 out:
662 mutex_unlock(&ftrace_start_lock);
663} 688}
664 689
665static void ftrace_startup_sysctl(void) 690static void ftrace_startup_sysctl(void)
@@ -669,7 +694,6 @@ static void ftrace_startup_sysctl(void)
669 if (unlikely(ftrace_disabled)) 694 if (unlikely(ftrace_disabled))
670 return; 695 return;
671 696
672 mutex_lock(&ftrace_start_lock);
673 /* Force update next time */ 697 /* Force update next time */
674 saved_ftrace_func = NULL; 698 saved_ftrace_func = NULL;
675 /* ftrace_start_up is true if we want ftrace running */ 699 /* ftrace_start_up is true if we want ftrace running */
@@ -677,7 +701,6 @@ static void ftrace_startup_sysctl(void)
677 command |= FTRACE_ENABLE_CALLS; 701 command |= FTRACE_ENABLE_CALLS;
678 702
679 ftrace_run_update_code(command); 703 ftrace_run_update_code(command);
680 mutex_unlock(&ftrace_start_lock);
681} 704}
682 705
683static void ftrace_shutdown_sysctl(void) 706static void ftrace_shutdown_sysctl(void)
@@ -687,13 +710,11 @@ static void ftrace_shutdown_sysctl(void)
687 if (unlikely(ftrace_disabled)) 710 if (unlikely(ftrace_disabled))
688 return; 711 return;
689 712
690 mutex_lock(&ftrace_start_lock);
691 /* ftrace_start_up is true if ftrace is running */ 713 /* ftrace_start_up is true if ftrace is running */
692 if (ftrace_start_up) 714 if (ftrace_start_up)
693 command |= FTRACE_DISABLE_CALLS; 715 command |= FTRACE_DISABLE_CALLS;
694 716
695 ftrace_run_update_code(command); 717 ftrace_run_update_code(command);
696 mutex_unlock(&ftrace_start_lock);
697} 718}
698 719
699static cycle_t ftrace_update_time; 720static cycle_t ftrace_update_time;
@@ -702,19 +723,21 @@ unsigned long ftrace_update_tot_cnt;
702 723
703static int ftrace_update_code(struct module *mod) 724static int ftrace_update_code(struct module *mod)
704{ 725{
705 struct dyn_ftrace *p, *t; 726 struct dyn_ftrace *p;
706 cycle_t start, stop; 727 cycle_t start, stop;
707 728
708 start = ftrace_now(raw_smp_processor_id()); 729 start = ftrace_now(raw_smp_processor_id());
709 ftrace_update_cnt = 0; 730 ftrace_update_cnt = 0;
710 731
711 list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) { 732 while (ftrace_new_addrs) {
712 733
713 /* If something went wrong, bail without enabling anything */ 734 /* If something went wrong, bail without enabling anything */
714 if (unlikely(ftrace_disabled)) 735 if (unlikely(ftrace_disabled))
715 return -1; 736 return -1;
716 737
717 list_del_init(&p->list); 738 p = ftrace_new_addrs;
739 ftrace_new_addrs = p->newlist;
740 p->flags = 0L;
718 741
719 /* convert record (i.e, patch mcount-call with NOP) */ 742 /* convert record (i.e, patch mcount-call with NOP) */
720 if (ftrace_code_disable(mod, p)) { 743 if (ftrace_code_disable(mod, p)) {
@@ -780,13 +803,16 @@ enum {
780 FTRACE_ITER_CONT = (1 << 1), 803 FTRACE_ITER_CONT = (1 << 1),
781 FTRACE_ITER_NOTRACE = (1 << 2), 804 FTRACE_ITER_NOTRACE = (1 << 2),
782 FTRACE_ITER_FAILURES = (1 << 3), 805 FTRACE_ITER_FAILURES = (1 << 3),
806 FTRACE_ITER_PRINTALL = (1 << 4),
807 FTRACE_ITER_HASH = (1 << 5),
783}; 808};
784 809
785#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 810#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
786 811
787struct ftrace_iterator { 812struct ftrace_iterator {
788 struct ftrace_page *pg; 813 struct ftrace_page *pg;
789 unsigned idx; 814 int hidx;
815 int idx;
790 unsigned flags; 816 unsigned flags;
791 unsigned char buffer[FTRACE_BUFF_MAX+1]; 817 unsigned char buffer[FTRACE_BUFF_MAX+1];
792 unsigned buffer_idx; 818 unsigned buffer_idx;
@@ -794,15 +820,89 @@ struct ftrace_iterator {
794}; 820};
795 821
796static void * 822static void *
823t_hash_next(struct seq_file *m, void *v, loff_t *pos)
824{
825 struct ftrace_iterator *iter = m->private;
826 struct hlist_node *hnd = v;
827 struct hlist_head *hhd;
828
829 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
830
831 (*pos)++;
832
833 retry:
834 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
835 return NULL;
836
837 hhd = &ftrace_func_hash[iter->hidx];
838
839 if (hlist_empty(hhd)) {
840 iter->hidx++;
841 hnd = NULL;
842 goto retry;
843 }
844
845 if (!hnd)
846 hnd = hhd->first;
847 else {
848 hnd = hnd->next;
849 if (!hnd) {
850 iter->hidx++;
851 goto retry;
852 }
853 }
854
855 return hnd;
856}
857
858static void *t_hash_start(struct seq_file *m, loff_t *pos)
859{
860 struct ftrace_iterator *iter = m->private;
861 void *p = NULL;
862
863 iter->flags |= FTRACE_ITER_HASH;
864
865 return t_hash_next(m, p, pos);
866}
867
868static int t_hash_show(struct seq_file *m, void *v)
869{
870 struct ftrace_func_probe *rec;
871 struct hlist_node *hnd = v;
872 char str[KSYM_SYMBOL_LEN];
873
874 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
875
876 if (rec->ops->print)
877 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
878
879 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
880 seq_printf(m, "%s:", str);
881
882 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
883 seq_printf(m, "%s", str);
884
885 if (rec->data)
886 seq_printf(m, ":%p", rec->data);
887 seq_putc(m, '\n');
888
889 return 0;
890}
891
892static void *
797t_next(struct seq_file *m, void *v, loff_t *pos) 893t_next(struct seq_file *m, void *v, loff_t *pos)
798{ 894{
799 struct ftrace_iterator *iter = m->private; 895 struct ftrace_iterator *iter = m->private;
800 struct dyn_ftrace *rec = NULL; 896 struct dyn_ftrace *rec = NULL;
801 897
898 if (iter->flags & FTRACE_ITER_HASH)
899 return t_hash_next(m, v, pos);
900
802 (*pos)++; 901 (*pos)++;
803 902
804 /* should not be called from interrupt context */ 903 if (iter->flags & FTRACE_ITER_PRINTALL)
805 spin_lock(&ftrace_lock); 904 return NULL;
905
806 retry: 906 retry:
807 if (iter->idx >= iter->pg->index) { 907 if (iter->idx >= iter->pg->index) {
808 if (iter->pg->next) { 908 if (iter->pg->next) {
@@ -831,7 +931,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
831 goto retry; 931 goto retry;
832 } 932 }
833 } 933 }
834 spin_unlock(&ftrace_lock);
835 934
836 return rec; 935 return rec;
837} 936}
@@ -841,6 +940,23 @@ static void *t_start(struct seq_file *m, loff_t *pos)
841 struct ftrace_iterator *iter = m->private; 940 struct ftrace_iterator *iter = m->private;
842 void *p = NULL; 941 void *p = NULL;
843 942
943 mutex_lock(&ftrace_lock);
944 /*
945 * For set_ftrace_filter reading, if we have the filter
946 * off, we can short cut and just print out that all
947 * functions are enabled.
948 */
949 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
950 if (*pos > 0)
951 return t_hash_start(m, pos);
952 iter->flags |= FTRACE_ITER_PRINTALL;
953 (*pos)++;
954 return iter;
955 }
956
957 if (iter->flags & FTRACE_ITER_HASH)
958 return t_hash_start(m, pos);
959
844 if (*pos > 0) { 960 if (*pos > 0) {
845 if (iter->idx < 0) 961 if (iter->idx < 0)
846 return p; 962 return p;
@@ -850,18 +966,31 @@ static void *t_start(struct seq_file *m, loff_t *pos)
850 966
851 p = t_next(m, p, pos); 967 p = t_next(m, p, pos);
852 968
969 if (!p)
970 return t_hash_start(m, pos);
971
853 return p; 972 return p;
854} 973}
855 974
856static void t_stop(struct seq_file *m, void *p) 975static void t_stop(struct seq_file *m, void *p)
857{ 976{
977 mutex_unlock(&ftrace_lock);
858} 978}
859 979
860static int t_show(struct seq_file *m, void *v) 980static int t_show(struct seq_file *m, void *v)
861{ 981{
982 struct ftrace_iterator *iter = m->private;
862 struct dyn_ftrace *rec = v; 983 struct dyn_ftrace *rec = v;
863 char str[KSYM_SYMBOL_LEN]; 984 char str[KSYM_SYMBOL_LEN];
864 985
986 if (iter->flags & FTRACE_ITER_HASH)
987 return t_hash_show(m, v);
988
989 if (iter->flags & FTRACE_ITER_PRINTALL) {
990 seq_printf(m, "#### all functions enabled ####\n");
991 return 0;
992 }
993
865 if (!rec) 994 if (!rec)
866 return 0; 995 return 0;
867 996
@@ -940,23 +1069,16 @@ static void ftrace_filter_reset(int enable)
940 struct ftrace_page *pg; 1069 struct ftrace_page *pg;
941 struct dyn_ftrace *rec; 1070 struct dyn_ftrace *rec;
942 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1071 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
943 unsigned i;
944 1072
945 /* should not be called from interrupt context */ 1073 mutex_lock(&ftrace_lock);
946 spin_lock(&ftrace_lock);
947 if (enable) 1074 if (enable)
948 ftrace_filtered = 0; 1075 ftrace_filtered = 0;
949 pg = ftrace_pages_start; 1076 do_for_each_ftrace_rec(pg, rec) {
950 while (pg) { 1077 if (rec->flags & FTRACE_FL_FAILED)
951 for (i = 0; i < pg->index; i++) { 1078 continue;
952 rec = &pg->records[i]; 1079 rec->flags &= ~type;
953 if (rec->flags & FTRACE_FL_FAILED) 1080 } while_for_each_ftrace_rec();
954 continue; 1081 mutex_unlock(&ftrace_lock);
955 rec->flags &= ~type;
956 }
957 pg = pg->next;
958 }
959 spin_unlock(&ftrace_lock);
960} 1082}
961 1083
962static int 1084static int
@@ -1007,16 +1129,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
1007 return ftrace_regex_open(inode, file, 0); 1129 return ftrace_regex_open(inode, file, 0);
1008} 1130}
1009 1131
1010static ssize_t
1011ftrace_regex_read(struct file *file, char __user *ubuf,
1012 size_t cnt, loff_t *ppos)
1013{
1014 if (file->f_mode & FMODE_READ)
1015 return seq_read(file, ubuf, cnt, ppos);
1016 else
1017 return -EPERM;
1018}
1019
1020static loff_t 1132static loff_t
1021ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 1133ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1022{ 1134{
@@ -1037,86 +1149,536 @@ enum {
1037 MATCH_END_ONLY, 1149 MATCH_END_ONLY,
1038}; 1150};
1039 1151
1040static void 1152/*
1041ftrace_match(unsigned char *buff, int len, int enable) 1153 * (static function - no need for kernel doc)
1154 *
1155 * Pass in a buffer containing a glob and this function will
1156 * set search to point to the search part of the buffer and
1157 * return the type of search it is (see enum above).
1158 * This does modify buff.
1159 *
1160 * Returns enum type.
1161 * search returns the pointer to use for comparison.
1162 * not returns 1 if buff started with a '!'
1163 * 0 otherwise.
1164 */
1165static int
1166ftrace_setup_glob(char *buff, int len, char **search, int *not)
1042{ 1167{
1043 char str[KSYM_SYMBOL_LEN];
1044 char *search = NULL;
1045 struct ftrace_page *pg;
1046 struct dyn_ftrace *rec;
1047 int type = MATCH_FULL; 1168 int type = MATCH_FULL;
1048 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1169 int i;
1049 unsigned i, match = 0, search_len = 0;
1050 int not = 0;
1051 1170
1052 if (buff[0] == '!') { 1171 if (buff[0] == '!') {
1053 not = 1; 1172 *not = 1;
1054 buff++; 1173 buff++;
1055 len--; 1174 len--;
1056 } 1175 } else
1176 *not = 0;
1177
1178 *search = buff;
1057 1179
1058 for (i = 0; i < len; i++) { 1180 for (i = 0; i < len; i++) {
1059 if (buff[i] == '*') { 1181 if (buff[i] == '*') {
1060 if (!i) { 1182 if (!i) {
1061 search = buff + i + 1; 1183 *search = buff + 1;
1062 type = MATCH_END_ONLY; 1184 type = MATCH_END_ONLY;
1063 search_len = len - (i + 1);
1064 } else { 1185 } else {
1065 if (type == MATCH_END_ONLY) { 1186 if (type == MATCH_END_ONLY)
1066 type = MATCH_MIDDLE_ONLY; 1187 type = MATCH_MIDDLE_ONLY;
1067 } else { 1188 else
1068 match = i;
1069 type = MATCH_FRONT_ONLY; 1189 type = MATCH_FRONT_ONLY;
1070 }
1071 buff[i] = 0; 1190 buff[i] = 0;
1072 break; 1191 break;
1073 } 1192 }
1074 } 1193 }
1075 } 1194 }
1076 1195
1077 /* should not be called from interrupt context */ 1196 return type;
1078 spin_lock(&ftrace_lock); 1197}
1079 if (enable) 1198
1080 ftrace_filtered = 1; 1199static int ftrace_match(char *str, char *regex, int len, int type)
1081 pg = ftrace_pages_start; 1200{
1082 while (pg) { 1201 int matched = 0;
1083 for (i = 0; i < pg->index; i++) { 1202 char *ptr;
1084 int matched = 0; 1203
1085 char *ptr; 1204 switch (type) {
1086 1205 case MATCH_FULL:
1087 rec = &pg->records[i]; 1206 if (strcmp(str, regex) == 0)
1088 if (rec->flags & FTRACE_FL_FAILED) 1207 matched = 1;
1208 break;
1209 case MATCH_FRONT_ONLY:
1210 if (strncmp(str, regex, len) == 0)
1211 matched = 1;
1212 break;
1213 case MATCH_MIDDLE_ONLY:
1214 if (strstr(str, regex))
1215 matched = 1;
1216 break;
1217 case MATCH_END_ONLY:
1218 ptr = strstr(str, regex);
1219 if (ptr && (ptr[len] == 0))
1220 matched = 1;
1221 break;
1222 }
1223
1224 return matched;
1225}
1226
1227static int
1228ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
1229{
1230 char str[KSYM_SYMBOL_LEN];
1231
1232 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1233 return ftrace_match(str, regex, len, type);
1234}
1235
1236static void ftrace_match_records(char *buff, int len, int enable)
1237{
1238 unsigned int search_len;
1239 struct ftrace_page *pg;
1240 struct dyn_ftrace *rec;
1241 unsigned long flag;
1242 char *search;
1243 int type;
1244 int not;
1245
1246 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1247 type = ftrace_setup_glob(buff, len, &search, &not);
1248
1249 search_len = strlen(search);
1250
1251 mutex_lock(&ftrace_lock);
1252 do_for_each_ftrace_rec(pg, rec) {
1253
1254 if (rec->flags & FTRACE_FL_FAILED)
1255 continue;
1256
1257 if (ftrace_match_record(rec, search, search_len, type)) {
1258 if (not)
1259 rec->flags &= ~flag;
1260 else
1261 rec->flags |= flag;
1262 }
1263 /*
1264 * Only enable filtering if we have a function that
1265 * is filtered on.
1266 */
1267 if (enable && (rec->flags & FTRACE_FL_FILTER))
1268 ftrace_filtered = 1;
1269 } while_for_each_ftrace_rec();
1270 mutex_unlock(&ftrace_lock);
1271}
1272
1273static int
1274ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
1275 char *regex, int len, int type)
1276{
1277 char str[KSYM_SYMBOL_LEN];
1278 char *modname;
1279
1280 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1281
1282 if (!modname || strcmp(modname, mod))
1283 return 0;
1284
1285 /* blank search means to match all funcs in the mod */
1286 if (len)
1287 return ftrace_match(str, regex, len, type);
1288 else
1289 return 1;
1290}
1291
1292static void ftrace_match_module_records(char *buff, char *mod, int enable)
1293{
1294 unsigned search_len = 0;
1295 struct ftrace_page *pg;
1296 struct dyn_ftrace *rec;
1297 int type = MATCH_FULL;
1298 char *search = buff;
1299 unsigned long flag;
1300 int not = 0;
1301
1302 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1303
1304 /* blank or '*' mean the same */
1305 if (strcmp(buff, "*") == 0)
1306 buff[0] = 0;
1307
1308 /* handle the case of 'dont filter this module' */
1309 if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
1310 buff[0] = 0;
1311 not = 1;
1312 }
1313
1314 if (strlen(buff)) {
1315 type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
1316 search_len = strlen(search);
1317 }
1318
1319 mutex_lock(&ftrace_lock);
1320 do_for_each_ftrace_rec(pg, rec) {
1321
1322 if (rec->flags & FTRACE_FL_FAILED)
1323 continue;
1324
1325 if (ftrace_match_module_record(rec, mod,
1326 search, search_len, type)) {
1327 if (not)
1328 rec->flags &= ~flag;
1329 else
1330 rec->flags |= flag;
1331 }
1332 if (enable && (rec->flags & FTRACE_FL_FILTER))
1333 ftrace_filtered = 1;
1334
1335 } while_for_each_ftrace_rec();
1336 mutex_unlock(&ftrace_lock);
1337}
1338
1339/*
1340 * We register the module command as a template to show others how
1341 * to register the a command as well.
1342 */
1343
1344static int
1345ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1346{
1347 char *mod;
1348
1349 /*
1350 * cmd == 'mod' because we only registered this func
1351 * for the 'mod' ftrace_func_command.
1352 * But if you register one func with multiple commands,
1353 * you can tell which command was used by the cmd
1354 * parameter.
1355 */
1356
1357 /* we must have a module name */
1358 if (!param)
1359 return -EINVAL;
1360
1361 mod = strsep(&param, ":");
1362 if (!strlen(mod))
1363 return -EINVAL;
1364
1365 ftrace_match_module_records(func, mod, enable);
1366 return 0;
1367}
1368
1369static struct ftrace_func_command ftrace_mod_cmd = {
1370 .name = "mod",
1371 .func = ftrace_mod_callback,
1372};
1373
1374static int __init ftrace_mod_cmd_init(void)
1375{
1376 return register_ftrace_command(&ftrace_mod_cmd);
1377}
1378device_initcall(ftrace_mod_cmd_init);
1379
1380static void
1381function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1382{
1383 struct ftrace_func_probe *entry;
1384 struct hlist_head *hhd;
1385 struct hlist_node *n;
1386 unsigned long key;
1387 int resched;
1388
1389 key = hash_long(ip, FTRACE_HASH_BITS);
1390
1391 hhd = &ftrace_func_hash[key];
1392
1393 if (hlist_empty(hhd))
1394 return;
1395
1396 /*
1397 * Disable preemption for these calls to prevent a RCU grace
1398 * period. This syncs the hash iteration and freeing of items
1399 * on the hash. rcu_read_lock is too dangerous here.
1400 */
1401 resched = ftrace_preempt_disable();
1402 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1403 if (entry->ip == ip)
1404 entry->ops->func(ip, parent_ip, &entry->data);
1405 }
1406 ftrace_preempt_enable(resched);
1407}
1408
1409static struct ftrace_ops trace_probe_ops __read_mostly =
1410{
1411 .func = function_trace_probe_call,
1412};
1413
1414static int ftrace_probe_registered;
1415
1416static void __enable_ftrace_function_probe(void)
1417{
1418 int i;
1419
1420 if (ftrace_probe_registered)
1421 return;
1422
1423 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1424 struct hlist_head *hhd = &ftrace_func_hash[i];
1425 if (hhd->first)
1426 break;
1427 }
1428 /* Nothing registered? */
1429 if (i == FTRACE_FUNC_HASHSIZE)
1430 return;
1431
1432 __register_ftrace_function(&trace_probe_ops);
1433 ftrace_startup(0);
1434 ftrace_probe_registered = 1;
1435}
1436
1437static void __disable_ftrace_function_probe(void)
1438{
1439 int i;
1440
1441 if (!ftrace_probe_registered)
1442 return;
1443
1444 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1445 struct hlist_head *hhd = &ftrace_func_hash[i];
1446 if (hhd->first)
1447 return;
1448 }
1449
1450 /* no more funcs left */
1451 __unregister_ftrace_function(&trace_probe_ops);
1452 ftrace_shutdown(0);
1453 ftrace_probe_registered = 0;
1454}
1455
1456
1457static void ftrace_free_entry_rcu(struct rcu_head *rhp)
1458{
1459 struct ftrace_func_probe *entry =
1460 container_of(rhp, struct ftrace_func_probe, rcu);
1461
1462 if (entry->ops->free)
1463 entry->ops->free(&entry->data);
1464 kfree(entry);
1465}
1466
1467
1468int
1469register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1470 void *data)
1471{
1472 struct ftrace_func_probe *entry;
1473 struct ftrace_page *pg;
1474 struct dyn_ftrace *rec;
1475 int type, len, not;
1476 unsigned long key;
1477 int count = 0;
1478 char *search;
1479
1480 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
1481 len = strlen(search);
1482
1483 /* we do not support '!' for function probes */
1484 if (WARN_ON(not))
1485 return -EINVAL;
1486
1487 mutex_lock(&ftrace_lock);
1488 do_for_each_ftrace_rec(pg, rec) {
1489
1490 if (rec->flags & FTRACE_FL_FAILED)
1491 continue;
1492
1493 if (!ftrace_match_record(rec, search, len, type))
1494 continue;
1495
1496 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1497 if (!entry) {
1498 /* If we did not process any, then return error */
1499 if (!count)
1500 count = -ENOMEM;
1501 goto out_unlock;
1502 }
1503
1504 count++;
1505
1506 entry->data = data;
1507
1508 /*
1509 * The caller might want to do something special
1510 * for each function we find. We call the callback
1511 * to give the caller an opportunity to do so.
1512 */
1513 if (ops->callback) {
1514 if (ops->callback(rec->ip, &entry->data) < 0) {
1515 /* caller does not like this func */
1516 kfree(entry);
1089 continue; 1517 continue;
1090 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1091 switch (type) {
1092 case MATCH_FULL:
1093 if (strcmp(str, buff) == 0)
1094 matched = 1;
1095 break;
1096 case MATCH_FRONT_ONLY:
1097 if (memcmp(str, buff, match) == 0)
1098 matched = 1;
1099 break;
1100 case MATCH_MIDDLE_ONLY:
1101 if (strstr(str, search))
1102 matched = 1;
1103 break;
1104 case MATCH_END_ONLY:
1105 ptr = strstr(str, search);
1106 if (ptr && (ptr[search_len] == 0))
1107 matched = 1;
1108 break;
1109 } 1518 }
1110 if (matched) { 1519 }
1111 if (not) 1520
1112 rec->flags &= ~flag; 1521 entry->ops = ops;
1113 else 1522 entry->ip = rec->ip;
1114 rec->flags |= flag; 1523
1524 key = hash_long(entry->ip, FTRACE_HASH_BITS);
1525 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
1526
1527 } while_for_each_ftrace_rec();
1528 __enable_ftrace_function_probe();
1529
1530 out_unlock:
1531 mutex_unlock(&ftrace_lock);
1532
1533 return count;
1534}
1535
1536enum {
1537 PROBE_TEST_FUNC = 1,
1538 PROBE_TEST_DATA = 2
1539};
1540
1541static void
1542__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1543 void *data, int flags)
1544{
1545 struct ftrace_func_probe *entry;
1546 struct hlist_node *n, *tmp;
1547 char str[KSYM_SYMBOL_LEN];
1548 int type = MATCH_FULL;
1549 int i, len = 0;
1550 char *search;
1551
1552 if (glob && (strcmp(glob, "*") || !strlen(glob)))
1553 glob = NULL;
1554 else {
1555 int not;
1556
1557 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
1558 len = strlen(search);
1559
1560 /* we do not support '!' for function probes */
1561 if (WARN_ON(not))
1562 return;
1563 }
1564
1565 mutex_lock(&ftrace_lock);
1566 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1567 struct hlist_head *hhd = &ftrace_func_hash[i];
1568
1569 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
1570
1571 /* break up if statements for readability */
1572 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
1573 continue;
1574
1575 if ((flags & PROBE_TEST_DATA) && entry->data != data)
1576 continue;
1577
1578 /* do this last, since it is the most expensive */
1579 if (glob) {
1580 kallsyms_lookup(entry->ip, NULL, NULL,
1581 NULL, str);
1582 if (!ftrace_match(str, glob, len, type))
1583 continue;
1115 } 1584 }
1585
1586 hlist_del(&entry->node);
1587 call_rcu(&entry->rcu, ftrace_free_entry_rcu);
1116 } 1588 }
1117 pg = pg->next;
1118 } 1589 }
1119 spin_unlock(&ftrace_lock); 1590 __disable_ftrace_function_probe();
1591 mutex_unlock(&ftrace_lock);
1592}
1593
1594void
1595unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1596 void *data)
1597{
1598 __unregister_ftrace_function_probe(glob, ops, data,
1599 PROBE_TEST_FUNC | PROBE_TEST_DATA);
1600}
1601
1602void
1603unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
1604{
1605 __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
1606}
1607
1608void unregister_ftrace_function_probe_all(char *glob)
1609{
1610 __unregister_ftrace_function_probe(glob, NULL, NULL, 0);
1611}
1612
1613static LIST_HEAD(ftrace_commands);
1614static DEFINE_MUTEX(ftrace_cmd_mutex);
1615
1616int register_ftrace_command(struct ftrace_func_command *cmd)
1617{
1618 struct ftrace_func_command *p;
1619 int ret = 0;
1620
1621 mutex_lock(&ftrace_cmd_mutex);
1622 list_for_each_entry(p, &ftrace_commands, list) {
1623 if (strcmp(cmd->name, p->name) == 0) {
1624 ret = -EBUSY;
1625 goto out_unlock;
1626 }
1627 }
1628 list_add(&cmd->list, &ftrace_commands);
1629 out_unlock:
1630 mutex_unlock(&ftrace_cmd_mutex);
1631
1632 return ret;
1633}
1634
1635int unregister_ftrace_command(struct ftrace_func_command *cmd)
1636{
1637 struct ftrace_func_command *p, *n;
1638 int ret = -ENODEV;
1639
1640 mutex_lock(&ftrace_cmd_mutex);
1641 list_for_each_entry_safe(p, n, &ftrace_commands, list) {
1642 if (strcmp(cmd->name, p->name) == 0) {
1643 ret = 0;
1644 list_del_init(&p->list);
1645 goto out_unlock;
1646 }
1647 }
1648 out_unlock:
1649 mutex_unlock(&ftrace_cmd_mutex);
1650
1651 return ret;
1652}
1653
1654static int ftrace_process_regex(char *buff, int len, int enable)
1655{
1656 char *func, *command, *next = buff;
1657 struct ftrace_func_command *p;
1658 int ret = -EINVAL;
1659
1660 func = strsep(&next, ":");
1661
1662 if (!next) {
1663 ftrace_match_records(func, len, enable);
1664 return 0;
1665 }
1666
1667 /* command found */
1668
1669 command = strsep(&next, ":");
1670
1671 mutex_lock(&ftrace_cmd_mutex);
1672 list_for_each_entry(p, &ftrace_commands, list) {
1673 if (strcmp(p->name, command) == 0) {
1674 ret = p->func(func, command, next, enable);
1675 goto out_unlock;
1676 }
1677 }
1678 out_unlock:
1679 mutex_unlock(&ftrace_cmd_mutex);
1680
1681 return ret;
1120} 1682}
1121 1683
1122static ssize_t 1684static ssize_t
@@ -1186,7 +1748,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
1186 if (isspace(ch)) { 1748 if (isspace(ch)) {
1187 iter->filtered++; 1749 iter->filtered++;
1188 iter->buffer[iter->buffer_idx] = 0; 1750 iter->buffer[iter->buffer_idx] = 0;
1189 ftrace_match(iter->buffer, iter->buffer_idx, enable); 1751 ret = ftrace_process_regex(iter->buffer,
1752 iter->buffer_idx, enable);
1753 if (ret)
1754 goto out;
1190 iter->buffer_idx = 0; 1755 iter->buffer_idx = 0;
1191 } else 1756 } else
1192 iter->flags |= FTRACE_ITER_CONT; 1757 iter->flags |= FTRACE_ITER_CONT;
@@ -1225,7 +1790,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
1225 if (reset) 1790 if (reset)
1226 ftrace_filter_reset(enable); 1791 ftrace_filter_reset(enable);
1227 if (buf) 1792 if (buf)
1228 ftrace_match(buf, len, enable); 1793 ftrace_match_records(buf, len, enable);
1229 mutex_unlock(&ftrace_regex_lock); 1794 mutex_unlock(&ftrace_regex_lock);
1230} 1795}
1231 1796
@@ -1275,15 +1840,13 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1275 if (iter->buffer_idx) { 1840 if (iter->buffer_idx) {
1276 iter->filtered++; 1841 iter->filtered++;
1277 iter->buffer[iter->buffer_idx] = 0; 1842 iter->buffer[iter->buffer_idx] = 0;
1278 ftrace_match(iter->buffer, iter->buffer_idx, enable); 1843 ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
1279 } 1844 }
1280 1845
1281 mutex_lock(&ftrace_sysctl_lock); 1846 mutex_lock(&ftrace_lock);
1282 mutex_lock(&ftrace_start_lock);
1283 if (ftrace_start_up && ftrace_enabled) 1847 if (ftrace_start_up && ftrace_enabled)
1284 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1848 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1285 mutex_unlock(&ftrace_start_lock); 1849 mutex_unlock(&ftrace_lock);
1286 mutex_unlock(&ftrace_sysctl_lock);
1287 1850
1288 kfree(iter); 1851 kfree(iter);
1289 mutex_unlock(&ftrace_regex_lock); 1852 mutex_unlock(&ftrace_regex_lock);
@@ -1302,31 +1865,31 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
1302 return ftrace_regex_release(inode, file, 0); 1865 return ftrace_regex_release(inode, file, 0);
1303} 1866}
1304 1867
1305static struct file_operations ftrace_avail_fops = { 1868static const struct file_operations ftrace_avail_fops = {
1306 .open = ftrace_avail_open, 1869 .open = ftrace_avail_open,
1307 .read = seq_read, 1870 .read = seq_read,
1308 .llseek = seq_lseek, 1871 .llseek = seq_lseek,
1309 .release = ftrace_avail_release, 1872 .release = ftrace_avail_release,
1310}; 1873};
1311 1874
1312static struct file_operations ftrace_failures_fops = { 1875static const struct file_operations ftrace_failures_fops = {
1313 .open = ftrace_failures_open, 1876 .open = ftrace_failures_open,
1314 .read = seq_read, 1877 .read = seq_read,
1315 .llseek = seq_lseek, 1878 .llseek = seq_lseek,
1316 .release = ftrace_avail_release, 1879 .release = ftrace_avail_release,
1317}; 1880};
1318 1881
1319static struct file_operations ftrace_filter_fops = { 1882static const struct file_operations ftrace_filter_fops = {
1320 .open = ftrace_filter_open, 1883 .open = ftrace_filter_open,
1321 .read = ftrace_regex_read, 1884 .read = seq_read,
1322 .write = ftrace_filter_write, 1885 .write = ftrace_filter_write,
1323 .llseek = ftrace_regex_lseek, 1886 .llseek = ftrace_regex_lseek,
1324 .release = ftrace_filter_release, 1887 .release = ftrace_filter_release,
1325}; 1888};
1326 1889
1327static struct file_operations ftrace_notrace_fops = { 1890static const struct file_operations ftrace_notrace_fops = {
1328 .open = ftrace_notrace_open, 1891 .open = ftrace_notrace_open,
1329 .read = ftrace_regex_read, 1892 .read = seq_read,
1330 .write = ftrace_notrace_write, 1893 .write = ftrace_notrace_write,
1331 .llseek = ftrace_regex_lseek, 1894 .llseek = ftrace_regex_lseek,
1332 .release = ftrace_notrace_release, 1895 .release = ftrace_notrace_release,
@@ -1359,6 +1922,10 @@ static void *g_start(struct seq_file *m, loff_t *pos)
1359 1922
1360 mutex_lock(&graph_lock); 1923 mutex_lock(&graph_lock);
1361 1924
1925 /* Nothing, tell g_show to print all functions are enabled */
1926 if (!ftrace_graph_count && !*pos)
1927 return (void *)1;
1928
1362 p = g_next(m, p, pos); 1929 p = g_next(m, p, pos);
1363 1930
1364 return p; 1931 return p;
@@ -1377,6 +1944,11 @@ static int g_show(struct seq_file *m, void *v)
1377 if (!ptr) 1944 if (!ptr)
1378 return 0; 1945 return 0;
1379 1946
1947 if (ptr == (unsigned long *)1) {
1948 seq_printf(m, "#### all functions enabled ####\n");
1949 return 0;
1950 }
1951
1380 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 1952 kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
1381 1953
1382 seq_printf(m, "%s\n", str); 1954 seq_printf(m, "%s\n", str);
@@ -1419,53 +1991,53 @@ ftrace_graph_open(struct inode *inode, struct file *file)
1419 return ret; 1991 return ret;
1420} 1992}
1421 1993
1422static ssize_t
1423ftrace_graph_read(struct file *file, char __user *ubuf,
1424 size_t cnt, loff_t *ppos)
1425{
1426 if (file->f_mode & FMODE_READ)
1427 return seq_read(file, ubuf, cnt, ppos);
1428 else
1429 return -EPERM;
1430}
1431
1432static int 1994static int
1433ftrace_set_func(unsigned long *array, int idx, char *buffer) 1995ftrace_set_func(unsigned long *array, int *idx, char *buffer)
1434{ 1996{
1435 char str[KSYM_SYMBOL_LEN];
1436 struct dyn_ftrace *rec; 1997 struct dyn_ftrace *rec;
1437 struct ftrace_page *pg; 1998 struct ftrace_page *pg;
1999 int search_len;
1438 int found = 0; 2000 int found = 0;
1439 int i, j; 2001 int type, not;
2002 char *search;
2003 bool exists;
2004 int i;
1440 2005
1441 if (ftrace_disabled) 2006 if (ftrace_disabled)
1442 return -ENODEV; 2007 return -ENODEV;
1443 2008
1444 /* should not be called from interrupt context */ 2009 /* decode regex */
1445 spin_lock(&ftrace_lock); 2010 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
2011 if (not)
2012 return -EINVAL;
2013
2014 search_len = strlen(search);
1446 2015
1447 for (pg = ftrace_pages_start; pg; pg = pg->next) { 2016 mutex_lock(&ftrace_lock);
1448 for (i = 0; i < pg->index; i++) { 2017 do_for_each_ftrace_rec(pg, rec) {
1449 rec = &pg->records[i];
1450 2018
1451 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2019 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
1452 continue; 2020 break;
1453 2021
1454 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 2022 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
1455 if (strcmp(str, buffer) == 0) { 2023 continue;
2024
2025 if (ftrace_match_record(rec, search, search_len, type)) {
2026 /* ensure it is not already in the array */
2027 exists = false;
2028 for (i = 0; i < *idx; i++)
2029 if (array[i] == rec->ip) {
2030 exists = true;
2031 break;
2032 }
2033 if (!exists) {
2034 array[(*idx)++] = rec->ip;
1456 found = 1; 2035 found = 1;
1457 for (j = 0; j < idx; j++)
1458 if (array[j] == rec->ip) {
1459 found = 0;
1460 break;
1461 }
1462 if (found)
1463 array[idx] = rec->ip;
1464 break;
1465 } 2036 }
1466 } 2037 }
1467 } 2038 } while_for_each_ftrace_rec();
1468 spin_unlock(&ftrace_lock); 2039
2040 mutex_unlock(&ftrace_lock);
1469 2041
1470 return found ? 0 : -EINVAL; 2042 return found ? 0 : -EINVAL;
1471} 2043}
@@ -1533,13 +2105,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
1533 } 2105 }
1534 buffer[index] = 0; 2106 buffer[index] = 0;
1535 2107
1536 /* we allow only one at a time */ 2108 /* we allow only one expression at a time */
1537 ret = ftrace_set_func(array, ftrace_graph_count, buffer); 2109 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
1538 if (ret) 2110 if (ret)
1539 goto out; 2111 goto out;
1540 2112
1541 ftrace_graph_count++;
1542
1543 file->f_pos += read; 2113 file->f_pos += read;
1544 2114
1545 ret = read; 2115 ret = read;
@@ -1551,7 +2121,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
1551 2121
1552static const struct file_operations ftrace_graph_fops = { 2122static const struct file_operations ftrace_graph_fops = {
1553 .open = ftrace_graph_open, 2123 .open = ftrace_graph_open,
1554 .read = ftrace_graph_read, 2124 .read = seq_read,
1555 .write = ftrace_graph_write, 2125 .write = ftrace_graph_write,
1556}; 2126};
1557#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2127#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -1603,7 +2173,7 @@ static int ftrace_convert_nops(struct module *mod,
1603 unsigned long addr; 2173 unsigned long addr;
1604 unsigned long flags; 2174 unsigned long flags;
1605 2175
1606 mutex_lock(&ftrace_start_lock); 2176 mutex_lock(&ftrace_lock);
1607 p = start; 2177 p = start;
1608 while (p < end) { 2178 while (p < end) {
1609 addr = ftrace_call_adjust(*p++); 2179 addr = ftrace_call_adjust(*p++);
@@ -1622,7 +2192,7 @@ static int ftrace_convert_nops(struct module *mod,
1622 local_irq_save(flags); 2192 local_irq_save(flags);
1623 ftrace_update_code(mod); 2193 ftrace_update_code(mod);
1624 local_irq_restore(flags); 2194 local_irq_restore(flags);
1625 mutex_unlock(&ftrace_start_lock); 2195 mutex_unlock(&ftrace_lock);
1626 2196
1627 return 0; 2197 return 0;
1628} 2198}
@@ -1699,7 +2269,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
1699 if (ftrace_pid_trace == ftrace_swapper_pid) 2269 if (ftrace_pid_trace == ftrace_swapper_pid)
1700 r = sprintf(buf, "swapper tasks\n"); 2270 r = sprintf(buf, "swapper tasks\n");
1701 else if (ftrace_pid_trace) 2271 else if (ftrace_pid_trace)
1702 r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); 2272 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
1703 else 2273 else
1704 r = sprintf(buf, "no pid\n"); 2274 r = sprintf(buf, "no pid\n");
1705 2275
@@ -1736,9 +2306,12 @@ static void clear_ftrace_pid(struct pid *pid)
1736{ 2306{
1737 struct task_struct *p; 2307 struct task_struct *p;
1738 2308
2309 rcu_read_lock();
1739 do_each_pid_task(pid, PIDTYPE_PID, p) { 2310 do_each_pid_task(pid, PIDTYPE_PID, p) {
1740 clear_tsk_trace_trace(p); 2311 clear_tsk_trace_trace(p);
1741 } while_each_pid_task(pid, PIDTYPE_PID, p); 2312 } while_each_pid_task(pid, PIDTYPE_PID, p);
2313 rcu_read_unlock();
2314
1742 put_pid(pid); 2315 put_pid(pid);
1743} 2316}
1744 2317
@@ -1746,9 +2319,11 @@ static void set_ftrace_pid(struct pid *pid)
1746{ 2319{
1747 struct task_struct *p; 2320 struct task_struct *p;
1748 2321
2322 rcu_read_lock();
1749 do_each_pid_task(pid, PIDTYPE_PID, p) { 2323 do_each_pid_task(pid, PIDTYPE_PID, p) {
1750 set_tsk_trace_trace(p); 2324 set_tsk_trace_trace(p);
1751 } while_each_pid_task(pid, PIDTYPE_PID, p); 2325 } while_each_pid_task(pid, PIDTYPE_PID, p);
2326 rcu_read_unlock();
1752} 2327}
1753 2328
1754static void clear_ftrace_pid_task(struct pid **pid) 2329static void clear_ftrace_pid_task(struct pid **pid)
@@ -1790,7 +2365,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
1790 if (ret < 0) 2365 if (ret < 0)
1791 return ret; 2366 return ret;
1792 2367
1793 mutex_lock(&ftrace_start_lock); 2368 mutex_lock(&ftrace_lock);
1794 if (val < 0) { 2369 if (val < 0) {
1795 /* disable pid tracing */ 2370 /* disable pid tracing */
1796 if (!ftrace_pid_trace) 2371 if (!ftrace_pid_trace)
@@ -1829,12 +2404,12 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
1829 ftrace_startup_enable(0); 2404 ftrace_startup_enable(0);
1830 2405
1831 out: 2406 out:
1832 mutex_unlock(&ftrace_start_lock); 2407 mutex_unlock(&ftrace_lock);
1833 2408
1834 return cnt; 2409 return cnt;
1835} 2410}
1836 2411
1837static struct file_operations ftrace_pid_fops = { 2412static const struct file_operations ftrace_pid_fops = {
1838 .read = ftrace_pid_read, 2413 .read = ftrace_pid_read,
1839 .write = ftrace_pid_write, 2414 .write = ftrace_pid_write,
1840}; 2415};
@@ -1857,7 +2432,6 @@ static __init int ftrace_init_debugfs(void)
1857 "'set_ftrace_pid' entry\n"); 2432 "'set_ftrace_pid' entry\n");
1858 return 0; 2433 return 0;
1859} 2434}
1860
1861fs_initcall(ftrace_init_debugfs); 2435fs_initcall(ftrace_init_debugfs);
1862 2436
1863/** 2437/**
@@ -1892,17 +2466,17 @@ int register_ftrace_function(struct ftrace_ops *ops)
1892 if (unlikely(ftrace_disabled)) 2466 if (unlikely(ftrace_disabled))
1893 return -1; 2467 return -1;
1894 2468
1895 mutex_lock(&ftrace_sysctl_lock); 2469 mutex_lock(&ftrace_lock);
1896 2470
1897 ret = __register_ftrace_function(ops); 2471 ret = __register_ftrace_function(ops);
1898 ftrace_startup(0); 2472 ftrace_startup(0);
1899 2473
1900 mutex_unlock(&ftrace_sysctl_lock); 2474 mutex_unlock(&ftrace_lock);
1901 return ret; 2475 return ret;
1902} 2476}
1903 2477
1904/** 2478/**
1905 * unregister_ftrace_function - unresgister a function for profiling. 2479 * unregister_ftrace_function - unregister a function for profiling.
1906 * @ops - ops structure that holds the function to unregister 2480 * @ops - ops structure that holds the function to unregister
1907 * 2481 *
1908 * Unregister a function that was added to be called by ftrace profiling. 2482 * Unregister a function that was added to be called by ftrace profiling.
@@ -1911,10 +2485,10 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
1911{ 2485{
1912 int ret; 2486 int ret;
1913 2487
1914 mutex_lock(&ftrace_sysctl_lock); 2488 mutex_lock(&ftrace_lock);
1915 ret = __unregister_ftrace_function(ops); 2489 ret = __unregister_ftrace_function(ops);
1916 ftrace_shutdown(0); 2490 ftrace_shutdown(0);
1917 mutex_unlock(&ftrace_sysctl_lock); 2491 mutex_unlock(&ftrace_lock);
1918 2492
1919 return ret; 2493 return ret;
1920} 2494}
@@ -1929,7 +2503,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1929 if (unlikely(ftrace_disabled)) 2503 if (unlikely(ftrace_disabled))
1930 return -ENODEV; 2504 return -ENODEV;
1931 2505
1932 mutex_lock(&ftrace_sysctl_lock); 2506 mutex_lock(&ftrace_lock);
1933 2507
1934 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 2508 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
1935 2509
@@ -1958,13 +2532,14 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1958 } 2532 }
1959 2533
1960 out: 2534 out:
1961 mutex_unlock(&ftrace_sysctl_lock); 2535 mutex_unlock(&ftrace_lock);
1962 return ret; 2536 return ret;
1963} 2537}
1964 2538
1965#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2539#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1966 2540
1967static atomic_t ftrace_graph_active; 2541static atomic_t ftrace_graph_active;
2542static struct notifier_block ftrace_suspend_notifier;
1968 2543
1969int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 2544int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
1970{ 2545{
@@ -2022,11 +2597,43 @@ free:
2022 return ret; 2597 return ret;
2023} 2598}
2024 2599
2600static void
2601ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
2602 struct task_struct *next)
2603{
2604 unsigned long long timestamp;
2605 int index;
2606
2607 /*
2608 * Does the user want to count the time a function was asleep.
2609 * If so, do not update the time stamps.
2610 */
2611 if (trace_flags & TRACE_ITER_SLEEP_TIME)
2612 return;
2613
2614 timestamp = trace_clock_local();
2615
2616 prev->ftrace_timestamp = timestamp;
2617
2618 /* only process tasks that we timestamped */
2619 if (!next->ftrace_timestamp)
2620 return;
2621
2622 /*
2623 * Update all the counters in next to make up for the
2624 * time next was sleeping.
2625 */
2626 timestamp -= next->ftrace_timestamp;
2627
2628 for (index = next->curr_ret_stack; index >= 0; index--)
2629 next->ret_stack[index].calltime += timestamp;
2630}
2631
2025/* Allocate a return stack for each task */ 2632/* Allocate a return stack for each task */
2026static int start_graph_tracing(void) 2633static int start_graph_tracing(void)
2027{ 2634{
2028 struct ftrace_ret_stack **ret_stack_list; 2635 struct ftrace_ret_stack **ret_stack_list;
2029 int ret; 2636 int ret, cpu;
2030 2637
2031 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * 2638 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
2032 sizeof(struct ftrace_ret_stack *), 2639 sizeof(struct ftrace_ret_stack *),
@@ -2035,20 +2642,61 @@ static int start_graph_tracing(void)
2035 if (!ret_stack_list) 2642 if (!ret_stack_list)
2036 return -ENOMEM; 2643 return -ENOMEM;
2037 2644
2645 /* The cpu_boot init_task->ret_stack will never be freed */
2646 for_each_online_cpu(cpu)
2647 ftrace_graph_init_task(idle_task(cpu));
2648
2038 do { 2649 do {
2039 ret = alloc_retstack_tasklist(ret_stack_list); 2650 ret = alloc_retstack_tasklist(ret_stack_list);
2040 } while (ret == -EAGAIN); 2651 } while (ret == -EAGAIN);
2041 2652
2653 if (!ret) {
2654 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
2655 if (ret)
2656 pr_info("ftrace_graph: Couldn't activate tracepoint"
2657 " probe to kernel_sched_switch\n");
2658 }
2659
2042 kfree(ret_stack_list); 2660 kfree(ret_stack_list);
2043 return ret; 2661 return ret;
2044} 2662}
2045 2663
2664/*
2665 * Hibernation protection.
2666 * The state of the current task is too much unstable during
2667 * suspend/restore to disk. We want to protect against that.
2668 */
2669static int
2670ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
2671 void *unused)
2672{
2673 switch (state) {
2674 case PM_HIBERNATION_PREPARE:
2675 pause_graph_tracing();
2676 break;
2677
2678 case PM_POST_HIBERNATION:
2679 unpause_graph_tracing();
2680 break;
2681 }
2682 return NOTIFY_DONE;
2683}
2684
2046int register_ftrace_graph(trace_func_graph_ret_t retfunc, 2685int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2047 trace_func_graph_ent_t entryfunc) 2686 trace_func_graph_ent_t entryfunc)
2048{ 2687{
2049 int ret = 0; 2688 int ret = 0;
2050 2689
2051 mutex_lock(&ftrace_sysctl_lock); 2690 mutex_lock(&ftrace_lock);
2691
2692 /* we currently allow only one tracer registered at a time */
2693 if (atomic_read(&ftrace_graph_active)) {
2694 ret = -EBUSY;
2695 goto out;
2696 }
2697
2698 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
2699 register_pm_notifier(&ftrace_suspend_notifier);
2052 2700
2053 atomic_inc(&ftrace_graph_active); 2701 atomic_inc(&ftrace_graph_active);
2054 ret = start_graph_tracing(); 2702 ret = start_graph_tracing();
@@ -2063,20 +2711,26 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2063 ftrace_startup(FTRACE_START_FUNC_RET); 2711 ftrace_startup(FTRACE_START_FUNC_RET);
2064 2712
2065out: 2713out:
2066 mutex_unlock(&ftrace_sysctl_lock); 2714 mutex_unlock(&ftrace_lock);
2067 return ret; 2715 return ret;
2068} 2716}
2069 2717
2070void unregister_ftrace_graph(void) 2718void unregister_ftrace_graph(void)
2071{ 2719{
2072 mutex_lock(&ftrace_sysctl_lock); 2720 mutex_lock(&ftrace_lock);
2721
2722 if (!unlikely(atomic_read(&ftrace_graph_active)))
2723 goto out;
2073 2724
2074 atomic_dec(&ftrace_graph_active); 2725 atomic_dec(&ftrace_graph_active);
2726 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
2075 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 2727 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2076 ftrace_graph_entry = ftrace_graph_entry_stub; 2728 ftrace_graph_entry = ftrace_graph_entry_stub;
2077 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 2729 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2730 unregister_pm_notifier(&ftrace_suspend_notifier);
2078 2731
2079 mutex_unlock(&ftrace_sysctl_lock); 2732 out:
2733 mutex_unlock(&ftrace_lock);
2080} 2734}
2081 2735
2082/* Allocate a return stack for newly created task */ 2736/* Allocate a return stack for newly created task */
@@ -2091,6 +2745,7 @@ void ftrace_graph_init_task(struct task_struct *t)
2091 t->curr_ret_stack = -1; 2745 t->curr_ret_stack = -1;
2092 atomic_set(&t->tracing_graph_pause, 0); 2746 atomic_set(&t->tracing_graph_pause, 0);
2093 atomic_set(&t->trace_overrun, 0); 2747 atomic_set(&t->trace_overrun, 0);
2748 t->ftrace_timestamp = 0;
2094 } else 2749 } else
2095 t->ret_stack = NULL; 2750 t->ret_stack = NULL;
2096} 2751}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 000000000000..5011f4d91e37
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,464 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
11#include <linux/debugfs.h>
12#include <linux/dcache.h>
13#include <linux/fs.h>
14
15#include <trace/kmemtrace.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20/* Select an alternative, minimalistic output than the original one */
21#define TRACE_KMEM_OPT_MINIMAL 0x1
22
23static struct tracer_opt kmem_opts[] = {
24 /* Default disable the minimalistic output */
25 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
26 { }
27};
28
29static struct tracer_flags kmem_tracer_flags = {
30 .val = 0,
31 .opts = kmem_opts
32};
33
34static struct trace_array *kmemtrace_array;
35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct trace_array *tr = kmemtrace_array;
46 struct kmemtrace_alloc_entry *entry;
47 struct ring_buffer_event *event;
48
49 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
50 if (!event)
51 return;
52
53 entry = ring_buffer_event_data(event);
54 tracing_generic_entry_update(&entry->ent, 0, 0);
55
56 entry->ent.type = TRACE_KMEM_ALLOC;
57 entry->type_id = type_id;
58 entry->call_site = call_site;
59 entry->ptr = ptr;
60 entry->bytes_req = bytes_req;
61 entry->bytes_alloc = bytes_alloc;
62 entry->gfp_flags = gfp_flags;
63 entry->node = node;
64
65 ring_buffer_unlock_commit(tr->buffer, event);
66
67 trace_wake_up();
68}
69
70static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
71 unsigned long call_site,
72 const void *ptr)
73{
74 struct trace_array *tr = kmemtrace_array;
75 struct kmemtrace_free_entry *entry;
76 struct ring_buffer_event *event;
77
78 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
79 if (!event)
80 return;
81 entry = ring_buffer_event_data(event);
82 tracing_generic_entry_update(&entry->ent, 0, 0);
83
84 entry->ent.type = TRACE_KMEM_FREE;
85 entry->type_id = type_id;
86 entry->call_site = call_site;
87 entry->ptr = ptr;
88
89 ring_buffer_unlock_commit(tr->buffer, event);
90
91 trace_wake_up();
92}
93
94static void kmemtrace_kmalloc(unsigned long call_site,
95 const void *ptr,
96 size_t bytes_req,
97 size_t bytes_alloc,
98 gfp_t gfp_flags)
99{
100 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
101 bytes_req, bytes_alloc, gfp_flags, -1);
102}
103
104static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
105 const void *ptr,
106 size_t bytes_req,
107 size_t bytes_alloc,
108 gfp_t gfp_flags)
109{
110 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
111 bytes_req, bytes_alloc, gfp_flags, -1);
112}
113
114static void kmemtrace_kmalloc_node(unsigned long call_site,
115 const void *ptr,
116 size_t bytes_req,
117 size_t bytes_alloc,
118 gfp_t gfp_flags,
119 int node)
120{
121 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
122 bytes_req, bytes_alloc, gfp_flags, node);
123}
124
125static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
126 const void *ptr,
127 size_t bytes_req,
128 size_t bytes_alloc,
129 gfp_t gfp_flags,
130 int node)
131{
132 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
133 bytes_req, bytes_alloc, gfp_flags, node);
134}
135
136static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
137{
138 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
139}
140
141static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
142{
143 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
144}
145
146static int kmemtrace_start_probes(void)
147{
148 int err;
149
150 err = register_trace_kmalloc(kmemtrace_kmalloc);
151 if (err)
152 return err;
153 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
154 if (err)
155 return err;
156 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
157 if (err)
158 return err;
159 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
160 if (err)
161 return err;
162 err = register_trace_kfree(kmemtrace_kfree);
163 if (err)
164 return err;
165 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
166
167 return err;
168}
169
170static void kmemtrace_stop_probes(void)
171{
172 unregister_trace_kmalloc(kmemtrace_kmalloc);
173 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
174 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
175 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
176 unregister_trace_kfree(kmemtrace_kfree);
177 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
178}
179
180static int kmem_trace_init(struct trace_array *tr)
181{
182 int cpu;
183 kmemtrace_array = tr;
184
185 for_each_cpu_mask(cpu, cpu_possible_map)
186 tracing_reset(tr, cpu);
187
188 kmemtrace_start_probes();
189
190 return 0;
191}
192
193static void kmem_trace_reset(struct trace_array *tr)
194{
195 kmemtrace_stop_probes();
196}
197
198static void kmemtrace_headers(struct seq_file *s)
199{
200 /* Don't need headers for the original kmemtrace output */
201 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
202 return;
203
204 seq_printf(s, "#\n");
205 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
206 " POINTER NODE CALLER\n");
207 seq_printf(s, "# FREE | | | | "
208 " | | | |\n");
209 seq_printf(s, "# |\n\n");
210}
211
212/*
213 * The following functions give the original output from kmemtrace,
214 * plus the origin CPU, since reordering occurs in-kernel now.
215 */
216
217#define KMEMTRACE_USER_ALLOC 0
218#define KMEMTRACE_USER_FREE 1
219
220struct kmemtrace_user_event {
221 u8 event_id;
222 u8 type_id;
223 u16 event_size;
224 u32 cpu;
225 u64 timestamp;
226 unsigned long call_site;
227 unsigned long ptr;
228};
229
230struct kmemtrace_user_event_alloc {
231 size_t bytes_req;
232 size_t bytes_alloc;
233 unsigned gfp_flags;
234 int node;
235};
236
237static enum print_line_t
238kmemtrace_print_alloc_user(struct trace_iterator *iter,
239 struct kmemtrace_alloc_entry *entry)
240{
241 struct kmemtrace_user_event_alloc *ev_alloc;
242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_user_event *ev;
244
245 ev = trace_seq_reserve(s, sizeof(*ev));
246 if (!ev)
247 return TRACE_TYPE_PARTIAL_LINE;
248
249 ev->event_id = KMEMTRACE_USER_ALLOC;
250 ev->type_id = entry->type_id;
251 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
252 ev->cpu = iter->cpu;
253 ev->timestamp = iter->ts;
254 ev->call_site = entry->call_site;
255 ev->ptr = (unsigned long)entry->ptr;
256
257 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
258 if (!ev_alloc)
259 return TRACE_TYPE_PARTIAL_LINE;
260
261 ev_alloc->bytes_req = entry->bytes_req;
262 ev_alloc->bytes_alloc = entry->bytes_alloc;
263 ev_alloc->gfp_flags = entry->gfp_flags;
264 ev_alloc->node = entry->node;
265
266 return TRACE_TYPE_HANDLED;
267}
268
269static enum print_line_t
270kmemtrace_print_free_user(struct trace_iterator *iter,
271 struct kmemtrace_free_entry *entry)
272{
273 struct trace_seq *s = &iter->seq;
274 struct kmemtrace_user_event *ev;
275
276 ev = trace_seq_reserve(s, sizeof(*ev));
277 if (!ev)
278 return TRACE_TYPE_PARTIAL_LINE;
279
280 ev->event_id = KMEMTRACE_USER_FREE;
281 ev->type_id = entry->type_id;
282 ev->event_size = sizeof(*ev);
283 ev->cpu = iter->cpu;
284 ev->timestamp = iter->ts;
285 ev->call_site = entry->call_site;
286 ev->ptr = (unsigned long)entry->ptr;
287
288 return TRACE_TYPE_HANDLED;
289}
290
291/* The two other following provide a more minimalistic output */
292static enum print_line_t
293kmemtrace_print_alloc_compress(struct trace_iterator *iter,
294 struct kmemtrace_alloc_entry *entry)
295{
296 struct trace_seq *s = &iter->seq;
297 int ret;
298
299 /* Alloc entry */
300 ret = trace_seq_printf(s, " + ");
301 if (!ret)
302 return TRACE_TYPE_PARTIAL_LINE;
303
304 /* Type */
305 switch (entry->type_id) {
306 case KMEMTRACE_TYPE_KMALLOC:
307 ret = trace_seq_printf(s, "K ");
308 break;
309 case KMEMTRACE_TYPE_CACHE:
310 ret = trace_seq_printf(s, "C ");
311 break;
312 case KMEMTRACE_TYPE_PAGES:
313 ret = trace_seq_printf(s, "P ");
314 break;
315 default:
316 ret = trace_seq_printf(s, "? ");
317 }
318
319 if (!ret)
320 return TRACE_TYPE_PARTIAL_LINE;
321
322 /* Requested */
323 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
324 if (!ret)
325 return TRACE_TYPE_PARTIAL_LINE;
326
327 /* Allocated */
328 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
329 if (!ret)
330 return TRACE_TYPE_PARTIAL_LINE;
331
332 /* Flags
333 * TODO: would be better to see the name of the GFP flag names
334 */
335 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
336 if (!ret)
337 return TRACE_TYPE_PARTIAL_LINE;
338
339 /* Pointer to allocated */
340 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
341 if (!ret)
342 return TRACE_TYPE_PARTIAL_LINE;
343
344 /* Node */
345 ret = trace_seq_printf(s, "%4d ", entry->node);
346 if (!ret)
347 return TRACE_TYPE_PARTIAL_LINE;
348
349 /* Call site */
350 ret = seq_print_ip_sym(s, entry->call_site, 0);
351 if (!ret)
352 return TRACE_TYPE_PARTIAL_LINE;
353
354 if (!trace_seq_printf(s, "\n"))
355 return TRACE_TYPE_PARTIAL_LINE;
356
357 return TRACE_TYPE_HANDLED;
358}
359
360static enum print_line_t
361kmemtrace_print_free_compress(struct trace_iterator *iter,
362 struct kmemtrace_free_entry *entry)
363{
364 struct trace_seq *s = &iter->seq;
365 int ret;
366
367 /* Free entry */
368 ret = trace_seq_printf(s, " - ");
369 if (!ret)
370 return TRACE_TYPE_PARTIAL_LINE;
371
372 /* Type */
373 switch (entry->type_id) {
374 case KMEMTRACE_TYPE_KMALLOC:
375 ret = trace_seq_printf(s, "K ");
376 break;
377 case KMEMTRACE_TYPE_CACHE:
378 ret = trace_seq_printf(s, "C ");
379 break;
380 case KMEMTRACE_TYPE_PAGES:
381 ret = trace_seq_printf(s, "P ");
382 break;
383 default:
384 ret = trace_seq_printf(s, "? ");
385 }
386
387 if (!ret)
388 return TRACE_TYPE_PARTIAL_LINE;
389
390 /* Skip requested/allocated/flags */
391 ret = trace_seq_printf(s, " ");
392 if (!ret)
393 return TRACE_TYPE_PARTIAL_LINE;
394
395 /* Pointer to allocated */
396 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
397 if (!ret)
398 return TRACE_TYPE_PARTIAL_LINE;
399
400 /* Skip node */
401 ret = trace_seq_printf(s, " ");
402 if (!ret)
403 return TRACE_TYPE_PARTIAL_LINE;
404
405 /* Call site */
406 ret = seq_print_ip_sym(s, entry->call_site, 0);
407 if (!ret)
408 return TRACE_TYPE_PARTIAL_LINE;
409
410 if (!trace_seq_printf(s, "\n"))
411 return TRACE_TYPE_PARTIAL_LINE;
412
413 return TRACE_TYPE_HANDLED;
414}
415
416static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
417{
418 struct trace_entry *entry = iter->ent;
419
420 switch (entry->type) {
421 case TRACE_KMEM_ALLOC: {
422 struct kmemtrace_alloc_entry *field;
423
424 trace_assign_type(field, entry);
425 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
426 return kmemtrace_print_alloc_compress(iter, field);
427 else
428 return kmemtrace_print_alloc_user(iter, field);
429 }
430
431 case TRACE_KMEM_FREE: {
432 struct kmemtrace_free_entry *field;
433
434 trace_assign_type(field, entry);
435 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
436 return kmemtrace_print_free_compress(iter, field);
437 else
438 return kmemtrace_print_free_user(iter, field);
439 }
440
441 default:
442 return TRACE_TYPE_UNHANDLED;
443 }
444}
445
446static struct tracer kmem_tracer __read_mostly = {
447 .name = "kmemtrace",
448 .init = kmem_trace_init,
449 .reset = kmem_trace_reset,
450 .print_line = kmemtrace_print_line,
451 .print_header = kmemtrace_headers,
452 .flags = &kmem_tracer_flags
453};
454
455void kmemtrace_init(void)
456{
457 /* earliest opportunity to start kmem tracing */
458}
459
460static int __init init_kmem_tracer(void)
461{
462 return register_tracer(&kmem_tracer);
463}
464device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0662ef..960cbf44c844 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,21 +4,92 @@
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
7#include <linux/spinlock.h> 9#include <linux/spinlock.h>
8#include <linux/debugfs.h> 10#include <linux/debugfs.h>
9#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h>
10#include <linux/module.h> 13#include <linux/module.h>
11#include <linux/percpu.h> 14#include <linux/percpu.h>
12#include <linux/mutex.h> 15#include <linux/mutex.h>
13#include <linux/sched.h> /* used for sched_clock() (for now) */
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/hash.h> 17#include <linux/hash.h>
16#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/cpu.h>
17#include <linux/fs.h> 20#include <linux/fs.h>
18 21
19#include "trace.h" 22#include "trace.h"
20 23
21/* 24/*
25 * The ring buffer is made up of a list of pages. A separate list of pages is
26 * allocated for each CPU. A writer may only write to a buffer that is
27 * associated with the CPU it is currently executing on. A reader may read
28 * from any per cpu buffer.
29 *
30 * The reader is special. For each per cpu buffer, the reader has its own
31 * reader page. When a reader has read the entire reader page, this reader
32 * page is swapped with another page in the ring buffer.
33 *
34 * Now, as long as the writer is off the reader page, the reader can do what
35 * ever it wants with that page. The writer will never write to that page
36 * again (as long as it is out of the ring buffer).
37 *
38 * Here's some silly ASCII art.
39 *
40 * +------+
41 * |reader| RING BUFFER
42 * |page |
43 * +------+ +---+ +---+ +---+
44 * | |-->| |-->| |
45 * +---+ +---+ +---+
46 * ^ |
47 * | |
48 * +---------------+
49 *
50 *
51 * +------+
52 * |reader| RING BUFFER
53 * |page |------------------v
54 * +------+ +---+ +---+ +---+
55 * | |-->| |-->| |
56 * +---+ +---+ +---+
57 * ^ |
58 * | |
59 * +---------------+
60 *
61 *
62 * +------+
63 * |reader| RING BUFFER
64 * |page |------------------v
65 * +------+ +---+ +---+ +---+
66 * ^ | |-->| |-->| |
67 * | +---+ +---+ +---+
68 * | |
69 * | |
70 * +------------------------------+
71 *
72 *
73 * +------+
74 * |buffer| RING BUFFER
75 * |page |------------------v
76 * +------+ +---+ +---+ +---+
77 * ^ | | | |-->| |
78 * | New +---+ +---+ +---+
79 * | Reader------^ |
80 * | page |
81 * +------------------------------+
82 *
83 *
84 * After we make this swap, the reader can hand this page off to the splice
85 * code and be done with it. It can even allocate a new page if it needs to
86 * and swap that into the ring buffer.
87 *
88 * We will be using cmpxchg soon to make all this lockless.
89 *
90 */
91
92/*
22 * A fast way to enable or disable all ring buffers is to 93 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers 94 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to. 95 * prevents all ring buffers from being recorded to.
@@ -57,7 +128,9 @@ enum {
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, 128 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58}; 129};
59 130
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; 131static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
132
133#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
61 134
62/** 135/**
63 * tracing_on - enable all tracing buffers 136 * tracing_on - enable all tracing buffers
@@ -89,59 +162,92 @@ EXPORT_SYMBOL_GPL(tracing_off);
89 * tracing_off_permanent - permanently disable ring buffers 162 * tracing_off_permanent - permanently disable ring buffers
90 * 163 *
91 * This function, once called, will disable all ring buffers 164 * This function, once called, will disable all ring buffers
92 * permanenty. 165 * permanently.
93 */ 166 */
94void tracing_off_permanent(void) 167void tracing_off_permanent(void)
95{ 168{
96 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); 169 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
97} 170}
98 171
172/**
173 * tracing_is_on - show state of ring buffers enabled
174 */
175int tracing_is_on(void)
176{
177 return ring_buffer_flags == RB_BUFFERS_ON;
178}
179EXPORT_SYMBOL_GPL(tracing_is_on);
180
99#include "trace.h" 181#include "trace.h"
100 182
101/* Up this if you want to test the TIME_EXTENTS and normalization */ 183#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
102#define DEBUG_SHIFT 0 184#define RB_ALIGNMENT 4U
185#define RB_MAX_SMALL_DATA 28
103 186
104/* FIXME!!! */ 187enum {
105u64 ring_buffer_time_stamp(int cpu) 188 RB_LEN_TIME_EXTEND = 8,
189 RB_LEN_TIME_STAMP = 16,
190};
191
192static inline int rb_null_event(struct ring_buffer_event *event)
106{ 193{
107 u64 time; 194 return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
195}
108 196
109 preempt_disable_notrace(); 197static inline int rb_discarded_event(struct ring_buffer_event *event)
110 /* shift to debug/test normalization and TIME_EXTENTS */ 198{
111 time = sched_clock() << DEBUG_SHIFT; 199 return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
112 preempt_enable_no_resched_notrace(); 200}
113 201
114 return time; 202static void rb_event_set_padding(struct ring_buffer_event *event)
203{
204 event->type = RINGBUF_TYPE_PADDING;
205 event->time_delta = 0;
115} 206}
116EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
117 207
118void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 208/**
209 * ring_buffer_event_discard - discard an event in the ring buffer
210 * @buffer: the ring buffer
211 * @event: the event to discard
212 *
213 * Sometimes a event that is in the ring buffer needs to be ignored.
214 * This function lets the user discard an event in the ring buffer
215 * and then that event will not be read later.
216 *
217 * Note, it is up to the user to be careful with this, and protect
218 * against races. If the user discards an event that has been consumed
219 * it is possible that it could corrupt the ring buffer.
220 */
221void ring_buffer_event_discard(struct ring_buffer_event *event)
119{ 222{
120 /* Just stupid testing the normalize function and deltas */ 223 event->type = RINGBUF_TYPE_PADDING;
121 *ts >>= DEBUG_SHIFT; 224 /* time delta must be non zero */
225 if (!event->time_delta)
226 event->time_delta = 1;
122} 227}
123EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
124 228
125#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) 229static unsigned
126#define RB_ALIGNMENT_SHIFT 2 230rb_event_data_length(struct ring_buffer_event *event)
127#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) 231{
128#define RB_MAX_SMALL_DATA 28 232 unsigned length;
129 233
130enum { 234 if (event->len)
131 RB_LEN_TIME_EXTEND = 8, 235 length = event->len * RB_ALIGNMENT;
132 RB_LEN_TIME_STAMP = 16, 236 else
133}; 237 length = event->array[0];
238 return length + RB_EVNT_HDR_SIZE;
239}
134 240
135/* inline for ring buffer fast paths */ 241/* inline for ring buffer fast paths */
136static inline unsigned 242static unsigned
137rb_event_length(struct ring_buffer_event *event) 243rb_event_length(struct ring_buffer_event *event)
138{ 244{
139 unsigned length;
140
141 switch (event->type) { 245 switch (event->type) {
142 case RINGBUF_TYPE_PADDING: 246 case RINGBUF_TYPE_PADDING:
143 /* undefined */ 247 if (rb_null_event(event))
144 return -1; 248 /* undefined */
249 return -1;
250 return rb_event_data_length(event);
145 251
146 case RINGBUF_TYPE_TIME_EXTEND: 252 case RINGBUF_TYPE_TIME_EXTEND:
147 return RB_LEN_TIME_EXTEND; 253 return RB_LEN_TIME_EXTEND;
@@ -150,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event)
150 return RB_LEN_TIME_STAMP; 256 return RB_LEN_TIME_STAMP;
151 257
152 case RINGBUF_TYPE_DATA: 258 case RINGBUF_TYPE_DATA:
153 if (event->len) 259 return rb_event_data_length(event);
154 length = event->len << RB_ALIGNMENT_SHIFT;
155 else
156 length = event->array[0];
157 return length + RB_EVNT_HDR_SIZE;
158 default: 260 default:
159 BUG(); 261 BUG();
160 } 262 }
@@ -179,7 +281,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
179EXPORT_SYMBOL_GPL(ring_buffer_event_length); 281EXPORT_SYMBOL_GPL(ring_buffer_event_length);
180 282
181/* inline for ring buffer fast paths */ 283/* inline for ring buffer fast paths */
182static inline void * 284static void *
183rb_event_data(struct ring_buffer_event *event) 285rb_event_data(struct ring_buffer_event *event)
184{ 286{
185 BUG_ON(event->type != RINGBUF_TYPE_DATA); 287 BUG_ON(event->type != RINGBUF_TYPE_DATA);
@@ -209,7 +311,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
209 311
210struct buffer_data_page { 312struct buffer_data_page {
211 u64 time_stamp; /* page time stamp */ 313 u64 time_stamp; /* page time stamp */
212 local_t commit; /* write commited index */ 314 local_t commit; /* write committed index */
213 unsigned char data[]; /* data of buffer page */ 315 unsigned char data[]; /* data of buffer page */
214}; 316};
215 317
@@ -225,14 +327,25 @@ static void rb_init_page(struct buffer_data_page *bpage)
225 local_set(&bpage->commit, 0); 327 local_set(&bpage->commit, 0);
226} 328}
227 329
330/**
331 * ring_buffer_page_len - the size of data on the page.
332 * @page: The page to read
333 *
334 * Returns the amount of data on the page, including buffer page header.
335 */
336size_t ring_buffer_page_len(void *page)
337{
338 return local_read(&((struct buffer_data_page *)page)->commit)
339 + BUF_PAGE_HDR_SIZE;
340}
341
228/* 342/*
229 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing 343 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
230 * this issue out. 344 * this issue out.
231 */ 345 */
232static inline void free_buffer_page(struct buffer_page *bpage) 346static void free_buffer_page(struct buffer_page *bpage)
233{ 347{
234 if (bpage->page) 348 free_page((unsigned long)bpage->page);
235 free_page((unsigned long)bpage->page);
236 kfree(bpage); 349 kfree(bpage);
237} 350}
238 351
@@ -246,7 +359,7 @@ static inline int test_time_stamp(u64 delta)
246 return 0; 359 return 0;
247} 360}
248 361
249#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) 362#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
250 363
251/* 364/*
252 * head_page == tail_page && head == tail then buffer is empty. 365 * head_page == tail_page && head == tail then buffer is empty.
@@ -260,7 +373,7 @@ struct ring_buffer_per_cpu {
260 struct list_head pages; 373 struct list_head pages;
261 struct buffer_page *head_page; /* read from head */ 374 struct buffer_page *head_page; /* read from head */
262 struct buffer_page *tail_page; /* write to tail */ 375 struct buffer_page *tail_page; /* write to tail */
263 struct buffer_page *commit_page; /* commited pages */ 376 struct buffer_page *commit_page; /* committed pages */
264 struct buffer_page *reader_page; 377 struct buffer_page *reader_page;
265 unsigned long overrun; 378 unsigned long overrun;
266 unsigned long entries; 379 unsigned long entries;
@@ -273,12 +386,17 @@ struct ring_buffer {
273 unsigned pages; 386 unsigned pages;
274 unsigned flags; 387 unsigned flags;
275 int cpus; 388 int cpus;
276 cpumask_var_t cpumask;
277 atomic_t record_disabled; 389 atomic_t record_disabled;
390 cpumask_var_t cpumask;
278 391
279 struct mutex mutex; 392 struct mutex mutex;
280 393
281 struct ring_buffer_per_cpu **buffers; 394 struct ring_buffer_per_cpu **buffers;
395
396#ifdef CONFIG_HOTPLUG_CPU
397 struct notifier_block cpu_notify;
398#endif
399 u64 (*clock)(void);
282}; 400};
283 401
284struct ring_buffer_iter { 402struct ring_buffer_iter {
@@ -299,11 +417,35 @@ struct ring_buffer_iter {
299 _____ret; \ 417 _____ret; \
300 }) 418 })
301 419
420/* Up this if you want to test the TIME_EXTENTS and normalization */
421#define DEBUG_SHIFT 0
422
423u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
424{
425 u64 time;
426
427 preempt_disable_notrace();
428 /* shift to debug/test normalization and TIME_EXTENTS */
429 time = buffer->clock() << DEBUG_SHIFT;
430 preempt_enable_no_resched_notrace();
431
432 return time;
433}
434EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
435
436void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
437 int cpu, u64 *ts)
438{
439 /* Just stupid testing the normalize function and deltas */
440 *ts >>= DEBUG_SHIFT;
441}
442EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
443
302/** 444/**
303 * check_pages - integrity check of buffer pages 445 * check_pages - integrity check of buffer pages
304 * @cpu_buffer: CPU buffer with pages to test 446 * @cpu_buffer: CPU buffer with pages to test
305 * 447 *
306 * As a safty measure we check to make sure the data pages have not 448 * As a safety measure we check to make sure the data pages have not
307 * been corrupted. 449 * been corrupted.
308 */ 450 */
309static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 451static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
@@ -421,7 +563,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
421 struct list_head *head = &cpu_buffer->pages; 563 struct list_head *head = &cpu_buffer->pages;
422 struct buffer_page *bpage, *tmp; 564 struct buffer_page *bpage, *tmp;
423 565
424 list_del_init(&cpu_buffer->reader_page->list);
425 free_buffer_page(cpu_buffer->reader_page); 566 free_buffer_page(cpu_buffer->reader_page);
426 567
427 list_for_each_entry_safe(bpage, tmp, head, list) { 568 list_for_each_entry_safe(bpage, tmp, head, list) {
@@ -437,6 +578,11 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
437 */ 578 */
438extern int ring_buffer_page_too_big(void); 579extern int ring_buffer_page_too_big(void);
439 580
581#ifdef CONFIG_HOTPLUG_CPU
582static int rb_cpu_notify(struct notifier_block *self,
583 unsigned long action, void *hcpu);
584#endif
585
440/** 586/**
441 * ring_buffer_alloc - allocate a new ring_buffer 587 * ring_buffer_alloc - allocate a new ring_buffer
442 * @size: the size in bytes per cpu that is needed. 588 * @size: the size in bytes per cpu that is needed.
@@ -469,12 +615,23 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
469 615
470 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 616 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
471 buffer->flags = flags; 617 buffer->flags = flags;
618 buffer->clock = trace_clock_local;
472 619
473 /* need at least two pages */ 620 /* need at least two pages */
474 if (buffer->pages == 1) 621 if (buffer->pages == 1)
475 buffer->pages++; 622 buffer->pages++;
476 623
624 /*
625 * In case of non-hotplug cpu, if the ring-buffer is allocated
626 * in early initcall, it will not be notified of secondary cpus.
627 * In that off case, we need to allocate for all possible cpus.
628 */
629#ifdef CONFIG_HOTPLUG_CPU
630 get_online_cpus();
631 cpumask_copy(buffer->cpumask, cpu_online_mask);
632#else
477 cpumask_copy(buffer->cpumask, cpu_possible_mask); 633 cpumask_copy(buffer->cpumask, cpu_possible_mask);
634#endif
478 buffer->cpus = nr_cpu_ids; 635 buffer->cpus = nr_cpu_ids;
479 636
480 bsize = sizeof(void *) * nr_cpu_ids; 637 bsize = sizeof(void *) * nr_cpu_ids;
@@ -490,6 +647,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
490 goto fail_free_buffers; 647 goto fail_free_buffers;
491 } 648 }
492 649
650#ifdef CONFIG_HOTPLUG_CPU
651 buffer->cpu_notify.notifier_call = rb_cpu_notify;
652 buffer->cpu_notify.priority = 0;
653 register_cpu_notifier(&buffer->cpu_notify);
654#endif
655
656 put_online_cpus();
493 mutex_init(&buffer->mutex); 657 mutex_init(&buffer->mutex);
494 658
495 return buffer; 659 return buffer;
@@ -503,6 +667,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
503 667
504 fail_free_cpumask: 668 fail_free_cpumask:
505 free_cpumask_var(buffer->cpumask); 669 free_cpumask_var(buffer->cpumask);
670 put_online_cpus();
506 671
507 fail_free_buffer: 672 fail_free_buffer:
508 kfree(buffer); 673 kfree(buffer);
@@ -519,15 +684,29 @@ ring_buffer_free(struct ring_buffer *buffer)
519{ 684{
520 int cpu; 685 int cpu;
521 686
687 get_online_cpus();
688
689#ifdef CONFIG_HOTPLUG_CPU
690 unregister_cpu_notifier(&buffer->cpu_notify);
691#endif
692
522 for_each_buffer_cpu(buffer, cpu) 693 for_each_buffer_cpu(buffer, cpu)
523 rb_free_cpu_buffer(buffer->buffers[cpu]); 694 rb_free_cpu_buffer(buffer->buffers[cpu]);
524 695
696 put_online_cpus();
697
525 free_cpumask_var(buffer->cpumask); 698 free_cpumask_var(buffer->cpumask);
526 699
527 kfree(buffer); 700 kfree(buffer);
528} 701}
529EXPORT_SYMBOL_GPL(ring_buffer_free); 702EXPORT_SYMBOL_GPL(ring_buffer_free);
530 703
704void ring_buffer_set_clock(struct ring_buffer *buffer,
705 u64 (*clock)(void))
706{
707 buffer->clock = clock;
708}
709
531static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 710static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
532 711
533static void 712static void
@@ -627,16 +806,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
627 return size; 806 return size;
628 807
629 mutex_lock(&buffer->mutex); 808 mutex_lock(&buffer->mutex);
809 get_online_cpus();
630 810
631 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 811 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
632 812
633 if (size < buffer_size) { 813 if (size < buffer_size) {
634 814
635 /* easy case, just free pages */ 815 /* easy case, just free pages */
636 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { 816 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
637 mutex_unlock(&buffer->mutex); 817 goto out_fail;
638 return -1;
639 }
640 818
641 rm_pages = buffer->pages - nr_pages; 819 rm_pages = buffer->pages - nr_pages;
642 820
@@ -655,10 +833,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
655 * add these pages to the cpu_buffers. Otherwise we just free 833 * add these pages to the cpu_buffers. Otherwise we just free
656 * them all and return -ENOMEM; 834 * them all and return -ENOMEM;
657 */ 835 */
658 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { 836 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
659 mutex_unlock(&buffer->mutex); 837 goto out_fail;
660 return -1;
661 }
662 838
663 new_pages = nr_pages - buffer->pages; 839 new_pages = nr_pages - buffer->pages;
664 840
@@ -683,13 +859,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
683 rb_insert_pages(cpu_buffer, &pages, new_pages); 859 rb_insert_pages(cpu_buffer, &pages, new_pages);
684 } 860 }
685 861
686 if (RB_WARN_ON(buffer, !list_empty(&pages))) { 862 if (RB_WARN_ON(buffer, !list_empty(&pages)))
687 mutex_unlock(&buffer->mutex); 863 goto out_fail;
688 return -1;
689 }
690 864
691 out: 865 out:
692 buffer->pages = nr_pages; 866 buffer->pages = nr_pages;
867 put_online_cpus();
693 mutex_unlock(&buffer->mutex); 868 mutex_unlock(&buffer->mutex);
694 869
695 return size; 870 return size;
@@ -699,15 +874,20 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
699 list_del_init(&bpage->list); 874 list_del_init(&bpage->list);
700 free_buffer_page(bpage); 875 free_buffer_page(bpage);
701 } 876 }
877 put_online_cpus();
702 mutex_unlock(&buffer->mutex); 878 mutex_unlock(&buffer->mutex);
703 return -ENOMEM; 879 return -ENOMEM;
704}
705EXPORT_SYMBOL_GPL(ring_buffer_resize);
706 880
707static inline int rb_null_event(struct ring_buffer_event *event) 881 /*
708{ 882 * Something went totally wrong, and we are too paranoid
709 return event->type == RINGBUF_TYPE_PADDING; 883 * to even clean up the mess.
884 */
885 out_fail:
886 put_online_cpus();
887 mutex_unlock(&buffer->mutex);
888 return -1;
710} 889}
890EXPORT_SYMBOL_GPL(ring_buffer_resize);
711 891
712static inline void * 892static inline void *
713__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 893__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
@@ -811,7 +991,7 @@ rb_event_index(struct ring_buffer_event *event)
811 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 991 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
812} 992}
813 993
814static inline int 994static int
815rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 995rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
816 struct ring_buffer_event *event) 996 struct ring_buffer_event *event)
817{ 997{
@@ -825,7 +1005,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
825 rb_commit_index(cpu_buffer) == index; 1005 rb_commit_index(cpu_buffer) == index;
826} 1006}
827 1007
828static inline void 1008static void
829rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, 1009rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
830 struct ring_buffer_event *event) 1010 struct ring_buffer_event *event)
831{ 1011{
@@ -850,7 +1030,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
850 local_set(&cpu_buffer->commit_page->page->commit, index); 1030 local_set(&cpu_buffer->commit_page->page->commit, index);
851} 1031}
852 1032
853static inline void 1033static void
854rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1034rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
855{ 1035{
856 /* 1036 /*
@@ -896,7 +1076,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
896 cpu_buffer->reader_page->read = 0; 1076 cpu_buffer->reader_page->read = 0;
897} 1077}
898 1078
899static inline void rb_inc_iter(struct ring_buffer_iter *iter) 1079static void rb_inc_iter(struct ring_buffer_iter *iter)
900{ 1080{
901 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1081 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
902 1082
@@ -926,7 +1106,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
926 * and with this, we can determine what to place into the 1106 * and with this, we can determine what to place into the
927 * data field. 1107 * data field.
928 */ 1108 */
929static inline void 1109static void
930rb_update_event(struct ring_buffer_event *event, 1110rb_update_event(struct ring_buffer_event *event,
931 unsigned type, unsigned length) 1111 unsigned type, unsigned length)
932{ 1112{
@@ -938,15 +1118,11 @@ rb_update_event(struct ring_buffer_event *event,
938 break; 1118 break;
939 1119
940 case RINGBUF_TYPE_TIME_EXTEND: 1120 case RINGBUF_TYPE_TIME_EXTEND:
941 event->len = 1121 event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
942 (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
943 >> RB_ALIGNMENT_SHIFT;
944 break; 1122 break;
945 1123
946 case RINGBUF_TYPE_TIME_STAMP: 1124 case RINGBUF_TYPE_TIME_STAMP:
947 event->len = 1125 event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
948 (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
949 >> RB_ALIGNMENT_SHIFT;
950 break; 1126 break;
951 1127
952 case RINGBUF_TYPE_DATA: 1128 case RINGBUF_TYPE_DATA:
@@ -955,16 +1131,14 @@ rb_update_event(struct ring_buffer_event *event,
955 event->len = 0; 1131 event->len = 0;
956 event->array[0] = length; 1132 event->array[0] = length;
957 } else 1133 } else
958 event->len = 1134 event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
959 (length + (RB_ALIGNMENT-1))
960 >> RB_ALIGNMENT_SHIFT;
961 break; 1135 break;
962 default: 1136 default:
963 BUG(); 1137 BUG();
964 } 1138 }
965} 1139}
966 1140
967static inline unsigned rb_calculate_event_length(unsigned length) 1141static unsigned rb_calculate_event_length(unsigned length)
968{ 1142{
969 struct ring_buffer_event event; /* Used only for sizeof array */ 1143 struct ring_buffer_event event; /* Used only for sizeof array */
970 1144
@@ -990,6 +1164,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
990 struct ring_buffer *buffer = cpu_buffer->buffer; 1164 struct ring_buffer *buffer = cpu_buffer->buffer;
991 struct ring_buffer_event *event; 1165 struct ring_buffer_event *event;
992 unsigned long flags; 1166 unsigned long flags;
1167 bool lock_taken = false;
993 1168
994 commit_page = cpu_buffer->commit_page; 1169 commit_page = cpu_buffer->commit_page;
995 /* we just need to protect against interrupts */ 1170 /* we just need to protect against interrupts */
@@ -1003,7 +1178,30 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1003 struct buffer_page *next_page = tail_page; 1178 struct buffer_page *next_page = tail_page;
1004 1179
1005 local_irq_save(flags); 1180 local_irq_save(flags);
1006 __raw_spin_lock(&cpu_buffer->lock); 1181 /*
1182 * Since the write to the buffer is still not
1183 * fully lockless, we must be careful with NMIs.
1184 * The locks in the writers are taken when a write
1185 * crosses to a new page. The locks protect against
1186 * races with the readers (this will soon be fixed
1187 * with a lockless solution).
1188 *
1189 * Because we can not protect against NMIs, and we
1190 * want to keep traces reentrant, we need to manage
1191 * what happens when we are in an NMI.
1192 *
1193 * NMIs can happen after we take the lock.
1194 * If we are in an NMI, only take the lock
1195 * if it is not already taken. Otherwise
1196 * simply fail.
1197 */
1198 if (unlikely(in_nmi())) {
1199 if (!__raw_spin_trylock(&cpu_buffer->lock))
1200 goto out_reset;
1201 } else
1202 __raw_spin_lock(&cpu_buffer->lock);
1203
1204 lock_taken = true;
1007 1205
1008 rb_inc_page(cpu_buffer, &next_page); 1206 rb_inc_page(cpu_buffer, &next_page);
1009 1207
@@ -1012,7 +1210,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1012 1210
1013 /* we grabbed the lock before incrementing */ 1211 /* we grabbed the lock before incrementing */
1014 if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) 1212 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1015 goto out_unlock; 1213 goto out_reset;
1016 1214
1017 /* 1215 /*
1018 * If for some reason, we had an interrupt storm that made 1216 * If for some reason, we had an interrupt storm that made
@@ -1021,16 +1219,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1021 */ 1219 */
1022 if (unlikely(next_page == commit_page)) { 1220 if (unlikely(next_page == commit_page)) {
1023 WARN_ON_ONCE(1); 1221 WARN_ON_ONCE(1);
1024 goto out_unlock; 1222 goto out_reset;
1025 } 1223 }
1026 1224
1027 if (next_page == head_page) { 1225 if (next_page == head_page) {
1028 if (!(buffer->flags & RB_FL_OVERWRITE)) { 1226 if (!(buffer->flags & RB_FL_OVERWRITE))
1029 /* reset write */ 1227 goto out_reset;
1030 if (tail <= BUF_PAGE_SIZE)
1031 local_set(&tail_page->write, tail);
1032 goto out_unlock;
1033 }
1034 1228
1035 /* tail_page has not moved yet? */ 1229 /* tail_page has not moved yet? */
1036 if (tail_page == cpu_buffer->tail_page) { 1230 if (tail_page == cpu_buffer->tail_page) {
@@ -1054,7 +1248,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1054 cpu_buffer->tail_page = next_page; 1248 cpu_buffer->tail_page = next_page;
1055 1249
1056 /* reread the time stamp */ 1250 /* reread the time stamp */
1057 *ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1251 *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
1058 cpu_buffer->tail_page->page->time_stamp = *ts; 1252 cpu_buffer->tail_page->page->time_stamp = *ts;
1059 } 1253 }
1060 1254
@@ -1064,7 +1258,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1064 if (tail < BUF_PAGE_SIZE) { 1258 if (tail < BUF_PAGE_SIZE) {
1065 /* Mark the rest of the page with padding */ 1259 /* Mark the rest of the page with padding */
1066 event = __rb_page_index(tail_page, tail); 1260 event = __rb_page_index(tail_page, tail);
1067 event->type = RINGBUF_TYPE_PADDING; 1261 rb_event_set_padding(event);
1068 } 1262 }
1069 1263
1070 if (tail <= BUF_PAGE_SIZE) 1264 if (tail <= BUF_PAGE_SIZE)
@@ -1104,8 +1298,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1104 1298
1105 return event; 1299 return event;
1106 1300
1107 out_unlock: 1301 out_reset:
1108 __raw_spin_unlock(&cpu_buffer->lock); 1302 /* reset write */
1303 if (tail <= BUF_PAGE_SIZE)
1304 local_set(&tail_page->write, tail);
1305
1306 if (likely(lock_taken))
1307 __raw_spin_unlock(&cpu_buffer->lock);
1109 local_irq_restore(flags); 1308 local_irq_restore(flags);
1110 return NULL; 1309 return NULL;
1111} 1310}
@@ -1192,7 +1391,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1192 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1391 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1193 return NULL; 1392 return NULL;
1194 1393
1195 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1394 ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1196 1395
1197 /* 1396 /*
1198 * Only the first commit can update the timestamp. 1397 * Only the first commit can update the timestamp.
@@ -1265,7 +1464,6 @@ static DEFINE_PER_CPU(int, rb_need_resched);
1265 * ring_buffer_lock_reserve - reserve a part of the buffer 1464 * ring_buffer_lock_reserve - reserve a part of the buffer
1266 * @buffer: the ring buffer to reserve from 1465 * @buffer: the ring buffer to reserve from
1267 * @length: the length of the data to reserve (excluding event header) 1466 * @length: the length of the data to reserve (excluding event header)
1268 * @flags: a pointer to save the interrupt flags
1269 * 1467 *
1270 * Returns a reseverd event on the ring buffer to copy directly to. 1468 * Returns a reseverd event on the ring buffer to copy directly to.
1271 * The user of this interface will need to get the body to write into 1469 * The user of this interface will need to get the body to write into
@@ -1278,9 +1476,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
1278 * If NULL is returned, then nothing has been allocated or locked. 1476 * If NULL is returned, then nothing has been allocated or locked.
1279 */ 1477 */
1280struct ring_buffer_event * 1478struct ring_buffer_event *
1281ring_buffer_lock_reserve(struct ring_buffer *buffer, 1479ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1282 unsigned long length,
1283 unsigned long *flags)
1284{ 1480{
1285 struct ring_buffer_per_cpu *cpu_buffer; 1481 struct ring_buffer_per_cpu *cpu_buffer;
1286 struct ring_buffer_event *event; 1482 struct ring_buffer_event *event;
@@ -1347,15 +1543,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1347 * ring_buffer_unlock_commit - commit a reserved 1543 * ring_buffer_unlock_commit - commit a reserved
1348 * @buffer: The buffer to commit to 1544 * @buffer: The buffer to commit to
1349 * @event: The event pointer to commit. 1545 * @event: The event pointer to commit.
1350 * @flags: the interrupt flags received from ring_buffer_lock_reserve.
1351 * 1546 *
1352 * This commits the data to the ring buffer, and releases any locks held. 1547 * This commits the data to the ring buffer, and releases any locks held.
1353 * 1548 *
1354 * Must be paired with ring_buffer_lock_reserve. 1549 * Must be paired with ring_buffer_lock_reserve.
1355 */ 1550 */
1356int ring_buffer_unlock_commit(struct ring_buffer *buffer, 1551int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1357 struct ring_buffer_event *event, 1552 struct ring_buffer_event *event)
1358 unsigned long flags)
1359{ 1553{
1360 struct ring_buffer_per_cpu *cpu_buffer; 1554 struct ring_buffer_per_cpu *cpu_buffer;
1361 int cpu = raw_smp_processor_id(); 1555 int cpu = raw_smp_processor_id();
@@ -1438,7 +1632,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1438} 1632}
1439EXPORT_SYMBOL_GPL(ring_buffer_write); 1633EXPORT_SYMBOL_GPL(ring_buffer_write);
1440 1634
1441static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 1635static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1442{ 1636{
1443 struct buffer_page *reader = cpu_buffer->reader_page; 1637 struct buffer_page *reader = cpu_buffer->reader_page;
1444 struct buffer_page *head = cpu_buffer->head_page; 1638 struct buffer_page *head = cpu_buffer->head_page;
@@ -1528,12 +1722,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
1528unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 1722unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1529{ 1723{
1530 struct ring_buffer_per_cpu *cpu_buffer; 1724 struct ring_buffer_per_cpu *cpu_buffer;
1725 unsigned long ret;
1531 1726
1532 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1727 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1533 return 0; 1728 return 0;
1534 1729
1535 cpu_buffer = buffer->buffers[cpu]; 1730 cpu_buffer = buffer->buffers[cpu];
1536 return cpu_buffer->entries; 1731 ret = cpu_buffer->entries;
1732
1733 return ret;
1537} 1734}
1538EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 1735EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1539 1736
@@ -1545,12 +1742,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1545unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) 1742unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1546{ 1743{
1547 struct ring_buffer_per_cpu *cpu_buffer; 1744 struct ring_buffer_per_cpu *cpu_buffer;
1745 unsigned long ret;
1548 1746
1549 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1747 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1550 return 0; 1748 return 0;
1551 1749
1552 cpu_buffer = buffer->buffers[cpu]; 1750 cpu_buffer = buffer->buffers[cpu];
1553 return cpu_buffer->overrun; 1751 ret = cpu_buffer->overrun;
1752
1753 return ret;
1554} 1754}
1555EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 1755EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1556 1756
@@ -1627,9 +1827,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
1627 */ 1827 */
1628void ring_buffer_iter_reset(struct ring_buffer_iter *iter) 1828void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1629{ 1829{
1630 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1830 struct ring_buffer_per_cpu *cpu_buffer;
1631 unsigned long flags; 1831 unsigned long flags;
1632 1832
1833 if (!iter)
1834 return;
1835
1836 cpu_buffer = iter->cpu_buffer;
1837
1633 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 1838 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1634 rb_iter_reset(iter); 1839 rb_iter_reset(iter);
1635 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 1840 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -1803,7 +2008,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1803 2008
1804 event = rb_reader_event(cpu_buffer); 2009 event = rb_reader_event(cpu_buffer);
1805 2010
1806 if (event->type == RINGBUF_TYPE_DATA) 2011 if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
1807 cpu_buffer->entries--; 2012 cpu_buffer->entries--;
1808 2013
1809 rb_update_read_stamp(cpu_buffer, event); 2014 rb_update_read_stamp(cpu_buffer, event);
@@ -1864,9 +2069,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1864 struct buffer_page *reader; 2069 struct buffer_page *reader;
1865 int nr_loops = 0; 2070 int nr_loops = 0;
1866 2071
1867 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1868 return NULL;
1869
1870 cpu_buffer = buffer->buffers[cpu]; 2072 cpu_buffer = buffer->buffers[cpu];
1871 2073
1872 again: 2074 again:
@@ -1889,9 +2091,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1889 2091
1890 switch (event->type) { 2092 switch (event->type) {
1891 case RINGBUF_TYPE_PADDING: 2093 case RINGBUF_TYPE_PADDING:
1892 RB_WARN_ON(cpu_buffer, 1); 2094 if (rb_null_event(event))
2095 RB_WARN_ON(cpu_buffer, 1);
2096 /*
2097 * Because the writer could be discarding every
2098 * event it creates (which would probably be bad)
2099 * if we were to go back to "again" then we may never
2100 * catch up, and will trigger the warn on, or lock
2101 * the box. Return the padding, and we will release
2102 * the current locks, and try again.
2103 */
1893 rb_advance_reader(cpu_buffer); 2104 rb_advance_reader(cpu_buffer);
1894 return NULL; 2105 return event;
1895 2106
1896 case RINGBUF_TYPE_TIME_EXTEND: 2107 case RINGBUF_TYPE_TIME_EXTEND:
1897 /* Internal data, OK to advance */ 2108 /* Internal data, OK to advance */
@@ -1906,7 +2117,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1906 case RINGBUF_TYPE_DATA: 2117 case RINGBUF_TYPE_DATA:
1907 if (ts) { 2118 if (ts) {
1908 *ts = cpu_buffer->read_stamp + event->time_delta; 2119 *ts = cpu_buffer->read_stamp + event->time_delta;
1909 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); 2120 ring_buffer_normalize_time_stamp(buffer,
2121 cpu_buffer->cpu, ts);
1910 } 2122 }
1911 return event; 2123 return event;
1912 2124
@@ -1951,8 +2163,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1951 2163
1952 switch (event->type) { 2164 switch (event->type) {
1953 case RINGBUF_TYPE_PADDING: 2165 case RINGBUF_TYPE_PADDING:
1954 rb_inc_iter(iter); 2166 if (rb_null_event(event)) {
1955 goto again; 2167 rb_inc_iter(iter);
2168 goto again;
2169 }
2170 rb_advance_iter(iter);
2171 return event;
1956 2172
1957 case RINGBUF_TYPE_TIME_EXTEND: 2173 case RINGBUF_TYPE_TIME_EXTEND:
1958 /* Internal data, OK to advance */ 2174 /* Internal data, OK to advance */
@@ -1967,7 +2183,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1967 case RINGBUF_TYPE_DATA: 2183 case RINGBUF_TYPE_DATA:
1968 if (ts) { 2184 if (ts) {
1969 *ts = iter->read_stamp + event->time_delta; 2185 *ts = iter->read_stamp + event->time_delta;
1970 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); 2186 ring_buffer_normalize_time_stamp(buffer,
2187 cpu_buffer->cpu, ts);
1971 } 2188 }
1972 return event; 2189 return event;
1973 2190
@@ -1995,10 +2212,19 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1995 struct ring_buffer_event *event; 2212 struct ring_buffer_event *event;
1996 unsigned long flags; 2213 unsigned long flags;
1997 2214
2215 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2216 return NULL;
2217
2218 again:
1998 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2219 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1999 event = rb_buffer_peek(buffer, cpu, ts); 2220 event = rb_buffer_peek(buffer, cpu, ts);
2000 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2221 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2001 2222
2223 if (event && event->type == RINGBUF_TYPE_PADDING) {
2224 cpu_relax();
2225 goto again;
2226 }
2227
2002 return event; 2228 return event;
2003} 2229}
2004 2230
@@ -2017,10 +2243,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2017 struct ring_buffer_event *event; 2243 struct ring_buffer_event *event;
2018 unsigned long flags; 2244 unsigned long flags;
2019 2245
2246 again:
2020 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2247 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2021 event = rb_iter_peek(iter, ts); 2248 event = rb_iter_peek(iter, ts);
2022 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2249 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2023 2250
2251 if (event && event->type == RINGBUF_TYPE_PADDING) {
2252 cpu_relax();
2253 goto again;
2254 }
2255
2024 return event; 2256 return event;
2025} 2257}
2026 2258
@@ -2035,24 +2267,37 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2035struct ring_buffer_event * 2267struct ring_buffer_event *
2036ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 2268ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2037{ 2269{
2038 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2270 struct ring_buffer_per_cpu *cpu_buffer;
2039 struct ring_buffer_event *event; 2271 struct ring_buffer_event *event = NULL;
2040 unsigned long flags; 2272 unsigned long flags;
2041 2273
2274 again:
2275 /* might be called in atomic */
2276 preempt_disable();
2277
2042 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2278 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2043 return NULL; 2279 goto out;
2044 2280
2281 cpu_buffer = buffer->buffers[cpu];
2045 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2282 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2046 2283
2047 event = rb_buffer_peek(buffer, cpu, ts); 2284 event = rb_buffer_peek(buffer, cpu, ts);
2048 if (!event) 2285 if (!event)
2049 goto out; 2286 goto out_unlock;
2050 2287
2051 rb_advance_reader(cpu_buffer); 2288 rb_advance_reader(cpu_buffer);
2052 2289
2053 out: 2290 out_unlock:
2054 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2291 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2055 2292
2293 out:
2294 preempt_enable();
2295
2296 if (event && event->type == RINGBUF_TYPE_PADDING) {
2297 cpu_relax();
2298 goto again;
2299 }
2300
2056 return event; 2301 return event;
2057} 2302}
2058EXPORT_SYMBOL_GPL(ring_buffer_consume); 2303EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2131,6 +2376,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2131 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 2376 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2132 unsigned long flags; 2377 unsigned long flags;
2133 2378
2379 again:
2134 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2380 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2135 event = rb_iter_peek(iter, ts); 2381 event = rb_iter_peek(iter, ts);
2136 if (!event) 2382 if (!event)
@@ -2140,6 +2386,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2140 out: 2386 out:
2141 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2387 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2142 2388
2389 if (event && event->type == RINGBUF_TYPE_PADDING) {
2390 cpu_relax();
2391 goto again;
2392 }
2393
2143 return event; 2394 return event;
2144} 2395}
2145EXPORT_SYMBOL_GPL(ring_buffer_read); 2396EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2174,6 +2425,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2174 2425
2175 cpu_buffer->overrun = 0; 2426 cpu_buffer->overrun = 0;
2176 cpu_buffer->entries = 0; 2427 cpu_buffer->entries = 0;
2428
2429 cpu_buffer->write_stamp = 0;
2430 cpu_buffer->read_stamp = 0;
2177} 2431}
2178 2432
2179/** 2433/**
@@ -2229,6 +2483,7 @@ int ring_buffer_empty(struct ring_buffer *buffer)
2229 if (!rb_per_cpu_empty(cpu_buffer)) 2483 if (!rb_per_cpu_empty(cpu_buffer))
2230 return 0; 2484 return 0;
2231 } 2485 }
2486
2232 return 1; 2487 return 1;
2233} 2488}
2234EXPORT_SYMBOL_GPL(ring_buffer_empty); 2489EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2241,12 +2496,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2241int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 2496int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2242{ 2497{
2243 struct ring_buffer_per_cpu *cpu_buffer; 2498 struct ring_buffer_per_cpu *cpu_buffer;
2499 int ret;
2244 2500
2245 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2501 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2246 return 1; 2502 return 1;
2247 2503
2248 cpu_buffer = buffer->buffers[cpu]; 2504 cpu_buffer = buffer->buffers[cpu];
2249 return rb_per_cpu_empty(cpu_buffer); 2505 ret = rb_per_cpu_empty(cpu_buffer);
2506
2507
2508 return ret;
2250} 2509}
2251EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 2510EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2252 2511
@@ -2265,18 +2524,36 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2265{ 2524{
2266 struct ring_buffer_per_cpu *cpu_buffer_a; 2525 struct ring_buffer_per_cpu *cpu_buffer_a;
2267 struct ring_buffer_per_cpu *cpu_buffer_b; 2526 struct ring_buffer_per_cpu *cpu_buffer_b;
2527 int ret = -EINVAL;
2268 2528
2269 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || 2529 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
2270 !cpumask_test_cpu(cpu, buffer_b->cpumask)) 2530 !cpumask_test_cpu(cpu, buffer_b->cpumask))
2271 return -EINVAL; 2531 goto out;
2272 2532
2273 /* At least make sure the two buffers are somewhat the same */ 2533 /* At least make sure the two buffers are somewhat the same */
2274 if (buffer_a->pages != buffer_b->pages) 2534 if (buffer_a->pages != buffer_b->pages)
2275 return -EINVAL; 2535 goto out;
2536
2537 ret = -EAGAIN;
2538
2539 if (ring_buffer_flags != RB_BUFFERS_ON)
2540 goto out;
2541
2542 if (atomic_read(&buffer_a->record_disabled))
2543 goto out;
2544
2545 if (atomic_read(&buffer_b->record_disabled))
2546 goto out;
2276 2547
2277 cpu_buffer_a = buffer_a->buffers[cpu]; 2548 cpu_buffer_a = buffer_a->buffers[cpu];
2278 cpu_buffer_b = buffer_b->buffers[cpu]; 2549 cpu_buffer_b = buffer_b->buffers[cpu];
2279 2550
2551 if (atomic_read(&cpu_buffer_a->record_disabled))
2552 goto out;
2553
2554 if (atomic_read(&cpu_buffer_b->record_disabled))
2555 goto out;
2556
2280 /* 2557 /*
2281 * We can't do a synchronize_sched here because this 2558 * We can't do a synchronize_sched here because this
2282 * function can be called in atomic context. 2559 * function can be called in atomic context.
@@ -2295,18 +2572,21 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2295 atomic_dec(&cpu_buffer_a->record_disabled); 2572 atomic_dec(&cpu_buffer_a->record_disabled);
2296 atomic_dec(&cpu_buffer_b->record_disabled); 2573 atomic_dec(&cpu_buffer_b->record_disabled);
2297 2574
2298 return 0; 2575 ret = 0;
2576out:
2577 return ret;
2299} 2578}
2300EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 2579EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2301 2580
2302static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, 2581static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2303 struct buffer_data_page *bpage) 2582 struct buffer_data_page *bpage,
2583 unsigned int offset)
2304{ 2584{
2305 struct ring_buffer_event *event; 2585 struct ring_buffer_event *event;
2306 unsigned long head; 2586 unsigned long head;
2307 2587
2308 __raw_spin_lock(&cpu_buffer->lock); 2588 __raw_spin_lock(&cpu_buffer->lock);
2309 for (head = 0; head < local_read(&bpage->commit); 2589 for (head = offset; head < local_read(&bpage->commit);
2310 head += rb_event_length(event)) { 2590 head += rb_event_length(event)) {
2311 2591
2312 event = __rb_data_page_index(bpage, head); 2592 event = __rb_data_page_index(bpage, head);
@@ -2337,8 +2617,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2337 */ 2617 */
2338void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) 2618void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2339{ 2619{
2340 unsigned long addr;
2341 struct buffer_data_page *bpage; 2620 struct buffer_data_page *bpage;
2621 unsigned long addr;
2342 2622
2343 addr = __get_free_page(GFP_KERNEL); 2623 addr = __get_free_page(GFP_KERNEL);
2344 if (!addr) 2624 if (!addr)
@@ -2346,6 +2626,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2346 2626
2347 bpage = (void *)addr; 2627 bpage = (void *)addr;
2348 2628
2629 rb_init_page(bpage);
2630
2349 return bpage; 2631 return bpage;
2350} 2632}
2351 2633
@@ -2365,6 +2647,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2365 * ring_buffer_read_page - extract a page from the ring buffer 2647 * ring_buffer_read_page - extract a page from the ring buffer
2366 * @buffer: buffer to extract from 2648 * @buffer: buffer to extract from
2367 * @data_page: the page to use allocated from ring_buffer_alloc_read_page 2649 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
2650 * @len: amount to extract
2368 * @cpu: the cpu of the buffer to extract 2651 * @cpu: the cpu of the buffer to extract
2369 * @full: should the extraction only happen when the page is full. 2652 * @full: should the extraction only happen when the page is full.
2370 * 2653 *
@@ -2374,12 +2657,12 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2374 * to swap with a page in the ring buffer. 2657 * to swap with a page in the ring buffer.
2375 * 2658 *
2376 * for example: 2659 * for example:
2377 * rpage = ring_buffer_alloc_page(buffer); 2660 * rpage = ring_buffer_alloc_read_page(buffer);
2378 * if (!rpage) 2661 * if (!rpage)
2379 * return error; 2662 * return error;
2380 * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); 2663 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
2381 * if (ret) 2664 * if (ret >= 0)
2382 * process_page(rpage); 2665 * process_page(rpage, ret);
2383 * 2666 *
2384 * When @full is set, the function will not return true unless 2667 * When @full is set, the function will not return true unless
2385 * the writer is off the reader page. 2668 * the writer is off the reader page.
@@ -2390,72 +2673,118 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2390 * responsible for that. 2673 * responsible for that.
2391 * 2674 *
2392 * Returns: 2675 * Returns:
2393 * 1 if data has been transferred 2676 * >=0 if data has been transferred, returns the offset of consumed data.
2394 * 0 if no data has been transferred. 2677 * <0 if no data has been transferred.
2395 */ 2678 */
2396int ring_buffer_read_page(struct ring_buffer *buffer, 2679int ring_buffer_read_page(struct ring_buffer *buffer,
2397 void **data_page, int cpu, int full) 2680 void **data_page, size_t len, int cpu, int full)
2398{ 2681{
2399 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2682 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2400 struct ring_buffer_event *event; 2683 struct ring_buffer_event *event;
2401 struct buffer_data_page *bpage; 2684 struct buffer_data_page *bpage;
2685 struct buffer_page *reader;
2402 unsigned long flags; 2686 unsigned long flags;
2403 int ret = 0; 2687 unsigned int commit;
2688 unsigned int read;
2689 u64 save_timestamp;
2690 int ret = -1;
2691
2692 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2693 goto out;
2694
2695 /*
2696 * If len is not big enough to hold the page header, then
2697 * we can not copy anything.
2698 */
2699 if (len <= BUF_PAGE_HDR_SIZE)
2700 goto out;
2701
2702 len -= BUF_PAGE_HDR_SIZE;
2404 2703
2405 if (!data_page) 2704 if (!data_page)
2406 return 0; 2705 goto out;
2407 2706
2408 bpage = *data_page; 2707 bpage = *data_page;
2409 if (!bpage) 2708 if (!bpage)
2410 return 0; 2709 goto out;
2411 2710
2412 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2711 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2413 2712
2414 /* 2713 reader = rb_get_reader_page(cpu_buffer);
2415 * rb_buffer_peek will get the next ring buffer if 2714 if (!reader)
2416 * the current reader page is empty. 2715 goto out_unlock;
2417 */ 2716
2418 event = rb_buffer_peek(buffer, cpu, NULL); 2717 event = rb_reader_event(cpu_buffer);
2419 if (!event) 2718
2420 goto out; 2719 read = reader->read;
2720 commit = rb_page_commit(reader);
2421 2721
2422 /* check for data */
2423 if (!local_read(&cpu_buffer->reader_page->page->commit))
2424 goto out;
2425 /* 2722 /*
2426 * If the writer is already off of the read page, then simply 2723 * If this page has been partially read or
2427 * switch the read page with the given page. Otherwise 2724 * if len is not big enough to read the rest of the page or
2428 * we need to copy the data from the reader to the writer. 2725 * a writer is still on the page, then
2726 * we must copy the data from the page to the buffer.
2727 * Otherwise, we can simply swap the page with the one passed in.
2429 */ 2728 */
2430 if (cpu_buffer->reader_page == cpu_buffer->commit_page) { 2729 if (read || (len < (commit - read)) ||
2431 unsigned int read = cpu_buffer->reader_page->read; 2730 cpu_buffer->reader_page == cpu_buffer->commit_page) {
2731 struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
2732 unsigned int rpos = read;
2733 unsigned int pos = 0;
2734 unsigned int size;
2432 2735
2433 if (full) 2736 if (full)
2434 goto out; 2737 goto out_unlock;
2435 /* The writer is still on the reader page, we must copy */ 2738
2436 bpage = cpu_buffer->reader_page->page; 2739 if (len > (commit - read))
2437 memcpy(bpage->data, 2740 len = (commit - read);
2438 cpu_buffer->reader_page->page->data + read, 2741
2439 local_read(&bpage->commit) - read); 2742 size = rb_event_length(event);
2743
2744 if (len < size)
2745 goto out_unlock;
2440 2746
2441 /* consume what was read */ 2747 /* save the current timestamp, since the user will need it */
2442 cpu_buffer->reader_page += read; 2748 save_timestamp = cpu_buffer->read_stamp;
2443 2749
2750 /* Need to copy one event at a time */
2751 do {
2752 memcpy(bpage->data + pos, rpage->data + rpos, size);
2753
2754 len -= size;
2755
2756 rb_advance_reader(cpu_buffer);
2757 rpos = reader->read;
2758 pos += size;
2759
2760 event = rb_reader_event(cpu_buffer);
2761 size = rb_event_length(event);
2762 } while (len > size);
2763
2764 /* update bpage */
2765 local_set(&bpage->commit, pos);
2766 bpage->time_stamp = save_timestamp;
2767
2768 /* we copied everything to the beginning */
2769 read = 0;
2444 } else { 2770 } else {
2445 /* swap the pages */ 2771 /* swap the pages */
2446 rb_init_page(bpage); 2772 rb_init_page(bpage);
2447 bpage = cpu_buffer->reader_page->page; 2773 bpage = reader->page;
2448 cpu_buffer->reader_page->page = *data_page; 2774 reader->page = *data_page;
2449 cpu_buffer->reader_page->read = 0; 2775 local_set(&reader->write, 0);
2776 reader->read = 0;
2450 *data_page = bpage; 2777 *data_page = bpage;
2778
2779 /* update the entry counter */
2780 rb_remove_entries(cpu_buffer, bpage, read);
2451 } 2781 }
2452 ret = 1; 2782 ret = read;
2453 2783
2454 /* update the entry counter */ 2784 out_unlock:
2455 rb_remove_entries(cpu_buffer, bpage);
2456 out:
2457 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2785 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2458 2786
2787 out:
2459 return ret; 2788 return ret;
2460} 2789}
2461 2790
@@ -2463,7 +2792,7 @@ static ssize_t
2463rb_simple_read(struct file *filp, char __user *ubuf, 2792rb_simple_read(struct file *filp, char __user *ubuf,
2464 size_t cnt, loff_t *ppos) 2793 size_t cnt, loff_t *ppos)
2465{ 2794{
2466 long *p = filp->private_data; 2795 unsigned long *p = filp->private_data;
2467 char buf[64]; 2796 char buf[64];
2468 int r; 2797 int r;
2469 2798
@@ -2479,9 +2808,9 @@ static ssize_t
2479rb_simple_write(struct file *filp, const char __user *ubuf, 2808rb_simple_write(struct file *filp, const char __user *ubuf,
2480 size_t cnt, loff_t *ppos) 2809 size_t cnt, loff_t *ppos)
2481{ 2810{
2482 long *p = filp->private_data; 2811 unsigned long *p = filp->private_data;
2483 char buf[64]; 2812 char buf[64];
2484 long val; 2813 unsigned long val;
2485 int ret; 2814 int ret;
2486 2815
2487 if (cnt >= sizeof(buf)) 2816 if (cnt >= sizeof(buf))
@@ -2506,7 +2835,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
2506 return cnt; 2835 return cnt;
2507} 2836}
2508 2837
2509static struct file_operations rb_simple_fops = { 2838static const struct file_operations rb_simple_fops = {
2510 .open = tracing_open_generic, 2839 .open = tracing_open_generic,
2511 .read = rb_simple_read, 2840 .read = rb_simple_read,
2512 .write = rb_simple_write, 2841 .write = rb_simple_write,
@@ -2529,3 +2858,42 @@ static __init int rb_init_debugfs(void)
2529} 2858}
2530 2859
2531fs_initcall(rb_init_debugfs); 2860fs_initcall(rb_init_debugfs);
2861
2862#ifdef CONFIG_HOTPLUG_CPU
2863static int rb_cpu_notify(struct notifier_block *self,
2864 unsigned long action, void *hcpu)
2865{
2866 struct ring_buffer *buffer =
2867 container_of(self, struct ring_buffer, cpu_notify);
2868 long cpu = (long)hcpu;
2869
2870 switch (action) {
2871 case CPU_UP_PREPARE:
2872 case CPU_UP_PREPARE_FROZEN:
2873 if (cpu_isset(cpu, *buffer->cpumask))
2874 return NOTIFY_OK;
2875
2876 buffer->buffers[cpu] =
2877 rb_allocate_cpu_buffer(buffer, cpu);
2878 if (!buffer->buffers[cpu]) {
2879 WARN(1, "failed to allocate ring buffer on CPU %ld\n",
2880 cpu);
2881 return NOTIFY_OK;
2882 }
2883 smp_wmb();
2884 cpu_set(cpu, *buffer->cpumask);
2885 break;
2886 case CPU_DOWN_PREPARE:
2887 case CPU_DOWN_PREPARE_FROZEN:
2888 /*
2889 * Do nothing.
2890 * If we were to free the buffer, then the user would
2891 * lose any trace that was in the buffer.
2892 */
2893 break;
2894 default:
2895 break;
2896 }
2897 return NOTIFY_OK;
2898}
2899#endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c580233add95..9d28476a9851 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -11,47 +11,60 @@
11 * Copyright (C) 2004-2006 Ingo Molnar 11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h>
14#include <linux/utsrelease.h> 15#include <linux/utsrelease.h>
16#include <linux/stacktrace.h>
17#include <linux/writeback.h>
15#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
16#include <linux/seq_file.h> 19#include <linux/seq_file.h>
17#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h>
18#include <linux/debugfs.h> 22#include <linux/debugfs.h>
19#include <linux/pagemap.h> 23#include <linux/pagemap.h>
20#include <linux/hardirq.h> 24#include <linux/hardirq.h>
21#include <linux/linkage.h> 25#include <linux/linkage.h>
22#include <linux/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/kprobes.h>
23#include <linux/ftrace.h> 28#include <linux/ftrace.h>
24#include <linux/module.h> 29#include <linux/module.h>
25#include <linux/percpu.h> 30#include <linux/percpu.h>
31#include <linux/splice.h>
26#include <linux/kdebug.h> 32#include <linux/kdebug.h>
33#include <linux/string.h>
27#include <linux/ctype.h> 34#include <linux/ctype.h>
28#include <linux/init.h> 35#include <linux/init.h>
29#include <linux/poll.h> 36#include <linux/poll.h>
30#include <linux/gfp.h> 37#include <linux/gfp.h>
31#include <linux/fs.h> 38#include <linux/fs.h>
32#include <linux/kprobes.h>
33#include <linux/writeback.h>
34
35#include <linux/stacktrace.h>
36#include <linux/ring_buffer.h>
37#include <linux/irqflags.h>
38 39
39#include "trace.h" 40#include "trace.h"
41#include "trace_output.h"
40 42
41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 43#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
42 44
43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 45unsigned long __read_mostly tracing_max_latency;
44unsigned long __read_mostly tracing_thresh; 46unsigned long __read_mostly tracing_thresh;
45 47
46/* 48/*
49 * On boot up, the ring buffer is set to the minimum size, so that
50 * we do not waste memory on systems that are not using tracing.
51 */
52static int ring_buffer_expanded;
53
54/*
47 * We need to change this state when a selftest is running. 55 * We need to change this state when a selftest is running.
48 * A selftest will lurk into the ring-buffer to count the 56 * A selftest will lurk into the ring-buffer to count the
49 * entries inserted during the selftest although some concurrent 57 * entries inserted during the selftest although some concurrent
50 * insertions into the ring-buffer such as ftrace_printk could occurred 58 * insertions into the ring-buffer such as trace_printk could occurred
51 * at the same time, giving false positive or negative results. 59 * at the same time, giving false positive or negative results.
52 */ 60 */
53static bool __read_mostly tracing_selftest_running; 61static bool __read_mostly tracing_selftest_running;
54 62
63/*
64 * If a tracer is running, we do not want to run SELFTEST.
65 */
66static bool __read_mostly tracing_selftest_disabled;
67
55/* For tracers that don't implement custom flags */ 68/* For tracers that don't implement custom flags */
56static struct tracer_opt dummy_tracer_opt[] = { 69static struct tracer_opt dummy_tracer_opt[] = {
57 { } 70 { }
@@ -73,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
73 * of the tracer is successful. But that is the only place that sets 86 * of the tracer is successful. But that is the only place that sets
74 * this back to zero. 87 * this back to zero.
75 */ 88 */
76int tracing_disabled = 1; 89static int tracing_disabled = 1;
77 90
78static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 91static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
79 92
@@ -91,6 +104,9 @@ static inline void ftrace_enable_cpu(void)
91 104
92static cpumask_var_t __read_mostly tracing_buffer_mask; 105static cpumask_var_t __read_mostly tracing_buffer_mask;
93 106
107/* Define which cpu buffers are currently read in trace_pipe */
108static cpumask_var_t tracing_reader_cpumask;
109
94#define for_each_tracing_cpu(cpu) \ 110#define for_each_tracing_cpu(cpu) \
95 for_each_cpu(cpu, tracing_buffer_mask) 111 for_each_cpu(cpu, tracing_buffer_mask)
96 112
@@ -109,14 +125,21 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
109 */ 125 */
110int ftrace_dump_on_oops; 126int ftrace_dump_on_oops;
111 127
112static int tracing_set_tracer(char *buf); 128static int tracing_set_tracer(const char *buf);
129
130#define BOOTUP_TRACER_SIZE 100
131static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
132static char *default_bootup_tracer;
113 133
114static int __init set_ftrace(char *str) 134static int __init set_ftrace(char *str)
115{ 135{
116 tracing_set_tracer(str); 136 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
137 default_bootup_tracer = bootup_tracer_buf;
138 /* We are using ftrace early, expand it */
139 ring_buffer_expanded = 1;
117 return 1; 140 return 1;
118} 141}
119__setup("ftrace", set_ftrace); 142__setup("ftrace=", set_ftrace);
120 143
121static int __init set_ftrace_dump_on_oops(char *str) 144static int __init set_ftrace_dump_on_oops(char *str)
122{ 145{
@@ -125,21 +148,13 @@ static int __init set_ftrace_dump_on_oops(char *str)
125} 148}
126__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 149__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
127 150
128long 151unsigned long long ns2usecs(cycle_t nsec)
129ns2usecs(cycle_t nsec)
130{ 152{
131 nsec += 500; 153 nsec += 500;
132 do_div(nsec, 1000); 154 do_div(nsec, 1000);
133 return nsec; 155 return nsec;
134} 156}
135 157
136cycle_t ftrace_now(int cpu)
137{
138 u64 ts = ring_buffer_time_stamp(cpu);
139 ring_buffer_normalize_time_stamp(cpu, &ts);
140 return ts;
141}
142
143/* 158/*
144 * The global_trace is the descriptor that holds the tracing 159 * The global_trace is the descriptor that holds the tracing
145 * buffers for the live tracing. For each CPU, it contains 160 * buffers for the live tracing. For each CPU, it contains
@@ -156,6 +171,20 @@ static struct trace_array global_trace;
156 171
157static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
158 173
174cycle_t ftrace_now(int cpu)
175{
176 u64 ts;
177
178 /* Early boot up does not have a buffer yet */
179 if (!global_trace.buffer)
180 return trace_clock_local();
181
182 ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
183 ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
184
185 return ts;
186}
187
159/* 188/*
160 * The max_tr is used to snapshot the global_trace when a maximum 189 * The max_tr is used to snapshot the global_trace when a maximum
161 * latency is reached. Some tracers will use this to store a maximum 190 * latency is reached. Some tracers will use this to store a maximum
@@ -186,9 +215,6 @@ int tracing_is_enabled(void)
186 return tracer_enabled; 215 return tracer_enabled;
187} 216}
188 217
189/* function tracing enabled */
190int ftrace_function_enabled;
191
192/* 218/*
193 * trace_buf_size is the size in bytes that is allocated 219 * trace_buf_size is the size in bytes that is allocated
194 * for a buffer. Note, the number of bytes is always rounded 220 * for a buffer. Note, the number of bytes is always rounded
@@ -229,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
229 255
230/* trace_flags holds trace_options default values */ 256/* trace_flags holds trace_options default values */
231unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 257unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
232 TRACE_ITER_ANNOTATE; 258 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
233 259
234/** 260/**
235 * trace_wake_up - wake up tasks waiting for trace input 261 * trace_wake_up - wake up tasks waiting for trace input
@@ -280,13 +306,17 @@ static const char *trace_options[] = {
280 "block", 306 "block",
281 "stacktrace", 307 "stacktrace",
282 "sched-tree", 308 "sched-tree",
283 "ftrace_printk", 309 "trace_printk",
284 "ftrace_preempt", 310 "ftrace_preempt",
285 "branch", 311 "branch",
286 "annotate", 312 "annotate",
287 "userstacktrace", 313 "userstacktrace",
288 "sym-userobj", 314 "sym-userobj",
289 "printk-msg-only", 315 "printk-msg-only",
316 "context-info",
317 "latency-format",
318 "global-clock",
319 "sleep-time",
290 NULL 320 NULL
291}; 321};
292 322
@@ -326,146 +356,37 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
326 data->rt_priority = tsk->rt_priority; 356 data->rt_priority = tsk->rt_priority;
327 357
328 /* record this tasks comm */ 358 /* record this tasks comm */
329 tracing_record_cmdline(current); 359 tracing_record_cmdline(tsk);
330} 360}
331 361
332/** 362ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
333 * trace_seq_printf - sequence printing of trace information
334 * @s: trace sequence descriptor
335 * @fmt: printf format string
336 *
337 * The tracer may use either sequence operations or its own
338 * copy to user routines. To simplify formating of a trace
339 * trace_seq_printf is used to store strings into a special
340 * buffer (@s). Then the output may be either used by
341 * the sequencer or pulled into another buffer.
342 */
343int
344trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
345{ 363{
346 int len = (PAGE_SIZE - 1) - s->len; 364 int len;
347 va_list ap;
348 int ret; 365 int ret;
349 366
350 if (!len) 367 if (!cnt)
351 return 0;
352
353 va_start(ap, fmt);
354 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
355 va_end(ap);
356
357 /* If we can't write it all, don't bother writing anything */
358 if (ret >= len)
359 return 0;
360
361 s->len += ret;
362
363 return len;
364}
365
366/**
367 * trace_seq_puts - trace sequence printing of simple string
368 * @s: trace sequence descriptor
369 * @str: simple string to record
370 *
371 * The tracer may use either the sequence operations or its own
372 * copy to user routines. This function records a simple string
373 * into a special buffer (@s) for later retrieval by a sequencer
374 * or other mechanism.
375 */
376static int
377trace_seq_puts(struct trace_seq *s, const char *str)
378{
379 int len = strlen(str);
380
381 if (len > ((PAGE_SIZE - 1) - s->len))
382 return 0;
383
384 memcpy(s->buffer + s->len, str, len);
385 s->len += len;
386
387 return len;
388}
389
390static int
391trace_seq_putc(struct trace_seq *s, unsigned char c)
392{
393 if (s->len >= (PAGE_SIZE - 1))
394 return 0;
395
396 s->buffer[s->len++] = c;
397
398 return 1;
399}
400
401static int
402trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
403{
404 if (len > ((PAGE_SIZE - 1) - s->len))
405 return 0; 368 return 0;
406 369
407 memcpy(s->buffer + s->len, mem, len); 370 if (s->len <= s->readpos)
408 s->len += len; 371 return -EBUSY;
409
410 return len;
411}
412
413#define MAX_MEMHEX_BYTES 8
414#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
415
416static int
417trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
418{
419 unsigned char hex[HEX_CHARS];
420 unsigned char *data = mem;
421 int i, j;
422
423#ifdef __BIG_ENDIAN
424 for (i = 0, j = 0; i < len; i++) {
425#else
426 for (i = len-1, j = 0; i >= 0; i--) {
427#endif
428 hex[j++] = hex_asc_hi(data[i]);
429 hex[j++] = hex_asc_lo(data[i]);
430 }
431 hex[j++] = ' ';
432
433 return trace_seq_putmem(s, hex, j);
434}
435
436static int
437trace_seq_path(struct trace_seq *s, struct path *path)
438{
439 unsigned char *p;
440 372
441 if (s->len >= (PAGE_SIZE - 1)) 373 len = s->len - s->readpos;
442 return 0; 374 if (cnt > len)
443 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 375 cnt = len;
444 if (!IS_ERR(p)) { 376 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
445 p = mangle_path(s->buffer + s->len, p, "\n"); 377 if (ret == cnt)
446 if (p) { 378 return -EFAULT;
447 s->len = p - s->buffer;
448 return 1;
449 }
450 } else {
451 s->buffer[s->len++] = '?';
452 return 1;
453 }
454 379
455 return 0; 380 cnt -= ret;
456}
457 381
458static void 382 s->readpos += cnt;
459trace_seq_reset(struct trace_seq *s) 383 return cnt;
460{
461 s->len = 0;
462 s->readpos = 0;
463} 384}
464 385
465ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 386static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
466{ 387{
467 int len; 388 int len;
468 int ret; 389 void *ret;
469 390
470 if (s->len <= s->readpos) 391 if (s->len <= s->readpos)
471 return -EBUSY; 392 return -EBUSY;
@@ -473,11 +394,11 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
473 len = s->len - s->readpos; 394 len = s->len - s->readpos;
474 if (cnt > len) 395 if (cnt > len)
475 cnt = len; 396 cnt = len;
476 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); 397 ret = memcpy(buf, s->buffer + s->readpos, cnt);
477 if (ret) 398 if (!ret)
478 return -EFAULT; 399 return -EFAULT;
479 400
480 s->readpos += len; 401 s->readpos += cnt;
481 return cnt; 402 return cnt;
482} 403}
483 404
@@ -489,7 +410,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
489 s->buffer[len] = 0; 410 s->buffer[len] = 0;
490 seq_puts(m, s->buffer); 411 seq_puts(m, s->buffer);
491 412
492 trace_seq_reset(s); 413 trace_seq_init(s);
493} 414}
494 415
495/** 416/**
@@ -543,7 +464,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
543 464
544 ftrace_enable_cpu(); 465 ftrace_enable_cpu();
545 466
546 WARN_ON_ONCE(ret); 467 WARN_ON_ONCE(ret && ret != -EAGAIN);
547 468
548 __update_max_tr(tr, tsk, cpu); 469 __update_max_tr(tr, tsk, cpu);
549 __raw_spin_unlock(&ftrace_max_lock); 470 __raw_spin_unlock(&ftrace_max_lock);
@@ -556,6 +477,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
556 * Register a new plugin tracer. 477 * Register a new plugin tracer.
557 */ 478 */
558int register_tracer(struct tracer *type) 479int register_tracer(struct tracer *type)
480__releases(kernel_lock)
481__acquires(kernel_lock)
559{ 482{
560 struct tracer *t; 483 struct tracer *t;
561 int len; 484 int len;
@@ -594,9 +517,12 @@ int register_tracer(struct tracer *type)
594 else 517 else
595 if (!type->flags->opts) 518 if (!type->flags->opts)
596 type->flags->opts = dummy_tracer_opt; 519 type->flags->opts = dummy_tracer_opt;
520 if (!type->wait_pipe)
521 type->wait_pipe = default_wait_pipe;
522
597 523
598#ifdef CONFIG_FTRACE_STARTUP_TEST 524#ifdef CONFIG_FTRACE_STARTUP_TEST
599 if (type->selftest) { 525 if (type->selftest && !tracing_selftest_disabled) {
600 struct tracer *saved_tracer = current_trace; 526 struct tracer *saved_tracer = current_trace;
601 struct trace_array *tr = &global_trace; 527 struct trace_array *tr = &global_trace;
602 int i; 528 int i;
@@ -638,8 +564,26 @@ int register_tracer(struct tracer *type)
638 out: 564 out:
639 tracing_selftest_running = false; 565 tracing_selftest_running = false;
640 mutex_unlock(&trace_types_lock); 566 mutex_unlock(&trace_types_lock);
641 lock_kernel();
642 567
568 if (ret || !default_bootup_tracer)
569 goto out_unlock;
570
571 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
572 goto out_unlock;
573
574 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
575 /* Do we want this tracer to start on bootup? */
576 tracing_set_tracer(type->name);
577 default_bootup_tracer = NULL;
578 /* disable other selftests, since this will break it. */
579 tracing_selftest_disabled = 1;
580#ifdef CONFIG_FTRACE_STARTUP_TEST
581 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
582 type->name);
583#endif
584
585 out_unlock:
586 lock_kernel();
643 return ret; 587 return ret;
644} 588}
645 589
@@ -658,6 +602,15 @@ void unregister_tracer(struct tracer *type)
658 602
659 found: 603 found:
660 *t = (*t)->next; 604 *t = (*t)->next;
605
606 if (type == current_trace && tracer_enabled) {
607 tracer_enabled = 0;
608 tracing_stop();
609 if (current_trace->stop)
610 current_trace->stop(&global_trace);
611 current_trace = &nop_trace;
612 }
613
661 if (strlen(type->name) != max_tracer_type_len) 614 if (strlen(type->name) != max_tracer_type_len)
662 goto out; 615 goto out;
663 616
@@ -689,19 +642,20 @@ void tracing_reset_online_cpus(struct trace_array *tr)
689} 642}
690 643
691#define SAVED_CMDLINES 128 644#define SAVED_CMDLINES 128
645#define NO_CMDLINE_MAP UINT_MAX
692static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 646static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
693static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 647static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
694static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 648static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
695static int cmdline_idx; 649static int cmdline_idx;
696static DEFINE_SPINLOCK(trace_cmdline_lock); 650static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
697 651
698/* temporary disable recording */ 652/* temporary disable recording */
699atomic_t trace_record_cmdline_disabled __read_mostly; 653static atomic_t trace_record_cmdline_disabled __read_mostly;
700 654
701static void trace_init_cmdlines(void) 655static void trace_init_cmdlines(void)
702{ 656{
703 memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); 657 memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
704 memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); 658 memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
705 cmdline_idx = 0; 659 cmdline_idx = 0;
706} 660}
707 661
@@ -738,13 +692,12 @@ void tracing_start(void)
738 return; 692 return;
739 693
740 spin_lock_irqsave(&tracing_start_lock, flags); 694 spin_lock_irqsave(&tracing_start_lock, flags);
741 if (--trace_stop_count) 695 if (--trace_stop_count) {
742 goto out; 696 if (trace_stop_count < 0) {
743 697 /* Someone screwed up their debugging */
744 if (trace_stop_count < 0) { 698 WARN_ON_ONCE(1);
745 /* Someone screwed up their debugging */ 699 trace_stop_count = 0;
746 WARN_ON_ONCE(1); 700 }
747 trace_stop_count = 0;
748 goto out; 701 goto out;
749 } 702 }
750 703
@@ -794,8 +747,7 @@ void trace_stop_cmdline_recording(void);
794 747
795static void trace_save_cmdline(struct task_struct *tsk) 748static void trace_save_cmdline(struct task_struct *tsk)
796{ 749{
797 unsigned map; 750 unsigned pid, idx;
798 unsigned idx;
799 751
800 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) 752 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
801 return; 753 return;
@@ -806,17 +758,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
806 * nor do we want to disable interrupts, 758 * nor do we want to disable interrupts,
807 * so if we miss here, then better luck next time. 759 * so if we miss here, then better luck next time.
808 */ 760 */
809 if (!spin_trylock(&trace_cmdline_lock)) 761 if (!__raw_spin_trylock(&trace_cmdline_lock))
810 return; 762 return;
811 763
812 idx = map_pid_to_cmdline[tsk->pid]; 764 idx = map_pid_to_cmdline[tsk->pid];
813 if (idx >= SAVED_CMDLINES) { 765 if (idx == NO_CMDLINE_MAP) {
814 idx = (cmdline_idx + 1) % SAVED_CMDLINES; 766 idx = (cmdline_idx + 1) % SAVED_CMDLINES;
815 767
816 map = map_cmdline_to_pid[idx]; 768 /*
817 if (map <= PID_MAX_DEFAULT) 769 * Check whether the cmdline buffer at idx has a pid
818 map_pid_to_cmdline[map] = (unsigned)-1; 770 * mapped. We are going to overwrite that entry so we
771 * need to clear the map_pid_to_cmdline. Otherwise we
772 * would read the new comm for the old pid.
773 */
774 pid = map_cmdline_to_pid[idx];
775 if (pid != NO_CMDLINE_MAP)
776 map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
819 777
778 map_cmdline_to_pid[idx] = tsk->pid;
820 map_pid_to_cmdline[tsk->pid] = idx; 779 map_pid_to_cmdline[tsk->pid] = idx;
821 780
822 cmdline_idx = idx; 781 cmdline_idx = idx;
@@ -824,33 +783,37 @@ static void trace_save_cmdline(struct task_struct *tsk)
824 783
825 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 784 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
826 785
827 spin_unlock(&trace_cmdline_lock); 786 __raw_spin_unlock(&trace_cmdline_lock);
828} 787}
829 788
830char *trace_find_cmdline(int pid) 789void trace_find_cmdline(int pid, char comm[])
831{ 790{
832 char *cmdline = "<...>";
833 unsigned map; 791 unsigned map;
834 792
835 if (!pid) 793 if (!pid) {
836 return "<idle>"; 794 strcpy(comm, "<idle>");
795 return;
796 }
837 797
838 if (pid > PID_MAX_DEFAULT) 798 if (pid > PID_MAX_DEFAULT) {
839 goto out; 799 strcpy(comm, "<...>");
800 return;
801 }
840 802
803 __raw_spin_lock(&trace_cmdline_lock);
841 map = map_pid_to_cmdline[pid]; 804 map = map_pid_to_cmdline[pid];
842 if (map >= SAVED_CMDLINES) 805 if (map != NO_CMDLINE_MAP)
843 goto out; 806 strcpy(comm, saved_cmdlines[map]);
844 807 else
845 cmdline = saved_cmdlines[map]; 808 strcpy(comm, "<...>");
846 809
847 out: 810 __raw_spin_unlock(&trace_cmdline_lock);
848 return cmdline;
849} 811}
850 812
851void tracing_record_cmdline(struct task_struct *tsk) 813void tracing_record_cmdline(struct task_struct *tsk)
852{ 814{
853 if (atomic_read(&trace_record_cmdline_disabled)) 815 if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
816 !tracing_is_on())
854 return; 817 return;
855 818
856 trace_save_cmdline(tsk); 819 trace_save_cmdline(tsk);
@@ -864,7 +827,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
864 827
865 entry->preempt_count = pc & 0xff; 828 entry->preempt_count = pc & 0xff;
866 entry->pid = (tsk) ? tsk->pid : 0; 829 entry->pid = (tsk) ? tsk->pid : 0;
867 entry->tgid = (tsk) ? tsk->tgid : 0; 830 entry->tgid = (tsk) ? tsk->tgid : 0;
868 entry->flags = 831 entry->flags =
869#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 832#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
870 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 833 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -876,78 +839,132 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
876 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 839 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
877} 840}
878 841
842struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
843 unsigned char type,
844 unsigned long len,
845 unsigned long flags, int pc)
846{
847 struct ring_buffer_event *event;
848
849 event = ring_buffer_lock_reserve(tr->buffer, len);
850 if (event != NULL) {
851 struct trace_entry *ent = ring_buffer_event_data(event);
852
853 tracing_generic_entry_update(ent, flags, pc);
854 ent->type = type;
855 }
856
857 return event;
858}
859static void ftrace_trace_stack(struct trace_array *tr,
860 unsigned long flags, int skip, int pc);
861static void ftrace_trace_userstack(struct trace_array *tr,
862 unsigned long flags, int pc);
863
864static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
865 struct ring_buffer_event *event,
866 unsigned long flags, int pc,
867 int wake)
868{
869 ring_buffer_unlock_commit(tr->buffer, event);
870
871 ftrace_trace_stack(tr, flags, 6, pc);
872 ftrace_trace_userstack(tr, flags, pc);
873
874 if (wake)
875 trace_wake_up();
876}
877
878void trace_buffer_unlock_commit(struct trace_array *tr,
879 struct ring_buffer_event *event,
880 unsigned long flags, int pc)
881{
882 __trace_buffer_unlock_commit(tr, event, flags, pc, 1);
883}
884
885struct ring_buffer_event *
886trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
887 unsigned long flags, int pc)
888{
889 return trace_buffer_lock_reserve(&global_trace,
890 type, len, flags, pc);
891}
892
893void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
894 unsigned long flags, int pc)
895{
896 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
897}
898
899void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
900 unsigned long flags, int pc)
901{
902 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
903}
904
879void 905void
880trace_function(struct trace_array *tr, struct trace_array_cpu *data, 906trace_function(struct trace_array *tr,
881 unsigned long ip, unsigned long parent_ip, unsigned long flags, 907 unsigned long ip, unsigned long parent_ip, unsigned long flags,
882 int pc) 908 int pc)
883{ 909{
884 struct ring_buffer_event *event; 910 struct ring_buffer_event *event;
885 struct ftrace_entry *entry; 911 struct ftrace_entry *entry;
886 unsigned long irq_flags;
887 912
888 /* If we are reading the ring buffer, don't trace */ 913 /* If we are reading the ring buffer, don't trace */
889 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 914 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
890 return; 915 return;
891 916
892 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 917 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry),
893 &irq_flags); 918 flags, pc);
894 if (!event) 919 if (!event)
895 return; 920 return;
896 entry = ring_buffer_event_data(event); 921 entry = ring_buffer_event_data(event);
897 tracing_generic_entry_update(&entry->ent, flags, pc);
898 entry->ent.type = TRACE_FN;
899 entry->ip = ip; 922 entry->ip = ip;
900 entry->parent_ip = parent_ip; 923 entry->parent_ip = parent_ip;
901 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 924 ring_buffer_unlock_commit(tr->buffer, event);
902} 925}
903 926
904#ifdef CONFIG_FUNCTION_GRAPH_TRACER 927#ifdef CONFIG_FUNCTION_GRAPH_TRACER
905static void __trace_graph_entry(struct trace_array *tr, 928static int __trace_graph_entry(struct trace_array *tr,
906 struct trace_array_cpu *data,
907 struct ftrace_graph_ent *trace, 929 struct ftrace_graph_ent *trace,
908 unsigned long flags, 930 unsigned long flags,
909 int pc) 931 int pc)
910{ 932{
911 struct ring_buffer_event *event; 933 struct ring_buffer_event *event;
912 struct ftrace_graph_ent_entry *entry; 934 struct ftrace_graph_ent_entry *entry;
913 unsigned long irq_flags;
914 935
915 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 936 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
916 return; 937 return 0;
917 938
918 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), 939 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
919 &irq_flags); 940 sizeof(*entry), flags, pc);
920 if (!event) 941 if (!event)
921 return; 942 return 0;
922 entry = ring_buffer_event_data(event); 943 entry = ring_buffer_event_data(event);
923 tracing_generic_entry_update(&entry->ent, flags, pc);
924 entry->ent.type = TRACE_GRAPH_ENT;
925 entry->graph_ent = *trace; 944 entry->graph_ent = *trace;
926 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); 945 ring_buffer_unlock_commit(global_trace.buffer, event);
946
947 return 1;
927} 948}
928 949
929static void __trace_graph_return(struct trace_array *tr, 950static void __trace_graph_return(struct trace_array *tr,
930 struct trace_array_cpu *data,
931 struct ftrace_graph_ret *trace, 951 struct ftrace_graph_ret *trace,
932 unsigned long flags, 952 unsigned long flags,
933 int pc) 953 int pc)
934{ 954{
935 struct ring_buffer_event *event; 955 struct ring_buffer_event *event;
936 struct ftrace_graph_ret_entry *entry; 956 struct ftrace_graph_ret_entry *entry;
937 unsigned long irq_flags;
938 957
939 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 958 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
940 return; 959 return;
941 960
942 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), 961 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
943 &irq_flags); 962 sizeof(*entry), flags, pc);
944 if (!event) 963 if (!event)
945 return; 964 return;
946 entry = ring_buffer_event_data(event); 965 entry = ring_buffer_event_data(event);
947 tracing_generic_entry_update(&entry->ent, flags, pc);
948 entry->ent.type = TRACE_GRAPH_RET;
949 entry->ret = *trace; 966 entry->ret = *trace;
950 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); 967 ring_buffer_unlock_commit(global_trace.buffer, event);
951} 968}
952#endif 969#endif
953 970
@@ -957,31 +974,23 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
957 int pc) 974 int pc)
958{ 975{
959 if (likely(!atomic_read(&data->disabled))) 976 if (likely(!atomic_read(&data->disabled)))
960 trace_function(tr, data, ip, parent_ip, flags, pc); 977 trace_function(tr, ip, parent_ip, flags, pc);
961} 978}
962 979
963static void ftrace_trace_stack(struct trace_array *tr, 980static void __ftrace_trace_stack(struct trace_array *tr,
964 struct trace_array_cpu *data, 981 unsigned long flags,
965 unsigned long flags, 982 int skip, int pc)
966 int skip, int pc)
967{ 983{
968#ifdef CONFIG_STACKTRACE 984#ifdef CONFIG_STACKTRACE
969 struct ring_buffer_event *event; 985 struct ring_buffer_event *event;
970 struct stack_entry *entry; 986 struct stack_entry *entry;
971 struct stack_trace trace; 987 struct stack_trace trace;
972 unsigned long irq_flags;
973 988
974 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 989 event = trace_buffer_lock_reserve(tr, TRACE_STACK,
975 return; 990 sizeof(*entry), flags, pc);
976
977 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
978 &irq_flags);
979 if (!event) 991 if (!event)
980 return; 992 return;
981 entry = ring_buffer_event_data(event); 993 entry = ring_buffer_event_data(event);
982 tracing_generic_entry_update(&entry->ent, flags, pc);
983 entry->ent.type = TRACE_STACK;
984
985 memset(&entry->caller, 0, sizeof(entry->caller)); 994 memset(&entry->caller, 0, sizeof(entry->caller));
986 995
987 trace.nr_entries = 0; 996 trace.nr_entries = 0;
@@ -990,38 +999,43 @@ static void ftrace_trace_stack(struct trace_array *tr,
990 trace.entries = entry->caller; 999 trace.entries = entry->caller;
991 1000
992 save_stack_trace(&trace); 1001 save_stack_trace(&trace);
993 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1002 ring_buffer_unlock_commit(tr->buffer, event);
994#endif 1003#endif
995} 1004}
996 1005
1006static void ftrace_trace_stack(struct trace_array *tr,
1007 unsigned long flags,
1008 int skip, int pc)
1009{
1010 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1011 return;
1012
1013 __ftrace_trace_stack(tr, flags, skip, pc);
1014}
1015
997void __trace_stack(struct trace_array *tr, 1016void __trace_stack(struct trace_array *tr,
998 struct trace_array_cpu *data,
999 unsigned long flags, 1017 unsigned long flags,
1000 int skip) 1018 int skip, int pc)
1001{ 1019{
1002 ftrace_trace_stack(tr, data, flags, skip, preempt_count()); 1020 __ftrace_trace_stack(tr, flags, skip, pc);
1003} 1021}
1004 1022
1005static void ftrace_trace_userstack(struct trace_array *tr, 1023static void ftrace_trace_userstack(struct trace_array *tr,
1006 struct trace_array_cpu *data, 1024 unsigned long flags, int pc)
1007 unsigned long flags, int pc)
1008{ 1025{
1009#ifdef CONFIG_STACKTRACE 1026#ifdef CONFIG_STACKTRACE
1010 struct ring_buffer_event *event; 1027 struct ring_buffer_event *event;
1011 struct userstack_entry *entry; 1028 struct userstack_entry *entry;
1012 struct stack_trace trace; 1029 struct stack_trace trace;
1013 unsigned long irq_flags;
1014 1030
1015 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1031 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1016 return; 1032 return;
1017 1033
1018 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1034 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK,
1019 &irq_flags); 1035 sizeof(*entry), flags, pc);
1020 if (!event) 1036 if (!event)
1021 return; 1037 return;
1022 entry = ring_buffer_event_data(event); 1038 entry = ring_buffer_event_data(event);
1023 tracing_generic_entry_update(&entry->ent, flags, pc);
1024 entry->ent.type = TRACE_USER_STACK;
1025 1039
1026 memset(&entry->caller, 0, sizeof(entry->caller)); 1040 memset(&entry->caller, 0, sizeof(entry->caller));
1027 1041
@@ -1031,70 +1045,58 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1031 trace.entries = entry->caller; 1045 trace.entries = entry->caller;
1032 1046
1033 save_stack_trace_user(&trace); 1047 save_stack_trace_user(&trace);
1034 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1048 ring_buffer_unlock_commit(tr->buffer, event);
1035#endif 1049#endif
1036} 1050}
1037 1051
1038void __trace_userstack(struct trace_array *tr, 1052#ifdef UNUSED
1039 struct trace_array_cpu *data, 1053static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1040 unsigned long flags)
1041{ 1054{
1042 ftrace_trace_userstack(tr, data, flags, preempt_count()); 1055 ftrace_trace_userstack(tr, flags, preempt_count());
1043} 1056}
1057#endif /* UNUSED */
1044 1058
1045static void 1059static void
1046ftrace_trace_special(void *__tr, void *__data, 1060ftrace_trace_special(void *__tr,
1047 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1061 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1048 int pc) 1062 int pc)
1049{ 1063{
1050 struct ring_buffer_event *event; 1064 struct ring_buffer_event *event;
1051 struct trace_array_cpu *data = __data;
1052 struct trace_array *tr = __tr; 1065 struct trace_array *tr = __tr;
1053 struct special_entry *entry; 1066 struct special_entry *entry;
1054 unsigned long irq_flags;
1055 1067
1056 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1068 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL,
1057 &irq_flags); 1069 sizeof(*entry), 0, pc);
1058 if (!event) 1070 if (!event)
1059 return; 1071 return;
1060 entry = ring_buffer_event_data(event); 1072 entry = ring_buffer_event_data(event);
1061 tracing_generic_entry_update(&entry->ent, 0, pc);
1062 entry->ent.type = TRACE_SPECIAL;
1063 entry->arg1 = arg1; 1073 entry->arg1 = arg1;
1064 entry->arg2 = arg2; 1074 entry->arg2 = arg2;
1065 entry->arg3 = arg3; 1075 entry->arg3 = arg3;
1066 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1076 trace_buffer_unlock_commit(tr, event, 0, pc);
1067 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1068 ftrace_trace_userstack(tr, data, irq_flags, pc);
1069
1070 trace_wake_up();
1071} 1077}
1072 1078
1073void 1079void
1074__trace_special(void *__tr, void *__data, 1080__trace_special(void *__tr, void *__data,
1075 unsigned long arg1, unsigned long arg2, unsigned long arg3) 1081 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1076{ 1082{
1077 ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); 1083 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1078} 1084}
1079 1085
1080void 1086void
1081tracing_sched_switch_trace(struct trace_array *tr, 1087tracing_sched_switch_trace(struct trace_array *tr,
1082 struct trace_array_cpu *data,
1083 struct task_struct *prev, 1088 struct task_struct *prev,
1084 struct task_struct *next, 1089 struct task_struct *next,
1085 unsigned long flags, int pc) 1090 unsigned long flags, int pc)
1086{ 1091{
1087 struct ring_buffer_event *event; 1092 struct ring_buffer_event *event;
1088 struct ctx_switch_entry *entry; 1093 struct ctx_switch_entry *entry;
1089 unsigned long irq_flags;
1090 1094
1091 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1095 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1092 &irq_flags); 1096 sizeof(*entry), flags, pc);
1093 if (!event) 1097 if (!event)
1094 return; 1098 return;
1095 entry = ring_buffer_event_data(event); 1099 entry = ring_buffer_event_data(event);
1096 tracing_generic_entry_update(&entry->ent, flags, pc);
1097 entry->ent.type = TRACE_CTX;
1098 entry->prev_pid = prev->pid; 1100 entry->prev_pid = prev->pid;
1099 entry->prev_prio = prev->prio; 1101 entry->prev_prio = prev->prio;
1100 entry->prev_state = prev->state; 1102 entry->prev_state = prev->state;
@@ -1102,29 +1104,23 @@ tracing_sched_switch_trace(struct trace_array *tr,
1102 entry->next_prio = next->prio; 1104 entry->next_prio = next->prio;
1103 entry->next_state = next->state; 1105 entry->next_state = next->state;
1104 entry->next_cpu = task_cpu(next); 1106 entry->next_cpu = task_cpu(next);
1105 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1107 trace_buffer_unlock_commit(tr, event, flags, pc);
1106 ftrace_trace_stack(tr, data, flags, 5, pc);
1107 ftrace_trace_userstack(tr, data, flags, pc);
1108} 1108}
1109 1109
1110void 1110void
1111tracing_sched_wakeup_trace(struct trace_array *tr, 1111tracing_sched_wakeup_trace(struct trace_array *tr,
1112 struct trace_array_cpu *data,
1113 struct task_struct *wakee, 1112 struct task_struct *wakee,
1114 struct task_struct *curr, 1113 struct task_struct *curr,
1115 unsigned long flags, int pc) 1114 unsigned long flags, int pc)
1116{ 1115{
1117 struct ring_buffer_event *event; 1116 struct ring_buffer_event *event;
1118 struct ctx_switch_entry *entry; 1117 struct ctx_switch_entry *entry;
1119 unsigned long irq_flags;
1120 1118
1121 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1119 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1122 &irq_flags); 1120 sizeof(*entry), flags, pc);
1123 if (!event) 1121 if (!event)
1124 return; 1122 return;
1125 entry = ring_buffer_event_data(event); 1123 entry = ring_buffer_event_data(event);
1126 tracing_generic_entry_update(&entry->ent, flags, pc);
1127 entry->ent.type = TRACE_WAKE;
1128 entry->prev_pid = curr->pid; 1124 entry->prev_pid = curr->pid;
1129 entry->prev_prio = curr->prio; 1125 entry->prev_prio = curr->prio;
1130 entry->prev_state = curr->state; 1126 entry->prev_state = curr->state;
@@ -1132,11 +1128,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1132 entry->next_prio = wakee->prio; 1128 entry->next_prio = wakee->prio;
1133 entry->next_state = wakee->state; 1129 entry->next_state = wakee->state;
1134 entry->next_cpu = task_cpu(wakee); 1130 entry->next_cpu = task_cpu(wakee);
1135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1136 ftrace_trace_stack(tr, data, flags, 6, pc);
1137 ftrace_trace_userstack(tr, data, flags, pc);
1138 1131
1139 trace_wake_up(); 1132 ring_buffer_unlock_commit(tr->buffer, event);
1133 ftrace_trace_stack(tr, flags, 6, pc);
1134 ftrace_trace_userstack(tr, flags, pc);
1140} 1135}
1141 1136
1142void 1137void
@@ -1157,66 +1152,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1157 data = tr->data[cpu]; 1152 data = tr->data[cpu];
1158 1153
1159 if (likely(atomic_inc_return(&data->disabled) == 1)) 1154 if (likely(atomic_inc_return(&data->disabled) == 1))
1160 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); 1155 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1161
1162 atomic_dec(&data->disabled);
1163 local_irq_restore(flags);
1164}
1165
1166#ifdef CONFIG_FUNCTION_TRACER
1167static void
1168function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
1169{
1170 struct trace_array *tr = &global_trace;
1171 struct trace_array_cpu *data;
1172 unsigned long flags;
1173 long disabled;
1174 int cpu, resched;
1175 int pc;
1176
1177 if (unlikely(!ftrace_function_enabled))
1178 return;
1179
1180 pc = preempt_count();
1181 resched = ftrace_preempt_disable();
1182 local_save_flags(flags);
1183 cpu = raw_smp_processor_id();
1184 data = tr->data[cpu];
1185 disabled = atomic_inc_return(&data->disabled);
1186
1187 if (likely(disabled == 1))
1188 trace_function(tr, data, ip, parent_ip, flags, pc);
1189
1190 atomic_dec(&data->disabled);
1191 ftrace_preempt_enable(resched);
1192}
1193
1194static void
1195function_trace_call(unsigned long ip, unsigned long parent_ip)
1196{
1197 struct trace_array *tr = &global_trace;
1198 struct trace_array_cpu *data;
1199 unsigned long flags;
1200 long disabled;
1201 int cpu;
1202 int pc;
1203
1204 if (unlikely(!ftrace_function_enabled))
1205 return;
1206
1207 /*
1208 * Need to use raw, since this must be called before the
1209 * recursive protection is performed.
1210 */
1211 local_irq_save(flags);
1212 cpu = raw_smp_processor_id();
1213 data = tr->data[cpu];
1214 disabled = atomic_inc_return(&data->disabled);
1215
1216 if (likely(disabled == 1)) {
1217 pc = preempt_count();
1218 trace_function(tr, data, ip, parent_ip, flags, pc);
1219 }
1220 1156
1221 atomic_dec(&data->disabled); 1157 atomic_dec(&data->disabled);
1222 local_irq_restore(flags); 1158 local_irq_restore(flags);
@@ -1229,6 +1165,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
1229 struct trace_array_cpu *data; 1165 struct trace_array_cpu *data;
1230 unsigned long flags; 1166 unsigned long flags;
1231 long disabled; 1167 long disabled;
1168 int ret;
1232 int cpu; 1169 int cpu;
1233 int pc; 1170 int pc;
1234 1171
@@ -1244,15 +1181,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
1244 disabled = atomic_inc_return(&data->disabled); 1181 disabled = atomic_inc_return(&data->disabled);
1245 if (likely(disabled == 1)) { 1182 if (likely(disabled == 1)) {
1246 pc = preempt_count(); 1183 pc = preempt_count();
1247 __trace_graph_entry(tr, data, trace, flags, pc); 1184 ret = __trace_graph_entry(tr, trace, flags, pc);
1185 } else {
1186 ret = 0;
1248 } 1187 }
1249 /* Only do the atomic if it is not already set */ 1188 /* Only do the atomic if it is not already set */
1250 if (!test_tsk_trace_graph(current)) 1189 if (!test_tsk_trace_graph(current))
1251 set_tsk_trace_graph(current); 1190 set_tsk_trace_graph(current);
1191
1252 atomic_dec(&data->disabled); 1192 atomic_dec(&data->disabled);
1253 local_irq_restore(flags); 1193 local_irq_restore(flags);
1254 1194
1255 return 1; 1195 return ret;
1256} 1196}
1257 1197
1258void trace_graph_return(struct ftrace_graph_ret *trace) 1198void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1270,7 +1210,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
1270 disabled = atomic_inc_return(&data->disabled); 1210 disabled = atomic_inc_return(&data->disabled);
1271 if (likely(disabled == 1)) { 1211 if (likely(disabled == 1)) {
1272 pc = preempt_count(); 1212 pc = preempt_count();
1273 __trace_graph_return(tr, data, trace, flags, pc); 1213 __trace_graph_return(tr, trace, flags, pc);
1274 } 1214 }
1275 if (!trace->depth) 1215 if (!trace->depth)
1276 clear_tsk_trace_graph(current); 1216 clear_tsk_trace_graph(current);
@@ -1279,30 +1219,122 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
1279} 1219}
1280#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 1220#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1281 1221
1282static struct ftrace_ops trace_ops __read_mostly =
1283{
1284 .func = function_trace_call,
1285};
1286 1222
1287void tracing_start_function_trace(void) 1223/**
1224 * trace_vbprintk - write binary msg to tracing buffer
1225 *
1226 */
1227int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1288{ 1228{
1289 ftrace_function_enabled = 0; 1229 static raw_spinlock_t trace_buf_lock =
1230 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
1231 static u32 trace_buf[TRACE_BUF_SIZE];
1290 1232
1291 if (trace_flags & TRACE_ITER_PREEMPTONLY) 1233 struct ring_buffer_event *event;
1292 trace_ops.func = function_trace_call_preempt_only; 1234 struct trace_array *tr = &global_trace;
1293 else 1235 struct trace_array_cpu *data;
1294 trace_ops.func = function_trace_call; 1236 struct bprint_entry *entry;
1237 unsigned long flags;
1238 int resched;
1239 int cpu, len = 0, size, pc;
1240
1241 if (unlikely(tracing_selftest_running || tracing_disabled))
1242 return 0;
1243
1244 /* Don't pollute graph traces with trace_vprintk internals */
1245 pause_graph_tracing();
1246
1247 pc = preempt_count();
1248 resched = ftrace_preempt_disable();
1249 cpu = raw_smp_processor_id();
1250 data = tr->data[cpu];
1251
1252 if (unlikely(atomic_read(&data->disabled)))
1253 goto out;
1254
1255 /* Lockdep uses trace_printk for lock tracing */
1256 local_irq_save(flags);
1257 __raw_spin_lock(&trace_buf_lock);
1258 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1259
1260 if (len > TRACE_BUF_SIZE || len < 0)
1261 goto out_unlock;
1262
1263 size = sizeof(*entry) + sizeof(u32) * len;
1264 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
1265 if (!event)
1266 goto out_unlock;
1267 entry = ring_buffer_event_data(event);
1268 entry->ip = ip;
1269 entry->fmt = fmt;
1270
1271 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1272 ring_buffer_unlock_commit(tr->buffer, event);
1273
1274out_unlock:
1275 __raw_spin_unlock(&trace_buf_lock);
1276 local_irq_restore(flags);
1277
1278out:
1279 ftrace_preempt_enable(resched);
1280 unpause_graph_tracing();
1295 1281
1296 register_ftrace_function(&trace_ops); 1282 return len;
1297 ftrace_function_enabled = 1;
1298} 1283}
1284EXPORT_SYMBOL_GPL(trace_vbprintk);
1299 1285
1300void tracing_stop_function_trace(void) 1286int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1301{ 1287{
1302 ftrace_function_enabled = 0; 1288 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1303 unregister_ftrace_function(&trace_ops); 1289 static char trace_buf[TRACE_BUF_SIZE];
1290
1291 struct ring_buffer_event *event;
1292 struct trace_array *tr = &global_trace;
1293 struct trace_array_cpu *data;
1294 int cpu, len = 0, size, pc;
1295 struct print_entry *entry;
1296 unsigned long irq_flags;
1297
1298 if (tracing_disabled || tracing_selftest_running)
1299 return 0;
1300
1301 pc = preempt_count();
1302 preempt_disable_notrace();
1303 cpu = raw_smp_processor_id();
1304 data = tr->data[cpu];
1305
1306 if (unlikely(atomic_read(&data->disabled)))
1307 goto out;
1308
1309 pause_graph_tracing();
1310 raw_local_irq_save(irq_flags);
1311 __raw_spin_lock(&trace_buf_lock);
1312 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1313
1314 len = min(len, TRACE_BUF_SIZE-1);
1315 trace_buf[len] = 0;
1316
1317 size = sizeof(*entry) + len + 1;
1318 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
1319 if (!event)
1320 goto out_unlock;
1321 entry = ring_buffer_event_data(event);
1322 entry->ip = ip;
1323
1324 memcpy(&entry->buf, trace_buf, len);
1325 entry->buf[len] = 0;
1326 ring_buffer_unlock_commit(tr->buffer, event);
1327
1328 out_unlock:
1329 __raw_spin_unlock(&trace_buf_lock);
1330 raw_local_irq_restore(irq_flags);
1331 unpause_graph_tracing();
1332 out:
1333 preempt_enable_notrace();
1334
1335 return len;
1304} 1336}
1305#endif 1337EXPORT_SYMBOL_GPL(trace_vprintk);
1306 1338
1307enum trace_file_type { 1339enum trace_file_type {
1308 TRACE_FILE_LAT_FMT = 1, 1340 TRACE_FILE_LAT_FMT = 1,
@@ -1345,10 +1377,25 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1345{ 1377{
1346 struct ring_buffer *buffer = iter->tr->buffer; 1378 struct ring_buffer *buffer = iter->tr->buffer;
1347 struct trace_entry *ent, *next = NULL; 1379 struct trace_entry *ent, *next = NULL;
1380 int cpu_file = iter->cpu_file;
1348 u64 next_ts = 0, ts; 1381 u64 next_ts = 0, ts;
1349 int next_cpu = -1; 1382 int next_cpu = -1;
1350 int cpu; 1383 int cpu;
1351 1384
1385 /*
1386 * If we are in a per_cpu trace file, don't bother by iterating over
1387 * all cpu and peek directly.
1388 */
1389 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1390 if (ring_buffer_empty_cpu(buffer, cpu_file))
1391 return NULL;
1392 ent = peek_next_entry(iter, cpu_file, ent_ts);
1393 if (ent_cpu)
1394 *ent_cpu = cpu_file;
1395
1396 return ent;
1397 }
1398
1352 for_each_tracing_cpu(cpu) { 1399 for_each_tracing_cpu(cpu) {
1353 1400
1354 if (ring_buffer_empty_cpu(buffer, cpu)) 1401 if (ring_buffer_empty_cpu(buffer, cpu))
@@ -1376,8 +1423,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1376} 1423}
1377 1424
1378/* Find the next real entry, without updating the iterator itself */ 1425/* Find the next real entry, without updating the iterator itself */
1379static struct trace_entry * 1426struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1380find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1427 int *ent_cpu, u64 *ent_ts)
1381{ 1428{
1382 return __find_next_entry(iter, ent_cpu, ent_ts); 1429 return __find_next_entry(iter, ent_cpu, ent_ts);
1383} 1430}
@@ -1426,19 +1473,32 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1426 return ent; 1473 return ent;
1427} 1474}
1428 1475
1476/*
1477 * No necessary locking here. The worst thing which can
1478 * happen is loosing events consumed at the same time
1479 * by a trace_pipe reader.
1480 * Other than that, we don't risk to crash the ring buffer
1481 * because it serializes the readers.
1482 *
1483 * The current tracer is copied to avoid a global locking
1484 * all around.
1485 */
1429static void *s_start(struct seq_file *m, loff_t *pos) 1486static void *s_start(struct seq_file *m, loff_t *pos)
1430{ 1487{
1431 struct trace_iterator *iter = m->private; 1488 struct trace_iterator *iter = m->private;
1489 static struct tracer *old_tracer;
1490 int cpu_file = iter->cpu_file;
1432 void *p = NULL; 1491 void *p = NULL;
1433 loff_t l = 0; 1492 loff_t l = 0;
1434 int cpu; 1493 int cpu;
1435 1494
1495 /* copy the tracer to avoid using a global lock all around */
1436 mutex_lock(&trace_types_lock); 1496 mutex_lock(&trace_types_lock);
1437 1497 if (unlikely(old_tracer != current_trace && current_trace)) {
1438 if (!current_trace || current_trace != iter->trace) { 1498 old_tracer = current_trace;
1439 mutex_unlock(&trace_types_lock); 1499 *iter->trace = *current_trace;
1440 return NULL;
1441 } 1500 }
1501 mutex_unlock(&trace_types_lock);
1442 1502
1443 atomic_inc(&trace_record_cmdline_disabled); 1503 atomic_inc(&trace_record_cmdline_disabled);
1444 1504
@@ -1449,9 +1509,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1449 1509
1450 ftrace_disable_cpu(); 1510 ftrace_disable_cpu();
1451 1511
1452 for_each_tracing_cpu(cpu) { 1512 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1453 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1513 for_each_tracing_cpu(cpu)
1454 } 1514 ring_buffer_iter_reset(iter->buffer_iter[cpu]);
1515 } else
1516 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]);
1517
1455 1518
1456 ftrace_enable_cpu(); 1519 ftrace_enable_cpu();
1457 1520
@@ -1469,155 +1532,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1469static void s_stop(struct seq_file *m, void *p) 1532static void s_stop(struct seq_file *m, void *p)
1470{ 1533{
1471 atomic_dec(&trace_record_cmdline_disabled); 1534 atomic_dec(&trace_record_cmdline_disabled);
1472 mutex_unlock(&trace_types_lock);
1473}
1474
1475#ifdef CONFIG_KRETPROBES
1476static inline const char *kretprobed(const char *name)
1477{
1478 static const char tramp_name[] = "kretprobe_trampoline";
1479 int size = sizeof(tramp_name);
1480
1481 if (strncmp(tramp_name, name, size) == 0)
1482 return "[unknown/kretprobe'd]";
1483 return name;
1484}
1485#else
1486static inline const char *kretprobed(const char *name)
1487{
1488 return name;
1489}
1490#endif /* CONFIG_KRETPROBES */
1491
1492static int
1493seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1494{
1495#ifdef CONFIG_KALLSYMS
1496 char str[KSYM_SYMBOL_LEN];
1497 const char *name;
1498
1499 kallsyms_lookup(address, NULL, NULL, NULL, str);
1500
1501 name = kretprobed(str);
1502
1503 return trace_seq_printf(s, fmt, name);
1504#endif
1505 return 1;
1506}
1507
1508static int
1509seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1510 unsigned long address)
1511{
1512#ifdef CONFIG_KALLSYMS
1513 char str[KSYM_SYMBOL_LEN];
1514 const char *name;
1515
1516 sprint_symbol(str, address);
1517 name = kretprobed(str);
1518
1519 return trace_seq_printf(s, fmt, name);
1520#endif
1521 return 1;
1522}
1523
1524#ifndef CONFIG_64BIT
1525# define IP_FMT "%08lx"
1526#else
1527# define IP_FMT "%016lx"
1528#endif
1529
1530int
1531seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1532{
1533 int ret;
1534
1535 if (!ip)
1536 return trace_seq_printf(s, "0");
1537
1538 if (sym_flags & TRACE_ITER_SYM_OFFSET)
1539 ret = seq_print_sym_offset(s, "%s", ip);
1540 else
1541 ret = seq_print_sym_short(s, "%s", ip);
1542
1543 if (!ret)
1544 return 0;
1545
1546 if (sym_flags & TRACE_ITER_SYM_ADDR)
1547 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1548 return ret;
1549}
1550
1551static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1552 unsigned long ip, unsigned long sym_flags)
1553{
1554 struct file *file = NULL;
1555 unsigned long vmstart = 0;
1556 int ret = 1;
1557
1558 if (mm) {
1559 const struct vm_area_struct *vma;
1560
1561 down_read(&mm->mmap_sem);
1562 vma = find_vma(mm, ip);
1563 if (vma) {
1564 file = vma->vm_file;
1565 vmstart = vma->vm_start;
1566 }
1567 if (file) {
1568 ret = trace_seq_path(s, &file->f_path);
1569 if (ret)
1570 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1571 }
1572 up_read(&mm->mmap_sem);
1573 }
1574 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1575 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1576 return ret;
1577}
1578
1579static int
1580seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1581 unsigned long sym_flags)
1582{
1583 struct mm_struct *mm = NULL;
1584 int ret = 1;
1585 unsigned int i;
1586
1587 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1588 struct task_struct *task;
1589 /*
1590 * we do the lookup on the thread group leader,
1591 * since individual threads might have already quit!
1592 */
1593 rcu_read_lock();
1594 task = find_task_by_vpid(entry->ent.tgid);
1595 if (task)
1596 mm = get_task_mm(task);
1597 rcu_read_unlock();
1598 }
1599
1600 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1601 unsigned long ip = entry->caller[i];
1602
1603 if (ip == ULONG_MAX || !ret)
1604 break;
1605 if (i && ret)
1606 ret = trace_seq_puts(s, " <- ");
1607 if (!ip) {
1608 if (ret)
1609 ret = trace_seq_puts(s, "??");
1610 continue;
1611 }
1612 if (!ret)
1613 break;
1614 if (ret)
1615 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1616 }
1617
1618 if (mm)
1619 mmput(mm);
1620 return ret;
1621} 1535}
1622 1536
1623static void print_lat_help_header(struct seq_file *m) 1537static void print_lat_help_header(struct seq_file *m)
@@ -1658,11 +1572,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1658 total = entries + 1572 total = entries +
1659 ring_buffer_overruns(iter->tr->buffer); 1573 ring_buffer_overruns(iter->tr->buffer);
1660 1574
1661 seq_printf(m, "%s latency trace v1.1.5 on %s\n", 1575 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1662 name, UTS_RELEASE); 1576 name, UTS_RELEASE);
1663 seq_puts(m, "-----------------------------------" 1577 seq_puts(m, "# -----------------------------------"
1664 "---------------------------------\n"); 1578 "---------------------------------\n");
1665 seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" 1579 seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
1666 " (M:%s VP:%d, KP:%d, SP:%d HP:%d", 1580 " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
1667 nsecs_to_usecs(data->saved_latency), 1581 nsecs_to_usecs(data->saved_latency),
1668 entries, 1582 entries,
@@ -1684,121 +1598,24 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1684#else 1598#else
1685 seq_puts(m, ")\n"); 1599 seq_puts(m, ")\n");
1686#endif 1600#endif
1687 seq_puts(m, " -----------------\n"); 1601 seq_puts(m, "# -----------------\n");
1688 seq_printf(m, " | task: %.16s-%d " 1602 seq_printf(m, "# | task: %.16s-%d "
1689 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", 1603 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
1690 data->comm, data->pid, data->uid, data->nice, 1604 data->comm, data->pid, data->uid, data->nice,
1691 data->policy, data->rt_priority); 1605 data->policy, data->rt_priority);
1692 seq_puts(m, " -----------------\n"); 1606 seq_puts(m, "# -----------------\n");
1693 1607
1694 if (data->critical_start) { 1608 if (data->critical_start) {
1695 seq_puts(m, " => started at: "); 1609 seq_puts(m, "# => started at: ");
1696 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); 1610 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
1697 trace_print_seq(m, &iter->seq); 1611 trace_print_seq(m, &iter->seq);
1698 seq_puts(m, "\n => ended at: "); 1612 seq_puts(m, "\n# => ended at: ");
1699 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1613 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1700 trace_print_seq(m, &iter->seq); 1614 trace_print_seq(m, &iter->seq);
1701 seq_puts(m, "\n"); 1615 seq_puts(m, "#\n");
1702 }
1703
1704 seq_puts(m, "\n");
1705}
1706
1707static void
1708lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1709{
1710 int hardirq, softirq;
1711 char *comm;
1712
1713 comm = trace_find_cmdline(entry->pid);
1714
1715 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1716 trace_seq_printf(s, "%3d", cpu);
1717 trace_seq_printf(s, "%c%c",
1718 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
1719 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
1720 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
1721
1722 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
1723 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
1724 if (hardirq && softirq) {
1725 trace_seq_putc(s, 'H');
1726 } else {
1727 if (hardirq) {
1728 trace_seq_putc(s, 'h');
1729 } else {
1730 if (softirq)
1731 trace_seq_putc(s, 's');
1732 else
1733 trace_seq_putc(s, '.');
1734 }
1735 }
1736
1737 if (entry->preempt_count)
1738 trace_seq_printf(s, "%x", entry->preempt_count);
1739 else
1740 trace_seq_puts(s, ".");
1741}
1742
1743unsigned long preempt_mark_thresh = 100;
1744
1745static void
1746lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1747 unsigned long rel_usecs)
1748{
1749 trace_seq_printf(s, " %4lldus", abs_usecs);
1750 if (rel_usecs > preempt_mark_thresh)
1751 trace_seq_puts(s, "!: ");
1752 else if (rel_usecs > 1)
1753 trace_seq_puts(s, "+: ");
1754 else
1755 trace_seq_puts(s, " : ");
1756}
1757
1758static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1759
1760static int task_state_char(unsigned long state)
1761{
1762 int bit = state ? __ffs(state) + 1 : 0;
1763
1764 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1765}
1766
1767/*
1768 * The message is supposed to contain an ending newline.
1769 * If the printing stops prematurely, try to add a newline of our own.
1770 */
1771void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1772{
1773 struct trace_entry *ent;
1774 struct trace_field_cont *cont;
1775 bool ok = true;
1776
1777 ent = peek_next_entry(iter, iter->cpu, NULL);
1778 if (!ent || ent->type != TRACE_CONT) {
1779 trace_seq_putc(s, '\n');
1780 return;
1781 } 1616 }
1782 1617
1783 do { 1618 seq_puts(m, "#\n");
1784 cont = (struct trace_field_cont *)ent;
1785 if (ok)
1786 ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
1787
1788 ftrace_disable_cpu();
1789
1790 if (iter->buffer_iter[iter->cpu])
1791 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1792 else
1793 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
1794
1795 ftrace_enable_cpu();
1796
1797 ent = peek_next_entry(iter, iter->cpu, NULL);
1798 } while (ent && ent->type == TRACE_CONT);
1799
1800 if (!ok)
1801 trace_seq_putc(s, '\n');
1802} 1619}
1803 1620
1804static void test_cpu_buff_start(struct trace_iterator *iter) 1621static void test_cpu_buff_start(struct trace_iterator *iter)
@@ -1815,141 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1815 return; 1632 return;
1816 1633
1817 cpumask_set_cpu(iter->cpu, iter->started); 1634 cpumask_set_cpu(iter->cpu, iter->started);
1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1819}
1820
1821static enum print_line_t
1822print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1823{
1824 struct trace_seq *s = &iter->seq;
1825 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1826 struct trace_entry *next_entry;
1827 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1828 struct trace_entry *entry = iter->ent;
1829 unsigned long abs_usecs;
1830 unsigned long rel_usecs;
1831 u64 next_ts;
1832 char *comm;
1833 int S, T;
1834 int i;
1835
1836 if (entry->type == TRACE_CONT)
1837 return TRACE_TYPE_HANDLED;
1838
1839 test_cpu_buff_start(iter);
1840
1841 next_entry = find_next_entry(iter, NULL, &next_ts);
1842 if (!next_entry)
1843 next_ts = iter->ts;
1844 rel_usecs = ns2usecs(next_ts - iter->ts);
1845 abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
1846
1847 if (verbose) {
1848 comm = trace_find_cmdline(entry->pid);
1849 trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
1850 " %ld.%03ldms (+%ld.%03ldms): ",
1851 comm,
1852 entry->pid, cpu, entry->flags,
1853 entry->preempt_count, trace_idx,
1854 ns2usecs(iter->ts),
1855 abs_usecs/1000,
1856 abs_usecs % 1000, rel_usecs/1000,
1857 rel_usecs % 1000);
1858 } else {
1859 lat_print_generic(s, entry, cpu);
1860 lat_print_timestamp(s, abs_usecs, rel_usecs);
1861 }
1862 switch (entry->type) {
1863 case TRACE_FN: {
1864 struct ftrace_entry *field;
1865
1866 trace_assign_type(field, entry);
1867
1868 seq_print_ip_sym(s, field->ip, sym_flags);
1869 trace_seq_puts(s, " (");
1870 seq_print_ip_sym(s, field->parent_ip, sym_flags);
1871 trace_seq_puts(s, ")\n");
1872 break;
1873 }
1874 case TRACE_CTX:
1875 case TRACE_WAKE: {
1876 struct ctx_switch_entry *field;
1877
1878 trace_assign_type(field, entry);
1879
1880 T = task_state_char(field->next_state);
1881 S = task_state_char(field->prev_state);
1882 comm = trace_find_cmdline(field->next_pid);
1883 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1884 field->prev_pid,
1885 field->prev_prio,
1886 S, entry->type == TRACE_CTX ? "==>" : " +",
1887 field->next_cpu,
1888 field->next_pid,
1889 field->next_prio,
1890 T, comm);
1891 break;
1892 }
1893 case TRACE_SPECIAL: {
1894 struct special_entry *field;
1895
1896 trace_assign_type(field, entry);
1897
1898 trace_seq_printf(s, "# %ld %ld %ld\n",
1899 field->arg1,
1900 field->arg2,
1901 field->arg3);
1902 break;
1903 }
1904 case TRACE_STACK: {
1905 struct stack_entry *field;
1906
1907 trace_assign_type(field, entry);
1908
1909 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1910 if (i)
1911 trace_seq_puts(s, " <= ");
1912 seq_print_ip_sym(s, field->caller[i], sym_flags);
1913 }
1914 trace_seq_puts(s, "\n");
1915 break;
1916 }
1917 case TRACE_PRINT: {
1918 struct print_entry *field;
1919 1635
1920 trace_assign_type(field, entry); 1636 /* Don't print started cpu buffer for the first entry of the trace */
1921 1637 if (iter->idx > 1)
1922 seq_print_ip_sym(s, field->ip, sym_flags); 1638 trace_seq_printf(s, "##### CPU %u buffer started ####\n",
1923 trace_seq_printf(s, ": %s", field->buf); 1639 iter->cpu);
1924 if (entry->flags & TRACE_FLAG_CONT)
1925 trace_seq_print_cont(s, iter);
1926 break;
1927 }
1928 case TRACE_BRANCH: {
1929 struct trace_branch *field;
1930
1931 trace_assign_type(field, entry);
1932
1933 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1934 field->correct ? " ok " : " MISS ",
1935 field->func,
1936 field->file,
1937 field->line);
1938 break;
1939 }
1940 case TRACE_USER_STACK: {
1941 struct userstack_entry *field;
1942
1943 trace_assign_type(field, entry);
1944
1945 seq_print_userip_objs(field, s, sym_flags);
1946 trace_seq_putc(s, '\n');
1947 break;
1948 }
1949 default:
1950 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1951 }
1952 return TRACE_TYPE_HANDLED;
1953} 1640}
1954 1641
1955static enum print_line_t print_trace_fmt(struct trace_iterator *iter) 1642static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1957,333 +1644,84 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1957 struct trace_seq *s = &iter->seq; 1644 struct trace_seq *s = &iter->seq;
1958 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1645 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1959 struct trace_entry *entry; 1646 struct trace_entry *entry;
1960 unsigned long usec_rem; 1647 struct trace_event *event;
1961 unsigned long long t;
1962 unsigned long secs;
1963 char *comm;
1964 int ret;
1965 int S, T;
1966 int i;
1967 1648
1968 entry = iter->ent; 1649 entry = iter->ent;
1969 1650
1970 if (entry->type == TRACE_CONT)
1971 return TRACE_TYPE_HANDLED;
1972
1973 test_cpu_buff_start(iter); 1651 test_cpu_buff_start(iter);
1974 1652
1975 comm = trace_find_cmdline(iter->ent->pid); 1653 event = ftrace_find_event(entry->type);
1976
1977 t = ns2usecs(iter->ts);
1978 usec_rem = do_div(t, 1000000ULL);
1979 secs = (unsigned long)t;
1980
1981 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
1982 if (!ret)
1983 return TRACE_TYPE_PARTIAL_LINE;
1984 ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
1985 if (!ret)
1986 return TRACE_TYPE_PARTIAL_LINE;
1987 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1988 if (!ret)
1989 return TRACE_TYPE_PARTIAL_LINE;
1990
1991 switch (entry->type) {
1992 case TRACE_FN: {
1993 struct ftrace_entry *field;
1994
1995 trace_assign_type(field, entry);
1996
1997 ret = seq_print_ip_sym(s, field->ip, sym_flags);
1998 if (!ret)
1999 return TRACE_TYPE_PARTIAL_LINE;
2000 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
2001 field->parent_ip) {
2002 ret = trace_seq_printf(s, " <-");
2003 if (!ret)
2004 return TRACE_TYPE_PARTIAL_LINE;
2005 ret = seq_print_ip_sym(s,
2006 field->parent_ip,
2007 sym_flags);
2008 if (!ret)
2009 return TRACE_TYPE_PARTIAL_LINE;
2010 }
2011 ret = trace_seq_printf(s, "\n");
2012 if (!ret)
2013 return TRACE_TYPE_PARTIAL_LINE;
2014 break;
2015 }
2016 case TRACE_CTX:
2017 case TRACE_WAKE: {
2018 struct ctx_switch_entry *field;
2019
2020 trace_assign_type(field, entry);
2021
2022 T = task_state_char(field->next_state);
2023 S = task_state_char(field->prev_state);
2024 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
2025 field->prev_pid,
2026 field->prev_prio,
2027 S,
2028 entry->type == TRACE_CTX ? "==>" : " +",
2029 field->next_cpu,
2030 field->next_pid,
2031 field->next_prio,
2032 T);
2033 if (!ret)
2034 return TRACE_TYPE_PARTIAL_LINE;
2035 break;
2036 }
2037 case TRACE_SPECIAL: {
2038 struct special_entry *field;
2039
2040 trace_assign_type(field, entry);
2041 1654
2042 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1655 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2043 field->arg1, 1656 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2044 field->arg2, 1657 if (!trace_print_lat_context(iter))
2045 field->arg3); 1658 goto partial;
2046 if (!ret) 1659 } else {
2047 return TRACE_TYPE_PARTIAL_LINE; 1660 if (!trace_print_context(iter))
2048 break; 1661 goto partial;
2049 }
2050 case TRACE_STACK: {
2051 struct stack_entry *field;
2052
2053 trace_assign_type(field, entry);
2054
2055 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
2056 if (i) {
2057 ret = trace_seq_puts(s, " <= ");
2058 if (!ret)
2059 return TRACE_TYPE_PARTIAL_LINE;
2060 }
2061 ret = seq_print_ip_sym(s, field->caller[i],
2062 sym_flags);
2063 if (!ret)
2064 return TRACE_TYPE_PARTIAL_LINE;
2065 } 1662 }
2066 ret = trace_seq_puts(s, "\n");
2067 if (!ret)
2068 return TRACE_TYPE_PARTIAL_LINE;
2069 break;
2070 }
2071 case TRACE_PRINT: {
2072 struct print_entry *field;
2073
2074 trace_assign_type(field, entry);
2075
2076 seq_print_ip_sym(s, field->ip, sym_flags);
2077 trace_seq_printf(s, ": %s", field->buf);
2078 if (entry->flags & TRACE_FLAG_CONT)
2079 trace_seq_print_cont(s, iter);
2080 break;
2081 }
2082 case TRACE_GRAPH_RET: {
2083 return print_graph_function(iter);
2084 }
2085 case TRACE_GRAPH_ENT: {
2086 return print_graph_function(iter);
2087 } 1663 }
2088 case TRACE_BRANCH: {
2089 struct trace_branch *field;
2090 1664
2091 trace_assign_type(field, entry); 1665 if (event)
2092 1666 return event->trace(iter, sym_flags);
2093 trace_seq_printf(s, "[%s] %s:%s:%d\n",
2094 field->correct ? " ok " : " MISS ",
2095 field->func,
2096 field->file,
2097 field->line);
2098 break;
2099 }
2100 case TRACE_USER_STACK: {
2101 struct userstack_entry *field;
2102 1667
2103 trace_assign_type(field, entry); 1668 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
1669 goto partial;
2104 1670
2105 ret = seq_print_userip_objs(field, s, sym_flags);
2106 if (!ret)
2107 return TRACE_TYPE_PARTIAL_LINE;
2108 ret = trace_seq_putc(s, '\n');
2109 if (!ret)
2110 return TRACE_TYPE_PARTIAL_LINE;
2111 break;
2112 }
2113 }
2114 return TRACE_TYPE_HANDLED; 1671 return TRACE_TYPE_HANDLED;
1672partial:
1673 return TRACE_TYPE_PARTIAL_LINE;
2115} 1674}
2116 1675
2117static enum print_line_t print_raw_fmt(struct trace_iterator *iter) 1676static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
2118{ 1677{
2119 struct trace_seq *s = &iter->seq; 1678 struct trace_seq *s = &iter->seq;
2120 struct trace_entry *entry; 1679 struct trace_entry *entry;
2121 int ret; 1680 struct trace_event *event;
2122 int S, T;
2123 1681
2124 entry = iter->ent; 1682 entry = iter->ent;
2125 1683
2126 if (entry->type == TRACE_CONT) 1684 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2127 return TRACE_TYPE_HANDLED; 1685 if (!trace_seq_printf(s, "%d %d %llu ",
2128 1686 entry->pid, iter->cpu, iter->ts))
2129 ret = trace_seq_printf(s, "%d %d %llu ", 1687 goto partial;
2130 entry->pid, iter->cpu, iter->ts);
2131 if (!ret)
2132 return TRACE_TYPE_PARTIAL_LINE;
2133
2134 switch (entry->type) {
2135 case TRACE_FN: {
2136 struct ftrace_entry *field;
2137
2138 trace_assign_type(field, entry);
2139
2140 ret = trace_seq_printf(s, "%x %x\n",
2141 field->ip,
2142 field->parent_ip);
2143 if (!ret)
2144 return TRACE_TYPE_PARTIAL_LINE;
2145 break;
2146 }
2147 case TRACE_CTX:
2148 case TRACE_WAKE: {
2149 struct ctx_switch_entry *field;
2150
2151 trace_assign_type(field, entry);
2152
2153 T = task_state_char(field->next_state);
2154 S = entry->type == TRACE_WAKE ? '+' :
2155 task_state_char(field->prev_state);
2156 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
2157 field->prev_pid,
2158 field->prev_prio,
2159 S,
2160 field->next_cpu,
2161 field->next_pid,
2162 field->next_prio,
2163 T);
2164 if (!ret)
2165 return TRACE_TYPE_PARTIAL_LINE;
2166 break;
2167 } 1688 }
2168 case TRACE_SPECIAL:
2169 case TRACE_USER_STACK:
2170 case TRACE_STACK: {
2171 struct special_entry *field;
2172 1689
2173 trace_assign_type(field, entry); 1690 event = ftrace_find_event(entry->type);
2174 1691 if (event)
2175 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1692 return event->raw(iter, 0);
2176 field->arg1,
2177 field->arg2,
2178 field->arg3);
2179 if (!ret)
2180 return TRACE_TYPE_PARTIAL_LINE;
2181 break;
2182 }
2183 case TRACE_PRINT: {
2184 struct print_entry *field;
2185 1693
2186 trace_assign_type(field, entry); 1694 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1695 goto partial;
2187 1696
2188 trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
2189 if (entry->flags & TRACE_FLAG_CONT)
2190 trace_seq_print_cont(s, iter);
2191 break;
2192 }
2193 }
2194 return TRACE_TYPE_HANDLED; 1697 return TRACE_TYPE_HANDLED;
1698partial:
1699 return TRACE_TYPE_PARTIAL_LINE;
2195} 1700}
2196 1701
2197#define SEQ_PUT_FIELD_RET(s, x) \
2198do { \
2199 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
2200 return 0; \
2201} while (0)
2202
2203#define SEQ_PUT_HEX_FIELD_RET(s, x) \
2204do { \
2205 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
2206 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
2207 return 0; \
2208} while (0)
2209
2210static enum print_line_t print_hex_fmt(struct trace_iterator *iter) 1702static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2211{ 1703{
2212 struct trace_seq *s = &iter->seq; 1704 struct trace_seq *s = &iter->seq;
2213 unsigned char newline = '\n'; 1705 unsigned char newline = '\n';
2214 struct trace_entry *entry; 1706 struct trace_entry *entry;
2215 int S, T; 1707 struct trace_event *event;
2216 1708
2217 entry = iter->ent; 1709 entry = iter->ent;
2218 1710
2219 if (entry->type == TRACE_CONT) 1711 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2220 return TRACE_TYPE_HANDLED; 1712 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
2221 1713 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
2222 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 1714 SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
2223 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
2224 SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
2225
2226 switch (entry->type) {
2227 case TRACE_FN: {
2228 struct ftrace_entry *field;
2229
2230 trace_assign_type(field, entry);
2231
2232 SEQ_PUT_HEX_FIELD_RET(s, field->ip);
2233 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
2234 break;
2235 }
2236 case TRACE_CTX:
2237 case TRACE_WAKE: {
2238 struct ctx_switch_entry *field;
2239
2240 trace_assign_type(field, entry);
2241
2242 T = task_state_char(field->next_state);
2243 S = entry->type == TRACE_WAKE ? '+' :
2244 task_state_char(field->prev_state);
2245 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
2246 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
2247 SEQ_PUT_HEX_FIELD_RET(s, S);
2248 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
2249 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
2250 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
2251 SEQ_PUT_HEX_FIELD_RET(s, T);
2252 break;
2253 } 1715 }
2254 case TRACE_SPECIAL:
2255 case TRACE_USER_STACK:
2256 case TRACE_STACK: {
2257 struct special_entry *field;
2258 1716
2259 trace_assign_type(field, entry); 1717 event = ftrace_find_event(entry->type);
2260 1718 if (event) {
2261 SEQ_PUT_HEX_FIELD_RET(s, field->arg1); 1719 enum print_line_t ret = event->hex(iter, 0);
2262 SEQ_PUT_HEX_FIELD_RET(s, field->arg2); 1720 if (ret != TRACE_TYPE_HANDLED)
2263 SEQ_PUT_HEX_FIELD_RET(s, field->arg3); 1721 return ret;
2264 break;
2265 }
2266 } 1722 }
2267 SEQ_PUT_FIELD_RET(s, newline);
2268
2269 return TRACE_TYPE_HANDLED;
2270}
2271
2272static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
2273{
2274 struct trace_seq *s = &iter->seq;
2275 struct trace_entry *entry = iter->ent;
2276 struct print_entry *field;
2277 int ret;
2278 1723
2279 trace_assign_type(field, entry); 1724 SEQ_PUT_FIELD_RET(s, newline);
2280
2281 ret = trace_seq_printf(s, field->buf);
2282 if (!ret)
2283 return TRACE_TYPE_PARTIAL_LINE;
2284
2285 if (entry->flags & TRACE_FLAG_CONT)
2286 trace_seq_print_cont(s, iter);
2287 1725
2288 return TRACE_TYPE_HANDLED; 1726 return TRACE_TYPE_HANDLED;
2289} 1727}
@@ -2292,59 +1730,37 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2292{ 1730{
2293 struct trace_seq *s = &iter->seq; 1731 struct trace_seq *s = &iter->seq;
2294 struct trace_entry *entry; 1732 struct trace_entry *entry;
1733 struct trace_event *event;
2295 1734
2296 entry = iter->ent; 1735 entry = iter->ent;
2297 1736
2298 if (entry->type == TRACE_CONT) 1737 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2299 return TRACE_TYPE_HANDLED; 1738 SEQ_PUT_FIELD_RET(s, entry->pid);
2300 1739 SEQ_PUT_FIELD_RET(s, iter->cpu);
2301 SEQ_PUT_FIELD_RET(s, entry->pid); 1740 SEQ_PUT_FIELD_RET(s, iter->ts);
2302 SEQ_PUT_FIELD_RET(s, entry->cpu);
2303 SEQ_PUT_FIELD_RET(s, iter->ts);
2304
2305 switch (entry->type) {
2306 case TRACE_FN: {
2307 struct ftrace_entry *field;
2308
2309 trace_assign_type(field, entry);
2310
2311 SEQ_PUT_FIELD_RET(s, field->ip);
2312 SEQ_PUT_FIELD_RET(s, field->parent_ip);
2313 break;
2314 }
2315 case TRACE_CTX: {
2316 struct ctx_switch_entry *field;
2317
2318 trace_assign_type(field, entry);
2319
2320 SEQ_PUT_FIELD_RET(s, field->prev_pid);
2321 SEQ_PUT_FIELD_RET(s, field->prev_prio);
2322 SEQ_PUT_FIELD_RET(s, field->prev_state);
2323 SEQ_PUT_FIELD_RET(s, field->next_pid);
2324 SEQ_PUT_FIELD_RET(s, field->next_prio);
2325 SEQ_PUT_FIELD_RET(s, field->next_state);
2326 break;
2327 } 1741 }
2328 case TRACE_SPECIAL:
2329 case TRACE_USER_STACK:
2330 case TRACE_STACK: {
2331 struct special_entry *field;
2332 1742
2333 trace_assign_type(field, entry); 1743 event = ftrace_find_event(entry->type);
2334 1744 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
2335 SEQ_PUT_FIELD_RET(s, field->arg1);
2336 SEQ_PUT_FIELD_RET(s, field->arg2);
2337 SEQ_PUT_FIELD_RET(s, field->arg3);
2338 break;
2339 }
2340 }
2341 return 1;
2342} 1745}
2343 1746
2344static int trace_empty(struct trace_iterator *iter) 1747static int trace_empty(struct trace_iterator *iter)
2345{ 1748{
2346 int cpu; 1749 int cpu;
2347 1750
1751 /* If we are looking at one CPU buffer, only check that one */
1752 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
1753 cpu = iter->cpu_file;
1754 if (iter->buffer_iter[cpu]) {
1755 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
1756 return 0;
1757 } else {
1758 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
1759 return 0;
1760 }
1761 return 1;
1762 }
1763
2348 for_each_tracing_cpu(cpu) { 1764 for_each_tracing_cpu(cpu) {
2349 if (iter->buffer_iter[cpu]) { 1765 if (iter->buffer_iter[cpu]) {
2350 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 1766 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
@@ -2368,10 +1784,15 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2368 return ret; 1784 return ret;
2369 } 1785 }
2370 1786
1787 if (iter->ent->type == TRACE_BPRINT &&
1788 trace_flags & TRACE_ITER_PRINTK &&
1789 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
1790 return trace_print_bprintk_msg_only(iter);
1791
2371 if (iter->ent->type == TRACE_PRINT && 1792 if (iter->ent->type == TRACE_PRINT &&
2372 trace_flags & TRACE_ITER_PRINTK && 1793 trace_flags & TRACE_ITER_PRINTK &&
2373 trace_flags & TRACE_ITER_PRINTK_MSGONLY) 1794 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2374 return print_printk_msg_only(iter); 1795 return trace_print_printk_msg_only(iter);
2375 1796
2376 if (trace_flags & TRACE_ITER_BIN) 1797 if (trace_flags & TRACE_ITER_BIN)
2377 return print_bin_fmt(iter); 1798 return print_bin_fmt(iter);
@@ -2382,9 +1803,6 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2382 if (trace_flags & TRACE_ITER_RAW) 1803 if (trace_flags & TRACE_ITER_RAW)
2383 return print_raw_fmt(iter); 1804 return print_raw_fmt(iter);
2384 1805
2385 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2386 return print_lat_fmt(iter, iter->idx, iter->cpu);
2387
2388 return print_trace_fmt(iter); 1806 return print_trace_fmt(iter);
2389} 1807}
2390 1808
@@ -2426,30 +1844,45 @@ static struct seq_operations tracer_seq_ops = {
2426}; 1844};
2427 1845
2428static struct trace_iterator * 1846static struct trace_iterator *
2429__tracing_open(struct inode *inode, struct file *file, int *ret) 1847__tracing_open(struct inode *inode, struct file *file)
2430{ 1848{
1849 long cpu_file = (long) inode->i_private;
1850 void *fail_ret = ERR_PTR(-ENOMEM);
2431 struct trace_iterator *iter; 1851 struct trace_iterator *iter;
2432 struct seq_file *m; 1852 struct seq_file *m;
2433 int cpu; 1853 int cpu, ret;
2434 1854
2435 if (tracing_disabled) { 1855 if (tracing_disabled)
2436 *ret = -ENODEV; 1856 return ERR_PTR(-ENODEV);
2437 return NULL;
2438 }
2439 1857
2440 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 1858 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2441 if (!iter) { 1859 if (!iter)
2442 *ret = -ENOMEM; 1860 return ERR_PTR(-ENOMEM);
2443 goto out;
2444 }
2445 1861
1862 /*
1863 * We make a copy of the current tracer to avoid concurrent
1864 * changes on it while we are reading.
1865 */
2446 mutex_lock(&trace_types_lock); 1866 mutex_lock(&trace_types_lock);
1867 iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
1868 if (!iter->trace)
1869 goto fail;
1870
1871 if (current_trace)
1872 *iter->trace = *current_trace;
1873
1874 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
1875 goto fail;
1876
1877 cpumask_clear(iter->started);
1878
2447 if (current_trace && current_trace->print_max) 1879 if (current_trace && current_trace->print_max)
2448 iter->tr = &max_tr; 1880 iter->tr = &max_tr;
2449 else 1881 else
2450 iter->tr = inode->i_private; 1882 iter->tr = &global_trace;
2451 iter->trace = current_trace;
2452 iter->pos = -1; 1883 iter->pos = -1;
1884 mutex_init(&iter->mutex);
1885 iter->cpu_file = cpu_file;
2453 1886
2454 /* Notify the tracer early; before we stop tracing. */ 1887 /* Notify the tracer early; before we stop tracing. */
2455 if (iter->trace && iter->trace->open) 1888 if (iter->trace && iter->trace->open)
@@ -2459,20 +1892,24 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2459 if (ring_buffer_overruns(iter->tr->buffer)) 1892 if (ring_buffer_overruns(iter->tr->buffer))
2460 iter->iter_flags |= TRACE_FILE_ANNOTATE; 1893 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2461 1894
1895 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1896 for_each_tracing_cpu(cpu) {
2462 1897
2463 for_each_tracing_cpu(cpu) { 1898 iter->buffer_iter[cpu] =
2464 1899 ring_buffer_read_start(iter->tr->buffer, cpu);
1900 }
1901 } else {
1902 cpu = iter->cpu_file;
2465 iter->buffer_iter[cpu] = 1903 iter->buffer_iter[cpu] =
2466 ring_buffer_read_start(iter->tr->buffer, cpu); 1904 ring_buffer_read_start(iter->tr->buffer, cpu);
2467
2468 if (!iter->buffer_iter[cpu])
2469 goto fail_buffer;
2470 } 1905 }
2471 1906
2472 /* TODO stop tracer */ 1907 /* TODO stop tracer */
2473 *ret = seq_open(file, &tracer_seq_ops); 1908 ret = seq_open(file, &tracer_seq_ops);
2474 if (*ret) 1909 if (ret < 0) {
1910 fail_ret = ERR_PTR(ret);
2475 goto fail_buffer; 1911 goto fail_buffer;
1912 }
2476 1913
2477 m = file->private_data; 1914 m = file->private_data;
2478 m->private = iter; 1915 m->private = iter;
@@ -2482,7 +1919,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2482 1919
2483 mutex_unlock(&trace_types_lock); 1920 mutex_unlock(&trace_types_lock);
2484 1921
2485 out:
2486 return iter; 1922 return iter;
2487 1923
2488 fail_buffer: 1924 fail_buffer:
@@ -2490,10 +1926,13 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2490 if (iter->buffer_iter[cpu]) 1926 if (iter->buffer_iter[cpu])
2491 ring_buffer_read_finish(iter->buffer_iter[cpu]); 1927 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2492 } 1928 }
1929 free_cpumask_var(iter->started);
1930 fail:
2493 mutex_unlock(&trace_types_lock); 1931 mutex_unlock(&trace_types_lock);
1932 kfree(iter->trace);
2494 kfree(iter); 1933 kfree(iter);
2495 1934
2496 return ERR_PTR(-ENOMEM); 1935 return fail_ret;
2497} 1936}
2498 1937
2499int tracing_open_generic(struct inode *inode, struct file *filp) 1938int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2505,12 +1944,17 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2505 return 0; 1944 return 0;
2506} 1945}
2507 1946
2508int tracing_release(struct inode *inode, struct file *file) 1947static int tracing_release(struct inode *inode, struct file *file)
2509{ 1948{
2510 struct seq_file *m = (struct seq_file *)file->private_data; 1949 struct seq_file *m = (struct seq_file *)file->private_data;
2511 struct trace_iterator *iter = m->private; 1950 struct trace_iterator *iter;
2512 int cpu; 1951 int cpu;
2513 1952
1953 if (!(file->f_mode & FMODE_READ))
1954 return 0;
1955
1956 iter = m->private;
1957
2514 mutex_lock(&trace_types_lock); 1958 mutex_lock(&trace_types_lock);
2515 for_each_tracing_cpu(cpu) { 1959 for_each_tracing_cpu(cpu) {
2516 if (iter->buffer_iter[cpu]) 1960 if (iter->buffer_iter[cpu])
@@ -2525,33 +1969,39 @@ int tracing_release(struct inode *inode, struct file *file)
2525 mutex_unlock(&trace_types_lock); 1969 mutex_unlock(&trace_types_lock);
2526 1970
2527 seq_release(inode, file); 1971 seq_release(inode, file);
1972 mutex_destroy(&iter->mutex);
1973 free_cpumask_var(iter->started);
1974 kfree(iter->trace);
2528 kfree(iter); 1975 kfree(iter);
2529 return 0; 1976 return 0;
2530} 1977}
2531 1978
2532static int tracing_open(struct inode *inode, struct file *file) 1979static int tracing_open(struct inode *inode, struct file *file)
2533{ 1980{
2534 int ret;
2535
2536 __tracing_open(inode, file, &ret);
2537
2538 return ret;
2539}
2540
2541static int tracing_lt_open(struct inode *inode, struct file *file)
2542{
2543 struct trace_iterator *iter; 1981 struct trace_iterator *iter;
2544 int ret; 1982 int ret = 0;
2545 1983
2546 iter = __tracing_open(inode, file, &ret); 1984 /* If this file was open for write, then erase contents */
1985 if ((file->f_mode & FMODE_WRITE) &&
1986 !(file->f_flags & O_APPEND)) {
1987 long cpu = (long) inode->i_private;
2547 1988
2548 if (!ret) 1989 if (cpu == TRACE_PIPE_ALL_CPU)
2549 iter->iter_flags |= TRACE_FILE_LAT_FMT; 1990 tracing_reset_online_cpus(&global_trace);
1991 else
1992 tracing_reset(&global_trace, cpu);
1993 }
2550 1994
1995 if (file->f_mode & FMODE_READ) {
1996 iter = __tracing_open(inode, file);
1997 if (IS_ERR(iter))
1998 ret = PTR_ERR(iter);
1999 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
2000 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2001 }
2551 return ret; 2002 return ret;
2552} 2003}
2553 2004
2554
2555static void * 2005static void *
2556t_next(struct seq_file *m, void *v, loff_t *pos) 2006t_next(struct seq_file *m, void *v, loff_t *pos)
2557{ 2007{
@@ -2623,21 +2073,22 @@ static int show_traces_open(struct inode *inode, struct file *file)
2623 return ret; 2073 return ret;
2624} 2074}
2625 2075
2626static struct file_operations tracing_fops = { 2076static ssize_t
2627 .open = tracing_open, 2077tracing_write_stub(struct file *filp, const char __user *ubuf,
2628 .read = seq_read, 2078 size_t count, loff_t *ppos)
2629 .llseek = seq_lseek, 2079{
2630 .release = tracing_release, 2080 return count;
2631}; 2081}
2632 2082
2633static struct file_operations tracing_lt_fops = { 2083static const struct file_operations tracing_fops = {
2634 .open = tracing_lt_open, 2084 .open = tracing_open,
2635 .read = seq_read, 2085 .read = seq_read,
2086 .write = tracing_write_stub,
2636 .llseek = seq_lseek, 2087 .llseek = seq_lseek,
2637 .release = tracing_release, 2088 .release = tracing_release,
2638}; 2089};
2639 2090
2640static struct file_operations show_traces_fops = { 2091static const struct file_operations show_traces_fops = {
2641 .open = show_traces_open, 2092 .open = show_traces_open,
2642 .read = seq_read, 2093 .read = seq_read,
2643 .release = seq_release, 2094 .release = seq_release,
@@ -2730,7 +2181,7 @@ err_unlock:
2730 return err; 2181 return err;
2731} 2182}
2732 2183
2733static struct file_operations tracing_cpumask_fops = { 2184static const struct file_operations tracing_cpumask_fops = {
2734 .open = tracing_open_generic, 2185 .open = tracing_open_generic,
2735 .read = tracing_cpumask_read, 2186 .read = tracing_cpumask_read,
2736 .write = tracing_cpumask_write, 2187 .write = tracing_cpumask_write,
@@ -2740,57 +2191,62 @@ static ssize_t
2740tracing_trace_options_read(struct file *filp, char __user *ubuf, 2191tracing_trace_options_read(struct file *filp, char __user *ubuf,
2741 size_t cnt, loff_t *ppos) 2192 size_t cnt, loff_t *ppos)
2742{ 2193{
2743 int i; 2194 struct tracer_opt *trace_opts;
2195 u32 tracer_flags;
2196 int len = 0;
2744 char *buf; 2197 char *buf;
2745 int r = 0; 2198 int r = 0;
2746 int len = 0; 2199 int i;
2747 u32 tracer_flags = current_trace->flags->val;
2748 struct tracer_opt *trace_opts = current_trace->flags->opts;
2749 2200
2750 2201
2751 /* calulate max size */ 2202 /* calculate max size */
2752 for (i = 0; trace_options[i]; i++) { 2203 for (i = 0; trace_options[i]; i++) {
2753 len += strlen(trace_options[i]); 2204 len += strlen(trace_options[i]);
2754 len += 3; /* "no" and space */ 2205 len += 3; /* "no" and newline */
2755 } 2206 }
2756 2207
2208 mutex_lock(&trace_types_lock);
2209 tracer_flags = current_trace->flags->val;
2210 trace_opts = current_trace->flags->opts;
2211
2757 /* 2212 /*
2758 * Increase the size with names of options specific 2213 * Increase the size with names of options specific
2759 * of the current tracer. 2214 * of the current tracer.
2760 */ 2215 */
2761 for (i = 0; trace_opts[i].name; i++) { 2216 for (i = 0; trace_opts[i].name; i++) {
2762 len += strlen(trace_opts[i].name); 2217 len += strlen(trace_opts[i].name);
2763 len += 3; /* "no" and space */ 2218 len += 3; /* "no" and newline */
2764 } 2219 }
2765 2220
2766 /* +2 for \n and \0 */ 2221 /* +2 for \n and \0 */
2767 buf = kmalloc(len + 2, GFP_KERNEL); 2222 buf = kmalloc(len + 2, GFP_KERNEL);
2768 if (!buf) 2223 if (!buf) {
2224 mutex_unlock(&trace_types_lock);
2769 return -ENOMEM; 2225 return -ENOMEM;
2226 }
2770 2227
2771 for (i = 0; trace_options[i]; i++) { 2228 for (i = 0; trace_options[i]; i++) {
2772 if (trace_flags & (1 << i)) 2229 if (trace_flags & (1 << i))
2773 r += sprintf(buf + r, "%s ", trace_options[i]); 2230 r += sprintf(buf + r, "%s\n", trace_options[i]);
2774 else 2231 else
2775 r += sprintf(buf + r, "no%s ", trace_options[i]); 2232 r += sprintf(buf + r, "no%s\n", trace_options[i]);
2776 } 2233 }
2777 2234
2778 for (i = 0; trace_opts[i].name; i++) { 2235 for (i = 0; trace_opts[i].name; i++) {
2779 if (tracer_flags & trace_opts[i].bit) 2236 if (tracer_flags & trace_opts[i].bit)
2780 r += sprintf(buf + r, "%s ", 2237 r += sprintf(buf + r, "%s\n",
2781 trace_opts[i].name); 2238 trace_opts[i].name);
2782 else 2239 else
2783 r += sprintf(buf + r, "no%s ", 2240 r += sprintf(buf + r, "no%s\n",
2784 trace_opts[i].name); 2241 trace_opts[i].name);
2785 } 2242 }
2243 mutex_unlock(&trace_types_lock);
2786 2244
2787 r += sprintf(buf + r, "\n");
2788 WARN_ON(r >= len + 2); 2245 WARN_ON(r >= len + 2);
2789 2246
2790 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2247 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2791 2248
2792 kfree(buf); 2249 kfree(buf);
2793
2794 return r; 2250 return r;
2795} 2251}
2796 2252
@@ -2828,6 +2284,34 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2828 return 0; 2284 return 0;
2829} 2285}
2830 2286
2287static void set_tracer_flags(unsigned int mask, int enabled)
2288{
2289 /* do nothing if flag is already set */
2290 if (!!(trace_flags & mask) == !!enabled)
2291 return;
2292
2293 if (enabled)
2294 trace_flags |= mask;
2295 else
2296 trace_flags &= ~mask;
2297
2298 if (mask == TRACE_ITER_GLOBAL_CLK) {
2299 u64 (*func)(void);
2300
2301 if (enabled)
2302 func = trace_clock_global;
2303 else
2304 func = trace_clock_local;
2305
2306 mutex_lock(&trace_types_lock);
2307 ring_buffer_set_clock(global_trace.buffer, func);
2308
2309 if (max_tr.buffer)
2310 ring_buffer_set_clock(max_tr.buffer, func);
2311 mutex_unlock(&trace_types_lock);
2312 }
2313}
2314
2831static ssize_t 2315static ssize_t
2832tracing_trace_options_write(struct file *filp, const char __user *ubuf, 2316tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2833 size_t cnt, loff_t *ppos) 2317 size_t cnt, loff_t *ppos)
@@ -2855,17 +2339,16 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2855 int len = strlen(trace_options[i]); 2339 int len = strlen(trace_options[i]);
2856 2340
2857 if (strncmp(cmp, trace_options[i], len) == 0) { 2341 if (strncmp(cmp, trace_options[i], len) == 0) {
2858 if (neg) 2342 set_tracer_flags(1 << i, !neg);
2859 trace_flags &= ~(1 << i);
2860 else
2861 trace_flags |= (1 << i);
2862 break; 2343 break;
2863 } 2344 }
2864 } 2345 }
2865 2346
2866 /* If no option could be set, test the specific tracer options */ 2347 /* If no option could be set, test the specific tracer options */
2867 if (!trace_options[i]) { 2348 if (!trace_options[i]) {
2349 mutex_lock(&trace_types_lock);
2868 ret = set_tracer_option(current_trace, cmp, neg); 2350 ret = set_tracer_option(current_trace, cmp, neg);
2351 mutex_unlock(&trace_types_lock);
2869 if (ret) 2352 if (ret)
2870 return ret; 2353 return ret;
2871 } 2354 }
@@ -2875,7 +2358,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2875 return cnt; 2358 return cnt;
2876} 2359}
2877 2360
2878static struct file_operations tracing_iter_fops = { 2361static const struct file_operations tracing_iter_fops = {
2879 .open = tracing_open_generic, 2362 .open = tracing_open_generic,
2880 .read = tracing_trace_options_read, 2363 .read = tracing_trace_options_read,
2881 .write = tracing_trace_options_write, 2364 .write = tracing_trace_options_write,
@@ -2886,9 +2369,9 @@ static const char readme_msg[] =
2886 "# mkdir /debug\n" 2369 "# mkdir /debug\n"
2887 "# mount -t debugfs nodev /debug\n\n" 2370 "# mount -t debugfs nodev /debug\n\n"
2888 "# cat /debug/tracing/available_tracers\n" 2371 "# cat /debug/tracing/available_tracers\n"
2889 "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" 2372 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2890 "# cat /debug/tracing/current_tracer\n" 2373 "# cat /debug/tracing/current_tracer\n"
2891 "none\n" 2374 "nop\n"
2892 "# echo sched_switch > /debug/tracing/current_tracer\n" 2375 "# echo sched_switch > /debug/tracing/current_tracer\n"
2893 "# cat /debug/tracing/current_tracer\n" 2376 "# cat /debug/tracing/current_tracer\n"
2894 "sched_switch\n" 2377 "sched_switch\n"
@@ -2908,7 +2391,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2908 readme_msg, strlen(readme_msg)); 2391 readme_msg, strlen(readme_msg));
2909} 2392}
2910 2393
2911static struct file_operations tracing_readme_fops = { 2394static const struct file_operations tracing_readme_fops = {
2912 .open = tracing_open_generic, 2395 .open = tracing_open_generic,
2913 .read = tracing_readme_read, 2396 .read = tracing_readme_read,
2914}; 2397};
@@ -2930,7 +2413,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2930{ 2413{
2931 struct trace_array *tr = filp->private_data; 2414 struct trace_array *tr = filp->private_data;
2932 char buf[64]; 2415 char buf[64];
2933 long val; 2416 unsigned long val;
2934 int ret; 2417 int ret;
2935 2418
2936 if (cnt >= sizeof(buf)) 2419 if (cnt >= sizeof(buf))
@@ -2985,13 +2468,105 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2985 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2468 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2986} 2469}
2987 2470
2988static int tracing_set_tracer(char *buf) 2471int tracer_init(struct tracer *t, struct trace_array *tr)
2472{
2473 tracing_reset_online_cpus(tr);
2474 return t->init(tr);
2475}
2476
2477static int tracing_resize_ring_buffer(unsigned long size)
2989{ 2478{
2479 int ret;
2480
2481 /*
2482 * If kernel or user changes the size of the ring buffer
2483 * we use the size that was given, and we can forget about
2484 * expanding it later.
2485 */
2486 ring_buffer_expanded = 1;
2487
2488 ret = ring_buffer_resize(global_trace.buffer, size);
2489 if (ret < 0)
2490 return ret;
2491
2492 ret = ring_buffer_resize(max_tr.buffer, size);
2493 if (ret < 0) {
2494 int r;
2495
2496 r = ring_buffer_resize(global_trace.buffer,
2497 global_trace.entries);
2498 if (r < 0) {
2499 /*
2500 * AARGH! We are left with different
2501 * size max buffer!!!!
2502 * The max buffer is our "snapshot" buffer.
2503 * When a tracer needs a snapshot (one of the
2504 * latency tracers), it swaps the max buffer
2505 * with the saved snap shot. We succeeded to
2506 * update the size of the main buffer, but failed to
2507 * update the size of the max buffer. But when we tried
2508 * to reset the main buffer to the original size, we
2509 * failed there too. This is very unlikely to
2510 * happen, but if it does, warn and kill all
2511 * tracing.
2512 */
2513 WARN_ON(1);
2514 tracing_disabled = 1;
2515 }
2516 return ret;
2517 }
2518
2519 global_trace.entries = size;
2520
2521 return ret;
2522}
2523
2524/**
2525 * tracing_update_buffers - used by tracing facility to expand ring buffers
2526 *
2527 * To save on memory when the tracing is never used on a system with it
2528 * configured in. The ring buffers are set to a minimum size. But once
2529 * a user starts to use the tracing facility, then they need to grow
2530 * to their default size.
2531 *
2532 * This function is to be called when a tracer is about to be used.
2533 */
2534int tracing_update_buffers(void)
2535{
2536 int ret = 0;
2537
2538 mutex_lock(&trace_types_lock);
2539 if (!ring_buffer_expanded)
2540 ret = tracing_resize_ring_buffer(trace_buf_size);
2541 mutex_unlock(&trace_types_lock);
2542
2543 return ret;
2544}
2545
2546struct trace_option_dentry;
2547
2548static struct trace_option_dentry *
2549create_trace_option_files(struct tracer *tracer);
2550
2551static void
2552destroy_trace_option_files(struct trace_option_dentry *topts);
2553
2554static int tracing_set_tracer(const char *buf)
2555{
2556 static struct trace_option_dentry *topts;
2990 struct trace_array *tr = &global_trace; 2557 struct trace_array *tr = &global_trace;
2991 struct tracer *t; 2558 struct tracer *t;
2992 int ret = 0; 2559 int ret = 0;
2993 2560
2994 mutex_lock(&trace_types_lock); 2561 mutex_lock(&trace_types_lock);
2562
2563 if (!ring_buffer_expanded) {
2564 ret = tracing_resize_ring_buffer(trace_buf_size);
2565 if (ret < 0)
2566 goto out;
2567 ret = 0;
2568 }
2569
2995 for (t = trace_types; t; t = t->next) { 2570 for (t = trace_types; t; t = t->next) {
2996 if (strcmp(t->name, buf) == 0) 2571 if (strcmp(t->name, buf) == 0)
2997 break; 2572 break;
@@ -3007,9 +2582,14 @@ static int tracing_set_tracer(char *buf)
3007 if (current_trace && current_trace->reset) 2582 if (current_trace && current_trace->reset)
3008 current_trace->reset(tr); 2583 current_trace->reset(tr);
3009 2584
2585 destroy_trace_option_files(topts);
2586
3010 current_trace = t; 2587 current_trace = t;
2588
2589 topts = create_trace_option_files(current_trace);
2590
3011 if (t->init) { 2591 if (t->init) {
3012 ret = t->init(tr); 2592 ret = tracer_init(t, tr);
3013 if (ret) 2593 if (ret)
3014 goto out; 2594 goto out;
3015 } 2595 }
@@ -3072,9 +2652,9 @@ static ssize_t
3072tracing_max_lat_write(struct file *filp, const char __user *ubuf, 2652tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3073 size_t cnt, loff_t *ppos) 2653 size_t cnt, loff_t *ppos)
3074{ 2654{
3075 long *ptr = filp->private_data; 2655 unsigned long *ptr = filp->private_data;
3076 char buf[64]; 2656 char buf[64];
3077 long val; 2657 unsigned long val;
3078 int ret; 2658 int ret;
3079 2659
3080 if (cnt >= sizeof(buf)) 2660 if (cnt >= sizeof(buf))
@@ -3094,54 +2674,96 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3094 return cnt; 2674 return cnt;
3095} 2675}
3096 2676
3097static atomic_t tracing_reader;
3098
3099static int tracing_open_pipe(struct inode *inode, struct file *filp) 2677static int tracing_open_pipe(struct inode *inode, struct file *filp)
3100{ 2678{
2679 long cpu_file = (long) inode->i_private;
3101 struct trace_iterator *iter; 2680 struct trace_iterator *iter;
2681 int ret = 0;
3102 2682
3103 if (tracing_disabled) 2683 if (tracing_disabled)
3104 return -ENODEV; 2684 return -ENODEV;
3105 2685
3106 /* We only allow for reader of the pipe */ 2686 mutex_lock(&trace_types_lock);
3107 if (atomic_inc_return(&tracing_reader) != 1) { 2687
3108 atomic_dec(&tracing_reader); 2688 /* We only allow one reader per cpu */
3109 return -EBUSY; 2689 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2690 if (!cpumask_empty(tracing_reader_cpumask)) {
2691 ret = -EBUSY;
2692 goto out;
2693 }
2694 cpumask_setall(tracing_reader_cpumask);
2695 } else {
2696 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2697 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2698 else {
2699 ret = -EBUSY;
2700 goto out;
2701 }
3110 } 2702 }
3111 2703
3112 /* create a buffer to store the information to pass to userspace */ 2704 /* create a buffer to store the information to pass to userspace */
3113 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2705 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
3114 if (!iter) 2706 if (!iter) {
3115 return -ENOMEM; 2707 ret = -ENOMEM;
2708 goto out;
2709 }
3116 2710
3117 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 2711 /*
3118 kfree(iter); 2712 * We make a copy of the current tracer to avoid concurrent
3119 return -ENOMEM; 2713 * changes on it while we are reading.
2714 */
2715 iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
2716 if (!iter->trace) {
2717 ret = -ENOMEM;
2718 goto fail;
3120 } 2719 }
2720 if (current_trace)
2721 *iter->trace = *current_trace;
3121 2722
3122 mutex_lock(&trace_types_lock); 2723 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
2724 ret = -ENOMEM;
2725 goto fail;
2726 }
3123 2727
3124 /* trace pipe does not show start of buffer */ 2728 /* trace pipe does not show start of buffer */
3125 cpumask_setall(iter->started); 2729 cpumask_setall(iter->started);
3126 2730
2731 iter->cpu_file = cpu_file;
3127 iter->tr = &global_trace; 2732 iter->tr = &global_trace;
3128 iter->trace = current_trace; 2733 mutex_init(&iter->mutex);
3129 filp->private_data = iter; 2734 filp->private_data = iter;
3130 2735
3131 if (iter->trace->pipe_open) 2736 if (iter->trace->pipe_open)
3132 iter->trace->pipe_open(iter); 2737 iter->trace->pipe_open(iter);
2738
2739out:
3133 mutex_unlock(&trace_types_lock); 2740 mutex_unlock(&trace_types_lock);
2741 return ret;
3134 2742
3135 return 0; 2743fail:
2744 kfree(iter->trace);
2745 kfree(iter);
2746 mutex_unlock(&trace_types_lock);
2747 return ret;
3136} 2748}
3137 2749
3138static int tracing_release_pipe(struct inode *inode, struct file *file) 2750static int tracing_release_pipe(struct inode *inode, struct file *file)
3139{ 2751{
3140 struct trace_iterator *iter = file->private_data; 2752 struct trace_iterator *iter = file->private_data;
3141 2753
2754 mutex_lock(&trace_types_lock);
2755
2756 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2757 cpumask_clear(tracing_reader_cpumask);
2758 else
2759 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2760
2761 mutex_unlock(&trace_types_lock);
2762
3142 free_cpumask_var(iter->started); 2763 free_cpumask_var(iter->started);
2764 mutex_destroy(&iter->mutex);
2765 kfree(iter->trace);
3143 kfree(iter); 2766 kfree(iter);
3144 atomic_dec(&tracing_reader);
3145 2767
3146 return 0; 2768 return 0;
3147} 2769}
@@ -3167,67 +2789,57 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3167 } 2789 }
3168} 2790}
3169 2791
3170/* 2792
3171 * Consumer reader. 2793void default_wait_pipe(struct trace_iterator *iter)
3172 */
3173static ssize_t
3174tracing_read_pipe(struct file *filp, char __user *ubuf,
3175 size_t cnt, loff_t *ppos)
3176{ 2794{
3177 struct trace_iterator *iter = filp->private_data; 2795 DEFINE_WAIT(wait);
3178 ssize_t sret;
3179 2796
3180 /* return any leftover data */ 2797 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
3181 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
3182 if (sret != -EBUSY)
3183 return sret;
3184 2798
3185 trace_seq_reset(&iter->seq); 2799 if (trace_empty(iter))
2800 schedule();
3186 2801
3187 mutex_lock(&trace_types_lock); 2802 finish_wait(&trace_wait, &wait);
3188 if (iter->trace->read) { 2803}
3189 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); 2804
3190 if (sret) 2805/*
3191 goto out; 2806 * This is a make-shift waitqueue.
3192 } 2807 * A tracer might use this callback on some rare cases:
2808 *
2809 * 1) the current tracer might hold the runqueue lock when it wakes up
2810 * a reader, hence a deadlock (sched, function, and function graph tracers)
2811 * 2) the function tracers, trace all functions, we don't want
2812 * the overhead of calling wake_up and friends
2813 * (and tracing them too)
2814 *
2815 * Anyway, this is really very primitive wakeup.
2816 */
2817void poll_wait_pipe(struct trace_iterator *iter)
2818{
2819 set_current_state(TASK_INTERRUPTIBLE);
2820 /* sleep for 100 msecs, and try again. */
2821 schedule_timeout(HZ / 10);
2822}
2823
2824/* Must be called with trace_types_lock mutex held. */
2825static int tracing_wait_pipe(struct file *filp)
2826{
2827 struct trace_iterator *iter = filp->private_data;
3193 2828
3194waitagain:
3195 sret = 0;
3196 while (trace_empty(iter)) { 2829 while (trace_empty(iter)) {
3197 2830
3198 if ((filp->f_flags & O_NONBLOCK)) { 2831 if ((filp->f_flags & O_NONBLOCK)) {
3199 sret = -EAGAIN; 2832 return -EAGAIN;
3200 goto out;
3201 } 2833 }
3202 2834
3203 /* 2835 mutex_unlock(&iter->mutex);
3204 * This is a make-shift waitqueue. The reason we don't use
3205 * an actual wait queue is because:
3206 * 1) we only ever have one waiter
3207 * 2) the tracing, traces all functions, we don't want
3208 * the overhead of calling wake_up and friends
3209 * (and tracing them too)
3210 * Anyway, this is really very primitive wakeup.
3211 */
3212 set_current_state(TASK_INTERRUPTIBLE);
3213 iter->tr->waiter = current;
3214
3215 mutex_unlock(&trace_types_lock);
3216
3217 /* sleep for 100 msecs, and try again. */
3218 schedule_timeout(HZ/10);
3219 2836
3220 mutex_lock(&trace_types_lock); 2837 iter->trace->wait_pipe(iter);
3221 2838
3222 iter->tr->waiter = NULL; 2839 mutex_lock(&iter->mutex);
3223
3224 if (signal_pending(current)) {
3225 sret = -EINTR;
3226 goto out;
3227 }
3228 2840
3229 if (iter->trace != current_trace) 2841 if (signal_pending(current))
3230 goto out; 2842 return -EINTR;
3231 2843
3232 /* 2844 /*
3233 * We block until we read something and tracing is disabled. 2845 * We block until we read something and tracing is disabled.
@@ -3240,13 +2852,59 @@ waitagain:
3240 */ 2852 */
3241 if (!tracer_enabled && iter->pos) 2853 if (!tracer_enabled && iter->pos)
3242 break; 2854 break;
2855 }
2856
2857 return 1;
2858}
3243 2859
3244 continue; 2860/*
2861 * Consumer reader.
2862 */
2863static ssize_t
2864tracing_read_pipe(struct file *filp, char __user *ubuf,
2865 size_t cnt, loff_t *ppos)
2866{
2867 struct trace_iterator *iter = filp->private_data;
2868 static struct tracer *old_tracer;
2869 ssize_t sret;
2870
2871 /* return any leftover data */
2872 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2873 if (sret != -EBUSY)
2874 return sret;
2875
2876 trace_seq_init(&iter->seq);
2877
2878 /* copy the tracer to avoid using a global lock all around */
2879 mutex_lock(&trace_types_lock);
2880 if (unlikely(old_tracer != current_trace && current_trace)) {
2881 old_tracer = current_trace;
2882 *iter->trace = *current_trace;
2883 }
2884 mutex_unlock(&trace_types_lock);
2885
2886 /*
2887 * Avoid more than one consumer on a single file descriptor
2888 * This is just a matter of traces coherency, the ring buffer itself
2889 * is protected.
2890 */
2891 mutex_lock(&iter->mutex);
2892 if (iter->trace->read) {
2893 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
2894 if (sret)
2895 goto out;
3245 } 2896 }
3246 2897
2898waitagain:
2899 sret = tracing_wait_pipe(filp);
2900 if (sret <= 0)
2901 goto out;
2902
3247 /* stop when tracing is finished */ 2903 /* stop when tracing is finished */
3248 if (trace_empty(iter)) 2904 if (trace_empty(iter)) {
2905 sret = 0;
3249 goto out; 2906 goto out;
2907 }
3250 2908
3251 if (cnt >= PAGE_SIZE) 2909 if (cnt >= PAGE_SIZE)
3252 cnt = PAGE_SIZE - 1; 2910 cnt = PAGE_SIZE - 1;
@@ -3267,8 +2925,8 @@ waitagain:
3267 iter->seq.len = len; 2925 iter->seq.len = len;
3268 break; 2926 break;
3269 } 2927 }
3270 2928 if (ret != TRACE_TYPE_NO_CONSUME)
3271 trace_consume(iter); 2929 trace_consume(iter);
3272 2930
3273 if (iter->seq.len >= cnt) 2931 if (iter->seq.len >= cnt)
3274 break; 2932 break;
@@ -3277,7 +2935,7 @@ waitagain:
3277 /* Now copy what we have to the user */ 2935 /* Now copy what we have to the user */
3278 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 2936 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
3279 if (iter->seq.readpos >= iter->seq.len) 2937 if (iter->seq.readpos >= iter->seq.len)
3280 trace_seq_reset(&iter->seq); 2938 trace_seq_init(&iter->seq);
3281 2939
3282 /* 2940 /*
3283 * If there was nothing to send to user, inspite of consuming trace 2941 * If there was nothing to send to user, inspite of consuming trace
@@ -3287,20 +2945,165 @@ waitagain:
3287 goto waitagain; 2945 goto waitagain;
3288 2946
3289out: 2947out:
3290 mutex_unlock(&trace_types_lock); 2948 mutex_unlock(&iter->mutex);
3291 2949
3292 return sret; 2950 return sret;
3293} 2951}
3294 2952
2953static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
2954 struct pipe_buffer *buf)
2955{
2956 __free_page(buf->page);
2957}
2958
2959static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
2960 unsigned int idx)
2961{
2962 __free_page(spd->pages[idx]);
2963}
2964
2965static struct pipe_buf_operations tracing_pipe_buf_ops = {
2966 .can_merge = 0,
2967 .map = generic_pipe_buf_map,
2968 .unmap = generic_pipe_buf_unmap,
2969 .confirm = generic_pipe_buf_confirm,
2970 .release = tracing_pipe_buf_release,
2971 .steal = generic_pipe_buf_steal,
2972 .get = generic_pipe_buf_get,
2973};
2974
2975static size_t
2976tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
2977{
2978 size_t count;
2979 int ret;
2980
2981 /* Seq buffer is page-sized, exactly what we need. */
2982 for (;;) {
2983 count = iter->seq.len;
2984 ret = print_trace_line(iter);
2985 count = iter->seq.len - count;
2986 if (rem < count) {
2987 rem = 0;
2988 iter->seq.len -= count;
2989 break;
2990 }
2991 if (ret == TRACE_TYPE_PARTIAL_LINE) {
2992 iter->seq.len -= count;
2993 break;
2994 }
2995
2996 trace_consume(iter);
2997 rem -= count;
2998 if (!find_next_entry_inc(iter)) {
2999 rem = 0;
3000 iter->ent = NULL;
3001 break;
3002 }
3003 }
3004
3005 return rem;
3006}
3007
3008static ssize_t tracing_splice_read_pipe(struct file *filp,
3009 loff_t *ppos,
3010 struct pipe_inode_info *pipe,
3011 size_t len,
3012 unsigned int flags)
3013{
3014 struct page *pages[PIPE_BUFFERS];
3015 struct partial_page partial[PIPE_BUFFERS];
3016 struct trace_iterator *iter = filp->private_data;
3017 struct splice_pipe_desc spd = {
3018 .pages = pages,
3019 .partial = partial,
3020 .nr_pages = 0, /* This gets updated below. */
3021 .flags = flags,
3022 .ops = &tracing_pipe_buf_ops,
3023 .spd_release = tracing_spd_release_pipe,
3024 };
3025 static struct tracer *old_tracer;
3026 ssize_t ret;
3027 size_t rem;
3028 unsigned int i;
3029
3030 /* copy the tracer to avoid using a global lock all around */
3031 mutex_lock(&trace_types_lock);
3032 if (unlikely(old_tracer != current_trace && current_trace)) {
3033 old_tracer = current_trace;
3034 *iter->trace = *current_trace;
3035 }
3036 mutex_unlock(&trace_types_lock);
3037
3038 mutex_lock(&iter->mutex);
3039
3040 if (iter->trace->splice_read) {
3041 ret = iter->trace->splice_read(iter, filp,
3042 ppos, pipe, len, flags);
3043 if (ret)
3044 goto out_err;
3045 }
3046
3047 ret = tracing_wait_pipe(filp);
3048 if (ret <= 0)
3049 goto out_err;
3050
3051 if (!iter->ent && !find_next_entry_inc(iter)) {
3052 ret = -EFAULT;
3053 goto out_err;
3054 }
3055
3056 /* Fill as many pages as possible. */
3057 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
3058 pages[i] = alloc_page(GFP_KERNEL);
3059 if (!pages[i])
3060 break;
3061
3062 rem = tracing_fill_pipe_page(rem, iter);
3063
3064 /* Copy the data into the page, so we can start over. */
3065 ret = trace_seq_to_buffer(&iter->seq,
3066 page_address(pages[i]),
3067 iter->seq.len);
3068 if (ret < 0) {
3069 __free_page(pages[i]);
3070 break;
3071 }
3072 partial[i].offset = 0;
3073 partial[i].len = iter->seq.len;
3074
3075 trace_seq_init(&iter->seq);
3076 }
3077
3078 mutex_unlock(&iter->mutex);
3079
3080 spd.nr_pages = i;
3081
3082 return splice_to_pipe(pipe, &spd);
3083
3084out_err:
3085 mutex_unlock(&iter->mutex);
3086
3087 return ret;
3088}
3089
3295static ssize_t 3090static ssize_t
3296tracing_entries_read(struct file *filp, char __user *ubuf, 3091tracing_entries_read(struct file *filp, char __user *ubuf,
3297 size_t cnt, loff_t *ppos) 3092 size_t cnt, loff_t *ppos)
3298{ 3093{
3299 struct trace_array *tr = filp->private_data; 3094 struct trace_array *tr = filp->private_data;
3300 char buf[64]; 3095 char buf[96];
3301 int r; 3096 int r;
3302 3097
3303 r = sprintf(buf, "%lu\n", tr->entries >> 10); 3098 mutex_lock(&trace_types_lock);
3099 if (!ring_buffer_expanded)
3100 r = sprintf(buf, "%lu (expanded: %lu)\n",
3101 tr->entries >> 10,
3102 trace_buf_size >> 10);
3103 else
3104 r = sprintf(buf, "%lu\n", tr->entries >> 10);
3105 mutex_unlock(&trace_types_lock);
3106
3304 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3107 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3305} 3108}
3306 3109
@@ -3344,28 +3147,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3344 val <<= 10; 3147 val <<= 10;
3345 3148
3346 if (val != global_trace.entries) { 3149 if (val != global_trace.entries) {
3347 ret = ring_buffer_resize(global_trace.buffer, val); 3150 ret = tracing_resize_ring_buffer(val);
3348 if (ret < 0) {
3349 cnt = ret;
3350 goto out;
3351 }
3352
3353 ret = ring_buffer_resize(max_tr.buffer, val);
3354 if (ret < 0) { 3151 if (ret < 0) {
3355 int r;
3356 cnt = ret; 3152 cnt = ret;
3357 r = ring_buffer_resize(global_trace.buffer,
3358 global_trace.entries);
3359 if (r < 0) {
3360 /* AARGH! We are left with different
3361 * size max buffer!!!! */
3362 WARN_ON(1);
3363 tracing_disabled = 1;
3364 }
3365 goto out; 3153 goto out;
3366 } 3154 }
3367
3368 global_trace.entries = val;
3369 } 3155 }
3370 3156
3371 filp->f_pos += cnt; 3157 filp->f_pos += cnt;
@@ -3393,7 +3179,7 @@ static int mark_printk(const char *fmt, ...)
3393 int ret; 3179 int ret;
3394 va_list args; 3180 va_list args;
3395 va_start(args, fmt); 3181 va_start(args, fmt);
3396 ret = trace_vprintk(0, -1, fmt, args); 3182 ret = trace_vprintk(0, fmt, args);
3397 va_end(args); 3183 va_end(args);
3398 return ret; 3184 return ret;
3399} 3185}
@@ -3433,42 +3219,288 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3433 return cnt; 3219 return cnt;
3434} 3220}
3435 3221
3436static struct file_operations tracing_max_lat_fops = { 3222static const struct file_operations tracing_max_lat_fops = {
3437 .open = tracing_open_generic, 3223 .open = tracing_open_generic,
3438 .read = tracing_max_lat_read, 3224 .read = tracing_max_lat_read,
3439 .write = tracing_max_lat_write, 3225 .write = tracing_max_lat_write,
3440}; 3226};
3441 3227
3442static struct file_operations tracing_ctrl_fops = { 3228static const struct file_operations tracing_ctrl_fops = {
3443 .open = tracing_open_generic, 3229 .open = tracing_open_generic,
3444 .read = tracing_ctrl_read, 3230 .read = tracing_ctrl_read,
3445 .write = tracing_ctrl_write, 3231 .write = tracing_ctrl_write,
3446}; 3232};
3447 3233
3448static struct file_operations set_tracer_fops = { 3234static const struct file_operations set_tracer_fops = {
3449 .open = tracing_open_generic, 3235 .open = tracing_open_generic,
3450 .read = tracing_set_trace_read, 3236 .read = tracing_set_trace_read,
3451 .write = tracing_set_trace_write, 3237 .write = tracing_set_trace_write,
3452}; 3238};
3453 3239
3454static struct file_operations tracing_pipe_fops = { 3240static const struct file_operations tracing_pipe_fops = {
3455 .open = tracing_open_pipe, 3241 .open = tracing_open_pipe,
3456 .poll = tracing_poll_pipe, 3242 .poll = tracing_poll_pipe,
3457 .read = tracing_read_pipe, 3243 .read = tracing_read_pipe,
3244 .splice_read = tracing_splice_read_pipe,
3458 .release = tracing_release_pipe, 3245 .release = tracing_release_pipe,
3459}; 3246};
3460 3247
3461static struct file_operations tracing_entries_fops = { 3248static const struct file_operations tracing_entries_fops = {
3462 .open = tracing_open_generic, 3249 .open = tracing_open_generic,
3463 .read = tracing_entries_read, 3250 .read = tracing_entries_read,
3464 .write = tracing_entries_write, 3251 .write = tracing_entries_write,
3465}; 3252};
3466 3253
3467static struct file_operations tracing_mark_fops = { 3254static const struct file_operations tracing_mark_fops = {
3468 .open = tracing_open_generic, 3255 .open = tracing_open_generic,
3469 .write = tracing_mark_write, 3256 .write = tracing_mark_write,
3470}; 3257};
3471 3258
3259struct ftrace_buffer_info {
3260 struct trace_array *tr;
3261 void *spare;
3262 int cpu;
3263 unsigned int read;
3264};
3265
3266static int tracing_buffers_open(struct inode *inode, struct file *filp)
3267{
3268 int cpu = (int)(long)inode->i_private;
3269 struct ftrace_buffer_info *info;
3270
3271 if (tracing_disabled)
3272 return -ENODEV;
3273
3274 info = kzalloc(sizeof(*info), GFP_KERNEL);
3275 if (!info)
3276 return -ENOMEM;
3277
3278 info->tr = &global_trace;
3279 info->cpu = cpu;
3280 info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
3281 /* Force reading ring buffer for first read */
3282 info->read = (unsigned int)-1;
3283 if (!info->spare)
3284 goto out;
3285
3286 filp->private_data = info;
3287
3288 return 0;
3289
3290 out:
3291 kfree(info);
3292 return -ENOMEM;
3293}
3294
3295static ssize_t
3296tracing_buffers_read(struct file *filp, char __user *ubuf,
3297 size_t count, loff_t *ppos)
3298{
3299 struct ftrace_buffer_info *info = filp->private_data;
3300 unsigned int pos;
3301 ssize_t ret;
3302 size_t size;
3303
3304 if (!count)
3305 return 0;
3306
3307 /* Do we have previous read data to read? */
3308 if (info->read < PAGE_SIZE)
3309 goto read;
3310
3311 info->read = 0;
3312
3313 ret = ring_buffer_read_page(info->tr->buffer,
3314 &info->spare,
3315 count,
3316 info->cpu, 0);
3317 if (ret < 0)
3318 return 0;
3319
3320 pos = ring_buffer_page_len(info->spare);
3321
3322 if (pos < PAGE_SIZE)
3323 memset(info->spare + pos, 0, PAGE_SIZE - pos);
3324
3325read:
3326 size = PAGE_SIZE - info->read;
3327 if (size > count)
3328 size = count;
3329
3330 ret = copy_to_user(ubuf, info->spare + info->read, size);
3331 if (ret == size)
3332 return -EFAULT;
3333 size -= ret;
3334
3335 *ppos += size;
3336 info->read += size;
3337
3338 return size;
3339}
3340
3341static int tracing_buffers_release(struct inode *inode, struct file *file)
3342{
3343 struct ftrace_buffer_info *info = file->private_data;
3344
3345 ring_buffer_free_read_page(info->tr->buffer, info->spare);
3346 kfree(info);
3347
3348 return 0;
3349}
3350
3351struct buffer_ref {
3352 struct ring_buffer *buffer;
3353 void *page;
3354 int ref;
3355};
3356
3357static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
3358 struct pipe_buffer *buf)
3359{
3360 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
3361
3362 if (--ref->ref)
3363 return;
3364
3365 ring_buffer_free_read_page(ref->buffer, ref->page);
3366 kfree(ref);
3367 buf->private = 0;
3368}
3369
3370static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
3371 struct pipe_buffer *buf)
3372{
3373 return 1;
3374}
3375
3376static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3377 struct pipe_buffer *buf)
3378{
3379 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
3380
3381 ref->ref++;
3382}
3383
3384/* Pipe buffer operations for a buffer. */
3385static struct pipe_buf_operations buffer_pipe_buf_ops = {
3386 .can_merge = 0,
3387 .map = generic_pipe_buf_map,
3388 .unmap = generic_pipe_buf_unmap,
3389 .confirm = generic_pipe_buf_confirm,
3390 .release = buffer_pipe_buf_release,
3391 .steal = buffer_pipe_buf_steal,
3392 .get = buffer_pipe_buf_get,
3393};
3394
3395/*
3396 * Callback from splice_to_pipe(), if we need to release some pages
3397 * at the end of the spd in case we error'ed out in filling the pipe.
3398 */
3399static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
3400{
3401 struct buffer_ref *ref =
3402 (struct buffer_ref *)spd->partial[i].private;
3403
3404 if (--ref->ref)
3405 return;
3406
3407 ring_buffer_free_read_page(ref->buffer, ref->page);
3408 kfree(ref);
3409 spd->partial[i].private = 0;
3410}
3411
3412static ssize_t
3413tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3414 struct pipe_inode_info *pipe, size_t len,
3415 unsigned int flags)
3416{
3417 struct ftrace_buffer_info *info = file->private_data;
3418 struct partial_page partial[PIPE_BUFFERS];
3419 struct page *pages[PIPE_BUFFERS];
3420 struct splice_pipe_desc spd = {
3421 .pages = pages,
3422 .partial = partial,
3423 .flags = flags,
3424 .ops = &buffer_pipe_buf_ops,
3425 .spd_release = buffer_spd_release,
3426 };
3427 struct buffer_ref *ref;
3428 int size, i;
3429 size_t ret;
3430
3431 /*
3432 * We can't seek on a buffer input
3433 */
3434 if (unlikely(*ppos))
3435 return -ESPIPE;
3436
3437
3438 for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
3439 struct page *page;
3440 int r;
3441
3442 ref = kzalloc(sizeof(*ref), GFP_KERNEL);
3443 if (!ref)
3444 break;
3445
3446 ref->buffer = info->tr->buffer;
3447 ref->page = ring_buffer_alloc_read_page(ref->buffer);
3448 if (!ref->page) {
3449 kfree(ref);
3450 break;
3451 }
3452
3453 r = ring_buffer_read_page(ref->buffer, &ref->page,
3454 len, info->cpu, 0);
3455 if (r < 0) {
3456 ring_buffer_free_read_page(ref->buffer,
3457 ref->page);
3458 kfree(ref);
3459 break;
3460 }
3461
3462 /*
3463 * zero out any left over data, this is going to
3464 * user land.
3465 */
3466 size = ring_buffer_page_len(ref->page);
3467 if (size < PAGE_SIZE)
3468 memset(ref->page + size, 0, PAGE_SIZE - size);
3469
3470 page = virt_to_page(ref->page);
3471
3472 spd.pages[i] = page;
3473 spd.partial[i].len = PAGE_SIZE;
3474 spd.partial[i].offset = 0;
3475 spd.partial[i].private = (unsigned long)ref;
3476 spd.nr_pages++;
3477 }
3478
3479 spd.nr_pages = i;
3480
3481 /* did we read anything? */
3482 if (!spd.nr_pages) {
3483 if (flags & SPLICE_F_NONBLOCK)
3484 ret = -EAGAIN;
3485 else
3486 ret = 0;
3487 /* TODO: block */
3488 return ret;
3489 }
3490
3491 ret = splice_to_pipe(pipe, &spd);
3492
3493 return ret;
3494}
3495
3496static const struct file_operations tracing_buffers_fops = {
3497 .open = tracing_buffers_open,
3498 .read = tracing_buffers_read,
3499 .release = tracing_buffers_release,
3500 .splice_read = tracing_buffers_splice_read,
3501 .llseek = no_llseek,
3502};
3503
3472#ifdef CONFIG_DYNAMIC_FTRACE 3504#ifdef CONFIG_DYNAMIC_FTRACE
3473 3505
3474int __weak ftrace_arch_read_dyn_info(char *buf, int size) 3506int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3500,7 +3532,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3500 return r; 3532 return r;
3501} 3533}
3502 3534
3503static struct file_operations tracing_dyn_info_fops = { 3535static const struct file_operations tracing_dyn_info_fops = {
3504 .open = tracing_open_generic, 3536 .open = tracing_open_generic,
3505 .read = tracing_read_dyn_info, 3537 .read = tracing_read_dyn_info,
3506}; 3538};
@@ -3515,6 +3547,9 @@ struct dentry *tracing_init_dentry(void)
3515 if (d_tracer) 3547 if (d_tracer)
3516 return d_tracer; 3548 return d_tracer;
3517 3549
3550 if (!debugfs_initialized())
3551 return NULL;
3552
3518 d_tracer = debugfs_create_dir("tracing", NULL); 3553 d_tracer = debugfs_create_dir("tracing", NULL);
3519 3554
3520 if (!d_tracer && !once) { 3555 if (!d_tracer && !once) {
@@ -3526,15 +3561,350 @@ struct dentry *tracing_init_dentry(void)
3526 return d_tracer; 3561 return d_tracer;
3527} 3562}
3528 3563
3564static struct dentry *d_percpu;
3565
3566struct dentry *tracing_dentry_percpu(void)
3567{
3568 static int once;
3569 struct dentry *d_tracer;
3570
3571 if (d_percpu)
3572 return d_percpu;
3573
3574 d_tracer = tracing_init_dentry();
3575
3576 if (!d_tracer)
3577 return NULL;
3578
3579 d_percpu = debugfs_create_dir("per_cpu", d_tracer);
3580
3581 if (!d_percpu && !once) {
3582 once = 1;
3583 pr_warning("Could not create debugfs directory 'per_cpu'\n");
3584 return NULL;
3585 }
3586
3587 return d_percpu;
3588}
3589
3590static void tracing_init_debugfs_percpu(long cpu)
3591{
3592 struct dentry *d_percpu = tracing_dentry_percpu();
3593 struct dentry *entry, *d_cpu;
3594 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
3595 char cpu_dir[7];
3596
3597 if (cpu > 999 || cpu < 0)
3598 return;
3599
3600 sprintf(cpu_dir, "cpu%ld", cpu);
3601 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
3602 if (!d_cpu) {
3603 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
3604 return;
3605 }
3606
3607 /* per cpu trace_pipe */
3608 entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
3609 (void *) cpu, &tracing_pipe_fops);
3610 if (!entry)
3611 pr_warning("Could not create debugfs 'trace_pipe' entry\n");
3612
3613 /* per cpu trace */
3614 entry = debugfs_create_file("trace", 0644, d_cpu,
3615 (void *) cpu, &tracing_fops);
3616 if (!entry)
3617 pr_warning("Could not create debugfs 'trace' entry\n");
3618
3619 entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
3620 (void *) cpu, &tracing_buffers_fops);
3621 if (!entry)
3622 pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
3623}
3624
3529#ifdef CONFIG_FTRACE_SELFTEST 3625#ifdef CONFIG_FTRACE_SELFTEST
3530/* Let selftest have access to static functions in this file */ 3626/* Let selftest have access to static functions in this file */
3531#include "trace_selftest.c" 3627#include "trace_selftest.c"
3532#endif 3628#endif
3533 3629
3630struct trace_option_dentry {
3631 struct tracer_opt *opt;
3632 struct tracer_flags *flags;
3633 struct dentry *entry;
3634};
3635
3636static ssize_t
3637trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
3638 loff_t *ppos)
3639{
3640 struct trace_option_dentry *topt = filp->private_data;
3641 char *buf;
3642
3643 if (topt->flags->val & topt->opt->bit)
3644 buf = "1\n";
3645 else
3646 buf = "0\n";
3647
3648 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
3649}
3650
3651static ssize_t
3652trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3653 loff_t *ppos)
3654{
3655 struct trace_option_dentry *topt = filp->private_data;
3656 unsigned long val;
3657 char buf[64];
3658 int ret;
3659
3660 if (cnt >= sizeof(buf))
3661 return -EINVAL;
3662
3663 if (copy_from_user(&buf, ubuf, cnt))
3664 return -EFAULT;
3665
3666 buf[cnt] = 0;
3667
3668 ret = strict_strtoul(buf, 10, &val);
3669 if (ret < 0)
3670 return ret;
3671
3672 ret = 0;
3673 switch (val) {
3674 case 0:
3675 /* do nothing if already cleared */
3676 if (!(topt->flags->val & topt->opt->bit))
3677 break;
3678
3679 mutex_lock(&trace_types_lock);
3680 if (current_trace->set_flag)
3681 ret = current_trace->set_flag(topt->flags->val,
3682 topt->opt->bit, 0);
3683 mutex_unlock(&trace_types_lock);
3684 if (ret)
3685 return ret;
3686 topt->flags->val &= ~topt->opt->bit;
3687 break;
3688 case 1:
3689 /* do nothing if already set */
3690 if (topt->flags->val & topt->opt->bit)
3691 break;
3692
3693 mutex_lock(&trace_types_lock);
3694 if (current_trace->set_flag)
3695 ret = current_trace->set_flag(topt->flags->val,
3696 topt->opt->bit, 1);
3697 mutex_unlock(&trace_types_lock);
3698 if (ret)
3699 return ret;
3700 topt->flags->val |= topt->opt->bit;
3701 break;
3702
3703 default:
3704 return -EINVAL;
3705 }
3706
3707 *ppos += cnt;
3708
3709 return cnt;
3710}
3711
3712
3713static const struct file_operations trace_options_fops = {
3714 .open = tracing_open_generic,
3715 .read = trace_options_read,
3716 .write = trace_options_write,
3717};
3718
3719static ssize_t
3720trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
3721 loff_t *ppos)
3722{
3723 long index = (long)filp->private_data;
3724 char *buf;
3725
3726 if (trace_flags & (1 << index))
3727 buf = "1\n";
3728 else
3729 buf = "0\n";
3730
3731 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
3732}
3733
3734static ssize_t
3735trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
3736 loff_t *ppos)
3737{
3738 long index = (long)filp->private_data;
3739 char buf[64];
3740 unsigned long val;
3741 int ret;
3742
3743 if (cnt >= sizeof(buf))
3744 return -EINVAL;
3745
3746 if (copy_from_user(&buf, ubuf, cnt))
3747 return -EFAULT;
3748
3749 buf[cnt] = 0;
3750
3751 ret = strict_strtoul(buf, 10, &val);
3752 if (ret < 0)
3753 return ret;
3754
3755 switch (val) {
3756 case 0:
3757 trace_flags &= ~(1 << index);
3758 break;
3759 case 1:
3760 trace_flags |= 1 << index;
3761 break;
3762
3763 default:
3764 return -EINVAL;
3765 }
3766
3767 *ppos += cnt;
3768
3769 return cnt;
3770}
3771
3772static const struct file_operations trace_options_core_fops = {
3773 .open = tracing_open_generic,
3774 .read = trace_options_core_read,
3775 .write = trace_options_core_write,
3776};
3777
3778static struct dentry *trace_options_init_dentry(void)
3779{
3780 struct dentry *d_tracer;
3781 static struct dentry *t_options;
3782
3783 if (t_options)
3784 return t_options;
3785
3786 d_tracer = tracing_init_dentry();
3787 if (!d_tracer)
3788 return NULL;
3789
3790 t_options = debugfs_create_dir("options", d_tracer);
3791 if (!t_options) {
3792 pr_warning("Could not create debugfs directory 'options'\n");
3793 return NULL;
3794 }
3795
3796 return t_options;
3797}
3798
3799static void
3800create_trace_option_file(struct trace_option_dentry *topt,
3801 struct tracer_flags *flags,
3802 struct tracer_opt *opt)
3803{
3804 struct dentry *t_options;
3805 struct dentry *entry;
3806
3807 t_options = trace_options_init_dentry();
3808 if (!t_options)
3809 return;
3810
3811 topt->flags = flags;
3812 topt->opt = opt;
3813
3814 entry = debugfs_create_file(opt->name, 0644, t_options, topt,
3815 &trace_options_fops);
3816
3817 topt->entry = entry;
3818
3819}
3820
3821static struct trace_option_dentry *
3822create_trace_option_files(struct tracer *tracer)
3823{
3824 struct trace_option_dentry *topts;
3825 struct tracer_flags *flags;
3826 struct tracer_opt *opts;
3827 int cnt;
3828
3829 if (!tracer)
3830 return NULL;
3831
3832 flags = tracer->flags;
3833
3834 if (!flags || !flags->opts)
3835 return NULL;
3836
3837 opts = flags->opts;
3838
3839 for (cnt = 0; opts[cnt].name; cnt++)
3840 ;
3841
3842 topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
3843 if (!topts)
3844 return NULL;
3845
3846 for (cnt = 0; opts[cnt].name; cnt++)
3847 create_trace_option_file(&topts[cnt], flags,
3848 &opts[cnt]);
3849
3850 return topts;
3851}
3852
3853static void
3854destroy_trace_option_files(struct trace_option_dentry *topts)
3855{
3856 int cnt;
3857
3858 if (!topts)
3859 return;
3860
3861 for (cnt = 0; topts[cnt].opt; cnt++) {
3862 if (topts[cnt].entry)
3863 debugfs_remove(topts[cnt].entry);
3864 }
3865
3866 kfree(topts);
3867}
3868
3869static struct dentry *
3870create_trace_option_core_file(const char *option, long index)
3871{
3872 struct dentry *t_options;
3873 struct dentry *entry;
3874
3875 t_options = trace_options_init_dentry();
3876 if (!t_options)
3877 return NULL;
3878
3879 entry = debugfs_create_file(option, 0644, t_options, (void *)index,
3880 &trace_options_core_fops);
3881
3882 return entry;
3883}
3884
3885static __init void create_trace_options_dir(void)
3886{
3887 struct dentry *t_options;
3888 struct dentry *entry;
3889 int i;
3890
3891 t_options = trace_options_init_dentry();
3892 if (!t_options)
3893 return;
3894
3895 for (i = 0; trace_options[i]; i++) {
3896 entry = create_trace_option_core_file(trace_options[i], i);
3897 if (!entry)
3898 pr_warning("Could not create debugfs %s entry\n",
3899 trace_options[i]);
3900 }
3901}
3902
3534static __init int tracer_init_debugfs(void) 3903static __init int tracer_init_debugfs(void)
3535{ 3904{
3536 struct dentry *d_tracer; 3905 struct dentry *d_tracer;
3537 struct dentry *entry; 3906 struct dentry *entry;
3907 int cpu;
3538 3908
3539 d_tracer = tracing_init_dentry(); 3909 d_tracer = tracing_init_dentry();
3540 3910
@@ -3548,18 +3918,15 @@ static __init int tracer_init_debugfs(void)
3548 if (!entry) 3918 if (!entry)
3549 pr_warning("Could not create debugfs 'trace_options' entry\n"); 3919 pr_warning("Could not create debugfs 'trace_options' entry\n");
3550 3920
3921 create_trace_options_dir();
3922
3551 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3923 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
3552 NULL, &tracing_cpumask_fops); 3924 NULL, &tracing_cpumask_fops);
3553 if (!entry) 3925 if (!entry)
3554 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); 3926 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
3555 3927
3556 entry = debugfs_create_file("latency_trace", 0444, d_tracer, 3928 entry = debugfs_create_file("trace", 0644, d_tracer,
3557 &global_trace, &tracing_lt_fops); 3929 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
3558 if (!entry)
3559 pr_warning("Could not create debugfs 'latency_trace' entry\n");
3560
3561 entry = debugfs_create_file("trace", 0444, d_tracer,
3562 &global_trace, &tracing_fops);
3563 if (!entry) 3930 if (!entry)
3564 pr_warning("Could not create debugfs 'trace' entry\n"); 3931 pr_warning("Could not create debugfs 'trace' entry\n");
3565 3932
@@ -3590,8 +3957,8 @@ static __init int tracer_init_debugfs(void)
3590 if (!entry) 3957 if (!entry)
3591 pr_warning("Could not create debugfs 'README' entry\n"); 3958 pr_warning("Could not create debugfs 'README' entry\n");
3592 3959
3593 entry = debugfs_create_file("trace_pipe", 0644, d_tracer, 3960 entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
3594 NULL, &tracing_pipe_fops); 3961 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
3595 if (!entry) 3962 if (!entry)
3596 pr_warning("Could not create debugfs " 3963 pr_warning("Could not create debugfs "
3597 "'trace_pipe' entry\n"); 3964 "'trace_pipe' entry\n");
@@ -3619,77 +3986,12 @@ static __init int tracer_init_debugfs(void)
3619#ifdef CONFIG_SYSPROF_TRACER 3986#ifdef CONFIG_SYSPROF_TRACER
3620 init_tracer_sysprof_debugfs(d_tracer); 3987 init_tracer_sysprof_debugfs(d_tracer);
3621#endif 3988#endif
3622 return 0;
3623}
3624
3625int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
3626{
3627 static DEFINE_SPINLOCK(trace_buf_lock);
3628 static char trace_buf[TRACE_BUF_SIZE];
3629
3630 struct ring_buffer_event *event;
3631 struct trace_array *tr = &global_trace;
3632 struct trace_array_cpu *data;
3633 int cpu, len = 0, size, pc;
3634 struct print_entry *entry;
3635 unsigned long irq_flags;
3636
3637 if (tracing_disabled || tracing_selftest_running)
3638 return 0;
3639
3640 pc = preempt_count();
3641 preempt_disable_notrace();
3642 cpu = raw_smp_processor_id();
3643 data = tr->data[cpu];
3644
3645 if (unlikely(atomic_read(&data->disabled)))
3646 goto out;
3647
3648 pause_graph_tracing();
3649 spin_lock_irqsave(&trace_buf_lock, irq_flags);
3650 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
3651
3652 len = min(len, TRACE_BUF_SIZE-1);
3653 trace_buf[len] = 0;
3654
3655 size = sizeof(*entry) + len + 1;
3656 event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
3657 if (!event)
3658 goto out_unlock;
3659 entry = ring_buffer_event_data(event);
3660 tracing_generic_entry_update(&entry->ent, irq_flags, pc);
3661 entry->ent.type = TRACE_PRINT;
3662 entry->ip = ip;
3663 entry->depth = depth;
3664
3665 memcpy(&entry->buf, trace_buf, len);
3666 entry->buf[len] = 0;
3667 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
3668
3669 out_unlock:
3670 spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
3671 unpause_graph_tracing();
3672 out:
3673 preempt_enable_notrace();
3674
3675 return len;
3676}
3677EXPORT_SYMBOL_GPL(trace_vprintk);
3678 3989
3679int __ftrace_printk(unsigned long ip, const char *fmt, ...) 3990 for_each_tracing_cpu(cpu)
3680{ 3991 tracing_init_debugfs_percpu(cpu);
3681 int ret;
3682 va_list ap;
3683
3684 if (!(trace_flags & TRACE_ITER_PRINTK))
3685 return 0;
3686 3992
3687 va_start(ap, fmt); 3993 return 0;
3688 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
3689 va_end(ap);
3690 return ret;
3691} 3994}
3692EXPORT_SYMBOL_GPL(__ftrace_printk);
3693 3995
3694static int trace_panic_handler(struct notifier_block *this, 3996static int trace_panic_handler(struct notifier_block *this,
3695 unsigned long event, void *unused) 3997 unsigned long event, void *unused)
@@ -3736,7 +4038,7 @@ static struct notifier_block trace_die_notifier = {
3736 * it if we decide to change what log level the ftrace dump 4038 * it if we decide to change what log level the ftrace dump
3737 * should be at. 4039 * should be at.
3738 */ 4040 */
3739#define KERN_TRACE KERN_INFO 4041#define KERN_TRACE KERN_EMERG
3740 4042
3741static void 4043static void
3742trace_printk_seq(struct trace_seq *s) 4044trace_printk_seq(struct trace_seq *s)
@@ -3750,14 +4052,15 @@ trace_printk_seq(struct trace_seq *s)
3750 4052
3751 printk(KERN_TRACE "%s", s->buffer); 4053 printk(KERN_TRACE "%s", s->buffer);
3752 4054
3753 trace_seq_reset(s); 4055 trace_seq_init(s);
3754} 4056}
3755 4057
3756void ftrace_dump(void) 4058static void __ftrace_dump(bool disable_tracing)
3757{ 4059{
3758 static DEFINE_SPINLOCK(ftrace_dump_lock); 4060 static DEFINE_SPINLOCK(ftrace_dump_lock);
3759 /* use static because iter can be a bit big for the stack */ 4061 /* use static because iter can be a bit big for the stack */
3760 static struct trace_iterator iter; 4062 static struct trace_iterator iter;
4063 unsigned int old_userobj;
3761 static int dump_ran; 4064 static int dump_ran;
3762 unsigned long flags; 4065 unsigned long flags;
3763 int cnt = 0, cpu; 4066 int cnt = 0, cpu;
@@ -3769,20 +4072,26 @@ void ftrace_dump(void)
3769 4072
3770 dump_ran = 1; 4073 dump_ran = 1;
3771 4074
3772 /* No turning back! */ 4075 tracing_off();
3773 ftrace_kill(); 4076
4077 if (disable_tracing)
4078 ftrace_kill();
3774 4079
3775 for_each_tracing_cpu(cpu) { 4080 for_each_tracing_cpu(cpu) {
3776 atomic_inc(&global_trace.data[cpu]->disabled); 4081 atomic_inc(&global_trace.data[cpu]->disabled);
3777 } 4082 }
3778 4083
4084 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
4085
3779 /* don't look at user memory in panic mode */ 4086 /* don't look at user memory in panic mode */
3780 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4087 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
3781 4088
3782 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 4089 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3783 4090
4091 /* Simulate the iterator */
3784 iter.tr = &global_trace; 4092 iter.tr = &global_trace;
3785 iter.trace = current_trace; 4093 iter.trace = current_trace;
4094 iter.cpu_file = TRACE_PIPE_ALL_CPU;
3786 4095
3787 /* 4096 /*
3788 * We need to stop all tracing on all CPUS to read the 4097 * We need to stop all tracing on all CPUS to read the
@@ -3818,13 +4127,30 @@ void ftrace_dump(void)
3818 else 4127 else
3819 printk(KERN_TRACE "---------------------------------\n"); 4128 printk(KERN_TRACE "---------------------------------\n");
3820 4129
4130 /* Re-enable tracing if requested */
4131 if (!disable_tracing) {
4132 trace_flags |= old_userobj;
4133
4134 for_each_tracing_cpu(cpu) {
4135 atomic_dec(&global_trace.data[cpu]->disabled);
4136 }
4137 tracing_on();
4138 }
4139
3821 out: 4140 out:
3822 spin_unlock_irqrestore(&ftrace_dump_lock, flags); 4141 spin_unlock_irqrestore(&ftrace_dump_lock, flags);
3823} 4142}
3824 4143
4144/* By default: disable tracing after the dump */
4145void ftrace_dump(void)
4146{
4147 __ftrace_dump(true);
4148}
4149
3825__init static int tracer_alloc_buffers(void) 4150__init static int tracer_alloc_buffers(void)
3826{ 4151{
3827 struct trace_array_cpu *data; 4152 struct trace_array_cpu *data;
4153 int ring_buf_size;
3828 int i; 4154 int i;
3829 int ret = -ENOMEM; 4155 int ret = -ENOMEM;
3830 4156
@@ -3834,11 +4160,21 @@ __init static int tracer_alloc_buffers(void)
3834 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4160 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
3835 goto out_free_buffer_mask; 4161 goto out_free_buffer_mask;
3836 4162
4163 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4164 goto out_free_tracing_cpumask;
4165
4166 /* To save memory, keep the ring buffer size to its minimum */
4167 if (ring_buffer_expanded)
4168 ring_buf_size = trace_buf_size;
4169 else
4170 ring_buf_size = 1;
4171
3837 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4172 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
3838 cpumask_copy(tracing_cpumask, cpu_all_mask); 4173 cpumask_copy(tracing_cpumask, cpu_all_mask);
4174 cpumask_clear(tracing_reader_cpumask);
3839 4175
3840 /* TODO: make the number of buffers hot pluggable with CPUS */ 4176 /* TODO: make the number of buffers hot pluggable with CPUS */
3841 global_trace.buffer = ring_buffer_alloc(trace_buf_size, 4177 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
3842 TRACE_BUFFER_FLAGS); 4178 TRACE_BUFFER_FLAGS);
3843 if (!global_trace.buffer) { 4179 if (!global_trace.buffer) {
3844 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4180 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
@@ -3849,7 +4185,7 @@ __init static int tracer_alloc_buffers(void)
3849 4185
3850 4186
3851#ifdef CONFIG_TRACER_MAX_TRACE 4187#ifdef CONFIG_TRACER_MAX_TRACE
3852 max_tr.buffer = ring_buffer_alloc(trace_buf_size, 4188 max_tr.buffer = ring_buffer_alloc(ring_buf_size,
3853 TRACE_BUFFER_FLAGS); 4189 TRACE_BUFFER_FLAGS);
3854 if (!max_tr.buffer) { 4190 if (!max_tr.buffer) {
3855 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4191 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
@@ -3870,14 +4206,10 @@ __init static int tracer_alloc_buffers(void)
3870 trace_init_cmdlines(); 4206 trace_init_cmdlines();
3871 4207
3872 register_tracer(&nop_trace); 4208 register_tracer(&nop_trace);
4209 current_trace = &nop_trace;
3873#ifdef CONFIG_BOOT_TRACER 4210#ifdef CONFIG_BOOT_TRACER
3874 register_tracer(&boot_tracer); 4211 register_tracer(&boot_tracer);
3875 current_trace = &boot_tracer;
3876 current_trace->init(&global_trace);
3877#else
3878 current_trace = &nop_trace;
3879#endif 4212#endif
3880
3881 /* All seems OK, enable tracing */ 4213 /* All seems OK, enable tracing */
3882 tracing_disabled = 0; 4214 tracing_disabled = 0;
3883 4215
@@ -3885,14 +4217,38 @@ __init static int tracer_alloc_buffers(void)
3885 &trace_panic_notifier); 4217 &trace_panic_notifier);
3886 4218
3887 register_die_notifier(&trace_die_notifier); 4219 register_die_notifier(&trace_die_notifier);
3888 ret = 0; 4220
4221 return 0;
3889 4222
3890out_free_cpumask: 4223out_free_cpumask:
4224 free_cpumask_var(tracing_reader_cpumask);
4225out_free_tracing_cpumask:
3891 free_cpumask_var(tracing_cpumask); 4226 free_cpumask_var(tracing_cpumask);
3892out_free_buffer_mask: 4227out_free_buffer_mask:
3893 free_cpumask_var(tracing_buffer_mask); 4228 free_cpumask_var(tracing_buffer_mask);
3894out: 4229out:
3895 return ret; 4230 return ret;
3896} 4231}
4232
4233__init static int clear_boot_tracer(void)
4234{
4235 /*
4236 * The default tracer at boot buffer is an init section.
4237 * This function is called in lateinit. If we did not
4238 * find the boot tracer, then clear it out, to prevent
4239 * later registration from accessing the buffer that is
4240 * about to be freed.
4241 */
4242 if (!default_bootup_tracer)
4243 return 0;
4244
4245 printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
4246 default_bootup_tracer);
4247 default_bootup_tracer = NULL;
4248
4249 return 0;
4250}
4251
3897early_initcall(tracer_alloc_buffers); 4252early_initcall(tracer_alloc_buffers);
3898fs_initcall(tracer_init_debugfs); 4253fs_initcall(tracer_init_debugfs);
4254late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4d3d381bfd95..e685ac2b2ba1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,8 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h> 11#include <trace/boot.h>
12#include <trace/kmemtrace.h>
13#include <trace/power.h>
12 14
13enum trace_type { 15enum trace_type {
14 __TRACE_FIRST_TYPE = 0, 16 __TRACE_FIRST_TYPE = 0,
@@ -16,9 +18,9 @@ enum trace_type {
16 TRACE_FN, 18 TRACE_FN,
17 TRACE_CTX, 19 TRACE_CTX,
18 TRACE_WAKE, 20 TRACE_WAKE,
19 TRACE_CONT,
20 TRACE_STACK, 21 TRACE_STACK,
21 TRACE_PRINT, 22 TRACE_PRINT,
23 TRACE_BPRINT,
22 TRACE_SPECIAL, 24 TRACE_SPECIAL,
23 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
24 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
@@ -29,9 +31,14 @@ enum trace_type {
29 TRACE_GRAPH_ENT, 31 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK, 32 TRACE_USER_STACK,
31 TRACE_HW_BRANCHES, 33 TRACE_HW_BRANCHES,
34 TRACE_SYSCALL_ENTER,
35 TRACE_SYSCALL_EXIT,
36 TRACE_KMEM_ALLOC,
37 TRACE_KMEM_FREE,
32 TRACE_POWER, 38 TRACE_POWER,
39 TRACE_BLK,
33 40
34 __TRACE_LAST_TYPE 41 __TRACE_LAST_TYPE,
35}; 42};
36 43
37/* 44/*
@@ -42,7 +49,6 @@ enum trace_type {
42 */ 49 */
43struct trace_entry { 50struct trace_entry {
44 unsigned char type; 51 unsigned char type;
45 unsigned char cpu;
46 unsigned char flags; 52 unsigned char flags;
47 unsigned char preempt_count; 53 unsigned char preempt_count;
48 int pid; 54 int pid;
@@ -60,13 +66,13 @@ struct ftrace_entry {
60 66
61/* Function call entry */ 67/* Function call entry */
62struct ftrace_graph_ent_entry { 68struct ftrace_graph_ent_entry {
63 struct trace_entry ent; 69 struct trace_entry ent;
64 struct ftrace_graph_ent graph_ent; 70 struct ftrace_graph_ent graph_ent;
65}; 71};
66 72
67/* Function return entry */ 73/* Function return entry */
68struct ftrace_graph_ret_entry { 74struct ftrace_graph_ret_entry {
69 struct trace_entry ent; 75 struct trace_entry ent;
70 struct ftrace_graph_ret ret; 76 struct ftrace_graph_ret ret;
71}; 77};
72extern struct tracer boot_tracer; 78extern struct tracer boot_tracer;
@@ -112,12 +118,18 @@ struct userstack_entry {
112}; 118};
113 119
114/* 120/*
115 * ftrace_printk entry: 121 * trace_printk entry:
116 */ 122 */
123struct bprint_entry {
124 struct trace_entry ent;
125 unsigned long ip;
126 const char *fmt;
127 u32 buf[];
128};
129
117struct print_entry { 130struct print_entry {
118 struct trace_entry ent; 131 struct trace_entry ent;
119 unsigned long ip; 132 unsigned long ip;
120 int depth;
121 char buf[]; 133 char buf[];
122}; 134};
123 135
@@ -170,15 +182,51 @@ struct trace_power {
170 struct power_trace state_data; 182 struct power_trace state_data;
171}; 183};
172 184
185enum kmemtrace_type_id {
186 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
187 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
188 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
189};
190
191struct kmemtrace_alloc_entry {
192 struct trace_entry ent;
193 enum kmemtrace_type_id type_id;
194 unsigned long call_site;
195 const void *ptr;
196 size_t bytes_req;
197 size_t bytes_alloc;
198 gfp_t gfp_flags;
199 int node;
200};
201
202struct kmemtrace_free_entry {
203 struct trace_entry ent;
204 enum kmemtrace_type_id type_id;
205 unsigned long call_site;
206 const void *ptr;
207};
208
209struct syscall_trace_enter {
210 struct trace_entry ent;
211 int nr;
212 unsigned long args[];
213};
214
215struct syscall_trace_exit {
216 struct trace_entry ent;
217 int nr;
218 unsigned long ret;
219};
220
221
173/* 222/*
174 * trace_flag_type is an enumeration that holds different 223 * trace_flag_type is an enumeration that holds different
175 * states when a trace occurs. These are: 224 * states when a trace occurs. These are:
176 * IRQS_OFF - interrupts were disabled 225 * IRQS_OFF - interrupts were disabled
177 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 226 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
178 * NEED_RESCED - reschedule is requested 227 * NEED_RESCED - reschedule is requested
179 * HARDIRQ - inside an interrupt handler 228 * HARDIRQ - inside an interrupt handler
180 * SOFTIRQ - inside a softirq handler 229 * SOFTIRQ - inside a softirq handler
181 * CONT - multiple entries hold the trace item
182 */ 230 */
183enum trace_flag_type { 231enum trace_flag_type {
184 TRACE_FLAG_IRQS_OFF = 0x01, 232 TRACE_FLAG_IRQS_OFF = 0x01,
@@ -186,7 +234,6 @@ enum trace_flag_type {
186 TRACE_FLAG_NEED_RESCHED = 0x04, 234 TRACE_FLAG_NEED_RESCHED = 0x04,
187 TRACE_FLAG_HARDIRQ = 0x08, 235 TRACE_FLAG_HARDIRQ = 0x08,
188 TRACE_FLAG_SOFTIRQ = 0x10, 236 TRACE_FLAG_SOFTIRQ = 0x10,
189 TRACE_FLAG_CONT = 0x20,
190}; 237};
191 238
192#define TRACE_BUF_SIZE 1024 239#define TRACE_BUF_SIZE 1024
@@ -198,6 +245,7 @@ enum trace_flag_type {
198 */ 245 */
199struct trace_array_cpu { 246struct trace_array_cpu {
200 atomic_t disabled; 247 atomic_t disabled;
248 void *buffer_page; /* ring buffer spare */
201 249
202 /* these fields get copied into max-trace: */ 250 /* these fields get copied into max-trace: */
203 unsigned long trace_idx; 251 unsigned long trace_idx;
@@ -262,10 +310,10 @@ extern void __ftrace_bad_type(void);
262 do { \ 310 do { \
263 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ 311 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
264 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ 312 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
265 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
266 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ 313 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
267 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 314 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
268 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 315 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
316 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
269 IF_ASSIGN(var, ent, struct special_entry, 0); \ 317 IF_ASSIGN(var, ent, struct special_entry, 0); \
270 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 318 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
271 TRACE_MMIO_RW); \ 319 TRACE_MMIO_RW); \
@@ -279,7 +327,15 @@ extern void __ftrace_bad_type(void);
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 327 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \ 328 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 329 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ 330 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
331 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
332 TRACE_KMEM_ALLOC); \
333 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
334 TRACE_KMEM_FREE); \
335 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
336 TRACE_SYSCALL_ENTER); \
337 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
338 TRACE_SYSCALL_EXIT); \
283 __ftrace_bad_type(); \ 339 __ftrace_bad_type(); \
284 } while (0) 340 } while (0)
285 341
@@ -287,7 +343,8 @@ extern void __ftrace_bad_type(void);
287enum print_line_t { 343enum print_line_t {
288 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ 344 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
289 TRACE_TYPE_HANDLED = 1, 345 TRACE_TYPE_HANDLED = 1,
290 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ 346 TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
347 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
291}; 348};
292 349
293 350
@@ -297,8 +354,8 @@ enum print_line_t {
297 * flags value in struct tracer_flags. 354 * flags value in struct tracer_flags.
298 */ 355 */
299struct tracer_opt { 356struct tracer_opt {
300 const char *name; /* Will appear on the trace_options file */ 357 const char *name; /* Will appear on the trace_options file */
301 u32 bit; /* Mask assigned in val field in tracer_flags */ 358 u32 bit; /* Mask assigned in val field in tracer_flags */
302}; 359};
303 360
304/* 361/*
@@ -307,28 +364,51 @@ struct tracer_opt {
307 */ 364 */
308struct tracer_flags { 365struct tracer_flags {
309 u32 val; 366 u32 val;
310 struct tracer_opt *opts; 367 struct tracer_opt *opts;
311}; 368};
312 369
313/* Makes more easy to define a tracer opt */ 370/* Makes more easy to define a tracer opt */
314#define TRACER_OPT(s, b) .name = #s, .bit = b 371#define TRACER_OPT(s, b) .name = #s, .bit = b
315 372
316/* 373
317 * A specific tracer, represented by methods that operate on a trace array: 374/**
375 * struct tracer - a specific tracer and its callbacks to interact with debugfs
376 * @name: the name chosen to select it on the available_tracers file
377 * @init: called when one switches to this tracer (echo name > current_tracer)
378 * @reset: called when one switches to another tracer
379 * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
380 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
381 * @open: called when the trace file is opened
382 * @pipe_open: called when the trace_pipe file is opened
383 * @wait_pipe: override how the user waits for traces on trace_pipe
384 * @close: called when the trace file is released
385 * @read: override the default read callback on trace_pipe
386 * @splice_read: override the default splice_read callback on trace_pipe
387 * @selftest: selftest to run on boot (see trace_selftest.c)
388 * @print_headers: override the first lines that describe your columns
389 * @print_line: callback that prints a trace
390 * @set_flag: signals one of your private flags changed (trace_options file)
391 * @flags: your private flags
318 */ 392 */
319struct tracer { 393struct tracer {
320 const char *name; 394 const char *name;
321 /* Your tracer should raise a warning if init fails */
322 int (*init)(struct trace_array *tr); 395 int (*init)(struct trace_array *tr);
323 void (*reset)(struct trace_array *tr); 396 void (*reset)(struct trace_array *tr);
324 void (*start)(struct trace_array *tr); 397 void (*start)(struct trace_array *tr);
325 void (*stop)(struct trace_array *tr); 398 void (*stop)(struct trace_array *tr);
326 void (*open)(struct trace_iterator *iter); 399 void (*open)(struct trace_iterator *iter);
327 void (*pipe_open)(struct trace_iterator *iter); 400 void (*pipe_open)(struct trace_iterator *iter);
401 void (*wait_pipe)(struct trace_iterator *iter);
328 void (*close)(struct trace_iterator *iter); 402 void (*close)(struct trace_iterator *iter);
329 ssize_t (*read)(struct trace_iterator *iter, 403 ssize_t (*read)(struct trace_iterator *iter,
330 struct file *filp, char __user *ubuf, 404 struct file *filp, char __user *ubuf,
331 size_t cnt, loff_t *ppos); 405 size_t cnt, loff_t *ppos);
406 ssize_t (*splice_read)(struct trace_iterator *iter,
407 struct file *filp,
408 loff_t *ppos,
409 struct pipe_inode_info *pipe,
410 size_t len,
411 unsigned int flags);
332#ifdef CONFIG_FTRACE_STARTUP_TEST 412#ifdef CONFIG_FTRACE_STARTUP_TEST
333 int (*selftest)(struct tracer *trace, 413 int (*selftest)(struct tracer *trace,
334 struct trace_array *tr); 414 struct trace_array *tr);
@@ -339,7 +419,8 @@ struct tracer {
339 int (*set_flag)(u32 old_flags, u32 bit, int set); 419 int (*set_flag)(u32 old_flags, u32 bit, int set);
340 struct tracer *next; 420 struct tracer *next;
341 int print_max; 421 int print_max;
342 struct tracer_flags *flags; 422 struct tracer_flags *flags;
423 struct tracer_stat *stats;
343}; 424};
344 425
345struct trace_seq { 426struct trace_seq {
@@ -348,6 +429,16 @@ struct trace_seq {
348 unsigned int readpos; 429 unsigned int readpos;
349}; 430};
350 431
432static inline void
433trace_seq_init(struct trace_seq *s)
434{
435 s->len = 0;
436 s->readpos = 0;
437}
438
439
440#define TRACE_PIPE_ALL_CPU -1
441
351/* 442/*
352 * Trace iterator - used by printout routines who present trace 443 * Trace iterator - used by printout routines who present trace
353 * results to users and which routines might sleep, etc: 444 * results to users and which routines might sleep, etc:
@@ -356,6 +447,8 @@ struct trace_iterator {
356 struct trace_array *tr; 447 struct trace_array *tr;
357 struct tracer *trace; 448 struct tracer *trace;
358 void *private; 449 void *private;
450 int cpu_file;
451 struct mutex mutex;
359 struct ring_buffer_iter *buffer_iter[NR_CPUS]; 452 struct ring_buffer_iter *buffer_iter[NR_CPUS];
360 453
361 /* The below is zeroed out in pipe_read */ 454 /* The below is zeroed out in pipe_read */
@@ -371,6 +464,7 @@ struct trace_iterator {
371 cpumask_var_t started; 464 cpumask_var_t started;
372}; 465};
373 466
467int tracer_init(struct tracer *t, struct trace_array *tr);
374int tracing_is_enabled(void); 468int tracing_is_enabled(void);
375void trace_wake_up(void); 469void trace_wake_up(void);
376void tracing_reset(struct trace_array *tr, int cpu); 470void tracing_reset(struct trace_array *tr, int cpu);
@@ -379,26 +473,50 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
379struct dentry *tracing_init_dentry(void); 473struct dentry *tracing_init_dentry(void);
380void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 474void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
381 475
476struct ring_buffer_event;
477
478struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
479 unsigned char type,
480 unsigned long len,
481 unsigned long flags,
482 int pc);
483void trace_buffer_unlock_commit(struct trace_array *tr,
484 struct ring_buffer_event *event,
485 unsigned long flags, int pc);
486
487struct ring_buffer_event *
488trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
489 unsigned long flags, int pc);
490void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
491 unsigned long flags, int pc);
492void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
493 unsigned long flags, int pc);
494
382struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 495struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
383 struct trace_array_cpu *data); 496 struct trace_array_cpu *data);
497
498struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
499 int *ent_cpu, u64 *ent_ts);
500
384void tracing_generic_entry_update(struct trace_entry *entry, 501void tracing_generic_entry_update(struct trace_entry *entry,
385 unsigned long flags, 502 unsigned long flags,
386 int pc); 503 int pc);
387 504
505void default_wait_pipe(struct trace_iterator *iter);
506void poll_wait_pipe(struct trace_iterator *iter);
507
388void ftrace(struct trace_array *tr, 508void ftrace(struct trace_array *tr,
389 struct trace_array_cpu *data, 509 struct trace_array_cpu *data,
390 unsigned long ip, 510 unsigned long ip,
391 unsigned long parent_ip, 511 unsigned long parent_ip,
392 unsigned long flags, int pc); 512 unsigned long flags, int pc);
393void tracing_sched_switch_trace(struct trace_array *tr, 513void tracing_sched_switch_trace(struct trace_array *tr,
394 struct trace_array_cpu *data,
395 struct task_struct *prev, 514 struct task_struct *prev,
396 struct task_struct *next, 515 struct task_struct *next,
397 unsigned long flags, int pc); 516 unsigned long flags, int pc);
398void tracing_record_cmdline(struct task_struct *tsk); 517void tracing_record_cmdline(struct task_struct *tsk);
399 518
400void tracing_sched_wakeup_trace(struct trace_array *tr, 519void tracing_sched_wakeup_trace(struct trace_array *tr,
401 struct trace_array_cpu *data,
402 struct task_struct *wakee, 520 struct task_struct *wakee,
403 struct task_struct *cur, 521 struct task_struct *cur,
404 unsigned long flags, int pc); 522 unsigned long flags, int pc);
@@ -408,14 +526,12 @@ void trace_special(struct trace_array *tr,
408 unsigned long arg2, 526 unsigned long arg2,
409 unsigned long arg3, int pc); 527 unsigned long arg3, int pc);
410void trace_function(struct trace_array *tr, 528void trace_function(struct trace_array *tr,
411 struct trace_array_cpu *data,
412 unsigned long ip, 529 unsigned long ip,
413 unsigned long parent_ip, 530 unsigned long parent_ip,
414 unsigned long flags, int pc); 531 unsigned long flags, int pc);
415 532
416void trace_graph_return(struct ftrace_graph_ret *trace); 533void trace_graph_return(struct ftrace_graph_ret *trace);
417int trace_graph_entry(struct ftrace_graph_ent *trace); 534int trace_graph_entry(struct ftrace_graph_ent *trace);
418void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
419 535
420void tracing_start_cmdline_record(void); 536void tracing_start_cmdline_record(void);
421void tracing_stop_cmdline_record(void); 537void tracing_stop_cmdline_record(void);
@@ -434,15 +550,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
434void update_max_tr_single(struct trace_array *tr, 550void update_max_tr_single(struct trace_array *tr,
435 struct task_struct *tsk, int cpu); 551 struct task_struct *tsk, int cpu);
436 552
437extern cycle_t ftrace_now(int cpu); 553void __trace_stack(struct trace_array *tr,
554 unsigned long flags,
555 int skip, int pc);
438 556
439#ifdef CONFIG_FUNCTION_TRACER 557extern cycle_t ftrace_now(int cpu);
440void tracing_start_function_trace(void);
441void tracing_stop_function_trace(void);
442#else
443# define tracing_start_function_trace() do { } while (0)
444# define tracing_stop_function_trace() do { } while (0)
445#endif
446 558
447#ifdef CONFIG_CONTEXT_SWITCH_TRACER 559#ifdef CONFIG_CONTEXT_SWITCH_TRACER
448typedef void 560typedef void
@@ -456,10 +568,10 @@ struct tracer_switch_ops {
456 void *private; 568 void *private;
457 struct tracer_switch_ops *next; 569 struct tracer_switch_ops *next;
458}; 570};
459
460char *trace_find_cmdline(int pid);
461#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 571#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
462 572
573extern void trace_find_cmdline(int pid, char comm[]);
574
463#ifdef CONFIG_DYNAMIC_FTRACE 575#ifdef CONFIG_DYNAMIC_FTRACE
464extern unsigned long ftrace_update_tot_cnt; 576extern unsigned long ftrace_update_tot_cnt;
465#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 577#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
@@ -469,6 +581,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
469#ifdef CONFIG_FTRACE_STARTUP_TEST 581#ifdef CONFIG_FTRACE_STARTUP_TEST
470extern int trace_selftest_startup_function(struct tracer *trace, 582extern int trace_selftest_startup_function(struct tracer *trace,
471 struct trace_array *tr); 583 struct trace_array *tr);
584extern int trace_selftest_startup_function_graph(struct tracer *trace,
585 struct trace_array *tr);
472extern int trace_selftest_startup_irqsoff(struct tracer *trace, 586extern int trace_selftest_startup_irqsoff(struct tracer *trace,
473 struct trace_array *tr); 587 struct trace_array *tr);
474extern int trace_selftest_startup_preemptoff(struct tracer *trace, 588extern int trace_selftest_startup_preemptoff(struct tracer *trace,
@@ -488,18 +602,11 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
488#endif /* CONFIG_FTRACE_STARTUP_TEST */ 602#endif /* CONFIG_FTRACE_STARTUP_TEST */
489 603
490extern void *head_page(struct trace_array_cpu *data); 604extern void *head_page(struct trace_array_cpu *data);
491extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 605extern unsigned long long ns2usecs(cycle_t nsec);
492extern void trace_seq_print_cont(struct trace_seq *s,
493 struct trace_iterator *iter);
494
495extern int 606extern int
496seq_print_ip_sym(struct trace_seq *s, unsigned long ip, 607trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
497 unsigned long sym_flags);
498extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
499 size_t cnt);
500extern long ns2usecs(cycle_t nsec);
501extern int 608extern int
502trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); 609trace_vprintk(unsigned long ip, const char *fmt, va_list args);
503 610
504extern unsigned long trace_flags; 611extern unsigned long trace_flags;
505 612
@@ -580,7 +687,11 @@ enum trace_iterator_flags {
580 TRACE_ITER_ANNOTATE = 0x2000, 687 TRACE_ITER_ANNOTATE = 0x2000,
581 TRACE_ITER_USERSTACKTRACE = 0x4000, 688 TRACE_ITER_USERSTACKTRACE = 0x4000,
582 TRACE_ITER_SYM_USEROBJ = 0x8000, 689 TRACE_ITER_SYM_USEROBJ = 0x8000,
583 TRACE_ITER_PRINTK_MSGONLY = 0x10000 690 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
691 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
692 TRACE_ITER_LATENCY_FMT = 0x40000,
693 TRACE_ITER_GLOBAL_CLK = 0x80000,
694 TRACE_ITER_SLEEP_TIME = 0x100000,
584}; 695};
585 696
586/* 697/*
@@ -601,12 +712,12 @@ extern struct tracer nop_trace;
601 * preempt_enable (after a disable), a schedule might take place 712 * preempt_enable (after a disable), a schedule might take place
602 * causing an infinite recursion. 713 * causing an infinite recursion.
603 * 714 *
604 * To prevent this, we read the need_recshed flag before 715 * To prevent this, we read the need_resched flag before
605 * disabling preemption. When we want to enable preemption we 716 * disabling preemption. When we want to enable preemption we
606 * check the flag, if it is set, then we call preempt_enable_no_resched. 717 * check the flag, if it is set, then we call preempt_enable_no_resched.
607 * Otherwise, we call preempt_enable. 718 * Otherwise, we call preempt_enable.
608 * 719 *
609 * The rational for doing the above is that if need resched is set 720 * The rational for doing the above is that if need_resched is set
610 * and we have yet to reschedule, we are either in an atomic location 721 * and we have yet to reschedule, we are either in an atomic location
611 * (where we do not need to check for scheduling) or we are inside 722 * (where we do not need to check for scheduling) or we are inside
612 * the scheduler and do not want to resched. 723 * the scheduler and do not want to resched.
@@ -627,7 +738,7 @@ static inline int ftrace_preempt_disable(void)
627 * 738 *
628 * This is a scheduler safe way to enable preemption and not miss 739 * This is a scheduler safe way to enable preemption and not miss
629 * any preemption checks. The disabled saved the state of preemption. 740 * any preemption checks. The disabled saved the state of preemption.
630 * If resched is set, then we were either inside an atomic or 741 * If resched is set, then we are either inside an atomic or
631 * are inside the scheduler (we would have already scheduled 742 * are inside the scheduler (we would have already scheduled
632 * otherwise). In this case, we do not want to call normal 743 * otherwise). In this case, we do not want to call normal
633 * preempt_enable, but preempt_enable_no_resched instead. 744 * preempt_enable, but preempt_enable_no_resched instead.
@@ -664,4 +775,118 @@ static inline void trace_branch_disable(void)
664} 775}
665#endif /* CONFIG_BRANCH_TRACER */ 776#endif /* CONFIG_BRANCH_TRACER */
666 777
778/* set ring buffers to default size if not already done so */
779int tracing_update_buffers(void);
780
781/* trace event type bit fields, not numeric */
782enum {
783 TRACE_EVENT_TYPE_PRINTF = 1,
784 TRACE_EVENT_TYPE_RAW = 2,
785};
786
787struct ftrace_event_field {
788 struct list_head link;
789 char *name;
790 char *type;
791 int offset;
792 int size;
793};
794
795struct ftrace_event_call {
796 char *name;
797 char *system;
798 struct dentry *dir;
799 int enabled;
800 int (*regfunc)(void);
801 void (*unregfunc)(void);
802 int id;
803 int (*raw_init)(void);
804 int (*show_format)(struct trace_seq *s);
805 int (*define_fields)(void);
806 struct list_head fields;
807 struct filter_pred **preds;
808
809#ifdef CONFIG_EVENT_PROFILE
810 atomic_t profile_count;
811 int (*profile_enable)(struct ftrace_event_call *);
812 void (*profile_disable)(struct ftrace_event_call *);
813#endif
814};
815
816struct event_subsystem {
817 struct list_head list;
818 const char *name;
819 struct dentry *entry;
820 struct filter_pred **preds;
821};
822
823#define events_for_each(event) \
824 for (event = __start_ftrace_events; \
825 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
826 event++)
827
828#define MAX_FILTER_PRED 8
829
830struct filter_pred;
831
832typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
833
834struct filter_pred {
835 filter_pred_fn_t fn;
836 u64 val;
837 char *str_val;
838 int str_len;
839 char *field_name;
840 int offset;
841 int not;
842 int or;
843 int compound;
844 int clear;
845};
846
847int trace_define_field(struct ftrace_event_call *call, char *type,
848 char *name, int offset, int size);
849extern void filter_free_pred(struct filter_pred *pred);
850extern void filter_print_preds(struct filter_pred **preds,
851 struct trace_seq *s);
852extern int filter_parse(char **pbuf, struct filter_pred *pred);
853extern int filter_add_pred(struct ftrace_event_call *call,
854 struct filter_pred *pred);
855extern void filter_free_preds(struct ftrace_event_call *call);
856extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
857extern void filter_free_subsystem_preds(struct event_subsystem *system);
858extern int filter_add_subsystem_pred(struct event_subsystem *system,
859 struct filter_pred *pred);
860
861void event_trace_printk(unsigned long ip, const char *fmt, ...);
862extern struct ftrace_event_call __start_ftrace_events[];
863extern struct ftrace_event_call __stop_ftrace_events[];
864
865#define for_each_event(event) \
866 for (event = __start_ftrace_events; \
867 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
868 event++)
869
870extern const char *__start___trace_bprintk_fmt[];
871extern const char *__stop___trace_bprintk_fmt[];
872
873/*
874 * The double __builtin_constant_p is because gcc will give us an error
875 * if we try to allocate the static variable to fmt if it is not a
876 * constant. Even with the outer if statement optimizing out.
877 */
878#define event_trace_printk(ip, fmt, args...) \
879do { \
880 __trace_printk_check_format(fmt, ##args); \
881 tracing_record_cmdline(current); \
882 if (__builtin_constant_p(fmt)) { \
883 static const char *trace_printk_fmt \
884 __attribute__((section("__trace_printk_fmt"))) = \
885 __builtin_constant_p(fmt) ? fmt : NULL; \
886 \
887 __trace_bprintk(ip, trace_printk_fmt, ##args); \
888 } else \
889 __trace_printk(ip, fmt, ##args); \
890} while (0)
891
667#endif /* _LINUX_KERNEL_TRACE_H */ 892#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 366c8c333e13..7a30fc4c3642 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -11,6 +11,7 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12 12
13#include "trace.h" 13#include "trace.h"
14#include "trace_output.h"
14 15
15static struct trace_array *boot_trace; 16static struct trace_array *boot_trace;
16static bool pre_initcalls_finished; 17static bool pre_initcalls_finished;
@@ -27,13 +28,13 @@ void start_boot_trace(void)
27 28
28void enable_boot_trace(void) 29void enable_boot_trace(void)
29{ 30{
30 if (pre_initcalls_finished) 31 if (boot_trace && pre_initcalls_finished)
31 tracing_start_sched_switch_record(); 32 tracing_start_sched_switch_record();
32} 33}
33 34
34void disable_boot_trace(void) 35void disable_boot_trace(void)
35{ 36{
36 if (pre_initcalls_finished) 37 if (boot_trace && pre_initcalls_finished)
37 tracing_stop_sched_switch_record(); 38 tracing_stop_sched_switch_record();
38} 39}
39 40
@@ -42,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr)
42 int cpu; 43 int cpu;
43 boot_trace = tr; 44 boot_trace = tr;
44 45
46 if (!tr)
47 return 0;
48
45 for_each_cpu(cpu, cpu_possible_mask) 49 for_each_cpu(cpu, cpu_possible_mask)
46 tracing_reset(tr, cpu); 50 tracing_reset(tr, cpu);
47 51
@@ -128,10 +132,9 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
128{ 132{
129 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
130 struct trace_boot_call *entry; 134 struct trace_boot_call *entry;
131 unsigned long irq_flags;
132 struct trace_array *tr = boot_trace; 135 struct trace_array *tr = boot_trace;
133 136
134 if (!pre_initcalls_finished) 137 if (!tr || !pre_initcalls_finished)
135 return; 138 return;
136 139
137 /* Get its name now since this function could 140 /* Get its name now since this function could
@@ -140,18 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
140 sprint_symbol(bt->func, (unsigned long)fn); 143 sprint_symbol(bt->func, (unsigned long)fn);
141 preempt_disable(); 144 preempt_disable();
142 145
143 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 146 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
144 &irq_flags); 147 sizeof(*entry), 0, 0);
145 if (!event) 148 if (!event)
146 goto out; 149 goto out;
147 entry = ring_buffer_event_data(event); 150 entry = ring_buffer_event_data(event);
148 tracing_generic_entry_update(&entry->ent, 0, 0);
149 entry->ent.type = TRACE_BOOT_CALL;
150 entry->boot_call = *bt; 151 entry->boot_call = *bt;
151 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 152 trace_buffer_unlock_commit(tr, event, 0, 0);
152
153 trace_wake_up();
154
155 out: 153 out:
156 preempt_enable(); 154 preempt_enable();
157} 155}
@@ -160,27 +158,21 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
160{ 158{
161 struct ring_buffer_event *event; 159 struct ring_buffer_event *event;
162 struct trace_boot_ret *entry; 160 struct trace_boot_ret *entry;
163 unsigned long irq_flags;
164 struct trace_array *tr = boot_trace; 161 struct trace_array *tr = boot_trace;
165 162
166 if (!pre_initcalls_finished) 163 if (!tr || !pre_initcalls_finished)
167 return; 164 return;
168 165
169 sprint_symbol(bt->func, (unsigned long)fn); 166 sprint_symbol(bt->func, (unsigned long)fn);
170 preempt_disable(); 167 preempt_disable();
171 168
172 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 169 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
173 &irq_flags); 170 sizeof(*entry), 0, 0);
174 if (!event) 171 if (!event)
175 goto out; 172 goto out;
176 entry = ring_buffer_event_data(event); 173 entry = ring_buffer_event_data(event);
177 tracing_generic_entry_update(&entry->ent, 0, 0);
178 entry->ent.type = TRACE_BOOT_RET;
179 entry->boot_ret = *bt; 174 entry->boot_ret = *bt;
180 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 175 trace_buffer_unlock_commit(tr, event, 0, 0);
181
182 trace_wake_up();
183
184 out: 176 out:
185 preempt_enable(); 177 preempt_enable();
186} 178}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6c00feb3bac7..ad8c22efff41 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -14,12 +14,17 @@
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <asm/local.h> 16#include <asm/local.h>
17
17#include "trace.h" 18#include "trace.h"
19#include "trace_stat.h"
20#include "trace_output.h"
18 21
19#ifdef CONFIG_BRANCH_TRACER 22#ifdef CONFIG_BRANCH_TRACER
20 23
24static struct tracer branch_trace;
21static int branch_tracing_enabled __read_mostly; 25static int branch_tracing_enabled __read_mostly;
22static DEFINE_MUTEX(branch_tracing_mutex); 26static DEFINE_MUTEX(branch_tracing_mutex);
27
23static struct trace_array *branch_tracer; 28static struct trace_array *branch_tracer;
24 29
25static void 30static void
@@ -28,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
28 struct trace_array *tr = branch_tracer; 33 struct trace_array *tr = branch_tracer;
29 struct ring_buffer_event *event; 34 struct ring_buffer_event *event;
30 struct trace_branch *entry; 35 struct trace_branch *entry;
31 unsigned long flags, irq_flags; 36 unsigned long flags;
32 int cpu, pc; 37 int cpu, pc;
33 const char *p; 38 const char *p;
34 39
@@ -47,15 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
47 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 52 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
48 goto out; 53 goto out;
49 54
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 55 pc = preempt_count();
51 &irq_flags); 56 event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
57 sizeof(*entry), flags, pc);
52 if (!event) 58 if (!event)
53 goto out; 59 goto out;
54 60
55 pc = preempt_count();
56 entry = ring_buffer_event_data(event); 61 entry = ring_buffer_event_data(event);
57 tracing_generic_entry_update(&entry->ent, flags, pc);
58 entry->ent.type = TRACE_BRANCH;
59 62
60 /* Strip off the path, only save the file */ 63 /* Strip off the path, only save the file */
61 p = f->file + strlen(f->file); 64 p = f->file + strlen(f->file);
@@ -70,7 +73,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
70 entry->line = f->line; 73 entry->line = f->line;
71 entry->correct = val == expect; 74 entry->correct = val == expect;
72 75
73 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 76 ring_buffer_unlock_commit(tr->buffer, event);
74 77
75 out: 78 out:
76 atomic_dec(&tr->data[cpu]->disabled); 79 atomic_dec(&tr->data[cpu]->disabled);
@@ -88,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
88 91
89int enable_branch_tracing(struct trace_array *tr) 92int enable_branch_tracing(struct trace_array *tr)
90{ 93{
91 int ret = 0;
92
93 mutex_lock(&branch_tracing_mutex); 94 mutex_lock(&branch_tracing_mutex);
94 branch_tracer = tr; 95 branch_tracer = tr;
95 /* 96 /*
@@ -100,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr)
100 branch_tracing_enabled++; 101 branch_tracing_enabled++;
101 mutex_unlock(&branch_tracing_mutex); 102 mutex_unlock(&branch_tracing_mutex);
102 103
103 return ret; 104 return 0;
104} 105}
105 106
106void disable_branch_tracing(void) 107void disable_branch_tracing(void)
@@ -128,11 +129,6 @@ static void stop_branch_trace(struct trace_array *tr)
128 129
129static int branch_trace_init(struct trace_array *tr) 130static int branch_trace_init(struct trace_array *tr)
130{ 131{
131 int cpu;
132
133 for_each_online_cpu(cpu)
134 tracing_reset(tr, cpu);
135
136 start_branch_trace(tr); 132 start_branch_trace(tr);
137 return 0; 133 return 0;
138} 134}
@@ -142,22 +138,53 @@ static void branch_trace_reset(struct trace_array *tr)
142 stop_branch_trace(tr); 138 stop_branch_trace(tr);
143} 139}
144 140
145struct tracer branch_trace __read_mostly = 141static enum print_line_t trace_branch_print(struct trace_iterator *iter,
142 int flags)
143{
144 struct trace_branch *field;
145
146 trace_assign_type(field, iter->ent);
147
148 if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
149 field->correct ? " ok " : " MISS ",
150 field->func,
151 field->file,
152 field->line))
153 return TRACE_TYPE_PARTIAL_LINE;
154
155 return TRACE_TYPE_HANDLED;
156}
157
158
159static struct trace_event trace_branch_event = {
160 .type = TRACE_BRANCH,
161 .trace = trace_branch_print,
162};
163
164static struct tracer branch_trace __read_mostly =
146{ 165{
147 .name = "branch", 166 .name = "branch",
148 .init = branch_trace_init, 167 .init = branch_trace_init,
149 .reset = branch_trace_reset, 168 .reset = branch_trace_reset,
150#ifdef CONFIG_FTRACE_SELFTEST 169#ifdef CONFIG_FTRACE_SELFTEST
151 .selftest = trace_selftest_startup_branch, 170 .selftest = trace_selftest_startup_branch,
152#endif 171#endif /* CONFIG_FTRACE_SELFTEST */
153}; 172};
154 173
155__init static int init_branch_trace(void) 174__init static int init_branch_tracer(void)
156{ 175{
176 int ret;
177
178 ret = register_ftrace_event(&trace_branch_event);
179 if (!ret) {
180 printk(KERN_WARNING "Warning: could not register "
181 "branch events\n");
182 return 1;
183 }
157 return register_tracer(&branch_trace); 184 return register_tracer(&branch_trace);
158} 185}
186device_initcall(init_branch_tracer);
159 187
160device_initcall(init_branch_trace);
161#else 188#else
162static inline 189static inline
163void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) 190void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -183,66 +210,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
183} 210}
184EXPORT_SYMBOL(ftrace_likely_update); 211EXPORT_SYMBOL(ftrace_likely_update);
185 212
186struct ftrace_pointer { 213extern unsigned long __start_annotated_branch_profile[];
187 void *start; 214extern unsigned long __stop_annotated_branch_profile[];
188 void *stop;
189 int hit;
190};
191 215
192static void * 216static int annotated_branch_stat_headers(struct seq_file *m)
193t_next(struct seq_file *m, void *v, loff_t *pos)
194{ 217{
195 const struct ftrace_pointer *f = m->private; 218 seq_printf(m, " correct incorrect %% ");
196 struct ftrace_branch_data *p = v; 219 seq_printf(m, " Function "
197 220 " File Line\n"
198 (*pos)++; 221 " ------- --------- - "
199 222 " -------- "
200 if (v == (void *)1) 223 " ---- ----\n");
201 return f->start; 224 return 0;
202
203 ++p;
204
205 if ((void *)p >= (void *)f->stop)
206 return NULL;
207
208 return p;
209} 225}
210 226
211static void *t_start(struct seq_file *m, loff_t *pos) 227static inline long get_incorrect_percent(struct ftrace_branch_data *p)
212{ 228{
213 void *t = (void *)1; 229 long percent;
214 loff_t l = 0;
215
216 for (; t && l < *pos; t = t_next(m, t, &l))
217 ;
218 230
219 return t; 231 if (p->correct) {
220} 232 percent = p->incorrect * 100;
233 percent /= p->correct + p->incorrect;
234 } else
235 percent = p->incorrect ? 100 : -1;
221 236
222static void t_stop(struct seq_file *m, void *p) 237 return percent;
223{
224} 238}
225 239
226static int t_show(struct seq_file *m, void *v) 240static int branch_stat_show(struct seq_file *m, void *v)
227{ 241{
228 const struct ftrace_pointer *fp = m->private;
229 struct ftrace_branch_data *p = v; 242 struct ftrace_branch_data *p = v;
230 const char *f; 243 const char *f;
231 long percent; 244 long percent;
232 245
233 if (v == (void *)1) {
234 if (fp->hit)
235 seq_printf(m, " miss hit %% ");
236 else
237 seq_printf(m, " correct incorrect %% ");
238 seq_printf(m, " Function "
239 " File Line\n"
240 " ------- --------- - "
241 " -------- "
242 " ---- ----\n");
243 return 0;
244 }
245
246 /* Only print the file, not the path */ 246 /* Only print the file, not the path */
247 f = p->file + strlen(p->file); 247 f = p->file + strlen(p->file);
248 while (f >= p->file && *f != '/') 248 while (f >= p->file && *f != '/')
@@ -252,11 +252,7 @@ static int t_show(struct seq_file *m, void *v)
252 /* 252 /*
253 * The miss is overlayed on correct, and hit on incorrect. 253 * The miss is overlayed on correct, and hit on incorrect.
254 */ 254 */
255 if (p->correct) { 255 percent = get_incorrect_percent(p);
256 percent = p->incorrect * 100;
257 percent /= p->correct + p->incorrect;
258 } else
259 percent = p->incorrect ? 100 : -1;
260 256
261 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); 257 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
262 if (percent < 0) 258 if (percent < 0)
@@ -267,76 +263,118 @@ static int t_show(struct seq_file *m, void *v)
267 return 0; 263 return 0;
268} 264}
269 265
270static struct seq_operations tracing_likely_seq_ops = { 266static void *annotated_branch_stat_start(void)
271 .start = t_start, 267{
272 .next = t_next, 268 return __start_annotated_branch_profile;
273 .stop = t_stop, 269}
274 .show = t_show, 270
271static void *
272annotated_branch_stat_next(void *v, int idx)
273{
274 struct ftrace_branch_data *p = v;
275
276 ++p;
277
278 if ((void *)p >= (void *)__stop_annotated_branch_profile)
279 return NULL;
280
281 return p;
282}
283
284static int annotated_branch_stat_cmp(void *p1, void *p2)
285{
286 struct ftrace_branch_data *a = p1;
287 struct ftrace_branch_data *b = p2;
288
289 long percent_a, percent_b;
290
291 percent_a = get_incorrect_percent(a);
292 percent_b = get_incorrect_percent(b);
293
294 if (percent_a < percent_b)
295 return -1;
296 if (percent_a > percent_b)
297 return 1;
298 else
299 return 0;
300}
301
302static struct tracer_stat annotated_branch_stats = {
303 .name = "branch_annotated",
304 .stat_start = annotated_branch_stat_start,
305 .stat_next = annotated_branch_stat_next,
306 .stat_cmp = annotated_branch_stat_cmp,
307 .stat_headers = annotated_branch_stat_headers,
308 .stat_show = branch_stat_show
275}; 309};
276 310
277static int tracing_branch_open(struct inode *inode, struct file *file) 311__init static int init_annotated_branch_stats(void)
278{ 312{
279 int ret; 313 int ret;
280 314
281 ret = seq_open(file, &tracing_likely_seq_ops); 315 ret = register_stat_tracer(&annotated_branch_stats);
282 if (!ret) { 316 if (!ret) {
283 struct seq_file *m = file->private_data; 317 printk(KERN_WARNING "Warning: could not register "
284 m->private = (void *)inode->i_private; 318 "annotated branches stats\n");
319 return 1;
285 } 320 }
286 321 return 0;
287 return ret;
288} 322}
289 323fs_initcall(init_annotated_branch_stats);
290static const struct file_operations tracing_branch_fops = {
291 .open = tracing_branch_open,
292 .read = seq_read,
293 .llseek = seq_lseek,
294};
295 324
296#ifdef CONFIG_PROFILE_ALL_BRANCHES 325#ifdef CONFIG_PROFILE_ALL_BRANCHES
326
297extern unsigned long __start_branch_profile[]; 327extern unsigned long __start_branch_profile[];
298extern unsigned long __stop_branch_profile[]; 328extern unsigned long __stop_branch_profile[];
299 329
300static const struct ftrace_pointer ftrace_branch_pos = { 330static int all_branch_stat_headers(struct seq_file *m)
301 .start = __start_branch_profile, 331{
302 .stop = __stop_branch_profile, 332 seq_printf(m, " miss hit %% ");
303 .hit = 1, 333 seq_printf(m, " Function "
304}; 334 " File Line\n"
335 " ------- --------- - "
336 " -------- "
337 " ---- ----\n");
338 return 0;
339}
305 340
306#endif /* CONFIG_PROFILE_ALL_BRANCHES */ 341static void *all_branch_stat_start(void)
342{
343 return __start_branch_profile;
344}
307 345
308extern unsigned long __start_annotated_branch_profile[]; 346static void *
309extern unsigned long __stop_annotated_branch_profile[]; 347all_branch_stat_next(void *v, int idx)
348{
349 struct ftrace_branch_data *p = v;
310 350
311static const struct ftrace_pointer ftrace_annotated_branch_pos = { 351 ++p;
312 .start = __start_annotated_branch_profile,
313 .stop = __stop_annotated_branch_profile,
314};
315 352
316static __init int ftrace_branch_init(void) 353 if ((void *)p >= (void *)__stop_branch_profile)
317{ 354 return NULL;
318 struct dentry *d_tracer;
319 struct dentry *entry;
320 355
321 d_tracer = tracing_init_dentry(); 356 return p;
357}
322 358
323 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, 359static struct tracer_stat all_branch_stats = {
324 (void *)&ftrace_annotated_branch_pos, 360 .name = "branch_all",
325 &tracing_branch_fops); 361 .stat_start = all_branch_stat_start,
326 if (!entry) 362 .stat_next = all_branch_stat_next,
327 pr_warning("Could not create debugfs " 363 .stat_headers = all_branch_stat_headers,
328 "'profile_annotatet_branch' entry\n"); 364 .stat_show = branch_stat_show
365};
329 366
330#ifdef CONFIG_PROFILE_ALL_BRANCHES 367__init static int all_annotated_branch_stats(void)
331 entry = debugfs_create_file("profile_branch", 0444, d_tracer, 368{
332 (void *)&ftrace_branch_pos, 369 int ret;
333 &tracing_branch_fops);
334 if (!entry)
335 pr_warning("Could not create debugfs"
336 " 'profile_branch' entry\n");
337#endif
338 370
371 ret = register_stat_tracer(&all_branch_stats);
372 if (!ret) {
373 printk(KERN_WARNING "Warning: could not register "
374 "all branches stats\n");
375 return 1;
376 }
339 return 0; 377 return 0;
340} 378}
341 379fs_initcall(all_annotated_branch_stats);
342device_initcall(ftrace_branch_init); 380#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 000000000000..b588fd81f7f9
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,109 @@
1/*
2 * tracing clocks
3 *
4 * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Implements 3 trace clock variants, with differing scalability/precision
7 * tradeoffs:
8 *
9 * - local: CPU-local trace clock
10 * - medium: scalable global clock with some jitter
11 * - global: globally monotonic, serialized clock
12 *
13 * Tracer plugins will chose a default from these clocks.
14 */
15#include <linux/spinlock.h>
16#include <linux/hardirq.h>
17#include <linux/module.h>
18#include <linux/percpu.h>
19#include <linux/sched.h>
20#include <linux/ktime.h>
21#include <linux/trace_clock.h>
22
23/*
24 * trace_clock_local(): the simplest and least coherent tracing clock.
25 *
26 * Useful for tracing that does not cross to other CPUs nor
27 * does it go through idle events.
28 */
29u64 notrace trace_clock_local(void)
30{
31 unsigned long flags;
32 u64 clock;
33
34 /*
35 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events.
38 */
39 raw_local_irq_save(flags);
40 clock = sched_clock();
41 raw_local_irq_restore(flags);
42
43 return clock;
44}
45
46/*
47 * trace_clock(): 'inbetween' trace clock. Not completely serialized,
48 * but not completely incorrect when crossing CPUs either.
49 *
50 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
51 * jitter between CPUs. So it's a pretty scalable clock, but there
52 * can be offsets in the trace data.
53 */
54u64 notrace trace_clock(void)
55{
56 return cpu_clock(raw_smp_processor_id());
57}
58
59
60/*
61 * trace_clock_global(): special globally coherent trace clock
62 *
63 * It has higher overhead than the other trace clocks but is still
64 * an order of magnitude faster than GTOD derived hardware clocks.
65 *
66 * Used by plugins that need globally coherent timestamps.
67 */
68
69static u64 prev_trace_clock_time;
70
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
73
74u64 notrace trace_clock_global(void)
75{
76 unsigned long flags;
77 int this_cpu;
78 u64 now;
79
80 raw_local_irq_save(flags);
81
82 this_cpu = raw_smp_processor_id();
83 now = cpu_clock(this_cpu);
84 /*
85 * If in an NMI context then dont risk lockups and return the
86 * cpu_clock() time:
87 */
88 if (unlikely(in_nmi()))
89 goto out;
90
91 __raw_spin_lock(&trace_clock_lock);
92
93 /*
94 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure
96 * we start ticking with the local clock from now on?
97 */
98 if ((s64)(now - prev_trace_clock_time) < 0)
99 now = prev_trace_clock_time + 1;
100
101 prev_trace_clock_time = now;
102
103 __raw_spin_unlock(&trace_clock_lock);
104
105 out:
106 raw_local_irq_restore(flags);
107
108 return now;
109}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
new file mode 100644
index 000000000000..22cba9970776
--- /dev/null
+++ b/kernel/trace/trace_event_profile.c
@@ -0,0 +1,31 @@
1/*
2 * trace event based perf counter profiling
3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 */
7
8#include "trace.h"
9
10int ftrace_profile_enable(int event_id)
11{
12 struct ftrace_event_call *event;
13
14 for_each_event(event) {
15 if (event->id == event_id)
16 return event->profile_enable(event);
17 }
18
19 return -EINVAL;
20}
21
22void ftrace_profile_disable(int event_id)
23{
24 struct ftrace_event_call *event;
25
26 for_each_event(event) {
27 if (event->id == event_id)
28 return event->profile_disable(event);
29 }
30}
31
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
new file mode 100644
index 000000000000..fd78bee71dd7
--- /dev/null
+++ b/kernel/trace/trace_event_types.h
@@ -0,0 +1,173 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(int, ret.depth, depth)
30 ),
31 TP_RAW_FMT("<-- %lx (%d)")
32);
33
34TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
35 TRACE_STRUCT(
36 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
37 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
38 TRACE_FIELD(unsigned char, prev_state, prev_state)
39 TRACE_FIELD(unsigned int, next_pid, next_pid)
40 TRACE_FIELD(unsigned char, next_prio, next_prio)
41 TRACE_FIELD(unsigned char, next_state, next_state)
42 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
43 ),
44 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
45);
46
47TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
48 TRACE_STRUCT(
49 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
50 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
51 TRACE_FIELD(unsigned char, prev_state, prev_state)
52 TRACE_FIELD(unsigned int, next_pid, next_pid)
53 TRACE_FIELD(unsigned char, next_prio, next_prio)
54 TRACE_FIELD(unsigned char, next_state, next_state)
55 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
56 ),
57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
58);
59
60TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
61 TRACE_STRUCT(
62 TRACE_FIELD(unsigned long, arg1, arg1)
63 TRACE_FIELD(unsigned long, arg2, arg2)
64 TRACE_FIELD(unsigned long, arg3, arg3)
65 ),
66 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
67);
68
69/*
70 * Stack-trace entry:
71 */
72
73/* #define FTRACE_STACK_ENTRIES 8 */
74
75TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
76 TRACE_STRUCT(
77 TRACE_FIELD(unsigned long, caller[0], stack0)
78 TRACE_FIELD(unsigned long, caller[1], stack1)
79 TRACE_FIELD(unsigned long, caller[2], stack2)
80 TRACE_FIELD(unsigned long, caller[3], stack3)
81 TRACE_FIELD(unsigned long, caller[4], stack4)
82 TRACE_FIELD(unsigned long, caller[5], stack5)
83 TRACE_FIELD(unsigned long, caller[6], stack6)
84 TRACE_FIELD(unsigned long, caller[7], stack7)
85 ),
86 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
87 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
88);
89
90TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
91 TRACE_STRUCT(
92 TRACE_FIELD(unsigned long, caller[0], stack0)
93 TRACE_FIELD(unsigned long, caller[1], stack1)
94 TRACE_FIELD(unsigned long, caller[2], stack2)
95 TRACE_FIELD(unsigned long, caller[3], stack3)
96 TRACE_FIELD(unsigned long, caller[4], stack4)
97 TRACE_FIELD(unsigned long, caller[5], stack5)
98 TRACE_FIELD(unsigned long, caller[6], stack6)
99 TRACE_FIELD(unsigned long, caller[7], stack7)
100 ),
101 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
102 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
103);
104
105TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
106 TRACE_STRUCT(
107 TRACE_FIELD(unsigned long, ip, ip)
108 TRACE_FIELD(char *, fmt, fmt)
109 TRACE_FIELD_ZERO_CHAR(buf)
110 ),
111 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
112);
113
114TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
115 TRACE_STRUCT(
116 TRACE_FIELD(unsigned long, ip, ip)
117 TRACE_FIELD_ZERO_CHAR(buf)
118 ),
119 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
120);
121
122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
123 TRACE_STRUCT(
124 TRACE_FIELD(unsigned int, line, line)
125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
126 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
127 TRACE_FIELD(char, correct, correct)
128 ),
129 TP_RAW_FMT("%u:%s:%s (%u)")
130);
131
132TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
133 TRACE_STRUCT(
134 TRACE_FIELD(u64, from, from)
135 TRACE_FIELD(u64, to, to)
136 ),
137 TP_RAW_FMT("from: %llx to: %llx")
138);
139
140TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
141 TRACE_STRUCT(
142 TRACE_FIELD(ktime_t, state_data.stamp, stamp)
143 TRACE_FIELD(ktime_t, state_data.end, end)
144 TRACE_FIELD(int, state_data.type, type)
145 TRACE_FIELD(int, state_data.state, state)
146 ),
147 TP_RAW_FMT("%llx->%llx type:%u state:%u")
148);
149
150TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
151 TRACE_STRUCT(
152 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
153 TRACE_FIELD(unsigned long, call_site, call_site)
154 TRACE_FIELD(const void *, ptr, ptr)
155 TRACE_FIELD(size_t, bytes_req, bytes_req)
156 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
157 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
158 TRACE_FIELD(int, node, node)
159 ),
160 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
161 " flags:%x node:%d")
162);
163
164TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
165 TRACE_STRUCT(
166 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
167 TRACE_FIELD(unsigned long, call_site, call_site)
168 TRACE_FIELD(const void *, ptr, ptr)
169 ),
170 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
171);
172
173#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
new file mode 100644
index 000000000000..64ec4d278ffb
--- /dev/null
+++ b/kernel/trace/trace_events.c
@@ -0,0 +1,824 @@
1/*
2 * event tracer
3 *
4 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 *
6 * - Added format output of fields of the trace point.
7 * This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
8 *
9 */
10
11#include <linux/debugfs.h>
12#include <linux/uaccess.h>
13#include <linux/module.h>
14#include <linux/ctype.h>
15
16#include "trace_output.h"
17
18#define TRACE_SYSTEM "TRACE_SYSTEM"
19
20static DEFINE_MUTEX(event_mutex);
21
22int trace_define_field(struct ftrace_event_call *call, char *type,
23 char *name, int offset, int size)
24{
25 struct ftrace_event_field *field;
26
27 field = kzalloc(sizeof(*field), GFP_KERNEL);
28 if (!field)
29 goto err;
30
31 field->name = kstrdup(name, GFP_KERNEL);
32 if (!field->name)
33 goto err;
34
35 field->type = kstrdup(type, GFP_KERNEL);
36 if (!field->type)
37 goto err;
38
39 field->offset = offset;
40 field->size = size;
41 list_add(&field->link, &call->fields);
42
43 return 0;
44
45err:
46 if (field) {
47 kfree(field->name);
48 kfree(field->type);
49 }
50 kfree(field);
51
52 return -ENOMEM;
53}
54
55static void ftrace_clear_events(void)
56{
57 struct ftrace_event_call *call = (void *)__start_ftrace_events;
58
59
60 while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
61
62 if (call->enabled) {
63 call->enabled = 0;
64 call->unregfunc();
65 }
66 call++;
67 }
68}
69
70static void ftrace_event_enable_disable(struct ftrace_event_call *call,
71 int enable)
72{
73
74 switch (enable) {
75 case 0:
76 if (call->enabled) {
77 call->enabled = 0;
78 call->unregfunc();
79 }
80 break;
81 case 1:
82 if (!call->enabled) {
83 call->enabled = 1;
84 call->regfunc();
85 }
86 break;
87 }
88}
89
90static int ftrace_set_clr_event(char *buf, int set)
91{
92 struct ftrace_event_call *call = __start_ftrace_events;
93 char *event = NULL, *sub = NULL, *match;
94 int ret = -EINVAL;
95
96 /*
97 * The buf format can be <subsystem>:<event-name>
98 * *:<event-name> means any event by that name.
99 * :<event-name> is the same.
100 *
101 * <subsystem>:* means all events in that subsystem
102 * <subsystem>: means the same.
103 *
104 * <name> (no ':') means all events in a subsystem with
105 * the name <name> or any event that matches <name>
106 */
107
108 match = strsep(&buf, ":");
109 if (buf) {
110 sub = match;
111 event = buf;
112 match = NULL;
113
114 if (!strlen(sub) || strcmp(sub, "*") == 0)
115 sub = NULL;
116 if (!strlen(event) || strcmp(event, "*") == 0)
117 event = NULL;
118 }
119
120 mutex_lock(&event_mutex);
121 for_each_event(call) {
122
123 if (!call->name || !call->regfunc)
124 continue;
125
126 if (match &&
127 strcmp(match, call->name) != 0 &&
128 strcmp(match, call->system) != 0)
129 continue;
130
131 if (sub && strcmp(sub, call->system) != 0)
132 continue;
133
134 if (event && strcmp(event, call->name) != 0)
135 continue;
136
137 ftrace_event_enable_disable(call, set);
138
139 ret = 0;
140 }
141 mutex_unlock(&event_mutex);
142
143 return ret;
144}
145
146/* 128 should be much more than enough */
147#define EVENT_BUF_SIZE 127
148
149static ssize_t
150ftrace_event_write(struct file *file, const char __user *ubuf,
151 size_t cnt, loff_t *ppos)
152{
153 size_t read = 0;
154 int i, set = 1;
155 ssize_t ret;
156 char *buf;
157 char ch;
158
159 if (!cnt || cnt < 0)
160 return 0;
161
162 ret = tracing_update_buffers();
163 if (ret < 0)
164 return ret;
165
166 ret = get_user(ch, ubuf++);
167 if (ret)
168 return ret;
169 read++;
170 cnt--;
171
172 /* skip white space */
173 while (cnt && isspace(ch)) {
174 ret = get_user(ch, ubuf++);
175 if (ret)
176 return ret;
177 read++;
178 cnt--;
179 }
180
181 /* Only white space found? */
182 if (isspace(ch)) {
183 file->f_pos += read;
184 ret = read;
185 return ret;
186 }
187
188 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
189 if (!buf)
190 return -ENOMEM;
191
192 if (cnt > EVENT_BUF_SIZE)
193 cnt = EVENT_BUF_SIZE;
194
195 i = 0;
196 while (cnt && !isspace(ch)) {
197 if (!i && ch == '!')
198 set = 0;
199 else
200 buf[i++] = ch;
201
202 ret = get_user(ch, ubuf++);
203 if (ret)
204 goto out_free;
205 read++;
206 cnt--;
207 }
208 buf[i] = 0;
209
210 file->f_pos += read;
211
212 ret = ftrace_set_clr_event(buf, set);
213 if (ret)
214 goto out_free;
215
216 ret = read;
217
218 out_free:
219 kfree(buf);
220
221 return ret;
222}
223
224static void *
225t_next(struct seq_file *m, void *v, loff_t *pos)
226{
227 struct ftrace_event_call *call = m->private;
228 struct ftrace_event_call *next = call;
229
230 (*pos)++;
231
232 for (;;) {
233 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
234 return NULL;
235
236 /*
237 * The ftrace subsystem is for showing formats only.
238 * They can not be enabled or disabled via the event files.
239 */
240 if (call->regfunc)
241 break;
242
243 call++;
244 next = call;
245 }
246
247 m->private = ++next;
248
249 return call;
250}
251
252static void *t_start(struct seq_file *m, loff_t *pos)
253{
254 return t_next(m, NULL, pos);
255}
256
257static void *
258s_next(struct seq_file *m, void *v, loff_t *pos)
259{
260 struct ftrace_event_call *call = m->private;
261 struct ftrace_event_call *next;
262
263 (*pos)++;
264
265 retry:
266 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
267 return NULL;
268
269 if (!call->enabled) {
270 call++;
271 goto retry;
272 }
273
274 next = call;
275 m->private = ++next;
276
277 return call;
278}
279
280static void *s_start(struct seq_file *m, loff_t *pos)
281{
282 return s_next(m, NULL, pos);
283}
284
285static int t_show(struct seq_file *m, void *v)
286{
287 struct ftrace_event_call *call = v;
288
289 if (strcmp(call->system, TRACE_SYSTEM) != 0)
290 seq_printf(m, "%s:", call->system);
291 seq_printf(m, "%s\n", call->name);
292
293 return 0;
294}
295
296static void t_stop(struct seq_file *m, void *p)
297{
298}
299
300static int
301ftrace_event_seq_open(struct inode *inode, struct file *file)
302{
303 int ret;
304 const struct seq_operations *seq_ops;
305
306 if ((file->f_mode & FMODE_WRITE) &&
307 !(file->f_flags & O_APPEND))
308 ftrace_clear_events();
309
310 seq_ops = inode->i_private;
311 ret = seq_open(file, seq_ops);
312 if (!ret) {
313 struct seq_file *m = file->private_data;
314
315 m->private = __start_ftrace_events;
316 }
317 return ret;
318}
319
320static ssize_t
321event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
322 loff_t *ppos)
323{
324 struct ftrace_event_call *call = filp->private_data;
325 char *buf;
326
327 if (call->enabled)
328 buf = "1\n";
329 else
330 buf = "0\n";
331
332 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
333}
334
335static ssize_t
336event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
337 loff_t *ppos)
338{
339 struct ftrace_event_call *call = filp->private_data;
340 char buf[64];
341 unsigned long val;
342 int ret;
343
344 if (cnt >= sizeof(buf))
345 return -EINVAL;
346
347 if (copy_from_user(&buf, ubuf, cnt))
348 return -EFAULT;
349
350 buf[cnt] = 0;
351
352 ret = strict_strtoul(buf, 10, &val);
353 if (ret < 0)
354 return ret;
355
356 ret = tracing_update_buffers();
357 if (ret < 0)
358 return ret;
359
360 switch (val) {
361 case 0:
362 case 1:
363 mutex_lock(&event_mutex);
364 ftrace_event_enable_disable(call, val);
365 mutex_unlock(&event_mutex);
366 break;
367
368 default:
369 return -EINVAL;
370 }
371
372 *ppos += cnt;
373
374 return cnt;
375}
376
377#undef FIELD
378#define FIELD(type, name) \
379 #type, "common_" #name, offsetof(typeof(field), name), \
380 sizeof(field.name)
381
382static int trace_write_header(struct trace_seq *s)
383{
384 struct trace_entry field;
385
386 /* struct trace_entry */
387 return trace_seq_printf(s,
388 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
389 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
390 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
391 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
392 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
393 "\n",
394 FIELD(unsigned char, type),
395 FIELD(unsigned char, flags),
396 FIELD(unsigned char, preempt_count),
397 FIELD(int, pid),
398 FIELD(int, tgid));
399}
400
401static ssize_t
402event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
403 loff_t *ppos)
404{
405 struct ftrace_event_call *call = filp->private_data;
406 struct trace_seq *s;
407 char *buf;
408 int r;
409
410 if (*ppos)
411 return 0;
412
413 s = kmalloc(sizeof(*s), GFP_KERNEL);
414 if (!s)
415 return -ENOMEM;
416
417 trace_seq_init(s);
418
419 /* If any of the first writes fail, so will the show_format. */
420
421 trace_seq_printf(s, "name: %s\n", call->name);
422 trace_seq_printf(s, "ID: %d\n", call->id);
423 trace_seq_printf(s, "format:\n");
424 trace_write_header(s);
425
426 r = call->show_format(s);
427 if (!r) {
428 /*
429 * ug! The format output is bigger than a PAGE!!
430 */
431 buf = "FORMAT TOO BIG\n";
432 r = simple_read_from_buffer(ubuf, cnt, ppos,
433 buf, strlen(buf));
434 goto out;
435 }
436
437 r = simple_read_from_buffer(ubuf, cnt, ppos,
438 s->buffer, s->len);
439 out:
440 kfree(s);
441 return r;
442}
443
444static ssize_t
445event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
446{
447 struct ftrace_event_call *call = filp->private_data;
448 struct trace_seq *s;
449 int r;
450
451 if (*ppos)
452 return 0;
453
454 s = kmalloc(sizeof(*s), GFP_KERNEL);
455 if (!s)
456 return -ENOMEM;
457
458 trace_seq_init(s);
459 trace_seq_printf(s, "%d\n", call->id);
460
461 r = simple_read_from_buffer(ubuf, cnt, ppos,
462 s->buffer, s->len);
463 kfree(s);
464 return r;
465}
466
467static ssize_t
468event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
469 loff_t *ppos)
470{
471 struct ftrace_event_call *call = filp->private_data;
472 struct trace_seq *s;
473 int r;
474
475 if (*ppos)
476 return 0;
477
478 s = kmalloc(sizeof(*s), GFP_KERNEL);
479 if (!s)
480 return -ENOMEM;
481
482 trace_seq_init(s);
483
484 filter_print_preds(call->preds, s);
485 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
486
487 kfree(s);
488
489 return r;
490}
491
492static ssize_t
493event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
494 loff_t *ppos)
495{
496 struct ftrace_event_call *call = filp->private_data;
497 char buf[64], *pbuf = buf;
498 struct filter_pred *pred;
499 int err;
500
501 if (cnt >= sizeof(buf))
502 return -EINVAL;
503
504 if (copy_from_user(&buf, ubuf, cnt))
505 return -EFAULT;
506
507 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
508 if (!pred)
509 return -ENOMEM;
510
511 err = filter_parse(&pbuf, pred);
512 if (err < 0) {
513 filter_free_pred(pred);
514 return err;
515 }
516
517 if (pred->clear) {
518 filter_free_preds(call);
519 filter_free_pred(pred);
520 return cnt;
521 }
522
523 if (filter_add_pred(call, pred)) {
524 filter_free_pred(pred);
525 return -EINVAL;
526 }
527
528 *ppos += cnt;
529
530 return cnt;
531}
532
533static ssize_t
534subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
535 loff_t *ppos)
536{
537 struct event_subsystem *system = filp->private_data;
538 struct trace_seq *s;
539 int r;
540
541 if (*ppos)
542 return 0;
543
544 s = kmalloc(sizeof(*s), GFP_KERNEL);
545 if (!s)
546 return -ENOMEM;
547
548 trace_seq_init(s);
549
550 filter_print_preds(system->preds, s);
551 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
552
553 kfree(s);
554
555 return r;
556}
557
558static ssize_t
559subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
560 loff_t *ppos)
561{
562 struct event_subsystem *system = filp->private_data;
563 char buf[64], *pbuf = buf;
564 struct filter_pred *pred;
565 int err;
566
567 if (cnt >= sizeof(buf))
568 return -EINVAL;
569
570 if (copy_from_user(&buf, ubuf, cnt))
571 return -EFAULT;
572
573 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
574 if (!pred)
575 return -ENOMEM;
576
577 err = filter_parse(&pbuf, pred);
578 if (err < 0) {
579 filter_free_pred(pred);
580 return err;
581 }
582
583 if (pred->clear) {
584 filter_free_subsystem_preds(system);
585 filter_free_pred(pred);
586 return cnt;
587 }
588
589 if (filter_add_subsystem_pred(system, pred)) {
590 filter_free_subsystem_preds(system);
591 filter_free_pred(pred);
592 return -EINVAL;
593 }
594
595 *ppos += cnt;
596
597 return cnt;
598}
599
600static const struct seq_operations show_event_seq_ops = {
601 .start = t_start,
602 .next = t_next,
603 .show = t_show,
604 .stop = t_stop,
605};
606
607static const struct seq_operations show_set_event_seq_ops = {
608 .start = s_start,
609 .next = s_next,
610 .show = t_show,
611 .stop = t_stop,
612};
613
614static const struct file_operations ftrace_avail_fops = {
615 .open = ftrace_event_seq_open,
616 .read = seq_read,
617 .llseek = seq_lseek,
618 .release = seq_release,
619};
620
621static const struct file_operations ftrace_set_event_fops = {
622 .open = ftrace_event_seq_open,
623 .read = seq_read,
624 .write = ftrace_event_write,
625 .llseek = seq_lseek,
626 .release = seq_release,
627};
628
629static const struct file_operations ftrace_enable_fops = {
630 .open = tracing_open_generic,
631 .read = event_enable_read,
632 .write = event_enable_write,
633};
634
635static const struct file_operations ftrace_event_format_fops = {
636 .open = tracing_open_generic,
637 .read = event_format_read,
638};
639
640static const struct file_operations ftrace_event_id_fops = {
641 .open = tracing_open_generic,
642 .read = event_id_read,
643};
644
645static const struct file_operations ftrace_event_filter_fops = {
646 .open = tracing_open_generic,
647 .read = event_filter_read,
648 .write = event_filter_write,
649};
650
651static const struct file_operations ftrace_subsystem_filter_fops = {
652 .open = tracing_open_generic,
653 .read = subsystem_filter_read,
654 .write = subsystem_filter_write,
655};
656
657static struct dentry *event_trace_events_dir(void)
658{
659 static struct dentry *d_tracer;
660 static struct dentry *d_events;
661
662 if (d_events)
663 return d_events;
664
665 d_tracer = tracing_init_dentry();
666 if (!d_tracer)
667 return NULL;
668
669 d_events = debugfs_create_dir("events", d_tracer);
670 if (!d_events)
671 pr_warning("Could not create debugfs "
672 "'events' directory\n");
673
674 return d_events;
675}
676
677static LIST_HEAD(event_subsystems);
678
679static struct dentry *
680event_subsystem_dir(const char *name, struct dentry *d_events)
681{
682 struct event_subsystem *system;
683
684 /* First see if we did not already create this dir */
685 list_for_each_entry(system, &event_subsystems, list) {
686 if (strcmp(system->name, name) == 0)
687 return system->entry;
688 }
689
690 /* need to create new entry */
691 system = kmalloc(sizeof(*system), GFP_KERNEL);
692 if (!system) {
693 pr_warning("No memory to create event subsystem %s\n",
694 name);
695 return d_events;
696 }
697
698 system->entry = debugfs_create_dir(name, d_events);
699 if (!system->entry) {
700 pr_warning("Could not create event subsystem %s\n",
701 name);
702 kfree(system);
703 return d_events;
704 }
705
706 system->name = name;
707 list_add(&system->list, &event_subsystems);
708
709 system->preds = NULL;
710
711 return system->entry;
712}
713
714static int
715event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
716{
717 struct dentry *entry;
718 int ret;
719
720 /*
721 * If the trace point header did not define TRACE_SYSTEM
722 * then the system would be called "TRACE_SYSTEM".
723 */
724 if (strcmp(call->system, "TRACE_SYSTEM") != 0)
725 d_events = event_subsystem_dir(call->system, d_events);
726
727 if (call->raw_init) {
728 ret = call->raw_init();
729 if (ret < 0) {
730 pr_warning("Could not initialize trace point"
731 " events/%s\n", call->name);
732 return ret;
733 }
734 }
735
736 call->dir = debugfs_create_dir(call->name, d_events);
737 if (!call->dir) {
738 pr_warning("Could not create debugfs "
739 "'%s' directory\n", call->name);
740 return -1;
741 }
742
743 if (call->regfunc) {
744 entry = debugfs_create_file("enable", 0644, call->dir, call,
745 &ftrace_enable_fops);
746 if (!entry)
747 pr_warning("Could not create debugfs "
748 "'%s/enable' entry\n", call->name);
749 }
750
751 if (call->id) {
752 entry = debugfs_create_file("id", 0444, call->dir, call,
753 &ftrace_event_id_fops);
754 if (!entry)
755 pr_warning("Could not create debugfs '%s/id' entry\n",
756 call->name);
757 }
758
759 if (call->define_fields) {
760 ret = call->define_fields();
761 if (ret < 0) {
762 pr_warning("Could not initialize trace point"
763 " events/%s\n", call->name);
764 return ret;
765 }
766 entry = debugfs_create_file("filter", 0644, call->dir, call,
767 &ftrace_event_filter_fops);
768 if (!entry)
769 pr_warning("Could not create debugfs "
770 "'%s/filter' entry\n", call->name);
771 }
772
773 /* A trace may not want to export its format */
774 if (!call->show_format)
775 return 0;
776
777 entry = debugfs_create_file("format", 0444, call->dir, call,
778 &ftrace_event_format_fops);
779 if (!entry)
780 pr_warning("Could not create debugfs "
781 "'%s/format' entry\n", call->name);
782
783 return 0;
784}
785
786static __init int event_trace_init(void)
787{
788 struct ftrace_event_call *call = __start_ftrace_events;
789 struct dentry *d_tracer;
790 struct dentry *entry;
791 struct dentry *d_events;
792
793 d_tracer = tracing_init_dentry();
794 if (!d_tracer)
795 return 0;
796
797 entry = debugfs_create_file("available_events", 0444, d_tracer,
798 (void *)&show_event_seq_ops,
799 &ftrace_avail_fops);
800 if (!entry)
801 pr_warning("Could not create debugfs "
802 "'available_events' entry\n");
803
804 entry = debugfs_create_file("set_event", 0644, d_tracer,
805 (void *)&show_set_event_seq_ops,
806 &ftrace_set_event_fops);
807 if (!entry)
808 pr_warning("Could not create debugfs "
809 "'set_event' entry\n");
810
811 d_events = event_trace_events_dir();
812 if (!d_events)
813 return 0;
814
815 for_each_event(call) {
816 /* The linker may leave blanks */
817 if (!call->name)
818 continue;
819 event_create_dir(call, d_events);
820 }
821
822 return 0;
823}
824fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
new file mode 100644
index 000000000000..026be412f356
--- /dev/null
+++ b/kernel/trace/trace_events_filter.c
@@ -0,0 +1,427 @@
1/*
2 * trace_events_filter - generic event filtering
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */
20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h>
24#include <linux/ctype.h>
25
26#include "trace.h"
27#include "trace_output.h"
28
29static int filter_pred_64(struct filter_pred *pred, void *event)
30{
31 u64 *addr = (u64 *)(event + pred->offset);
32 u64 val = (u64)pred->val;
33 int match;
34
35 match = (val == *addr) ^ pred->not;
36
37 return match;
38}
39
40static int filter_pred_32(struct filter_pred *pred, void *event)
41{
42 u32 *addr = (u32 *)(event + pred->offset);
43 u32 val = (u32)pred->val;
44 int match;
45
46 match = (val == *addr) ^ pred->not;
47
48 return match;
49}
50
51static int filter_pred_16(struct filter_pred *pred, void *event)
52{
53 u16 *addr = (u16 *)(event + pred->offset);
54 u16 val = (u16)pred->val;
55 int match;
56
57 match = (val == *addr) ^ pred->not;
58
59 return match;
60}
61
62static int filter_pred_8(struct filter_pred *pred, void *event)
63{
64 u8 *addr = (u8 *)(event + pred->offset);
65 u8 val = (u8)pred->val;
66 int match;
67
68 match = (val == *addr) ^ pred->not;
69
70 return match;
71}
72
73static int filter_pred_string(struct filter_pred *pred, void *event)
74{
75 char *addr = (char *)(event + pred->offset);
76 int cmp, match;
77
78 cmp = strncmp(addr, pred->str_val, pred->str_len);
79
80 match = (!cmp) ^ pred->not;
81
82 return match;
83}
84
85/* return 1 if event matches, 0 otherwise (discard) */
86int filter_match_preds(struct ftrace_event_call *call, void *rec)
87{
88 int i, matched, and_failed = 0;
89 struct filter_pred *pred;
90
91 for (i = 0; i < MAX_FILTER_PRED; i++) {
92 if (call->preds[i]) {
93 pred = call->preds[i];
94 if (and_failed && !pred->or)
95 continue;
96 matched = pred->fn(pred, rec);
97 if (!matched && !pred->or) {
98 and_failed = 1;
99 continue;
100 } else if (matched && pred->or)
101 return 1;
102 } else
103 break;
104 }
105
106 if (and_failed)
107 return 0;
108
109 return 1;
110}
111
112void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
113{
114 char *field_name;
115 struct filter_pred *pred;
116 int i;
117
118 if (!preds) {
119 trace_seq_printf(s, "none\n");
120 return;
121 }
122
123 for (i = 0; i < MAX_FILTER_PRED; i++) {
124 if (preds[i]) {
125 pred = preds[i];
126 field_name = pred->field_name;
127 if (i)
128 trace_seq_printf(s, pred->or ? "|| " : "&& ");
129 trace_seq_printf(s, "%s ", field_name);
130 trace_seq_printf(s, pred->not ? "!= " : "== ");
131 if (pred->str_val)
132 trace_seq_printf(s, "%s\n", pred->str_val);
133 else
134 trace_seq_printf(s, "%llu\n", pred->val);
135 } else
136 break;
137 }
138}
139
140static struct ftrace_event_field *
141find_event_field(struct ftrace_event_call *call, char *name)
142{
143 struct ftrace_event_field *field;
144
145 list_for_each_entry(field, &call->fields, link) {
146 if (!strcmp(field->name, name))
147 return field;
148 }
149
150 return NULL;
151}
152
153void filter_free_pred(struct filter_pred *pred)
154{
155 if (!pred)
156 return;
157
158 kfree(pred->field_name);
159 kfree(pred->str_val);
160 kfree(pred);
161}
162
163void filter_free_preds(struct ftrace_event_call *call)
164{
165 int i;
166
167 if (call->preds) {
168 for (i = 0; i < MAX_FILTER_PRED; i++)
169 filter_free_pred(call->preds[i]);
170 kfree(call->preds);
171 call->preds = NULL;
172 }
173}
174
175void filter_free_subsystem_preds(struct event_subsystem *system)
176{
177 struct ftrace_event_call *call = __start_ftrace_events;
178 int i;
179
180 if (system->preds) {
181 for (i = 0; i < MAX_FILTER_PRED; i++)
182 filter_free_pred(system->preds[i]);
183 kfree(system->preds);
184 system->preds = NULL;
185 }
186
187 events_for_each(call) {
188 if (!call->name || !call->regfunc)
189 continue;
190
191 if (!strcmp(call->system, system->name))
192 filter_free_preds(call);
193 }
194}
195
196static int __filter_add_pred(struct ftrace_event_call *call,
197 struct filter_pred *pred)
198{
199 int i;
200
201 if (call->preds && !pred->compound)
202 filter_free_preds(call);
203
204 if (!call->preds) {
205 call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
206 GFP_KERNEL);
207 if (!call->preds)
208 return -ENOMEM;
209 }
210
211 for (i = 0; i < MAX_FILTER_PRED; i++) {
212 if (!call->preds[i]) {
213 call->preds[i] = pred;
214 return 0;
215 }
216 }
217
218 return -ENOMEM;
219}
220
221static int is_string_field(const char *type)
222{
223 if (strchr(type, '[') && strstr(type, "char"))
224 return 1;
225
226 return 0;
227}
228
229int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
230{
231 struct ftrace_event_field *field;
232
233 field = find_event_field(call, pred->field_name);
234 if (!field)
235 return -EINVAL;
236
237 pred->offset = field->offset;
238
239 if (is_string_field(field->type)) {
240 if (!pred->str_val)
241 return -EINVAL;
242 pred->fn = filter_pred_string;
243 pred->str_len = field->size;
244 return __filter_add_pred(call, pred);
245 } else {
246 if (pred->str_val)
247 return -EINVAL;
248 }
249
250 switch (field->size) {
251 case 8:
252 pred->fn = filter_pred_64;
253 break;
254 case 4:
255 pred->fn = filter_pred_32;
256 break;
257 case 2:
258 pred->fn = filter_pred_16;
259 break;
260 case 1:
261 pred->fn = filter_pred_8;
262 break;
263 default:
264 return -EINVAL;
265 }
266
267 return __filter_add_pred(call, pred);
268}
269
270static struct filter_pred *copy_pred(struct filter_pred *pred)
271{
272 struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
273 if (!new_pred)
274 return NULL;
275
276 memcpy(new_pred, pred, sizeof(*pred));
277
278 if (pred->field_name) {
279 new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
280 if (!new_pred->field_name) {
281 kfree(new_pred);
282 return NULL;
283 }
284 }
285
286 if (pred->str_val) {
287 new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
288 if (!new_pred->str_val) {
289 filter_free_pred(new_pred);
290 return NULL;
291 }
292 }
293
294 return new_pred;
295}
296
297int filter_add_subsystem_pred(struct event_subsystem *system,
298 struct filter_pred *pred)
299{
300 struct ftrace_event_call *call = __start_ftrace_events;
301 struct filter_pred *event_pred;
302 int i;
303
304 if (system->preds && !pred->compound)
305 filter_free_subsystem_preds(system);
306
307 if (!system->preds) {
308 system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
309 GFP_KERNEL);
310 if (!system->preds)
311 return -ENOMEM;
312 }
313
314 for (i = 0; i < MAX_FILTER_PRED; i++) {
315 if (!system->preds[i]) {
316 system->preds[i] = pred;
317 break;
318 }
319 }
320
321 if (i == MAX_FILTER_PRED)
322 return -EINVAL;
323
324 events_for_each(call) {
325 int err;
326
327 if (!call->name || !call->regfunc)
328 continue;
329
330 if (strcmp(call->system, system->name))
331 continue;
332
333 if (!find_event_field(call, pred->field_name))
334 continue;
335
336 event_pred = copy_pred(pred);
337 if (!event_pred)
338 goto oom;
339
340 err = filter_add_pred(call, event_pred);
341 if (err)
342 filter_free_pred(event_pred);
343 if (err == -ENOMEM)
344 goto oom;
345 }
346
347 return 0;
348
349oom:
350 system->preds[i] = NULL;
351 return -ENOMEM;
352}
353
354int filter_parse(char **pbuf, struct filter_pred *pred)
355{
356 char *tmp, *tok, *val_str = NULL;
357 int tok_n = 0;
358
359 /* field ==/!= number, or/and field ==/!= number, number */
360 while ((tok = strsep(pbuf, " \n"))) {
361 if (tok_n == 0) {
362 if (!strcmp(tok, "0")) {
363 pred->clear = 1;
364 return 0;
365 } else if (!strcmp(tok, "&&")) {
366 pred->or = 0;
367 pred->compound = 1;
368 } else if (!strcmp(tok, "||")) {
369 pred->or = 1;
370 pred->compound = 1;
371 } else
372 pred->field_name = tok;
373 tok_n = 1;
374 continue;
375 }
376 if (tok_n == 1) {
377 if (!pred->field_name)
378 pred->field_name = tok;
379 else if (!strcmp(tok, "!="))
380 pred->not = 1;
381 else if (!strcmp(tok, "=="))
382 pred->not = 0;
383 else {
384 pred->field_name = NULL;
385 return -EINVAL;
386 }
387 tok_n = 2;
388 continue;
389 }
390 if (tok_n == 2) {
391 if (pred->compound) {
392 if (!strcmp(tok, "!="))
393 pred->not = 1;
394 else if (!strcmp(tok, "=="))
395 pred->not = 0;
396 else {
397 pred->field_name = NULL;
398 return -EINVAL;
399 }
400 } else {
401 val_str = tok;
402 break; /* done */
403 }
404 tok_n = 3;
405 continue;
406 }
407 if (tok_n == 3) {
408 val_str = tok;
409 break; /* done */
410 }
411 }
412
413 pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
414 if (!pred->field_name)
415 return -ENOMEM;
416
417 pred->val = simple_strtoull(val_str, &tmp, 10);
418 if (tmp == val_str) {
419 pred->str_val = kstrdup(val_str, GFP_KERNEL);
420 if (!pred->str_val)
421 return -ENOMEM;
422 }
423
424 return 0;
425}
426
427
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
new file mode 100644
index 000000000000..38985f9b379c
--- /dev/null
+++ b/kernel/trace/trace_events_stage_1.h
@@ -0,0 +1,39 @@
1/*
2 * Stage 1 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * struct ftrace_raw_<call> {
7 * struct trace_entry ent;
8 * <type> <item>;
9 * <type2> <item2>[<len>];
10 * [...]
11 * };
12 *
13 * The <type> <item> is created by the __field(type, item) macro or
14 * the __array(type2, item2, len) macro.
15 * We simply do "type item;", and that will create the fields
16 * in the structure.
17 */
18
19#undef TRACE_FORMAT
20#define TRACE_FORMAT(call, proto, args, fmt)
21
22#undef __array
23#define __array(type, item, len) type item[len];
24
25#undef __field
26#define __field(type, item) type item;
27
28#undef TP_STRUCT__entry
29#define TP_STRUCT__entry(args...) args
30
31#undef TRACE_EVENT
32#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
33 struct ftrace_raw_##name { \
34 struct trace_entry ent; \
35 tstruct \
36 }; \
37 static struct ftrace_event_call event_##name
38
39#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
new file mode 100644
index 000000000000..30743f7d4110
--- /dev/null
+++ b/kernel/trace/trace_events_stage_2.h
@@ -0,0 +1,176 @@
1/*
2 * Stage 2 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * enum print_line_t
7 * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
8 * {
9 * struct trace_seq *s = &iter->seq;
10 * struct ftrace_raw_<call> *field; <-- defined in stage 1
11 * struct trace_entry *entry;
12 * int ret;
13 *
14 * entry = iter->ent;
15 *
16 * if (entry->type != event_<call>.id) {
17 * WARN_ON_ONCE(1);
18 * return TRACE_TYPE_UNHANDLED;
19 * }
20 *
21 * field = (typeof(field))entry;
22 *
23 * ret = trace_seq_printf(s, <TP_printk> "\n");
24 * if (!ret)
25 * return TRACE_TYPE_PARTIAL_LINE;
26 *
27 * return TRACE_TYPE_HANDLED;
28 * }
29 *
30 * This is the method used to print the raw event to the trace
31 * output format. Note, this is not needed if the data is read
32 * in binary.
33 */
34
35#undef __entry
36#define __entry field
37
38#undef TP_printk
39#define TP_printk(fmt, args...) fmt "\n", args
40
41#undef TRACE_EVENT
42#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
43enum print_line_t \
44ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
45{ \
46 struct trace_seq *s = &iter->seq; \
47 struct ftrace_raw_##call *field; \
48 struct trace_entry *entry; \
49 int ret; \
50 \
51 entry = iter->ent; \
52 \
53 if (entry->type != event_##call.id) { \
54 WARN_ON_ONCE(1); \
55 return TRACE_TYPE_UNHANDLED; \
56 } \
57 \
58 field = (typeof(field))entry; \
59 \
60 ret = trace_seq_printf(s, #call ": " print); \
61 if (!ret) \
62 return TRACE_TYPE_PARTIAL_LINE; \
63 \
64 return TRACE_TYPE_HANDLED; \
65}
66
67#include <trace/trace_event_types.h>
68
69/*
70 * Setup the showing format of trace point.
71 *
72 * int
73 * ftrace_format_##call(struct trace_seq *s)
74 * {
75 * struct ftrace_raw_##call field;
76 * int ret;
77 *
78 * ret = trace_seq_printf(s, #type " " #item ";"
79 * " offset:%u; size:%u;\n",
80 * offsetof(struct ftrace_raw_##call, item),
81 * sizeof(field.type));
82 *
83 * }
84 */
85
86#undef TP_STRUCT__entry
87#define TP_STRUCT__entry(args...) args
88
89#undef __field
90#define __field(type, item) \
91 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
92 "offset:%u;\tsize:%u;\n", \
93 (unsigned int)offsetof(typeof(field), item), \
94 (unsigned int)sizeof(field.item)); \
95 if (!ret) \
96 return 0;
97
98#undef __array
99#define __array(type, item, len) \
100 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
101 "offset:%u;\tsize:%u;\n", \
102 (unsigned int)offsetof(typeof(field), item), \
103 (unsigned int)sizeof(field.item)); \
104 if (!ret) \
105 return 0;
106
107#undef __entry
108#define __entry "REC"
109
110#undef TP_printk
111#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
112
113#undef TP_fast_assign
114#define TP_fast_assign(args...) args
115
116#undef TRACE_EVENT
117#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
118static int \
119ftrace_format_##call(struct trace_seq *s) \
120{ \
121 struct ftrace_raw_##call field; \
122 int ret; \
123 \
124 tstruct; \
125 \
126 trace_seq_printf(s, "\nprint fmt: " print); \
127 \
128 return ret; \
129}
130
131#include <trace/trace_event_types.h>
132
133#undef __field
134#define __field(type, item) \
135 ret = trace_define_field(event_call, #type, #item, \
136 offsetof(typeof(field), item), \
137 sizeof(field.item)); \
138 if (ret) \
139 return ret;
140
141#undef __array
142#define __array(type, item, len) \
143 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
144 offsetof(typeof(field), item), \
145 sizeof(field.item)); \
146 if (ret) \
147 return ret;
148
149#define __common_field(type, item) \
150 ret = trace_define_field(event_call, #type, "common_" #item, \
151 offsetof(typeof(field.ent), item), \
152 sizeof(field.ent.item)); \
153 if (ret) \
154 return ret;
155
156#undef TRACE_EVENT
157#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
158int \
159ftrace_define_fields_##call(void) \
160{ \
161 struct ftrace_raw_##call field; \
162 struct ftrace_event_call *event_call = &event_##call; \
163 int ret; \
164 \
165 __common_field(unsigned char, type); \
166 __common_field(unsigned char, flags); \
167 __common_field(unsigned char, preempt_count); \
168 __common_field(int, pid); \
169 __common_field(int, tgid); \
170 \
171 tstruct; \
172 \
173 return ret; \
174}
175
176#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
new file mode 100644
index 000000000000..9d2fa78cecca
--- /dev/null
+++ b/kernel/trace/trace_events_stage_3.h
@@ -0,0 +1,281 @@
1/*
2 * Stage 3 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * static void ftrace_event_<call>(proto)
7 * {
8 * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
9 * }
10 *
11 * static int ftrace_reg_event_<call>(void)
12 * {
13 * int ret;
14 *
15 * ret = register_trace_<call>(ftrace_event_<call>);
16 * if (!ret)
17 * pr_info("event trace: Could not activate trace point "
18 * "probe to <call>");
19 * return ret;
20 * }
21 *
22 * static void ftrace_unreg_event_<call>(void)
23 * {
24 * unregister_trace_<call>(ftrace_event_<call>);
25 * }
26 *
27 * For those macros defined with TRACE_FORMAT:
28 *
29 * static struct ftrace_event_call __used
30 * __attribute__((__aligned__(4)))
31 * __attribute__((section("_ftrace_events"))) event_<call> = {
32 * .name = "<call>",
33 * .regfunc = ftrace_reg_event_<call>,
34 * .unregfunc = ftrace_unreg_event_<call>,
35 * }
36 *
37 *
38 * For those macros defined with TRACE_EVENT:
39 *
40 * static struct ftrace_event_call event_<call>;
41 *
42 * static void ftrace_raw_event_<call>(proto)
43 * {
44 * struct ring_buffer_event *event;
45 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
46 * unsigned long irq_flags;
47 * int pc;
48 *
49 * local_save_flags(irq_flags);
50 * pc = preempt_count();
51 *
52 * event = trace_current_buffer_lock_reserve(event_<call>.id,
53 * sizeof(struct ftrace_raw_<call>),
54 * irq_flags, pc);
55 * if (!event)
56 * return;
57 * entry = ring_buffer_event_data(event);
58 *
59 * <assign>; <-- Here we assign the entries by the __field and
60 * __array macros.
61 *
62 * trace_current_buffer_unlock_commit(event, irq_flags, pc);
63 * }
64 *
65 * static int ftrace_raw_reg_event_<call>(void)
66 * {
67 * int ret;
68 *
69 * ret = register_trace_<call>(ftrace_raw_event_<call>);
70 * if (!ret)
71 * pr_info("event trace: Could not activate trace point "
72 * "probe to <call>");
73 * return ret;
74 * }
75 *
76 * static void ftrace_unreg_event_<call>(void)
77 * {
78 * unregister_trace_<call>(ftrace_raw_event_<call>);
79 * }
80 *
81 * static struct trace_event ftrace_event_type_<call> = {
82 * .trace = ftrace_raw_output_<call>, <-- stage 2
83 * };
84 *
85 * static int ftrace_raw_init_event_<call>(void)
86 * {
87 * int id;
88 *
89 * id = register_ftrace_event(&ftrace_event_type_<call>);
90 * if (!id)
91 * return -ENODEV;
92 * event_<call>.id = id;
93 * return 0;
94 * }
95 *
96 * static struct ftrace_event_call __used
97 * __attribute__((__aligned__(4)))
98 * __attribute__((section("_ftrace_events"))) event_<call> = {
99 * .name = "<call>",
100 * .system = "<system>",
101 * .raw_init = ftrace_raw_init_event_<call>,
102 * .regfunc = ftrace_reg_event_<call>,
103 * .unregfunc = ftrace_unreg_event_<call>,
104 * .show_format = ftrace_format_<call>,
105 * }
106 *
107 */
108
109#undef TP_FMT
110#define TP_FMT(fmt, args...) fmt "\n", ##args
111
112#ifdef CONFIG_EVENT_PROFILE
113#define _TRACE_PROFILE(call, proto, args) \
114static void ftrace_profile_##call(proto) \
115{ \
116 extern void perf_tpcounter_event(int); \
117 perf_tpcounter_event(event_##call.id); \
118} \
119 \
120static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
121{ \
122 int ret = 0; \
123 \
124 if (!atomic_inc_return(&call->profile_count)) \
125 ret = register_trace_##call(ftrace_profile_##call); \
126 \
127 return ret; \
128} \
129 \
130static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
131{ \
132 if (atomic_add_negative(-1, &call->profile_count)) \
133 unregister_trace_##call(ftrace_profile_##call); \
134}
135
136#define _TRACE_PROFILE_INIT(call) \
137 .profile_count = ATOMIC_INIT(-1), \
138 .profile_enable = ftrace_profile_enable_##call, \
139 .profile_disable = ftrace_profile_disable_##call,
140
141#else
142#define _TRACE_PROFILE(call, proto, args)
143#define _TRACE_PROFILE_INIT(call)
144#endif
145
146#define _TRACE_FORMAT(call, proto, args, fmt) \
147static void ftrace_event_##call(proto) \
148{ \
149 event_trace_printk(_RET_IP_, #call ": " fmt); \
150} \
151 \
152static int ftrace_reg_event_##call(void) \
153{ \
154 int ret; \
155 \
156 ret = register_trace_##call(ftrace_event_##call); \
157 if (ret) \
158 pr_info("event trace: Could not activate trace point " \
159 "probe to " #call "\n"); \
160 return ret; \
161} \
162 \
163static void ftrace_unreg_event_##call(void) \
164{ \
165 unregister_trace_##call(ftrace_event_##call); \
166} \
167 \
168static struct ftrace_event_call event_##call; \
169 \
170static int ftrace_init_event_##call(void) \
171{ \
172 int id; \
173 \
174 id = register_ftrace_event(NULL); \
175 if (!id) \
176 return -ENODEV; \
177 event_##call.id = id; \
178 return 0; \
179}
180
181#undef TRACE_FORMAT
182#define TRACE_FORMAT(call, proto, args, fmt) \
183_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
184_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
185static struct ftrace_event_call __used \
186__attribute__((__aligned__(4))) \
187__attribute__((section("_ftrace_events"))) event_##call = { \
188 .name = #call, \
189 .system = __stringify(TRACE_SYSTEM), \
190 .raw_init = ftrace_init_event_##call, \
191 .regfunc = ftrace_reg_event_##call, \
192 .unregfunc = ftrace_unreg_event_##call, \
193 _TRACE_PROFILE_INIT(call) \
194}
195
196#undef __entry
197#define __entry entry
198
199#undef TRACE_EVENT
200#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
201_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
202 \
203static struct ftrace_event_call event_##call; \
204 \
205static void ftrace_raw_event_##call(proto) \
206{ \
207 struct ftrace_event_call *call = &event_##call; \
208 struct ring_buffer_event *event; \
209 struct ftrace_raw_##call *entry; \
210 unsigned long irq_flags; \
211 int pc; \
212 \
213 local_save_flags(irq_flags); \
214 pc = preempt_count(); \
215 \
216 event = trace_current_buffer_lock_reserve(event_##call.id, \
217 sizeof(struct ftrace_raw_##call), \
218 irq_flags, pc); \
219 if (!event) \
220 return; \
221 entry = ring_buffer_event_data(event); \
222 \
223 assign; \
224 \
225 if (call->preds && !filter_match_preds(call, entry)) \
226 ring_buffer_event_discard(event); \
227 \
228 trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
229 \
230} \
231 \
232static int ftrace_raw_reg_event_##call(void) \
233{ \
234 int ret; \
235 \
236 ret = register_trace_##call(ftrace_raw_event_##call); \
237 if (ret) \
238 pr_info("event trace: Could not activate trace point " \
239 "probe to " #call "\n"); \
240 return ret; \
241} \
242 \
243static void ftrace_raw_unreg_event_##call(void) \
244{ \
245 unregister_trace_##call(ftrace_raw_event_##call); \
246} \
247 \
248static struct trace_event ftrace_event_type_##call = { \
249 .trace = ftrace_raw_output_##call, \
250}; \
251 \
252static int ftrace_raw_init_event_##call(void) \
253{ \
254 int id; \
255 \
256 id = register_ftrace_event(&ftrace_event_type_##call); \
257 if (!id) \
258 return -ENODEV; \
259 event_##call.id = id; \
260 INIT_LIST_HEAD(&event_##call.fields); \
261 return 0; \
262} \
263 \
264static struct ftrace_event_call __used \
265__attribute__((__aligned__(4))) \
266__attribute__((section("_ftrace_events"))) event_##call = { \
267 .name = #call, \
268 .system = __stringify(TRACE_SYSTEM), \
269 .raw_init = ftrace_raw_init_event_##call, \
270 .regfunc = ftrace_raw_reg_event_##call, \
271 .unregfunc = ftrace_raw_unreg_event_##call, \
272 .show_format = ftrace_format_##call, \
273 .define_fields = ftrace_define_fields_##call, \
274 _TRACE_PROFILE_INIT(call) \
275}
276
277#include <trace/trace_event_types.h>
278
279#undef _TRACE_PROFILE
280#undef _TRACE_PROFILE_INIT
281
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
new file mode 100644
index 000000000000..07a22c33ebf3
--- /dev/null
+++ b/kernel/trace/trace_export.c
@@ -0,0 +1,102 @@
1/*
2 * trace_export.c - export basic ftrace utilities to user space
3 *
4 * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/stringify.h>
7#include <linux/kallsyms.h>
8#include <linux/seq_file.h>
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/fs.h>
15
16#include "trace_output.h"
17
18
19#undef TRACE_STRUCT
20#define TRACE_STRUCT(args...) args
21
22#undef TRACE_FIELD
23#define TRACE_FIELD(type, item, assign) \
24 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
25 "offset:%u;\tsize:%u;\n", \
26 (unsigned int)offsetof(typeof(field), item), \
27 (unsigned int)sizeof(field.item)); \
28 if (!ret) \
29 return 0;
30
31
32#undef TRACE_FIELD_SPECIAL
33#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
34 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
35 "offset:%u;\tsize:%u;\n", \
36 (unsigned int)offsetof(typeof(field), item), \
37 (unsigned int)sizeof(field.item)); \
38 if (!ret) \
39 return 0;
40
41#undef TRACE_FIELD_ZERO_CHAR
42#define TRACE_FIELD_ZERO_CHAR(item) \
43 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \
44 "offset:%u;\tsize:0;\n", \
45 (unsigned int)offsetof(typeof(field), item)); \
46 if (!ret) \
47 return 0;
48
49
50#undef TP_RAW_FMT
51#define TP_RAW_FMT(args...) args
52
53#undef TRACE_EVENT_FORMAT
54#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
55static int \
56ftrace_format_##call(struct trace_seq *s) \
57{ \
58 struct args field; \
59 int ret; \
60 \
61 tstruct; \
62 \
63 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
64 \
65 return ret; \
66}
67
68#include "trace_event_types.h"
69
70#undef TRACE_ZERO_CHAR
71#define TRACE_ZERO_CHAR(arg)
72
73#undef TRACE_FIELD
74#define TRACE_FIELD(type, item, assign)\
75 entry->item = assign;
76
77#undef TRACE_FIELD
78#define TRACE_FIELD(type, item, assign)\
79 entry->item = assign;
80
81#undef TP_CMD
82#define TP_CMD(cmd...) cmd
83
84#undef TRACE_ENTRY
85#define TRACE_ENTRY entry
86
87#undef TRACE_FIELD_SPECIAL
88#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
89 cmd;
90
91#undef TRACE_EVENT_FORMAT
92#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
93 \
94static struct ftrace_event_call __used \
95__attribute__((__aligned__(4))) \
96__attribute__((section("_ftrace_events"))) event_##call = { \
97 .name = #call, \
98 .id = proto, \
99 .system = __stringify(TRACE_SYSTEM), \
100 .show_format = ftrace_format_##call, \
101}
102#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9236d7e25a16..c9a0b7df44ff 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -9,6 +9,7 @@
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 William Lee Irwin III
11 */ 11 */
12#include <linux/ring_buffer.h>
12#include <linux/debugfs.h> 13#include <linux/debugfs.h>
13#include <linux/uaccess.h> 14#include <linux/uaccess.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
@@ -16,52 +17,388 @@
16 17
17#include "trace.h" 18#include "trace.h"
18 19
19static void start_function_trace(struct trace_array *tr) 20/* function tracing enabled */
21static int ftrace_function_enabled;
22
23static struct trace_array *func_trace;
24
25static void tracing_start_function_trace(void);
26static void tracing_stop_function_trace(void);
27
28static int function_trace_init(struct trace_array *tr)
20{ 29{
30 func_trace = tr;
21 tr->cpu = get_cpu(); 31 tr->cpu = get_cpu();
22 tracing_reset_online_cpus(tr);
23 put_cpu(); 32 put_cpu();
24 33
25 tracing_start_cmdline_record(); 34 tracing_start_cmdline_record();
26 tracing_start_function_trace(); 35 tracing_start_function_trace();
36 return 0;
27} 37}
28 38
29static void stop_function_trace(struct trace_array *tr) 39static void function_trace_reset(struct trace_array *tr)
30{ 40{
31 tracing_stop_function_trace(); 41 tracing_stop_function_trace();
32 tracing_stop_cmdline_record(); 42 tracing_stop_cmdline_record();
33} 43}
34 44
35static int function_trace_init(struct trace_array *tr) 45static void function_trace_start(struct trace_array *tr)
36{ 46{
37 start_function_trace(tr); 47 tracing_reset_online_cpus(tr);
38 return 0;
39} 48}
40 49
41static void function_trace_reset(struct trace_array *tr) 50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
52{
53 struct trace_array *tr = func_trace;
54 struct trace_array_cpu *data;
55 unsigned long flags;
56 long disabled;
57 int cpu, resched;
58 int pc;
59
60 if (unlikely(!ftrace_function_enabled))
61 return;
62
63 pc = preempt_count();
64 resched = ftrace_preempt_disable();
65 local_save_flags(flags);
66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu];
68 disabled = atomic_inc_return(&data->disabled);
69
70 if (likely(disabled == 1))
71 trace_function(tr, ip, parent_ip, flags, pc);
72
73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched);
75}
76
77static void
78function_trace_call(unsigned long ip, unsigned long parent_ip)
42{ 79{
43 stop_function_trace(tr); 80 struct trace_array *tr = func_trace;
81 struct trace_array_cpu *data;
82 unsigned long flags;
83 long disabled;
84 int cpu;
85 int pc;
86
87 if (unlikely(!ftrace_function_enabled))
88 return;
89
90 /*
91 * Need to use raw, since this must be called before the
92 * recursive protection is performed.
93 */
94 local_irq_save(flags);
95 cpu = raw_smp_processor_id();
96 data = tr->data[cpu];
97 disabled = atomic_inc_return(&data->disabled);
98
99 if (likely(disabled == 1)) {
100 pc = preempt_count();
101 trace_function(tr, ip, parent_ip, flags, pc);
102 }
103
104 atomic_dec(&data->disabled);
105 local_irq_restore(flags);
44} 106}
45 107
46static void function_trace_start(struct trace_array *tr) 108static void
109function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
47{ 110{
48 tracing_reset_online_cpus(tr); 111 struct trace_array *tr = func_trace;
112 struct trace_array_cpu *data;
113 unsigned long flags;
114 long disabled;
115 int cpu;
116 int pc;
117
118 if (unlikely(!ftrace_function_enabled))
119 return;
120
121 /*
122 * Need to use raw, since this must be called before the
123 * recursive protection is performed.
124 */
125 local_irq_save(flags);
126 cpu = raw_smp_processor_id();
127 data = tr->data[cpu];
128 disabled = atomic_inc_return(&data->disabled);
129
130 if (likely(disabled == 1)) {
131 pc = preempt_count();
132 trace_function(tr, ip, parent_ip, flags, pc);
133 /*
134 * skip over 5 funcs:
135 * __ftrace_trace_stack,
136 * __trace_stack,
137 * function_stack_trace_call
138 * ftrace_list_func
139 * ftrace_call
140 */
141 __trace_stack(tr, flags, 5, pc);
142 }
143
144 atomic_dec(&data->disabled);
145 local_irq_restore(flags);
146}
147
148
149static struct ftrace_ops trace_ops __read_mostly =
150{
151 .func = function_trace_call,
152};
153
154static struct ftrace_ops trace_stack_ops __read_mostly =
155{
156 .func = function_stack_trace_call,
157};
158
159/* Our two options */
160enum {
161 TRACE_FUNC_OPT_STACK = 0x1,
162};
163
164static struct tracer_opt func_opts[] = {
165#ifdef CONFIG_STACKTRACE
166 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
167#endif
168 { } /* Always set a last empty entry */
169};
170
171static struct tracer_flags func_flags = {
172 .val = 0, /* By default: all flags disabled */
173 .opts = func_opts
174};
175
176static void tracing_start_function_trace(void)
177{
178 ftrace_function_enabled = 0;
179
180 if (trace_flags & TRACE_ITER_PREEMPTONLY)
181 trace_ops.func = function_trace_call_preempt_only;
182 else
183 trace_ops.func = function_trace_call;
184
185 if (func_flags.val & TRACE_FUNC_OPT_STACK)
186 register_ftrace_function(&trace_stack_ops);
187 else
188 register_ftrace_function(&trace_ops);
189
190 ftrace_function_enabled = 1;
191}
192
193static void tracing_stop_function_trace(void)
194{
195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */
197 unregister_ftrace_function(&trace_stack_ops);
198 unregister_ftrace_function(&trace_ops);
199}
200
201static int func_set_flag(u32 old_flags, u32 bit, int set)
202{
203 if (bit == TRACE_FUNC_OPT_STACK) {
204 /* do nothing if already set */
205 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
206 return 0;
207
208 if (set) {
209 unregister_ftrace_function(&trace_ops);
210 register_ftrace_function(&trace_stack_ops);
211 } else {
212 unregister_ftrace_function(&trace_stack_ops);
213 register_ftrace_function(&trace_ops);
214 }
215
216 return 0;
217 }
218
219 return -EINVAL;
49} 220}
50 221
51static struct tracer function_trace __read_mostly = 222static struct tracer function_trace __read_mostly =
52{ 223{
53 .name = "function", 224 .name = "function",
54 .init = function_trace_init, 225 .init = function_trace_init,
55 .reset = function_trace_reset, 226 .reset = function_trace_reset,
56 .start = function_trace_start, 227 .start = function_trace_start,
228 .wait_pipe = poll_wait_pipe,
229 .flags = &func_flags,
230 .set_flag = func_set_flag,
57#ifdef CONFIG_FTRACE_SELFTEST 231#ifdef CONFIG_FTRACE_SELFTEST
58 .selftest = trace_selftest_startup_function, 232 .selftest = trace_selftest_startup_function,
59#endif 233#endif
60}; 234};
61 235
236#ifdef CONFIG_DYNAMIC_FTRACE
237static void
238ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
239{
240 long *count = (long *)data;
241
242 if (tracing_is_on())
243 return;
244
245 if (!*count)
246 return;
247
248 if (*count != -1)
249 (*count)--;
250
251 tracing_on();
252}
253
254static void
255ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
256{
257 long *count = (long *)data;
258
259 if (!tracing_is_on())
260 return;
261
262 if (!*count)
263 return;
264
265 if (*count != -1)
266 (*count)--;
267
268 tracing_off();
269}
270
271static int
272ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
273 struct ftrace_probe_ops *ops, void *data);
274
275static struct ftrace_probe_ops traceon_probe_ops = {
276 .func = ftrace_traceon,
277 .print = ftrace_trace_onoff_print,
278};
279
280static struct ftrace_probe_ops traceoff_probe_ops = {
281 .func = ftrace_traceoff,
282 .print = ftrace_trace_onoff_print,
283};
284
285static int
286ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
287 struct ftrace_probe_ops *ops, void *data)
288{
289 char str[KSYM_SYMBOL_LEN];
290 long count = (long)data;
291
292 kallsyms_lookup(ip, NULL, NULL, NULL, str);
293 seq_printf(m, "%s:", str);
294
295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon");
297 else
298 seq_printf(m, "traceoff");
299
300 if (count == -1)
301 seq_printf(m, ":unlimited\n");
302 else
303 seq_printf(m, ":count=%ld", count);
304 seq_putc(m, '\n');
305
306 return 0;
307}
308
309static int
310ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
311{
312 struct ftrace_probe_ops *ops;
313
314 /* we register both traceon and traceoff to this callback */
315 if (strcmp(cmd, "traceon") == 0)
316 ops = &traceon_probe_ops;
317 else
318 ops = &traceoff_probe_ops;
319
320 unregister_ftrace_function_probe_func(glob, ops);
321
322 return 0;
323}
324
325static int
326ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
327{
328 struct ftrace_probe_ops *ops;
329 void *count = (void *)-1;
330 char *number;
331 int ret;
332
333 /* hash funcs only work with set_ftrace_filter */
334 if (!enable)
335 return -EINVAL;
336
337 if (glob[0] == '!')
338 return ftrace_trace_onoff_unreg(glob+1, cmd, param);
339
340 /* we register both traceon and traceoff to this callback */
341 if (strcmp(cmd, "traceon") == 0)
342 ops = &traceon_probe_ops;
343 else
344 ops = &traceoff_probe_ops;
345
346 if (!param)
347 goto out_reg;
348
349 number = strsep(&param, ":");
350
351 if (!strlen(number))
352 goto out_reg;
353
354 /*
355 * We use the callback data field (which is a pointer)
356 * as our counter.
357 */
358 ret = strict_strtoul(number, 0, (unsigned long *)&count);
359 if (ret)
360 return ret;
361
362 out_reg:
363 ret = register_ftrace_function_probe(glob, ops, count);
364
365 return ret;
366}
367
368static struct ftrace_func_command ftrace_traceon_cmd = {
369 .name = "traceon",
370 .func = ftrace_trace_onoff_callback,
371};
372
373static struct ftrace_func_command ftrace_traceoff_cmd = {
374 .name = "traceoff",
375 .func = ftrace_trace_onoff_callback,
376};
377
378static int __init init_func_cmd_traceon(void)
379{
380 int ret;
381
382 ret = register_ftrace_command(&ftrace_traceoff_cmd);
383 if (ret)
384 return ret;
385
386 ret = register_ftrace_command(&ftrace_traceon_cmd);
387 if (ret)
388 unregister_ftrace_command(&ftrace_traceoff_cmd);
389 return ret;
390}
391#else
392static inline int init_func_cmd_traceon(void)
393{
394 return 0;
395}
396#endif /* CONFIG_DYNAMIC_FTRACE */
397
62static __init int init_function_trace(void) 398static __init int init_function_trace(void)
63{ 399{
400 init_func_cmd_traceon();
64 return register_tracer(&function_trace); 401 return register_tracer(&function_trace);
65} 402}
66
67device_initcall(init_function_trace); 403device_initcall(init_function_trace);
404
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 930c08e5b38e..d28687e7b3a7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * 2 *
3 * Function graph tracer. 3 * Function graph tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com> 4 * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which 5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com> 6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 * 7 *
@@ -12,6 +12,12 @@
12#include <linux/fs.h> 12#include <linux/fs.h>
13 13
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h"
16
17struct fgraph_data {
18 pid_t last_pid;
19 int depth;
20};
15 21
16#define TRACE_GRAPH_INDENT 2 22#define TRACE_GRAPH_INDENT 2
17 23
@@ -20,9 +26,11 @@
20#define TRACE_GRAPH_PRINT_CPU 0x2 26#define TRACE_GRAPH_PRINT_CPU 0x2
21#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 27#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
22#define TRACE_GRAPH_PRINT_PROC 0x8 28#define TRACE_GRAPH_PRINT_PROC 0x8
29#define TRACE_GRAPH_PRINT_DURATION 0x10
30#define TRACE_GRAPH_PRINT_ABS_TIME 0X20
23 31
24static struct tracer_opt trace_opts[] = { 32static struct tracer_opt trace_opts[] = {
25 /* Display overruns ? */ 33 /* Display overruns? (for self-debug purpose) */
26 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, 34 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
27 /* Display CPU ? */ 35 /* Display CPU ? */
28 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, 36 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@@ -30,26 +38,103 @@ static struct tracer_opt trace_opts[] = {
30 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, 38 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
31 /* Display proc name/pid */ 39 /* Display proc name/pid */
32 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, 40 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
41 /* Display duration of execution */
42 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
43 /* Display absolute time of an entry */
44 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
33 { } /* Empty entry */ 45 { } /* Empty entry */
34}; 46};
35 47
36static struct tracer_flags tracer_flags = { 48static struct tracer_flags tracer_flags = {
37 /* Don't display overruns and proc by default */ 49 /* Don't display overruns and proc by default */
38 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, 50 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
51 TRACE_GRAPH_PRINT_DURATION,
39 .opts = trace_opts 52 .opts = trace_opts
40}; 53};
41 54
42/* pid on the last trace processed */ 55/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
44 56
45static int graph_trace_init(struct trace_array *tr) 57
58/* Add a function return address to the trace stack on thread info.*/
59int
60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
46{ 61{
47 int cpu, ret; 62 unsigned long long calltime;
63 int index;
64
65 if (!current->ret_stack)
66 return -EBUSY;
67
68 /* The return trace stack is full */
69 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
70 atomic_inc(&current->trace_overrun);
71 return -EBUSY;
72 }
73
74 calltime = trace_clock_local();
75
76 index = ++current->curr_ret_stack;
77 barrier();
78 current->ret_stack[index].ret = ret;
79 current->ret_stack[index].func = func;
80 current->ret_stack[index].calltime = calltime;
81 *depth = index;
82
83 return 0;
84}
85
86/* Retrieve a function return address to the trace stack on thread info.*/
87void
88ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
89{
90 int index;
91
92 index = current->curr_ret_stack;
93
94 if (unlikely(index < 0)) {
95 ftrace_graph_stop();
96 WARN_ON(1);
97 /* Might as well panic, otherwise we have no where to go */
98 *ret = (unsigned long)panic;
99 return;
100 }
48 101
49 for_each_online_cpu(cpu) 102 *ret = current->ret_stack[index].ret;
50 tracing_reset(tr, cpu); 103 trace->func = current->ret_stack[index].func;
104 trace->calltime = current->ret_stack[index].calltime;
105 trace->overrun = atomic_read(&current->trace_overrun);
106 trace->depth = index;
107 barrier();
108 current->curr_ret_stack--;
51 109
52 ret = register_ftrace_graph(&trace_graph_return, 110}
111
112/*
113 * Send the trace to the ring-buffer.
114 * @return the original return address.
115 */
116unsigned long ftrace_return_to_handler(void)
117{
118 struct ftrace_graph_ret trace;
119 unsigned long ret;
120
121 ftrace_pop_return_trace(&trace, &ret);
122 trace.rettime = trace_clock_local();
123 ftrace_graph_return(&trace);
124
125 if (unlikely(!ret)) {
126 ftrace_graph_stop();
127 WARN_ON(1);
128 /* Might as well panic. What else to do? */
129 ret = (unsigned long)panic;
130 }
131
132 return ret;
133}
134
135static int graph_trace_init(struct trace_array *tr)
136{
137 int ret = register_ftrace_graph(&trace_graph_return,
53 &trace_graph_entry); 138 &trace_graph_entry);
54 if (ret) 139 if (ret)
55 return ret; 140 return ret;
@@ -112,15 +197,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
112static enum print_line_t 197static enum print_line_t
113print_graph_proc(struct trace_seq *s, pid_t pid) 198print_graph_proc(struct trace_seq *s, pid_t pid)
114{ 199{
115 int i; 200 char comm[TASK_COMM_LEN];
116 int ret;
117 int len;
118 char comm[8];
119 int spaces = 0;
120 /* sign + log10(MAX_INT) + '\0' */ 201 /* sign + log10(MAX_INT) + '\0' */
121 char pid_str[11]; 202 char pid_str[11];
203 int spaces = 0;
204 int ret;
205 int len;
206 int i;
122 207
123 strncpy(comm, trace_find_cmdline(pid), 7); 208 trace_find_cmdline(pid, comm);
124 comm[7] = '\0'; 209 comm[7] = '\0';
125 sprintf(pid_str, "%d", pid); 210 sprintf(pid_str, "%d", pid);
126 211
@@ -153,17 +238,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
153 238
154/* If the pid changed since the last trace, output this event */ 239/* If the pid changed since the last trace, output this event */
155static enum print_line_t 240static enum print_line_t
156verif_pid(struct trace_seq *s, pid_t pid, int cpu) 241verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
157{ 242{
158 pid_t prev_pid; 243 pid_t prev_pid;
244 pid_t *last_pid;
159 int ret; 245 int ret;
160 246
161 if (last_pid[cpu] != -1 && last_pid[cpu] == pid) 247 if (!data)
162 return TRACE_TYPE_HANDLED; 248 return TRACE_TYPE_HANDLED;
163 249
164 prev_pid = last_pid[cpu]; 250 last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
165 last_pid[cpu] = pid;
166 251
252 if (*last_pid == pid)
253 return TRACE_TYPE_HANDLED;
254
255 prev_pid = *last_pid;
256 *last_pid = pid;
257
258 if (prev_pid == -1)
259 return TRACE_TYPE_HANDLED;
167/* 260/*
168 * Context-switch trace line: 261 * Context-switch trace line:
169 262
@@ -175,34 +268,34 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu)
175 ret = trace_seq_printf(s, 268 ret = trace_seq_printf(s,
176 " ------------------------------------------\n"); 269 " ------------------------------------------\n");
177 if (!ret) 270 if (!ret)
178 TRACE_TYPE_PARTIAL_LINE; 271 return TRACE_TYPE_PARTIAL_LINE;
179 272
180 ret = print_graph_cpu(s, cpu); 273 ret = print_graph_cpu(s, cpu);
181 if (ret == TRACE_TYPE_PARTIAL_LINE) 274 if (ret == TRACE_TYPE_PARTIAL_LINE)
182 TRACE_TYPE_PARTIAL_LINE; 275 return TRACE_TYPE_PARTIAL_LINE;
183 276
184 ret = print_graph_proc(s, prev_pid); 277 ret = print_graph_proc(s, prev_pid);
185 if (ret == TRACE_TYPE_PARTIAL_LINE) 278 if (ret == TRACE_TYPE_PARTIAL_LINE)
186 TRACE_TYPE_PARTIAL_LINE; 279 return TRACE_TYPE_PARTIAL_LINE;
187 280
188 ret = trace_seq_printf(s, " => "); 281 ret = trace_seq_printf(s, " => ");
189 if (!ret) 282 if (!ret)
190 TRACE_TYPE_PARTIAL_LINE; 283 return TRACE_TYPE_PARTIAL_LINE;
191 284
192 ret = print_graph_proc(s, pid); 285 ret = print_graph_proc(s, pid);
193 if (ret == TRACE_TYPE_PARTIAL_LINE) 286 if (ret == TRACE_TYPE_PARTIAL_LINE)
194 TRACE_TYPE_PARTIAL_LINE; 287 return TRACE_TYPE_PARTIAL_LINE;
195 288
196 ret = trace_seq_printf(s, 289 ret = trace_seq_printf(s,
197 "\n ------------------------------------------\n\n"); 290 "\n ------------------------------------------\n\n");
198 if (!ret) 291 if (!ret)
199 TRACE_TYPE_PARTIAL_LINE; 292 return TRACE_TYPE_PARTIAL_LINE;
200 293
201 return ret; 294 return TRACE_TYPE_HANDLED;
202} 295}
203 296
204static bool 297static struct ftrace_graph_ret_entry *
205trace_branch_is_leaf(struct trace_iterator *iter, 298get_return_for_leaf(struct trace_iterator *iter,
206 struct ftrace_graph_ent_entry *curr) 299 struct ftrace_graph_ent_entry *curr)
207{ 300{
208 struct ring_buffer_iter *ring_iter; 301 struct ring_buffer_iter *ring_iter;
@@ -211,65 +304,123 @@ trace_branch_is_leaf(struct trace_iterator *iter,
211 304
212 ring_iter = iter->buffer_iter[iter->cpu]; 305 ring_iter = iter->buffer_iter[iter->cpu];
213 306
214 if (!ring_iter) 307 /* First peek to compare current entry and the next one */
215 return false; 308 if (ring_iter)
216 309 event = ring_buffer_iter_peek(ring_iter, NULL);
217 event = ring_buffer_iter_peek(ring_iter, NULL); 310 else {
311 /* We need to consume the current entry to see the next one */
312 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
313 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
314 NULL);
315 }
218 316
219 if (!event) 317 if (!event)
220 return false; 318 return NULL;
221 319
222 next = ring_buffer_event_data(event); 320 next = ring_buffer_event_data(event);
223 321
224 if (next->ent.type != TRACE_GRAPH_RET) 322 if (next->ent.type != TRACE_GRAPH_RET)
225 return false; 323 return NULL;
226 324
227 if (curr->ent.pid != next->ent.pid || 325 if (curr->ent.pid != next->ent.pid ||
228 curr->graph_ent.func != next->ret.func) 326 curr->graph_ent.func != next->ret.func)
229 return false; 327 return NULL;
328
329 /* this is a leaf, now advance the iterator */
330 if (ring_iter)
331 ring_buffer_read(ring_iter, NULL);
332
333 return next;
334}
335
336/* Signal a overhead of time execution to the output */
337static int
338print_graph_overhead(unsigned long long duration, struct trace_seq *s)
339{
340 /* If duration disappear, we don't need anything */
341 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
342 return 1;
343
344 /* Non nested entry or return */
345 if (duration == -1)
346 return trace_seq_printf(s, " ");
347
348 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
349 /* Duration exceeded 100 msecs */
350 if (duration > 100000ULL)
351 return trace_seq_printf(s, "! ");
230 352
231 return true; 353 /* Duration exceeded 10 msecs */
354 if (duration > 10000ULL)
355 return trace_seq_printf(s, "+ ");
356 }
357
358 return trace_seq_printf(s, " ");
359}
360
361static int print_graph_abs_time(u64 t, struct trace_seq *s)
362{
363 unsigned long usecs_rem;
364
365 usecs_rem = do_div(t, NSEC_PER_SEC);
366 usecs_rem /= 1000;
367
368 return trace_seq_printf(s, "%5lu.%06lu | ",
369 (unsigned long)t, usecs_rem);
232} 370}
233 371
234static enum print_line_t 372static enum print_line_t
235print_graph_irq(struct trace_seq *s, unsigned long addr, 373print_graph_irq(struct trace_iterator *iter, unsigned long addr,
236 enum trace_type type, int cpu, pid_t pid) 374 enum trace_type type, int cpu, pid_t pid)
237{ 375{
238 int ret; 376 int ret;
377 struct trace_seq *s = &iter->seq;
239 378
240 if (addr < (unsigned long)__irqentry_text_start || 379 if (addr < (unsigned long)__irqentry_text_start ||
241 addr >= (unsigned long)__irqentry_text_end) 380 addr >= (unsigned long)__irqentry_text_end)
242 return TRACE_TYPE_UNHANDLED; 381 return TRACE_TYPE_UNHANDLED;
243 382
244 if (type == TRACE_GRAPH_ENT) { 383 /* Absolute time */
245 ret = trace_seq_printf(s, "==========> | "); 384 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
246 } else { 385 ret = print_graph_abs_time(iter->ts, s);
247 /* Cpu */ 386 if (!ret)
248 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 387 return TRACE_TYPE_PARTIAL_LINE;
249 ret = print_graph_cpu(s, cpu); 388 }
250 if (ret == TRACE_TYPE_PARTIAL_LINE)
251 return TRACE_TYPE_PARTIAL_LINE;
252 }
253 /* Proc */
254 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
255 ret = print_graph_proc(s, pid);
256 if (ret == TRACE_TYPE_PARTIAL_LINE)
257 return TRACE_TYPE_PARTIAL_LINE;
258 389
259 ret = trace_seq_printf(s, " | "); 390 /* Cpu */
260 if (!ret) 391 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
261 return TRACE_TYPE_PARTIAL_LINE; 392 ret = print_graph_cpu(s, cpu);
262 } 393 if (ret == TRACE_TYPE_PARTIAL_LINE)
394 return TRACE_TYPE_PARTIAL_LINE;
395 }
396 /* Proc */
397 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
398 ret = print_graph_proc(s, pid);
399 if (ret == TRACE_TYPE_PARTIAL_LINE)
400 return TRACE_TYPE_PARTIAL_LINE;
401 ret = trace_seq_printf(s, " | ");
402 if (!ret)
403 return TRACE_TYPE_PARTIAL_LINE;
404 }
263 405
264 /* No overhead */ 406 /* No overhead */
265 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 407 ret = print_graph_overhead(-1, s);
266 ret = trace_seq_printf(s, " "); 408 if (!ret)
267 if (!ret) 409 return TRACE_TYPE_PARTIAL_LINE;
268 return TRACE_TYPE_PARTIAL_LINE; 410
269 } 411 if (type == TRACE_GRAPH_ENT)
412 ret = trace_seq_printf(s, "==========>");
413 else
414 ret = trace_seq_printf(s, "<==========");
415
416 if (!ret)
417 return TRACE_TYPE_PARTIAL_LINE;
418
419 /* Don't close the duration column if haven't one */
420 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
421 trace_seq_printf(s, " |");
422 ret = trace_seq_printf(s, "\n");
270 423
271 ret = trace_seq_printf(s, "<========== |\n");
272 }
273 if (!ret) 424 if (!ret)
274 return TRACE_TYPE_PARTIAL_LINE; 425 return TRACE_TYPE_PARTIAL_LINE;
275 return TRACE_TYPE_HANDLED; 426 return TRACE_TYPE_HANDLED;
@@ -288,7 +439,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
288 sprintf(msecs_str, "%lu", (unsigned long) duration); 439 sprintf(msecs_str, "%lu", (unsigned long) duration);
289 440
290 /* Print msecs */ 441 /* Print msecs */
291 ret = trace_seq_printf(s, msecs_str); 442 ret = trace_seq_printf(s, "%s", msecs_str);
292 if (!ret) 443 if (!ret)
293 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
294 445
@@ -321,52 +472,47 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
321 472
322} 473}
323 474
324/* Signal a overhead of time execution to the output */
325static int
326print_graph_overhead(unsigned long long duration, struct trace_seq *s)
327{
328 /* Duration exceeded 100 msecs */
329 if (duration > 100000ULL)
330 return trace_seq_printf(s, "! ");
331
332 /* Duration exceeded 10 msecs */
333 if (duration > 10000ULL)
334 return trace_seq_printf(s, "+ ");
335
336 return trace_seq_printf(s, " ");
337}
338
339/* Case of a leaf function on its call entry */ 475/* Case of a leaf function on its call entry */
340static enum print_line_t 476static enum print_line_t
341print_graph_entry_leaf(struct trace_iterator *iter, 477print_graph_entry_leaf(struct trace_iterator *iter,
342 struct ftrace_graph_ent_entry *entry, struct trace_seq *s) 478 struct ftrace_graph_ent_entry *entry,
479 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
343{ 480{
344 struct ftrace_graph_ret_entry *ret_entry; 481 struct fgraph_data *data = iter->private;
345 struct ftrace_graph_ret *graph_ret; 482 struct ftrace_graph_ret *graph_ret;
346 struct ring_buffer_event *event;
347 struct ftrace_graph_ent *call; 483 struct ftrace_graph_ent *call;
348 unsigned long long duration; 484 unsigned long long duration;
349 int ret; 485 int ret;
350 int i; 486 int i;
351 487
352 event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
353 ret_entry = ring_buffer_event_data(event);
354 graph_ret = &ret_entry->ret; 488 graph_ret = &ret_entry->ret;
355 call = &entry->graph_ent; 489 call = &entry->graph_ent;
356 duration = graph_ret->rettime - graph_ret->calltime; 490 duration = graph_ret->rettime - graph_ret->calltime;
357 491
358 /* Overhead */ 492 if (data) {
359 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 493 int cpu = iter->cpu;
360 ret = print_graph_overhead(duration, s); 494 int *depth = &(per_cpu_ptr(data, cpu)->depth);
361 if (!ret) 495
362 return TRACE_TYPE_PARTIAL_LINE; 496 /*
497 * Comments display at + 1 to depth. Since
498 * this is a leaf function, keep the comments
499 * equal to this depth.
500 */
501 *depth = call->depth - 1;
363 } 502 }
364 503
365 /* Duration */ 504 /* Overhead */
366 ret = print_graph_duration(duration, s); 505 ret = print_graph_overhead(duration, s);
367 if (ret == TRACE_TYPE_PARTIAL_LINE) 506 if (!ret)
368 return TRACE_TYPE_PARTIAL_LINE; 507 return TRACE_TYPE_PARTIAL_LINE;
369 508
509 /* Duration */
510 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
511 ret = print_graph_duration(duration, s);
512 if (ret == TRACE_TYPE_PARTIAL_LINE)
513 return TRACE_TYPE_PARTIAL_LINE;
514 }
515
370 /* Function */ 516 /* Function */
371 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 517 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
372 ret = trace_seq_printf(s, " "); 518 ret = trace_seq_printf(s, " ");
@@ -386,33 +532,34 @@ print_graph_entry_leaf(struct trace_iterator *iter,
386} 532}
387 533
388static enum print_line_t 534static enum print_line_t
389print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, 535print_graph_entry_nested(struct trace_iterator *iter,
390 struct trace_seq *s, pid_t pid, int cpu) 536 struct ftrace_graph_ent_entry *entry,
537 struct trace_seq *s, int cpu)
391{ 538{
392 int i;
393 int ret;
394 struct ftrace_graph_ent *call = &entry->graph_ent; 539 struct ftrace_graph_ent *call = &entry->graph_ent;
540 struct fgraph_data *data = iter->private;
541 int ret;
542 int i;
395 543
396 /* No overhead */ 544 if (data) {
397 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 545 int cpu = iter->cpu;
398 ret = trace_seq_printf(s, " "); 546 int *depth = &(per_cpu_ptr(data, cpu)->depth);
399 if (!ret) 547
400 return TRACE_TYPE_PARTIAL_LINE; 548 *depth = call->depth;
401 } 549 }
402 550
403 /* Interrupt */ 551 /* No overhead */
404 ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); 552 ret = print_graph_overhead(-1, s);
405 if (ret == TRACE_TYPE_UNHANDLED) { 553 if (!ret)
406 /* No time */ 554 return TRACE_TYPE_PARTIAL_LINE;
555
556 /* No time */
557 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
407 ret = trace_seq_printf(s, " | "); 558 ret = trace_seq_printf(s, " | ");
408 if (!ret) 559 if (!ret)
409 return TRACE_TYPE_PARTIAL_LINE; 560 return TRACE_TYPE_PARTIAL_LINE;
410 } else {
411 if (ret == TRACE_TYPE_PARTIAL_LINE)
412 return TRACE_TYPE_PARTIAL_LINE;
413 } 561 }
414 562
415
416 /* Function */ 563 /* Function */
417 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 564 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
418 ret = trace_seq_printf(s, " "); 565 ret = trace_seq_printf(s, " ");
@@ -428,20 +575,40 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
428 if (!ret) 575 if (!ret)
429 return TRACE_TYPE_PARTIAL_LINE; 576 return TRACE_TYPE_PARTIAL_LINE;
430 577
431 return TRACE_TYPE_HANDLED; 578 /*
579 * we already consumed the current entry to check the next one
580 * and see if this is a leaf.
581 */
582 return TRACE_TYPE_NO_CONSUME;
432} 583}
433 584
434static enum print_line_t 585static enum print_line_t
435print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 586print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
436 struct trace_iterator *iter, int cpu) 587 int type, unsigned long addr)
437{ 588{
438 int ret; 589 struct fgraph_data *data = iter->private;
439 struct trace_entry *ent = iter->ent; 590 struct trace_entry *ent = iter->ent;
591 int cpu = iter->cpu;
592 int ret;
440 593
441 /* Pid */ 594 /* Pid */
442 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) 595 if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
443 return TRACE_TYPE_PARTIAL_LINE; 596 return TRACE_TYPE_PARTIAL_LINE;
444 597
598 if (type) {
599 /* Interrupt */
600 ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
601 if (ret == TRACE_TYPE_PARTIAL_LINE)
602 return TRACE_TYPE_PARTIAL_LINE;
603 }
604
605 /* Absolute time */
606 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
607 ret = print_graph_abs_time(iter->ts, s);
608 if (!ret)
609 return TRACE_TYPE_PARTIAL_LINE;
610 }
611
445 /* Cpu */ 612 /* Cpu */
446 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 613 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
447 ret = print_graph_cpu(s, cpu); 614 ret = print_graph_cpu(s, cpu);
@@ -460,54 +627,65 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
460 return TRACE_TYPE_PARTIAL_LINE; 627 return TRACE_TYPE_PARTIAL_LINE;
461 } 628 }
462 629
463 if (trace_branch_is_leaf(iter, field)) 630 return 0;
464 return print_graph_entry_leaf(iter, field, s); 631}
632
633static enum print_line_t
634print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
635 struct trace_iterator *iter)
636{
637 int cpu = iter->cpu;
638 struct ftrace_graph_ent *call = &field->graph_ent;
639 struct ftrace_graph_ret_entry *leaf_ret;
640
641 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
642 return TRACE_TYPE_PARTIAL_LINE;
643
644 leaf_ret = get_return_for_leaf(iter, field);
645 if (leaf_ret)
646 return print_graph_entry_leaf(iter, field, leaf_ret, s);
465 else 647 else
466 return print_graph_entry_nested(field, s, iter->ent->pid, cpu); 648 return print_graph_entry_nested(iter, field, s, cpu);
467 649
468} 650}
469 651
470static enum print_line_t 652static enum print_line_t
471print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 653print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
472 struct trace_entry *ent, int cpu) 654 struct trace_entry *ent, struct trace_iterator *iter)
473{ 655{
474 int i;
475 int ret;
476 unsigned long long duration = trace->rettime - trace->calltime; 656 unsigned long long duration = trace->rettime - trace->calltime;
657 struct fgraph_data *data = iter->private;
658 pid_t pid = ent->pid;
659 int cpu = iter->cpu;
660 int ret;
661 int i;
477 662
478 /* Pid */ 663 if (data) {
479 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) 664 int cpu = iter->cpu;
480 return TRACE_TYPE_PARTIAL_LINE; 665 int *depth = &(per_cpu_ptr(data, cpu)->depth);
481 666
482 /* Cpu */ 667 /*
483 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 668 * Comments display at + 1 to depth. This is the
484 ret = print_graph_cpu(s, cpu); 669 * return from a function, we now want the comments
485 if (ret == TRACE_TYPE_PARTIAL_LINE) 670 * to display at the same level of the bracket.
486 return TRACE_TYPE_PARTIAL_LINE; 671 */
672 *depth = trace->depth - 1;
487 } 673 }
488 674
489 /* Proc */ 675 if (print_graph_prologue(iter, s, 0, 0))
490 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 676 return TRACE_TYPE_PARTIAL_LINE;
491 ret = print_graph_proc(s, ent->pid);
492 if (ret == TRACE_TYPE_PARTIAL_LINE)
493 return TRACE_TYPE_PARTIAL_LINE;
494
495 ret = trace_seq_printf(s, " | ");
496 if (!ret)
497 return TRACE_TYPE_PARTIAL_LINE;
498 }
499 677
500 /* Overhead */ 678 /* Overhead */
501 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 679 ret = print_graph_overhead(duration, s);
502 ret = print_graph_overhead(duration, s); 680 if (!ret)
503 if (!ret) 681 return TRACE_TYPE_PARTIAL_LINE;
504 return TRACE_TYPE_PARTIAL_LINE;
505 }
506 682
507 /* Duration */ 683 /* Duration */
508 ret = print_graph_duration(duration, s); 684 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
509 if (ret == TRACE_TYPE_PARTIAL_LINE) 685 ret = print_graph_duration(duration, s);
510 return TRACE_TYPE_PARTIAL_LINE; 686 if (ret == TRACE_TYPE_PARTIAL_LINE)
687 return TRACE_TYPE_PARTIAL_LINE;
688 }
511 689
512 /* Closing brace */ 690 /* Closing brace */
513 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 691 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -528,7 +706,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
528 return TRACE_TYPE_PARTIAL_LINE; 706 return TRACE_TYPE_PARTIAL_LINE;
529 } 707 }
530 708
531 ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid); 709 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
532 if (ret == TRACE_TYPE_PARTIAL_LINE) 710 if (ret == TRACE_TYPE_PARTIAL_LINE)
533 return TRACE_TYPE_PARTIAL_LINE; 711 return TRACE_TYPE_PARTIAL_LINE;
534 712
@@ -536,61 +714,73 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
536} 714}
537 715
538static enum print_line_t 716static enum print_line_t
539print_graph_comment(struct print_entry *trace, struct trace_seq *s, 717print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
540 struct trace_entry *ent, struct trace_iterator *iter) 718 struct trace_iterator *iter)
541{ 719{
542 int i; 720 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
721 struct fgraph_data *data = iter->private;
722 struct trace_event *event;
723 int depth = 0;
543 int ret; 724 int ret;
725 int i;
544 726
545 /* Pid */ 727 if (data)
546 if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) 728 depth = per_cpu_ptr(data, iter->cpu)->depth;
547 return TRACE_TYPE_PARTIAL_LINE;
548
549 /* Cpu */
550 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
551 ret = print_graph_cpu(s, iter->cpu);
552 if (ret == TRACE_TYPE_PARTIAL_LINE)
553 return TRACE_TYPE_PARTIAL_LINE;
554 }
555
556 /* Proc */
557 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
558 ret = print_graph_proc(s, ent->pid);
559 if (ret == TRACE_TYPE_PARTIAL_LINE)
560 return TRACE_TYPE_PARTIAL_LINE;
561 729
562 ret = trace_seq_printf(s, " | "); 730 if (print_graph_prologue(iter, s, 0, 0))
563 if (!ret) 731 return TRACE_TYPE_PARTIAL_LINE;
564 return TRACE_TYPE_PARTIAL_LINE;
565 }
566 732
567 /* No overhead */ 733 /* No overhead */
568 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 734 ret = print_graph_overhead(-1, s);
569 ret = trace_seq_printf(s, " "); 735 if (!ret)
736 return TRACE_TYPE_PARTIAL_LINE;
737
738 /* No time */
739 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
740 ret = trace_seq_printf(s, " | ");
570 if (!ret) 741 if (!ret)
571 return TRACE_TYPE_PARTIAL_LINE; 742 return TRACE_TYPE_PARTIAL_LINE;
572 } 743 }
573 744
574 /* No time */
575 ret = trace_seq_printf(s, " | ");
576 if (!ret)
577 return TRACE_TYPE_PARTIAL_LINE;
578
579 /* Indentation */ 745 /* Indentation */
580 if (trace->depth > 0) 746 if (depth > 0)
581 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { 747 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
582 ret = trace_seq_printf(s, " "); 748 ret = trace_seq_printf(s, " ");
583 if (!ret) 749 if (!ret)
584 return TRACE_TYPE_PARTIAL_LINE; 750 return TRACE_TYPE_PARTIAL_LINE;
585 } 751 }
586 752
587 /* The comment */ 753 /* The comment */
588 ret = trace_seq_printf(s, "/* %s", trace->buf); 754 ret = trace_seq_printf(s, "/* ");
589 if (!ret) 755 if (!ret)
590 return TRACE_TYPE_PARTIAL_LINE; 756 return TRACE_TYPE_PARTIAL_LINE;
591 757
592 if (ent->flags & TRACE_FLAG_CONT) 758 switch (iter->ent->type) {
593 trace_seq_print_cont(s, iter); 759 case TRACE_BPRINT:
760 ret = trace_print_bprintk_msg_only(iter);
761 if (ret != TRACE_TYPE_HANDLED)
762 return ret;
763 break;
764 case TRACE_PRINT:
765 ret = trace_print_printk_msg_only(iter);
766 if (ret != TRACE_TYPE_HANDLED)
767 return ret;
768 break;
769 default:
770 event = ftrace_find_event(ent->type);
771 if (!event)
772 return TRACE_TYPE_UNHANDLED;
773
774 ret = event->trace(iter, sym_flags);
775 if (ret != TRACE_TYPE_HANDLED)
776 return ret;
777 }
778
779 /* Strip ending newline */
780 if (s->buffer[s->len - 1] == '\n') {
781 s->buffer[s->len - 1] = '\0';
782 s->len--;
783 }
594 784
595 ret = trace_seq_printf(s, " */\n"); 785 ret = trace_seq_printf(s, " */\n");
596 if (!ret) 786 if (!ret)
@@ -603,62 +793,91 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
603enum print_line_t 793enum print_line_t
604print_graph_function(struct trace_iterator *iter) 794print_graph_function(struct trace_iterator *iter)
605{ 795{
606 struct trace_seq *s = &iter->seq;
607 struct trace_entry *entry = iter->ent; 796 struct trace_entry *entry = iter->ent;
797 struct trace_seq *s = &iter->seq;
608 798
609 switch (entry->type) { 799 switch (entry->type) {
610 case TRACE_GRAPH_ENT: { 800 case TRACE_GRAPH_ENT: {
611 struct ftrace_graph_ent_entry *field; 801 struct ftrace_graph_ent_entry *field;
612 trace_assign_type(field, entry); 802 trace_assign_type(field, entry);
613 return print_graph_entry(field, s, iter, 803 return print_graph_entry(field, s, iter);
614 iter->cpu);
615 } 804 }
616 case TRACE_GRAPH_RET: { 805 case TRACE_GRAPH_RET: {
617 struct ftrace_graph_ret_entry *field; 806 struct ftrace_graph_ret_entry *field;
618 trace_assign_type(field, entry); 807 trace_assign_type(field, entry);
619 return print_graph_return(&field->ret, s, entry, iter->cpu); 808 return print_graph_return(&field->ret, s, entry, iter);
620 }
621 case TRACE_PRINT: {
622 struct print_entry *field;
623 trace_assign_type(field, entry);
624 return print_graph_comment(field, s, entry, iter);
625 } 809 }
626 default: 810 default:
627 return TRACE_TYPE_UNHANDLED; 811 return print_graph_comment(s, entry, iter);
628 } 812 }
813
814 return TRACE_TYPE_HANDLED;
629} 815}
630 816
631static void print_graph_headers(struct seq_file *s) 817static void print_graph_headers(struct seq_file *s)
632{ 818{
633 /* 1st line */ 819 /* 1st line */
634 seq_printf(s, "# "); 820 seq_printf(s, "# ");
821 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
822 seq_printf(s, " TIME ");
635 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 823 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
636 seq_printf(s, "CPU "); 824 seq_printf(s, "CPU");
637 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 825 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
638 seq_printf(s, "TASK/PID "); 826 seq_printf(s, " TASK/PID ");
639 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) 827 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
640 seq_printf(s, "OVERHEAD/"); 828 seq_printf(s, " DURATION ");
641 seq_printf(s, "DURATION FUNCTION CALLS\n"); 829 seq_printf(s, " FUNCTION CALLS\n");
642 830
643 /* 2nd line */ 831 /* 2nd line */
644 seq_printf(s, "# "); 832 seq_printf(s, "# ");
833 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
834 seq_printf(s, " | ");
645 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 835 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
646 seq_printf(s, "| "); 836 seq_printf(s, "| ");
647 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 837 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
648 seq_printf(s, "| | "); 838 seq_printf(s, " | | ");
649 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 839 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
650 seq_printf(s, "| "); 840 seq_printf(s, " | | ");
651 seq_printf(s, "| | | | |\n"); 841 seq_printf(s, " | | | |\n");
652 } else 842}
653 seq_printf(s, " | | | | |\n"); 843
844static void graph_trace_open(struct trace_iterator *iter)
845{
846 /* pid and depth on the last trace processed */
847 struct fgraph_data *data = alloc_percpu(struct fgraph_data);
848 int cpu;
849
850 if (!data)
851 pr_warning("function graph tracer: not enough memory\n");
852 else
853 for_each_possible_cpu(cpu) {
854 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
855 int *depth = &(per_cpu_ptr(data, cpu)->depth);
856 *pid = -1;
857 *depth = 0;
858 }
859
860 iter->private = data;
654} 861}
862
863static void graph_trace_close(struct trace_iterator *iter)
864{
865 free_percpu(iter->private);
866}
867
655static struct tracer graph_trace __read_mostly = { 868static struct tracer graph_trace __read_mostly = {
656 .name = "function_graph", 869 .name = "function_graph",
657 .init = graph_trace_init, 870 .open = graph_trace_open,
658 .reset = graph_trace_reset, 871 .close = graph_trace_close,
872 .wait_pipe = poll_wait_pipe,
873 .init = graph_trace_init,
874 .reset = graph_trace_reset,
659 .print_line = print_graph_function, 875 .print_line = print_graph_function,
660 .print_header = print_graph_headers, 876 .print_header = print_graph_headers,
661 .flags = &tracer_flags, 877 .flags = &tracer_flags,
878#ifdef CONFIG_FTRACE_SELFTEST
879 .selftest = trace_selftest_startup_function_graph,
880#endif
662}; 881};
663 882
664static __init int init_graph_trace(void) 883static __init int init_graph_trace(void)
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 649df22d435f..7bfdf4c2347f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,30 +1,53 @@
1/* 1/*
2 * h/w branch tracer for x86 based on bts 2 * h/w branch tracer for x86 based on bts
3 * 3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com> 4 * Copyright (C) 2008-2009 Intel Corporation.
5 * 5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */ 6 */
7 7#include <linux/spinlock.h>
8#include <linux/module.h> 8#include <linux/kallsyms.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/kallsyms.h> 11#include <linux/module.h>
12#include <linux/cpu.h>
13#include <linux/smp.h>
14#include <linux/fs.h>
13 15
14#include <asm/ds.h> 16#include <asm/ds.h>
15 17
16#include "trace.h" 18#include "trace.h"
19#include "trace_output.h"
17 20
18 21
19#define SIZEOF_BTS (1 << 13) 22#define SIZEOF_BTS (1 << 13)
20 23
24/*
25 * The tracer lock protects the below per-cpu tracer array.
26 * It needs to be held to:
27 * - start tracing on all cpus
28 * - stop tracing on all cpus
29 * - start tracing on a single hotplug cpu
30 * - stop tracing on a single hotplug cpu
31 * - read the trace from all cpus
32 * - read the trace from a single cpu
33 */
34static DEFINE_SPINLOCK(bts_tracer_lock);
21static DEFINE_PER_CPU(struct bts_tracer *, tracer); 35static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); 36static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23 37
24#define this_tracer per_cpu(tracer, smp_processor_id()) 38#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id()) 39#define this_buffer per_cpu(buffer, smp_processor_id())
26 40
41static int __read_mostly trace_hw_branches_enabled;
42static struct trace_array *hw_branch_trace __read_mostly;
43
27 44
45/*
46 * Start tracing on the current cpu.
47 * The argument is ignored.
48 *
49 * pre: bts_tracer_lock must be locked.
50 */
28static void bts_trace_start_cpu(void *arg) 51static void bts_trace_start_cpu(void *arg)
29{ 52{
30 if (this_tracer) 53 if (this_tracer)
@@ -42,14 +65,20 @@ static void bts_trace_start_cpu(void *arg)
42 65
43static void bts_trace_start(struct trace_array *tr) 66static void bts_trace_start(struct trace_array *tr)
44{ 67{
45 int cpu; 68 spin_lock(&bts_tracer_lock);
46 69
47 tracing_reset_online_cpus(tr); 70 on_each_cpu(bts_trace_start_cpu, NULL, 1);
71 trace_hw_branches_enabled = 1;
48 72
49 for_each_cpu(cpu, cpu_possible_mask) 73 spin_unlock(&bts_tracer_lock);
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51} 74}
52 75
76/*
77 * Stop tracing on the current cpu.
78 * The argument is ignored.
79 *
80 * pre: bts_tracer_lock must be locked.
81 */
53static void bts_trace_stop_cpu(void *arg) 82static void bts_trace_stop_cpu(void *arg)
54{ 83{
55 if (this_tracer) { 84 if (this_tracer) {
@@ -60,26 +89,60 @@ static void bts_trace_stop_cpu(void *arg)
60 89
61static void bts_trace_stop(struct trace_array *tr) 90static void bts_trace_stop(struct trace_array *tr)
62{ 91{
63 int cpu; 92 spin_lock(&bts_tracer_lock);
93
94 trace_hw_branches_enabled = 0;
95 on_each_cpu(bts_trace_stop_cpu, NULL, 1);
96
97 spin_unlock(&bts_tracer_lock);
98}
99
100static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
101 unsigned long action, void *hcpu)
102{
103 unsigned int cpu = (unsigned long)hcpu;
64 104
65 for_each_cpu(cpu, cpu_possible_mask) 105 spin_lock(&bts_tracer_lock);
106
107 if (!trace_hw_branches_enabled)
108 goto out;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
114 break;
115 case CPU_DOWN_PREPARE:
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 116 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
117 break;
118 }
119
120 out:
121 spin_unlock(&bts_tracer_lock);
122 return NOTIFY_DONE;
67} 123}
68 124
125static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
126 .notifier_call = bts_hotcpu_handler
127};
128
69static int bts_trace_init(struct trace_array *tr) 129static int bts_trace_init(struct trace_array *tr)
70{ 130{
71 tracing_reset_online_cpus(tr); 131 hw_branch_trace = tr;
132
72 bts_trace_start(tr); 133 bts_trace_start(tr);
73 134
74 return 0; 135 return 0;
75} 136}
76 137
138static void bts_trace_reset(struct trace_array *tr)
139{
140 bts_trace_stop(tr);
141}
142
77static void bts_trace_print_header(struct seq_file *m) 143static void bts_trace_print_header(struct seq_file *m)
78{ 144{
79 seq_puts(m, 145 seq_puts(m, "# CPU# TO <- FROM\n");
80 "# CPU# FROM TO FUNCTION\n");
81 seq_puts(m,
82 "# | | | |\n");
83} 146}
84 147
85static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) 148static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@@ -87,15 +150,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
87 struct trace_entry *entry = iter->ent; 150 struct trace_entry *entry = iter->ent;
88 struct trace_seq *seq = &iter->seq; 151 struct trace_seq *seq = &iter->seq;
89 struct hw_branch_entry *it; 152 struct hw_branch_entry *it;
153 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
90 154
91 trace_assign_type(it, entry); 155 trace_assign_type(it, entry);
92 156
93 if (entry->type == TRACE_HW_BRANCHES) { 157 if (entry->type == TRACE_HW_BRANCHES) {
94 if (trace_seq_printf(seq, "%4d ", entry->cpu) && 158 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
95 trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", 159 seq_print_ip_sym(seq, it->to, symflags) &&
96 it->from, it->to) && 160 trace_seq_printf(seq, "\t <- ") &&
97 (!it->from || 161 seq_print_ip_sym(seq, it->from, symflags) &&
98 seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
99 trace_seq_printf(seq, "\n")) 162 trace_seq_printf(seq, "\n"))
100 return TRACE_TYPE_HANDLED; 163 return TRACE_TYPE_HANDLED;
101 return TRACE_TYPE_PARTIAL_LINE;; 164 return TRACE_TYPE_PARTIAL_LINE;;
@@ -103,26 +166,42 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
103 return TRACE_TYPE_UNHANDLED; 166 return TRACE_TYPE_UNHANDLED;
104} 167}
105 168
106void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) 169void trace_hw_branch(u64 from, u64 to)
107{ 170{
171 struct trace_array *tr = hw_branch_trace;
108 struct ring_buffer_event *event; 172 struct ring_buffer_event *event;
109 struct hw_branch_entry *entry; 173 struct hw_branch_entry *entry;
110 unsigned long irq; 174 unsigned long irq1;
175 int cpu;
111 176
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); 177 if (unlikely(!tr))
113 if (!event)
114 return; 178 return;
179
180 if (unlikely(!trace_hw_branches_enabled))
181 return;
182
183 local_irq_save(irq1);
184 cpu = raw_smp_processor_id();
185 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
186 goto out;
187
188 event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
189 sizeof(*entry), 0, 0);
190 if (!event)
191 goto out;
115 entry = ring_buffer_event_data(event); 192 entry = ring_buffer_event_data(event);
116 tracing_generic_entry_update(&entry->ent, 0, from); 193 tracing_generic_entry_update(&entry->ent, 0, from);
117 entry->ent.type = TRACE_HW_BRANCHES; 194 entry->ent.type = TRACE_HW_BRANCHES;
118 entry->ent.cpu = smp_processor_id();
119 entry->from = from; 195 entry->from = from;
120 entry->to = to; 196 entry->to = to;
121 ring_buffer_unlock_commit(tr->buffer, event, irq); 197 trace_buffer_unlock_commit(tr, event, 0, 0);
198
199 out:
200 atomic_dec(&tr->data[cpu]->disabled);
201 local_irq_restore(irq1);
122} 202}
123 203
124static void trace_bts_at(struct trace_array *tr, 204static void trace_bts_at(const struct bts_trace *trace, void *at)
125 const struct bts_trace *trace, void *at)
126{ 205{
127 struct bts_struct bts; 206 struct bts_struct bts;
128 int err = 0; 207 int err = 0;
@@ -137,18 +216,29 @@ static void trace_bts_at(struct trace_array *tr,
137 216
138 switch (bts.qualifier) { 217 switch (bts.qualifier) {
139 case BTS_BRANCH: 218 case BTS_BRANCH:
140 trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); 219 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
141 break; 220 break;
142 } 221 }
143} 222}
144 223
224/*
225 * Collect the trace on the current cpu and write it into the ftrace buffer.
226 *
227 * pre: bts_tracer_lock must be locked
228 */
145static void trace_bts_cpu(void *arg) 229static void trace_bts_cpu(void *arg)
146{ 230{
147 struct trace_array *tr = (struct trace_array *) arg; 231 struct trace_array *tr = (struct trace_array *) arg;
148 const struct bts_trace *trace; 232 const struct bts_trace *trace;
149 unsigned char *at; 233 unsigned char *at;
150 234
151 if (!this_tracer) 235 if (unlikely(!tr))
236 return;
237
238 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
239 return;
240
241 if (unlikely(!this_tracer))
152 return; 242 return;
153 243
154 ds_suspend_bts(this_tracer); 244 ds_suspend_bts(this_tracer);
@@ -158,11 +248,11 @@ static void trace_bts_cpu(void *arg)
158 248
159 for (at = trace->ds.top; (void *)at < trace->ds.end; 249 for (at = trace->ds.top; (void *)at < trace->ds.end;
160 at += trace->ds.size) 250 at += trace->ds.size)
161 trace_bts_at(tr, trace, at); 251 trace_bts_at(trace, at);
162 252
163 for (at = trace->ds.begin; (void *)at < trace->ds.top; 253 for (at = trace->ds.begin; (void *)at < trace->ds.top;
164 at += trace->ds.size) 254 at += trace->ds.size)
165 trace_bts_at(tr, trace, at); 255 trace_bts_at(trace, at);
166 256
167out: 257out:
168 ds_resume_bts(this_tracer); 258 ds_resume_bts(this_tracer);
@@ -170,26 +260,43 @@ out:
170 260
171static void trace_bts_prepare(struct trace_iterator *iter) 261static void trace_bts_prepare(struct trace_iterator *iter)
172{ 262{
173 int cpu; 263 spin_lock(&bts_tracer_lock);
264
265 on_each_cpu(trace_bts_cpu, iter->tr, 1);
266
267 spin_unlock(&bts_tracer_lock);
268}
269
270static void trace_bts_close(struct trace_iterator *iter)
271{
272 tracing_reset_online_cpus(iter->tr);
273}
274
275void trace_hw_branch_oops(void)
276{
277 spin_lock(&bts_tracer_lock);
278
279 trace_bts_cpu(hw_branch_trace);
174 280
175 for_each_cpu(cpu, cpu_possible_mask) 281 spin_unlock(&bts_tracer_lock);
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177} 282}
178 283
179struct tracer bts_tracer __read_mostly = 284struct tracer bts_tracer __read_mostly =
180{ 285{
181 .name = "hw-branch-tracer", 286 .name = "hw-branch-tracer",
182 .init = bts_trace_init, 287 .init = bts_trace_init,
183 .reset = bts_trace_stop, 288 .reset = bts_trace_reset,
184 .print_header = bts_trace_print_header, 289 .print_header = bts_trace_print_header,
185 .print_line = bts_trace_print_line, 290 .print_line = bts_trace_print_line,
186 .start = bts_trace_start, 291 .start = bts_trace_start,
187 .stop = bts_trace_stop, 292 .stop = bts_trace_stop,
188 .open = trace_bts_prepare 293 .open = trace_bts_prepare,
294 .close = trace_bts_close
189}; 295};
190 296
191__init static int init_bts_trace(void) 297__init static int init_bts_trace(void)
192{ 298{
299 register_hotcpu_notifier(&bts_hotcpu_notifier);
193 return register_tracer(&bts_tracer); 300 return register_tracer(&bts_tracer);
194} 301}
195device_initcall(init_bts_trace); 302device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8b..b923d13e2fad 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * trace irqs off criticall timings 2 * trace irqs off critical timings
3 * 3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> 5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
@@ -32,6 +32,8 @@ enum {
32 32
33static int trace_type __read_mostly; 33static int trace_type __read_mostly;
34 34
35static int save_lat_flag;
36
35#ifdef CONFIG_PREEMPT_TRACER 37#ifdef CONFIG_PREEMPT_TRACER
36static inline int 38static inline int
37preempt_trace(void) 39preempt_trace(void)
@@ -95,7 +97,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
95 disabled = atomic_inc_return(&data->disabled); 97 disabled = atomic_inc_return(&data->disabled);
96 98
97 if (likely(disabled == 1)) 99 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 100 trace_function(tr, ip, parent_ip, flags, preempt_count());
99 101
100 atomic_dec(&data->disabled); 102 atomic_dec(&data->disabled);
101} 103}
@@ -153,7 +155,7 @@ check_critical_timing(struct trace_array *tr,
153 if (!report_latency(delta)) 155 if (!report_latency(delta))
154 goto out_unlock; 156 goto out_unlock;
155 157
156 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); 158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
157 159
158 latency = nsecs_to_usecs(delta); 160 latency = nsecs_to_usecs(delta);
159 161
@@ -177,7 +179,7 @@ out:
177 data->critical_sequence = max_sequence; 179 data->critical_sequence = max_sequence;
178 data->preempt_timestamp = ftrace_now(cpu); 180 data->preempt_timestamp = ftrace_now(cpu);
179 tracing_reset(tr, cpu); 181 tracing_reset(tr, cpu);
180 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); 182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
181} 183}
182 184
183static inline void 185static inline void
@@ -210,7 +212,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
210 212
211 local_save_flags(flags); 213 local_save_flags(flags);
212 214
213 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 215 trace_function(tr, ip, parent_ip, flags, preempt_count());
214 216
215 per_cpu(tracing_cpu, cpu) = 1; 217 per_cpu(tracing_cpu, cpu) = 1;
216 218
@@ -244,7 +246,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
244 atomic_inc(&data->disabled); 246 atomic_inc(&data->disabled);
245 247
246 local_save_flags(flags); 248 local_save_flags(flags);
247 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 249 trace_function(tr, ip, parent_ip, flags, preempt_count());
248 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 250 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
249 data->critical_start = 0; 251 data->critical_start = 0;
250 atomic_dec(&data->disabled); 252 atomic_dec(&data->disabled);
@@ -353,33 +355,27 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
353} 355}
354#endif /* CONFIG_PREEMPT_TRACER */ 356#endif /* CONFIG_PREEMPT_TRACER */
355 357
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
362static void start_irqsoff_tracer(struct trace_array *tr) 358static void start_irqsoff_tracer(struct trace_array *tr)
363{ 359{
364 register_ftrace_function(&trace_ops); 360 register_ftrace_function(&trace_ops);
365 if (tracing_is_enabled()) { 361 if (tracing_is_enabled())
366 tracer_enabled = 1; 362 tracer_enabled = 1;
367 save_tracer_enabled = 1; 363 else
368 } else {
369 tracer_enabled = 0; 364 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
372} 365}
373 366
374static void stop_irqsoff_tracer(struct trace_array *tr) 367static void stop_irqsoff_tracer(struct trace_array *tr)
375{ 368{
376 tracer_enabled = 0; 369 tracer_enabled = 0;
377 save_tracer_enabled = 0;
378 unregister_ftrace_function(&trace_ops); 370 unregister_ftrace_function(&trace_ops);
379} 371}
380 372
381static void __irqsoff_tracer_init(struct trace_array *tr) 373static void __irqsoff_tracer_init(struct trace_array *tr)
382{ 374{
375 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
376 trace_flags |= TRACE_ITER_LATENCY_FMT;
377
378 tracing_max_latency = 0;
383 irqsoff_trace = tr; 379 irqsoff_trace = tr;
384 /* make sure that the tracer is visible */ 380 /* make sure that the tracer is visible */
385 smp_wmb(); 381 smp_wmb();
@@ -389,30 +385,19 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
389static void irqsoff_tracer_reset(struct trace_array *tr) 385static void irqsoff_tracer_reset(struct trace_array *tr)
390{ 386{
391 stop_irqsoff_tracer(tr); 387 stop_irqsoff_tracer(tr);
388
389 if (!save_lat_flag)
390 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
392} 391}
393 392
394static void irqsoff_tracer_start(struct trace_array *tr) 393static void irqsoff_tracer_start(struct trace_array *tr)
395{ 394{
396 tracer_enabled = 1; 395 tracer_enabled = 1;
397 save_tracer_enabled = 1;
398} 396}
399 397
400static void irqsoff_tracer_stop(struct trace_array *tr) 398static void irqsoff_tracer_stop(struct trace_array *tr)
401{ 399{
402 tracer_enabled = 0; 400 tracer_enabled = 0;
403 save_tracer_enabled = 0;
404}
405
406static void irqsoff_tracer_open(struct trace_iterator *iter)
407{
408 /* stop the trace while dumping */
409 tracer_enabled = 0;
410}
411
412static void irqsoff_tracer_close(struct trace_iterator *iter)
413{
414 /* restart tracing */
415 tracer_enabled = save_tracer_enabled;
416} 401}
417 402
418#ifdef CONFIG_IRQSOFF_TRACER 403#ifdef CONFIG_IRQSOFF_TRACER
@@ -430,8 +415,6 @@ static struct tracer irqsoff_tracer __read_mostly =
430 .reset = irqsoff_tracer_reset, 415 .reset = irqsoff_tracer_reset,
431 .start = irqsoff_tracer_start, 416 .start = irqsoff_tracer_start,
432 .stop = irqsoff_tracer_stop, 417 .stop = irqsoff_tracer_stop,
433 .open = irqsoff_tracer_open,
434 .close = irqsoff_tracer_close,
435 .print_max = 1, 418 .print_max = 1,
436#ifdef CONFIG_FTRACE_SELFTEST 419#ifdef CONFIG_FTRACE_SELFTEST
437 .selftest = trace_selftest_startup_irqsoff, 420 .selftest = trace_selftest_startup_irqsoff,
@@ -458,8 +441,6 @@ static struct tracer preemptoff_tracer __read_mostly =
458 .reset = irqsoff_tracer_reset, 441 .reset = irqsoff_tracer_reset,
459 .start = irqsoff_tracer_start, 442 .start = irqsoff_tracer_start,
460 .stop = irqsoff_tracer_stop, 443 .stop = irqsoff_tracer_stop,
461 .open = irqsoff_tracer_open,
462 .close = irqsoff_tracer_close,
463 .print_max = 1, 444 .print_max = 1,
464#ifdef CONFIG_FTRACE_SELFTEST 445#ifdef CONFIG_FTRACE_SELFTEST
465 .selftest = trace_selftest_startup_preemptoff, 446 .selftest = trace_selftest_startup_preemptoff,
@@ -488,8 +469,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
488 .reset = irqsoff_tracer_reset, 469 .reset = irqsoff_tracer_reset,
489 .start = irqsoff_tracer_start, 470 .start = irqsoff_tracer_start,
490 .stop = irqsoff_tracer_stop, 471 .stop = irqsoff_tracer_stop,
491 .open = irqsoff_tracer_open,
492 .close = irqsoff_tracer_close,
493 .print_max = 1, 472 .print_max = 1,
494#ifdef CONFIG_FTRACE_SELFTEST 473#ifdef CONFIG_FTRACE_SELFTEST
495 .selftest = trace_selftest_startup_preemptirqsoff, 474 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fffcb069f1dc..8e37fcddd8b4 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,8 +9,10 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <asm/atomic.h>
12 13
13#include "trace.h" 14#include "trace.h"
15#include "trace_output.h"
14 16
15struct header_iter { 17struct header_iter {
16 struct pci_dev *dev; 18 struct pci_dev *dev;
@@ -19,6 +21,7 @@ struct header_iter {
19static struct trace_array *mmio_trace_array; 21static struct trace_array *mmio_trace_array;
20static bool overrun_detected; 22static bool overrun_detected;
21static unsigned long prev_overruns; 23static unsigned long prev_overruns;
24static atomic_t dropped_count;
22 25
23static void mmio_reset_data(struct trace_array *tr) 26static void mmio_reset_data(struct trace_array *tr)
24{ 27{
@@ -121,11 +124,11 @@ static void mmio_close(struct trace_iterator *iter)
121 124
122static unsigned long count_overruns(struct trace_iterator *iter) 125static unsigned long count_overruns(struct trace_iterator *iter)
123{ 126{
124 unsigned long cnt = 0; 127 unsigned long cnt = atomic_xchg(&dropped_count, 0);
125 unsigned long over = ring_buffer_overruns(iter->tr->buffer); 128 unsigned long over = ring_buffer_overruns(iter->tr->buffer);
126 129
127 if (over > prev_overruns) 130 if (over > prev_overruns)
128 cnt = over - prev_overruns; 131 cnt += over - prev_overruns;
129 prev_overruns = over; 132 prev_overruns = over;
130 return cnt; 133 return cnt;
131} 134}
@@ -181,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
181 switch (rw->opcode) { 184 switch (rw->opcode) {
182 case MMIO_READ: 185 case MMIO_READ:
183 ret = trace_seq_printf(s, 186 ret = trace_seq_printf(s,
184 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 187 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
185 rw->width, secs, usec_rem, rw->map_id, 188 rw->width, secs, usec_rem, rw->map_id,
186 (unsigned long long)rw->phys, 189 (unsigned long long)rw->phys,
187 rw->value, rw->pc, 0); 190 rw->value, rw->pc, 0);
188 break; 191 break;
189 case MMIO_WRITE: 192 case MMIO_WRITE:
190 ret = trace_seq_printf(s, 193 ret = trace_seq_printf(s,
191 "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 194 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
192 rw->width, secs, usec_rem, rw->map_id, 195 rw->width, secs, usec_rem, rw->map_id,
193 (unsigned long long)rw->phys, 196 (unsigned long long)rw->phys,
194 rw->value, rw->pc, 0); 197 rw->value, rw->pc, 0);
195 break; 198 break;
196 case MMIO_UNKNOWN_OP: 199 case MMIO_UNKNOWN_OP:
197 ret = trace_seq_printf(s, 200 ret = trace_seq_printf(s,
198 "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", 201 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
202 "%02lx 0x%lx %d\n",
199 secs, usec_rem, rw->map_id, 203 secs, usec_rem, rw->map_id,
200 (unsigned long long)rw->phys, 204 (unsigned long long)rw->phys,
201 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, 205 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -227,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
227 switch (m->opcode) { 231 switch (m->opcode) {
228 case MMIO_PROBE: 232 case MMIO_PROBE:
229 ret = trace_seq_printf(s, 233 ret = trace_seq_printf(s,
230 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 234 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
231 secs, usec_rem, m->map_id, 235 secs, usec_rem, m->map_id,
232 (unsigned long long)m->phys, m->virt, m->len, 236 (unsigned long long)m->phys, m->virt, m->len,
233 0UL, 0); 237 0UL, 0);
234 break; 238 break;
235 case MMIO_UNPROBE: 239 case MMIO_UNPROBE:
236 ret = trace_seq_printf(s, 240 ret = trace_seq_printf(s,
237 "UNMAP %lu.%06lu %d 0x%lx %d\n", 241 "UNMAP %u.%06lu %d 0x%lx %d\n",
238 secs, usec_rem, m->map_id, 0UL, 0); 242 secs, usec_rem, m->map_id, 0UL, 0);
239 break; 243 break;
240 default: 244 default:
@@ -253,18 +257,15 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
253 const char *msg = print->buf; 257 const char *msg = print->buf;
254 struct trace_seq *s = &iter->seq; 258 struct trace_seq *s = &iter->seq;
255 unsigned long long t = ns2usecs(iter->ts); 259 unsigned long long t = ns2usecs(iter->ts);
256 unsigned long usec_rem = do_div(t, 1000000ULL); 260 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
257 unsigned secs = (unsigned long)t; 261 unsigned secs = (unsigned long)t;
258 int ret; 262 int ret;
259 263
260 /* The trailing newline must be in the message. */ 264 /* The trailing newline must be in the message. */
261 ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); 265 ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
262 if (!ret) 266 if (!ret)
263 return TRACE_TYPE_PARTIAL_LINE; 267 return TRACE_TYPE_PARTIAL_LINE;
264 268
265 if (entry->flags & TRACE_FLAG_CONT)
266 trace_seq_print_cont(s, iter);
267
268 return TRACE_TYPE_HANDLED; 269 return TRACE_TYPE_HANDLED;
269} 270}
270 271
@@ -306,19 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
306{ 307{
307 struct ring_buffer_event *event; 308 struct ring_buffer_event *event;
308 struct trace_mmiotrace_rw *entry; 309 struct trace_mmiotrace_rw *entry;
309 unsigned long irq_flags; 310 int pc = preempt_count();
310 311
311 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 312 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
312 &irq_flags); 313 sizeof(*entry), 0, pc);
313 if (!event) 314 if (!event) {
315 atomic_inc(&dropped_count);
314 return; 316 return;
317 }
315 entry = ring_buffer_event_data(event); 318 entry = ring_buffer_event_data(event);
316 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
317 entry->ent.type = TRACE_MMIO_RW;
318 entry->rw = *rw; 319 entry->rw = *rw;
319 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 320 trace_buffer_unlock_commit(tr, event, 0, pc);
320
321 trace_wake_up();
322} 321}
323 322
324void mmio_trace_rw(struct mmiotrace_rw *rw) 323void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -334,19 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
334{ 333{
335 struct ring_buffer_event *event; 334 struct ring_buffer_event *event;
336 struct trace_mmiotrace_map *entry; 335 struct trace_mmiotrace_map *entry;
337 unsigned long irq_flags; 336 int pc = preempt_count();
338 337
339 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 338 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
340 &irq_flags); 339 sizeof(*entry), 0, pc);
341 if (!event) 340 if (!event) {
341 atomic_inc(&dropped_count);
342 return; 342 return;
343 }
343 entry = ring_buffer_event_data(event); 344 entry = ring_buffer_event_data(event);
344 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
345 entry->ent.type = TRACE_MMIO_MAP;
346 entry->map = *map; 345 entry->map = *map;
347 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 346 trace_buffer_unlock_commit(tr, event, 0, pc);
348
349 trace_wake_up();
350} 347}
351 348
352void mmio_trace_mapping(struct mmiotrace_map *map) 349void mmio_trace_mapping(struct mmiotrace_map *map)
@@ -362,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
362 359
363int mmio_trace_printk(const char *fmt, va_list args) 360int mmio_trace_printk(const char *fmt, va_list args)
364{ 361{
365 return trace_vprintk(0, -1, fmt, args); 362 return trace_vprintk(0, fmt, args);
366} 363}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index b9767acd30ac..394f94417e2f 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -47,12 +47,7 @@ static void stop_nop_trace(struct trace_array *tr)
47 47
48static int nop_trace_init(struct trace_array *tr) 48static int nop_trace_init(struct trace_array *tr)
49{ 49{
50 int cpu;
51 ctx_trace = tr; 50 ctx_trace = tr;
52
53 for_each_online_cpu(cpu)
54 tracing_reset(tr, cpu);
55
56 start_nop_trace(tr); 51 start_nop_trace(tr);
57 return 0; 52 return 0;
58} 53}
@@ -96,6 +91,7 @@ struct tracer nop_trace __read_mostly =
96 .name = "nop", 91 .name = "nop",
97 .init = nop_trace_init, 92 .init = nop_trace_init,
98 .reset = nop_trace_reset, 93 .reset = nop_trace_reset,
94 .wait_pipe = poll_wait_pipe,
99#ifdef CONFIG_FTRACE_SELFTEST 95#ifdef CONFIG_FTRACE_SELFTEST
100 .selftest = trace_selftest_startup_nop, 96 .selftest = trace_selftest_startup_nop,
101#endif 97#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 000000000000..64b54a59c55b
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,1017 @@
1/*
2 * trace_output.c
3 *
4 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/mutex.h>
10#include <linux/ftrace.h>
11
12#include "trace_output.h"
13
14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128
16
17static DEFINE_MUTEX(trace_event_mutex);
18static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
19
20static int next_event_type = __TRACE_LAST_TYPE + 1;
21
22enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
23{
24 struct trace_seq *s = &iter->seq;
25 struct trace_entry *entry = iter->ent;
26 struct bprint_entry *field;
27 int ret;
28
29 trace_assign_type(field, entry);
30
31 ret = trace_seq_bprintf(s, field->fmt, field->buf);
32 if (!ret)
33 return TRACE_TYPE_PARTIAL_LINE;
34
35 return TRACE_TYPE_HANDLED;
36}
37
38enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
39{
40 struct trace_seq *s = &iter->seq;
41 struct trace_entry *entry = iter->ent;
42 struct print_entry *field;
43 int ret;
44
45 trace_assign_type(field, entry);
46
47 ret = trace_seq_printf(s, "%s", field->buf);
48 if (!ret)
49 return TRACE_TYPE_PARTIAL_LINE;
50
51 return TRACE_TYPE_HANDLED;
52}
53
54/**
55 * trace_seq_printf - sequence printing of trace information
56 * @s: trace sequence descriptor
57 * @fmt: printf format string
58 *
59 * The tracer may use either sequence operations or its own
60 * copy to user routines. To simplify formating of a trace
61 * trace_seq_printf is used to store strings into a special
62 * buffer (@s). Then the output may be either used by
63 * the sequencer or pulled into another buffer.
64 */
65int
66trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
67{
68 int len = (PAGE_SIZE - 1) - s->len;
69 va_list ap;
70 int ret;
71
72 if (!len)
73 return 0;
74
75 va_start(ap, fmt);
76 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
77 va_end(ap);
78
79 /* If we can't write it all, don't bother writing anything */
80 if (ret >= len)
81 return 0;
82
83 s->len += ret;
84
85 return len;
86}
87
88int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
89{
90 int len = (PAGE_SIZE - 1) - s->len;
91 int ret;
92
93 if (!len)
94 return 0;
95
96 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
97
98 /* If we can't write it all, don't bother writing anything */
99 if (ret >= len)
100 return 0;
101
102 s->len += ret;
103
104 return len;
105}
106
107/**
108 * trace_seq_puts - trace sequence printing of simple string
109 * @s: trace sequence descriptor
110 * @str: simple string to record
111 *
112 * The tracer may use either the sequence operations or its own
113 * copy to user routines. This function records a simple string
114 * into a special buffer (@s) for later retrieval by a sequencer
115 * or other mechanism.
116 */
117int trace_seq_puts(struct trace_seq *s, const char *str)
118{
119 int len = strlen(str);
120
121 if (len > ((PAGE_SIZE - 1) - s->len))
122 return 0;
123
124 memcpy(s->buffer + s->len, str, len);
125 s->len += len;
126
127 return len;
128}
129
130int trace_seq_putc(struct trace_seq *s, unsigned char c)
131{
132 if (s->len >= (PAGE_SIZE - 1))
133 return 0;
134
135 s->buffer[s->len++] = c;
136
137 return 1;
138}
139
140int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
141{
142 if (len > ((PAGE_SIZE - 1) - s->len))
143 return 0;
144
145 memcpy(s->buffer + s->len, mem, len);
146 s->len += len;
147
148 return len;
149}
150
151int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
152{
153 unsigned char hex[HEX_CHARS];
154 const unsigned char *data = mem;
155 int i, j;
156
157#ifdef __BIG_ENDIAN
158 for (i = 0, j = 0; i < len; i++) {
159#else
160 for (i = len-1, j = 0; i >= 0; i--) {
161#endif
162 hex[j++] = hex_asc_hi(data[i]);
163 hex[j++] = hex_asc_lo(data[i]);
164 }
165 hex[j++] = ' ';
166
167 return trace_seq_putmem(s, hex, j);
168}
169
170void *trace_seq_reserve(struct trace_seq *s, size_t len)
171{
172 void *ret;
173
174 if (len > ((PAGE_SIZE - 1) - s->len))
175 return NULL;
176
177 ret = s->buffer + s->len;
178 s->len += len;
179
180 return ret;
181}
182
183int trace_seq_path(struct trace_seq *s, struct path *path)
184{
185 unsigned char *p;
186
187 if (s->len >= (PAGE_SIZE - 1))
188 return 0;
189 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
190 if (!IS_ERR(p)) {
191 p = mangle_path(s->buffer + s->len, p, "\n");
192 if (p) {
193 s->len = p - s->buffer;
194 return 1;
195 }
196 } else {
197 s->buffer[s->len++] = '?';
198 return 1;
199 }
200
201 return 0;
202}
203
204#ifdef CONFIG_KRETPROBES
205static inline const char *kretprobed(const char *name)
206{
207 static const char tramp_name[] = "kretprobe_trampoline";
208 int size = sizeof(tramp_name);
209
210 if (strncmp(tramp_name, name, size) == 0)
211 return "[unknown/kretprobe'd]";
212 return name;
213}
214#else
215static inline const char *kretprobed(const char *name)
216{
217 return name;
218}
219#endif /* CONFIG_KRETPROBES */
220
221static int
222seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
223{
224#ifdef CONFIG_KALLSYMS
225 char str[KSYM_SYMBOL_LEN];
226 const char *name;
227
228 kallsyms_lookup(address, NULL, NULL, NULL, str);
229
230 name = kretprobed(str);
231
232 return trace_seq_printf(s, fmt, name);
233#endif
234 return 1;
235}
236
237static int
238seq_print_sym_offset(struct trace_seq *s, const char *fmt,
239 unsigned long address)
240{
241#ifdef CONFIG_KALLSYMS
242 char str[KSYM_SYMBOL_LEN];
243 const char *name;
244
245 sprint_symbol(str, address);
246 name = kretprobed(str);
247
248 return trace_seq_printf(s, fmt, name);
249#endif
250 return 1;
251}
252
253#ifndef CONFIG_64BIT
254# define IP_FMT "%08lx"
255#else
256# define IP_FMT "%016lx"
257#endif
258
259int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
260 unsigned long ip, unsigned long sym_flags)
261{
262 struct file *file = NULL;
263 unsigned long vmstart = 0;
264 int ret = 1;
265
266 if (mm) {
267 const struct vm_area_struct *vma;
268
269 down_read(&mm->mmap_sem);
270 vma = find_vma(mm, ip);
271 if (vma) {
272 file = vma->vm_file;
273 vmstart = vma->vm_start;
274 }
275 if (file) {
276 ret = trace_seq_path(s, &file->f_path);
277 if (ret)
278 ret = trace_seq_printf(s, "[+0x%lx]",
279 ip - vmstart);
280 }
281 up_read(&mm->mmap_sem);
282 }
283 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
284 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
285 return ret;
286}
287
288int
289seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
290 unsigned long sym_flags)
291{
292 struct mm_struct *mm = NULL;
293 int ret = 1;
294 unsigned int i;
295
296 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
297 struct task_struct *task;
298 /*
299 * we do the lookup on the thread group leader,
300 * since individual threads might have already quit!
301 */
302 rcu_read_lock();
303 task = find_task_by_vpid(entry->ent.tgid);
304 if (task)
305 mm = get_task_mm(task);
306 rcu_read_unlock();
307 }
308
309 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
310 unsigned long ip = entry->caller[i];
311
312 if (ip == ULONG_MAX || !ret)
313 break;
314 if (i && ret)
315 ret = trace_seq_puts(s, " <- ");
316 if (!ip) {
317 if (ret)
318 ret = trace_seq_puts(s, "??");
319 continue;
320 }
321 if (!ret)
322 break;
323 if (ret)
324 ret = seq_print_user_ip(s, mm, ip, sym_flags);
325 }
326
327 if (mm)
328 mmput(mm);
329 return ret;
330}
331
332int
333seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
334{
335 int ret;
336
337 if (!ip)
338 return trace_seq_printf(s, "0");
339
340 if (sym_flags & TRACE_ITER_SYM_OFFSET)
341 ret = seq_print_sym_offset(s, "%s", ip);
342 else
343 ret = seq_print_sym_short(s, "%s", ip);
344
345 if (!ret)
346 return 0;
347
348 if (sym_flags & TRACE_ITER_SYM_ADDR)
349 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
350 return ret;
351}
352
353static int
354lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
355{
356 int hardirq, softirq;
357 char comm[TASK_COMM_LEN];
358
359 trace_find_cmdline(entry->pid, comm);
360 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
361 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
362
363 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
364 comm, entry->pid, cpu,
365 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
366 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
367 'X' : '.',
368 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
369 'N' : '.',
370 (hardirq && softirq) ? 'H' :
371 hardirq ? 'h' : softirq ? 's' : '.'))
372 return 0;
373
374 if (entry->preempt_count)
375 return trace_seq_printf(s, "%x", entry->preempt_count);
376 return trace_seq_puts(s, ".");
377}
378
379static unsigned long preempt_mark_thresh = 100;
380
381static int
382lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
383 unsigned long rel_usecs)
384{
385 return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
386 rel_usecs > preempt_mark_thresh ? '!' :
387 rel_usecs > 1 ? '+' : ' ');
388}
389
390int trace_print_context(struct trace_iterator *iter)
391{
392 struct trace_seq *s = &iter->seq;
393 struct trace_entry *entry = iter->ent;
394 unsigned long long t = ns2usecs(iter->ts);
395 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
396 unsigned long secs = (unsigned long)t;
397 char comm[TASK_COMM_LEN];
398
399 trace_find_cmdline(entry->pid, comm);
400
401 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
402 comm, entry->pid, iter->cpu, secs, usec_rem);
403}
404
405int trace_print_lat_context(struct trace_iterator *iter)
406{
407 u64 next_ts;
408 int ret;
409 struct trace_seq *s = &iter->seq;
410 struct trace_entry *entry = iter->ent,
411 *next_entry = trace_find_next_entry(iter, NULL,
412 &next_ts);
413 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
414 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
415 unsigned long rel_usecs;
416
417 if (!next_entry)
418 next_ts = iter->ts;
419 rel_usecs = ns2usecs(next_ts - iter->ts);
420
421 if (verbose) {
422 char comm[TASK_COMM_LEN];
423
424 trace_find_cmdline(entry->pid, comm);
425
426 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
427 " %ld.%03ldms (+%ld.%03ldms): ", comm,
428 entry->pid, iter->cpu, entry->flags,
429 entry->preempt_count, iter->idx,
430 ns2usecs(iter->ts),
431 abs_usecs / USEC_PER_MSEC,
432 abs_usecs % USEC_PER_MSEC,
433 rel_usecs / USEC_PER_MSEC,
434 rel_usecs % USEC_PER_MSEC);
435 } else {
436 ret = lat_print_generic(s, entry, iter->cpu);
437 if (ret)
438 ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
439 }
440
441 return ret;
442}
443
444static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
445
446static int task_state_char(unsigned long state)
447{
448 int bit = state ? __ffs(state) + 1 : 0;
449
450 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
451}
452
453/**
454 * ftrace_find_event - find a registered event
455 * @type: the type of event to look for
456 *
457 * Returns an event of type @type otherwise NULL
458 */
459struct trace_event *ftrace_find_event(int type)
460{
461 struct trace_event *event;
462 struct hlist_node *n;
463 unsigned key;
464
465 key = type & (EVENT_HASHSIZE - 1);
466
467 hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
468 if (event->type == type)
469 return event;
470 }
471
472 return NULL;
473}
474
475/**
476 * register_ftrace_event - register output for an event type
477 * @event: the event type to register
478 *
479 * Event types are stored in a hash and this hash is used to
480 * find a way to print an event. If the @event->type is set
481 * then it will use that type, otherwise it will assign a
482 * type to use.
483 *
484 * If you assign your own type, please make sure it is added
485 * to the trace_type enum in trace.h, to avoid collisions
486 * with the dynamic types.
487 *
488 * Returns the event type number or zero on error.
489 */
490int register_ftrace_event(struct trace_event *event)
491{
492 unsigned key;
493 int ret = 0;
494
495 mutex_lock(&trace_event_mutex);
496
497 if (!event) {
498 ret = next_event_type++;
499 goto out;
500 }
501
502 if (!event->type)
503 event->type = next_event_type++;
504 else if (event->type > __TRACE_LAST_TYPE) {
505 printk(KERN_WARNING "Need to add type to trace.h\n");
506 WARN_ON(1);
507 }
508
509 if (ftrace_find_event(event->type))
510 goto out;
511
512 if (event->trace == NULL)
513 event->trace = trace_nop_print;
514 if (event->raw == NULL)
515 event->raw = trace_nop_print;
516 if (event->hex == NULL)
517 event->hex = trace_nop_print;
518 if (event->binary == NULL)
519 event->binary = trace_nop_print;
520
521 key = event->type & (EVENT_HASHSIZE - 1);
522
523 hlist_add_head_rcu(&event->node, &event_hash[key]);
524
525 ret = event->type;
526 out:
527 mutex_unlock(&trace_event_mutex);
528
529 return ret;
530}
531
532/**
533 * unregister_ftrace_event - remove a no longer used event
534 * @event: the event to remove
535 */
536int unregister_ftrace_event(struct trace_event *event)
537{
538 mutex_lock(&trace_event_mutex);
539 hlist_del(&event->node);
540 mutex_unlock(&trace_event_mutex);
541
542 return 0;
543}
544
545/*
546 * Standard events
547 */
548
549enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
550{
551 return TRACE_TYPE_HANDLED;
552}
553
554/* TRACE_FN */
555static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
556{
557 struct ftrace_entry *field;
558 struct trace_seq *s = &iter->seq;
559
560 trace_assign_type(field, iter->ent);
561
562 if (!seq_print_ip_sym(s, field->ip, flags))
563 goto partial;
564
565 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
566 if (!trace_seq_printf(s, " <-"))
567 goto partial;
568 if (!seq_print_ip_sym(s,
569 field->parent_ip,
570 flags))
571 goto partial;
572 }
573 if (!trace_seq_printf(s, "\n"))
574 goto partial;
575
576 return TRACE_TYPE_HANDLED;
577
578 partial:
579 return TRACE_TYPE_PARTIAL_LINE;
580}
581
582static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
583{
584 struct ftrace_entry *field;
585
586 trace_assign_type(field, iter->ent);
587
588 if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
589 field->ip,
590 field->parent_ip))
591 return TRACE_TYPE_PARTIAL_LINE;
592
593 return TRACE_TYPE_HANDLED;
594}
595
596static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
597{
598 struct ftrace_entry *field;
599 struct trace_seq *s = &iter->seq;
600
601 trace_assign_type(field, iter->ent);
602
603 SEQ_PUT_HEX_FIELD_RET(s, field->ip);
604 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
605
606 return TRACE_TYPE_HANDLED;
607}
608
609static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
610{
611 struct ftrace_entry *field;
612 struct trace_seq *s = &iter->seq;
613
614 trace_assign_type(field, iter->ent);
615
616 SEQ_PUT_FIELD_RET(s, field->ip);
617 SEQ_PUT_FIELD_RET(s, field->parent_ip);
618
619 return TRACE_TYPE_HANDLED;
620}
621
622static struct trace_event trace_fn_event = {
623 .type = TRACE_FN,
624 .trace = trace_fn_trace,
625 .raw = trace_fn_raw,
626 .hex = trace_fn_hex,
627 .binary = trace_fn_bin,
628};
629
630/* TRACE_CTX an TRACE_WAKE */
631static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
632 char *delim)
633{
634 struct ctx_switch_entry *field;
635 char comm[TASK_COMM_LEN];
636 int S, T;
637
638
639 trace_assign_type(field, iter->ent);
640
641 T = task_state_char(field->next_state);
642 S = task_state_char(field->prev_state);
643 trace_find_cmdline(field->next_pid, comm);
644 if (!trace_seq_printf(&iter->seq,
645 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
646 field->prev_pid,
647 field->prev_prio,
648 S, delim,
649 field->next_cpu,
650 field->next_pid,
651 field->next_prio,
652 T, comm))
653 return TRACE_TYPE_PARTIAL_LINE;
654
655 return TRACE_TYPE_HANDLED;
656}
657
658static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
659{
660 return trace_ctxwake_print(iter, "==>");
661}
662
663static enum print_line_t trace_wake_print(struct trace_iterator *iter,
664 int flags)
665{
666 return trace_ctxwake_print(iter, " +");
667}
668
669static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
670{
671 struct ctx_switch_entry *field;
672 int T;
673
674 trace_assign_type(field, iter->ent);
675
676 if (!S)
677 task_state_char(field->prev_state);
678 T = task_state_char(field->next_state);
679 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
680 field->prev_pid,
681 field->prev_prio,
682 S,
683 field->next_cpu,
684 field->next_pid,
685 field->next_prio,
686 T))
687 return TRACE_TYPE_PARTIAL_LINE;
688
689 return TRACE_TYPE_HANDLED;
690}
691
692static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
693{
694 return trace_ctxwake_raw(iter, 0);
695}
696
697static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
698{
699 return trace_ctxwake_raw(iter, '+');
700}
701
702
703static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
704{
705 struct ctx_switch_entry *field;
706 struct trace_seq *s = &iter->seq;
707 int T;
708
709 trace_assign_type(field, iter->ent);
710
711 if (!S)
712 task_state_char(field->prev_state);
713 T = task_state_char(field->next_state);
714
715 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
716 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
717 SEQ_PUT_HEX_FIELD_RET(s, S);
718 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
719 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
720 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
721 SEQ_PUT_HEX_FIELD_RET(s, T);
722
723 return TRACE_TYPE_HANDLED;
724}
725
726static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
727{
728 return trace_ctxwake_hex(iter, 0);
729}
730
731static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
732{
733 return trace_ctxwake_hex(iter, '+');
734}
735
736static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
737 int flags)
738{
739 struct ctx_switch_entry *field;
740 struct trace_seq *s = &iter->seq;
741
742 trace_assign_type(field, iter->ent);
743
744 SEQ_PUT_FIELD_RET(s, field->prev_pid);
745 SEQ_PUT_FIELD_RET(s, field->prev_prio);
746 SEQ_PUT_FIELD_RET(s, field->prev_state);
747 SEQ_PUT_FIELD_RET(s, field->next_pid);
748 SEQ_PUT_FIELD_RET(s, field->next_prio);
749 SEQ_PUT_FIELD_RET(s, field->next_state);
750
751 return TRACE_TYPE_HANDLED;
752}
753
754static struct trace_event trace_ctx_event = {
755 .type = TRACE_CTX,
756 .trace = trace_ctx_print,
757 .raw = trace_ctx_raw,
758 .hex = trace_ctx_hex,
759 .binary = trace_ctxwake_bin,
760};
761
762static struct trace_event trace_wake_event = {
763 .type = TRACE_WAKE,
764 .trace = trace_wake_print,
765 .raw = trace_wake_raw,
766 .hex = trace_wake_hex,
767 .binary = trace_ctxwake_bin,
768};
769
770/* TRACE_SPECIAL */
771static enum print_line_t trace_special_print(struct trace_iterator *iter,
772 int flags)
773{
774 struct special_entry *field;
775
776 trace_assign_type(field, iter->ent);
777
778 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
779 field->arg1,
780 field->arg2,
781 field->arg3))
782 return TRACE_TYPE_PARTIAL_LINE;
783
784 return TRACE_TYPE_HANDLED;
785}
786
787static enum print_line_t trace_special_hex(struct trace_iterator *iter,
788 int flags)
789{
790 struct special_entry *field;
791 struct trace_seq *s = &iter->seq;
792
793 trace_assign_type(field, iter->ent);
794
795 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
796 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
797 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
798
799 return TRACE_TYPE_HANDLED;
800}
801
802static enum print_line_t trace_special_bin(struct trace_iterator *iter,
803 int flags)
804{
805 struct special_entry *field;
806 struct trace_seq *s = &iter->seq;
807
808 trace_assign_type(field, iter->ent);
809
810 SEQ_PUT_FIELD_RET(s, field->arg1);
811 SEQ_PUT_FIELD_RET(s, field->arg2);
812 SEQ_PUT_FIELD_RET(s, field->arg3);
813
814 return TRACE_TYPE_HANDLED;
815}
816
817static struct trace_event trace_special_event = {
818 .type = TRACE_SPECIAL,
819 .trace = trace_special_print,
820 .raw = trace_special_print,
821 .hex = trace_special_hex,
822 .binary = trace_special_bin,
823};
824
825/* TRACE_STACK */
826
827static enum print_line_t trace_stack_print(struct trace_iterator *iter,
828 int flags)
829{
830 struct stack_entry *field;
831 struct trace_seq *s = &iter->seq;
832 int i;
833
834 trace_assign_type(field, iter->ent);
835
836 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
837 if (i) {
838 if (!trace_seq_puts(s, " <= "))
839 goto partial;
840
841 if (!seq_print_ip_sym(s, field->caller[i], flags))
842 goto partial;
843 }
844 if (!trace_seq_puts(s, "\n"))
845 goto partial;
846 }
847
848 return TRACE_TYPE_HANDLED;
849
850 partial:
851 return TRACE_TYPE_PARTIAL_LINE;
852}
853
854static struct trace_event trace_stack_event = {
855 .type = TRACE_STACK,
856 .trace = trace_stack_print,
857 .raw = trace_special_print,
858 .hex = trace_special_hex,
859 .binary = trace_special_bin,
860};
861
862/* TRACE_USER_STACK */
863static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
864 int flags)
865{
866 struct userstack_entry *field;
867 struct trace_seq *s = &iter->seq;
868
869 trace_assign_type(field, iter->ent);
870
871 if (!seq_print_userip_objs(field, s, flags))
872 goto partial;
873
874 if (!trace_seq_putc(s, '\n'))
875 goto partial;
876
877 return TRACE_TYPE_HANDLED;
878
879 partial:
880 return TRACE_TYPE_PARTIAL_LINE;
881}
882
883static struct trace_event trace_user_stack_event = {
884 .type = TRACE_USER_STACK,
885 .trace = trace_user_stack_print,
886 .raw = trace_special_print,
887 .hex = trace_special_hex,
888 .binary = trace_special_bin,
889};
890
891/* TRACE_BPRINT */
892static enum print_line_t
893trace_bprint_print(struct trace_iterator *iter, int flags)
894{
895 struct trace_entry *entry = iter->ent;
896 struct trace_seq *s = &iter->seq;
897 struct bprint_entry *field;
898
899 trace_assign_type(field, entry);
900
901 if (!seq_print_ip_sym(s, field->ip, flags))
902 goto partial;
903
904 if (!trace_seq_puts(s, ": "))
905 goto partial;
906
907 if (!trace_seq_bprintf(s, field->fmt, field->buf))
908 goto partial;
909
910 return TRACE_TYPE_HANDLED;
911
912 partial:
913 return TRACE_TYPE_PARTIAL_LINE;
914}
915
916
917static enum print_line_t
918trace_bprint_raw(struct trace_iterator *iter, int flags)
919{
920 struct bprint_entry *field;
921 struct trace_seq *s = &iter->seq;
922
923 trace_assign_type(field, iter->ent);
924
925 if (!trace_seq_printf(s, ": %lx : ", field->ip))
926 goto partial;
927
928 if (!trace_seq_bprintf(s, field->fmt, field->buf))
929 goto partial;
930
931 return TRACE_TYPE_HANDLED;
932
933 partial:
934 return TRACE_TYPE_PARTIAL_LINE;
935}
936
937
938static struct trace_event trace_bprint_event = {
939 .type = TRACE_BPRINT,
940 .trace = trace_bprint_print,
941 .raw = trace_bprint_raw,
942};
943
944/* TRACE_PRINT */
945static enum print_line_t trace_print_print(struct trace_iterator *iter,
946 int flags)
947{
948 struct print_entry *field;
949 struct trace_seq *s = &iter->seq;
950
951 trace_assign_type(field, iter->ent);
952
953 if (!seq_print_ip_sym(s, field->ip, flags))
954 goto partial;
955
956 if (!trace_seq_printf(s, ": %s", field->buf))
957 goto partial;
958
959 return TRACE_TYPE_HANDLED;
960
961 partial:
962 return TRACE_TYPE_PARTIAL_LINE;
963}
964
965static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
966{
967 struct print_entry *field;
968
969 trace_assign_type(field, iter->ent);
970
971 if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
972 goto partial;
973
974 return TRACE_TYPE_HANDLED;
975
976 partial:
977 return TRACE_TYPE_PARTIAL_LINE;
978}
979
980static struct trace_event trace_print_event = {
981 .type = TRACE_PRINT,
982 .trace = trace_print_print,
983 .raw = trace_print_raw,
984};
985
986
987static struct trace_event *events[] __initdata = {
988 &trace_fn_event,
989 &trace_ctx_event,
990 &trace_wake_event,
991 &trace_special_event,
992 &trace_stack_event,
993 &trace_user_stack_event,
994 &trace_bprint_event,
995 &trace_print_event,
996 NULL
997};
998
999__init static int init_events(void)
1000{
1001 struct trace_event *event;
1002 int i, ret;
1003
1004 for (i = 0; events[i]; i++) {
1005 event = events[i];
1006
1007 ret = register_ftrace_event(event);
1008 if (!ret) {
1009 printk(KERN_WARNING "event %d failed to register\n",
1010 event->type);
1011 WARN_ON_ONCE(1);
1012 }
1013 }
1014
1015 return 0;
1016}
1017device_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 000000000000..e0bde39c2dd9
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,71 @@
1#ifndef __TRACE_EVENTS_H
2#define __TRACE_EVENTS_H
3
4#include "trace.h"
5
6typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
7 int flags);
8
9struct trace_event {
10 struct hlist_node node;
11 int type;
12 trace_print_func trace;
13 trace_print_func raw;
14 trace_print_func hex;
15 trace_print_func binary;
16};
17
18extern enum print_line_t
19trace_print_bprintk_msg_only(struct trace_iterator *iter);
20extern enum print_line_t
21trace_print_printk_msg_only(struct trace_iterator *iter);
22
23extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
24 __attribute__ ((format (printf, 2, 3)));
25extern int
26trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
27extern int
28seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
29 unsigned long sym_flags);
30extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
31 size_t cnt);
32extern int trace_seq_puts(struct trace_seq *s, const char *str);
33extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
34extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
35extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
36 size_t len);
37extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
38extern int trace_seq_path(struct trace_seq *s, struct path *path);
39extern int seq_print_userip_objs(const struct userstack_entry *entry,
40 struct trace_seq *s, unsigned long sym_flags);
41extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
42 unsigned long ip, unsigned long sym_flags);
43
44extern int trace_print_context(struct trace_iterator *iter);
45extern int trace_print_lat_context(struct trace_iterator *iter);
46
47extern struct trace_event *ftrace_find_event(int type);
48extern int register_ftrace_event(struct trace_event *event);
49extern int unregister_ftrace_event(struct trace_event *event);
50
51extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
52 int flags);
53
54#define MAX_MEMHEX_BYTES 8
55#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
56
57#define SEQ_PUT_FIELD_RET(s, x) \
58do { \
59 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
60 return TRACE_TYPE_PARTIAL_LINE; \
61} while (0)
62
63#define SEQ_PUT_HEX_FIELD_RET(s, x) \
64do { \
65 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
66 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
67 return TRACE_TYPE_PARTIAL_LINE; \
68} while (0)
69
70#endif
71
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 7bda248daf55..bae791ebcc51 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,15 +11,113 @@
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/ftrace.h> 14#include <trace/power.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/module.h> 16#include <linux/module.h>
17 17
18#include "trace.h" 18#include "trace.h"
19#include "trace_output.h"
19 20
20static struct trace_array *power_trace; 21static struct trace_array *power_trace;
21static int __read_mostly trace_power_enabled; 22static int __read_mostly trace_power_enabled;
22 23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ring_buffer_event *event;
40 struct trace_power *entry;
41 struct trace_array_cpu *data;
42 struct trace_array *tr = power_trace;
43
44 if (!trace_power_enabled)
45 return;
46
47 preempt_disable();
48 it->end = ktime_get();
49 data = tr->data[smp_processor_id()];
50
51 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
52 sizeof(*entry), 0, 0);
53 if (!event)
54 goto out;
55 entry = ring_buffer_event_data(event);
56 entry->state_data = *it;
57 trace_buffer_unlock_commit(tr, event, 0, 0);
58 out:
59 preempt_enable();
60}
61
62static void probe_power_mark(struct power_trace *it, unsigned int type,
63 unsigned int level)
64{
65 struct ring_buffer_event *event;
66 struct trace_power *entry;
67 struct trace_array_cpu *data;
68 struct trace_array *tr = power_trace;
69
70 if (!trace_power_enabled)
71 return;
72
73 memset(it, 0, sizeof(struct power_trace));
74 it->state = level;
75 it->type = type;
76 it->stamp = ktime_get();
77 preempt_disable();
78 it->end = it->stamp;
79 data = tr->data[smp_processor_id()];
80
81 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
82 sizeof(*entry), 0, 0);
83 if (!event)
84 goto out;
85 entry = ring_buffer_event_data(event);
86 entry->state_data = *it;
87 trace_buffer_unlock_commit(tr, event, 0, 0);
88 out:
89 preempt_enable();
90}
91
92static int tracing_power_register(void)
93{
94 int ret;
95
96 ret = register_trace_power_start(probe_power_start);
97 if (ret) {
98 pr_info("power trace: Couldn't activate tracepoint"
99 " probe to trace_power_start\n");
100 return ret;
101 }
102 ret = register_trace_power_end(probe_power_end);
103 if (ret) {
104 pr_info("power trace: Couldn't activate tracepoint"
105 " probe to trace_power_end\n");
106 goto fail_start;
107 }
108 ret = register_trace_power_mark(probe_power_mark);
109 if (ret) {
110 pr_info("power trace: Couldn't activate tracepoint"
111 " probe to trace_power_mark\n");
112 goto fail_end;
113 }
114 return ret;
115fail_end:
116 unregister_trace_power_end(probe_power_end);
117fail_start:
118 unregister_trace_power_start(probe_power_start);
119 return ret;
120}
23 121
24static void start_power_trace(struct trace_array *tr) 122static void start_power_trace(struct trace_array *tr)
25{ 123{
@@ -31,6 +129,14 @@ static void stop_power_trace(struct trace_array *tr)
31 trace_power_enabled = 0; 129 trace_power_enabled = 0;
32} 130}
33 131
132static void power_trace_reset(struct trace_array *tr)
133{
134 trace_power_enabled = 0;
135 unregister_trace_power_start(probe_power_start);
136 unregister_trace_power_end(probe_power_end);
137 unregister_trace_power_mark(probe_power_mark);
138}
139
34 140
35static int power_trace_init(struct trace_array *tr) 141static int power_trace_init(struct trace_array *tr)
36{ 142{
@@ -38,6 +144,7 @@ static int power_trace_init(struct trace_array *tr)
38 power_trace = tr; 144 power_trace = tr;
39 145
40 trace_power_enabled = 1; 146 trace_power_enabled = 1;
147 tracing_power_register();
41 148
42 for_each_cpu(cpu, cpu_possible_mask) 149 for_each_cpu(cpu, cpu_possible_mask)
43 tracing_reset(tr, cpu); 150 tracing_reset(tr, cpu);
@@ -85,7 +192,7 @@ static struct tracer power_tracer __read_mostly =
85 .init = power_trace_init, 192 .init = power_trace_init,
86 .start = start_power_trace, 193 .start = start_power_trace,
87 .stop = stop_power_trace, 194 .stop = stop_power_trace,
88 .reset = stop_power_trace, 195 .reset = power_trace_reset,
89 .print_line = power_print_line, 196 .print_line = power_print_line,
90}; 197};
91 198
@@ -94,86 +201,3 @@ static int init_power_trace(void)
94 return register_tracer(&power_tracer); 201 return register_tracer(&power_tracer);
95} 202}
96device_initcall(init_power_trace); 203device_initcall(init_power_trace);
97
98void trace_power_start(struct power_trace *it, unsigned int type,
99 unsigned int level)
100{
101 if (!trace_power_enabled)
102 return;
103
104 memset(it, 0, sizeof(struct power_trace));
105 it->state = level;
106 it->type = type;
107 it->stamp = ktime_get();
108}
109EXPORT_SYMBOL_GPL(trace_power_start);
110
111
112void trace_power_end(struct power_trace *it)
113{
114 struct ring_buffer_event *event;
115 struct trace_power *entry;
116 struct trace_array_cpu *data;
117 unsigned long irq_flags;
118 struct trace_array *tr = power_trace;
119
120 if (!trace_power_enabled)
121 return;
122
123 preempt_disable();
124 it->end = ktime_get();
125 data = tr->data[smp_processor_id()];
126
127 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128 &irq_flags);
129 if (!event)
130 goto out;
131 entry = ring_buffer_event_data(event);
132 tracing_generic_entry_update(&entry->ent, 0, 0);
133 entry->ent.type = TRACE_POWER;
134 entry->state_data = *it;
135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136
137 trace_wake_up();
138
139 out:
140 preempt_enable();
141}
142EXPORT_SYMBOL_GPL(trace_power_end);
143
144void trace_power_mark(struct power_trace *it, unsigned int type,
145 unsigned int level)
146{
147 struct ring_buffer_event *event;
148 struct trace_power *entry;
149 struct trace_array_cpu *data;
150 unsigned long irq_flags;
151 struct trace_array *tr = power_trace;
152
153 if (!trace_power_enabled)
154 return;
155
156 memset(it, 0, sizeof(struct power_trace));
157 it->state = level;
158 it->type = type;
159 it->stamp = ktime_get();
160 preempt_disable();
161 it->end = it->stamp;
162 data = tr->data[smp_processor_id()];
163
164 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165 &irq_flags);
166 if (!event)
167 goto out;
168 entry = ring_buffer_event_data(event);
169 tracing_generic_entry_update(&entry->ent, 0, 0);
170 entry->ent.type = TRACE_POWER;
171 entry->state_data = *it;
172 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173
174 trace_wake_up();
175
176 out:
177 preempt_enable();
178}
179EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 000000000000..eb81556107fe
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,270 @@
1/*
2 * trace binary printk
3 *
4 * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
5 *
6 */
7#include <linux/seq_file.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h>
10#include <linux/kernel.h>
11#include <linux/ftrace.h>
12#include <linux/string.h>
13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h>
16#include <linux/ctype.h>
17#include <linux/list.h>
18#include <linux/slab.h>
19#include <linux/fs.h>
20
21#include "trace.h"
22
23#ifdef CONFIG_MODULES
24
25/*
26 * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
27 * which are queued on trace_bprintk_fmt_list.
28 */
29static LIST_HEAD(trace_bprintk_fmt_list);
30
31/* serialize accesses to trace_bprintk_fmt_list */
32static DEFINE_MUTEX(btrace_mutex);
33
34struct trace_bprintk_fmt {
35 struct list_head list;
36 char fmt[0];
37};
38
39static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
40{
41 struct trace_bprintk_fmt *pos;
42 list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
43 if (!strcmp(pos->fmt, fmt))
44 return pos;
45 }
46 return NULL;
47}
48
49static
50void hold_module_trace_bprintk_format(const char **start, const char **end)
51{
52 const char **iter;
53
54 mutex_lock(&btrace_mutex);
55 for (iter = start; iter < end; iter++) {
56 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
57 if (tb_fmt) {
58 *iter = tb_fmt->fmt;
59 continue;
60 }
61
62 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
63 + strlen(*iter) + 1, GFP_KERNEL);
64 if (tb_fmt) {
65 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
66 strcpy(tb_fmt->fmt, *iter);
67 *iter = tb_fmt->fmt;
68 } else
69 *iter = NULL;
70 }
71 mutex_unlock(&btrace_mutex);
72}
73
74static int module_trace_bprintk_format_notify(struct notifier_block *self,
75 unsigned long val, void *data)
76{
77 struct module *mod = data;
78 if (mod->num_trace_bprintk_fmt) {
79 const char **start = mod->trace_bprintk_fmt_start;
80 const char **end = start + mod->num_trace_bprintk_fmt;
81
82 if (val == MODULE_STATE_COMING)
83 hold_module_trace_bprintk_format(start, end);
84 }
85 return 0;
86}
87
88#else /* !CONFIG_MODULES */
89__init static int
90module_trace_bprintk_format_notify(struct notifier_block *self,
91 unsigned long val, void *data)
92{
93 return 0;
94}
95#endif /* CONFIG_MODULES */
96
97
98__initdata_or_module static
99struct notifier_block module_trace_bprintk_format_nb = {
100 .notifier_call = module_trace_bprintk_format_notify,
101};
102
103int __trace_bprintk(unsigned long ip, const char *fmt, ...)
104 {
105 int ret;
106 va_list ap;
107
108 if (unlikely(!fmt))
109 return 0;
110
111 if (!(trace_flags & TRACE_ITER_PRINTK))
112 return 0;
113
114 va_start(ap, fmt);
115 ret = trace_vbprintk(ip, fmt, ap);
116 va_end(ap);
117 return ret;
118}
119EXPORT_SYMBOL_GPL(__trace_bprintk);
120
121int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
122 {
123 if (unlikely(!fmt))
124 return 0;
125
126 if (!(trace_flags & TRACE_ITER_PRINTK))
127 return 0;
128
129 return trace_vbprintk(ip, fmt, ap);
130}
131EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
132
133int __trace_printk(unsigned long ip, const char *fmt, ...)
134{
135 int ret;
136 va_list ap;
137
138 if (!(trace_flags & TRACE_ITER_PRINTK))
139 return 0;
140
141 va_start(ap, fmt);
142 ret = trace_vprintk(ip, fmt, ap);
143 va_end(ap);
144 return ret;
145}
146EXPORT_SYMBOL_GPL(__trace_printk);
147
148int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
149{
150 if (!(trace_flags & TRACE_ITER_PRINTK))
151 return 0;
152
153 return trace_vprintk(ip, fmt, ap);
154}
155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156
157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos)
159{
160 const char **fmt = m->private;
161 const char **next = fmt;
162
163 (*pos)++;
164
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt;
172}
173
174static void *t_start(struct seq_file *m, loff_t *pos)
175{
176 return t_next(m, NULL, pos);
177}
178
179static int t_show(struct seq_file *m, void *v)
180{
181 const char **fmt = v;
182 const char *str = *fmt;
183 int i;
184
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
186
187 /*
188 * Tabs and new lines need to be converted.
189 */
190 for (i = 0; str[i]; i++) {
191 switch (str[i]) {
192 case '\n':
193 seq_puts(m, "\\n");
194 break;
195 case '\t':
196 seq_puts(m, "\\t");
197 break;
198 case '\\':
199 seq_puts(m, "\\");
200 break;
201 case '"':
202 seq_puts(m, "\\\"");
203 break;
204 default:
205 seq_putc(m, str[i]);
206 }
207 }
208 seq_puts(m, "\"\n");
209
210 return 0;
211}
212
213static void t_stop(struct seq_file *m, void *p)
214{
215}
216
217static const struct seq_operations show_format_seq_ops = {
218 .start = t_start,
219 .next = t_next,
220 .show = t_show,
221 .stop = t_stop,
222};
223
224static int
225ftrace_formats_open(struct inode *inode, struct file *file)
226{
227 int ret;
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236}
237
238static const struct file_operations ftrace_formats_fops = {
239 .open = ftrace_formats_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = seq_release,
243};
244
245static __init int init_trace_printk_function_export(void)
246{
247 struct dentry *d_tracer;
248 struct dentry *entry;
249
250 d_tracer = tracing_init_dentry();
251 if (!d_tracer)
252 return 0;
253
254 entry = debugfs_create_file("printk_formats", 0444, d_tracer,
255 NULL, &ftrace_formats_fops);
256 if (!entry)
257 pr_warning("Could not create debugfs "
258 "'printk_formats' entry\n");
259
260 return 0;
261}
262
263fs_initcall(init_trace_printk_function_export);
264
265static __init int init_trace_printk(void)
266{
267 return register_module_notifier(&module_trace_bprintk_format_nb);
268}
269
270early_initcall(init_trace_printk);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index df175cb4564f..9117cea6f1ae 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,6 +18,7 @@ static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static int sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped;
21 22
22static void 23static void
23probe_sched_switch(struct rq *__rq, struct task_struct *prev, 24probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
28 int cpu; 29 int cpu;
29 int pc; 30 int pc;
30 31
31 if (!sched_ref) 32 if (!sched_ref || sched_stopped)
32 return; 33 return;
33 34
34 tracing_record_cmdline(prev); 35 tracing_record_cmdline(prev);
@@ -43,7 +44,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
43 data = ctx_trace->data[cpu]; 44 data = ctx_trace->data[cpu];
44 45
45 if (likely(!atomic_read(&data->disabled))) 46 if (likely(!atomic_read(&data->disabled)))
46 tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); 47 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
47 48
48 local_irq_restore(flags); 49 local_irq_restore(flags);
49} 50}
@@ -61,12 +62,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
61 pc = preempt_count(); 62 pc = preempt_count();
62 tracing_record_cmdline(current); 63 tracing_record_cmdline(current);
63 64
65 if (sched_stopped)
66 return;
67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 cpu = raw_smp_processor_id(); 69 cpu = raw_smp_processor_id();
66 data = ctx_trace->data[cpu]; 70 data = ctx_trace->data[cpu];
67 71
68 if (likely(!atomic_read(&data->disabled))) 72 if (likely(!atomic_read(&data->disabled)))
69 tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, 73 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
70 flags, pc); 74 flags, pc);
71 75
72 local_irq_restore(flags); 76 local_irq_restore(flags);
@@ -93,7 +97,7 @@ static int tracing_sched_register(void)
93 ret = register_trace_sched_switch(probe_sched_switch); 97 ret = register_trace_sched_switch(probe_sched_switch);
94 if (ret) { 98 if (ret) {
95 pr_info("sched trace: Couldn't activate tracepoint" 99 pr_info("sched trace: Couldn't activate tracepoint"
96 " probe to kernel_sched_schedule\n"); 100 " probe to kernel_sched_switch\n");
97 goto fail_deprobe_wake_new; 101 goto fail_deprobe_wake_new;
98 } 102 }
99 103
@@ -185,12 +189,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
185 ctx_trace = tr; 189 ctx_trace = tr;
186} 190}
187 191
188static void start_sched_trace(struct trace_array *tr)
189{
190 tracing_reset_online_cpus(tr);
191 tracing_start_sched_switch_record();
192}
193
194static void stop_sched_trace(struct trace_array *tr) 192static void stop_sched_trace(struct trace_array *tr)
195{ 193{
196 tracing_stop_sched_switch_record(); 194 tracing_stop_sched_switch_record();
@@ -199,7 +197,8 @@ static void stop_sched_trace(struct trace_array *tr)
199static int sched_switch_trace_init(struct trace_array *tr) 197static int sched_switch_trace_init(struct trace_array *tr)
200{ 198{
201 ctx_trace = tr; 199 ctx_trace = tr;
202 start_sched_trace(tr); 200 tracing_reset_online_cpus(tr);
201 tracing_start_sched_switch_record();
203 return 0; 202 return 0;
204} 203}
205 204
@@ -211,13 +210,12 @@ static void sched_switch_trace_reset(struct trace_array *tr)
211 210
212static void sched_switch_trace_start(struct trace_array *tr) 211static void sched_switch_trace_start(struct trace_array *tr)
213{ 212{
214 tracing_reset_online_cpus(tr); 213 sched_stopped = 0;
215 tracing_start_sched_switch();
216} 214}
217 215
218static void sched_switch_trace_stop(struct trace_array *tr) 216static void sched_switch_trace_stop(struct trace_array *tr)
219{ 217{
220 tracing_stop_sched_switch(); 218 sched_stopped = 1;
221} 219}
222 220
223static struct tracer sched_switch_trace __read_mostly = 221static struct tracer sched_switch_trace __read_mostly =
@@ -227,6 +225,7 @@ static struct tracer sched_switch_trace __read_mostly =
227 .reset = sched_switch_trace_reset, 225 .reset = sched_switch_trace_reset,
228 .start = sched_switch_trace_start, 226 .start = sched_switch_trace_start,
229 .stop = sched_switch_trace_stop, 227 .stop = sched_switch_trace_stop,
228 .wait_pipe = poll_wait_pipe,
230#ifdef CONFIG_FTRACE_SELFTEST 229#ifdef CONFIG_FTRACE_SELFTEST
231 .selftest = trace_selftest_startup_sched_switch, 230 .selftest = trace_selftest_startup_sched_switch,
232#endif 231#endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e31..5bc00e8f153e 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -25,12 +25,15 @@ static int __read_mostly tracer_enabled;
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static unsigned wakeup_prio = -1; 27static unsigned wakeup_prio = -1;
28static int wakeup_rt;
28 29
29static raw_spinlock_t wakeup_lock = 30static raw_spinlock_t wakeup_lock =
30 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
31 32
32static void __wakeup_reset(struct trace_array *tr); 33static void __wakeup_reset(struct trace_array *tr);
33 34
35static int save_lat_flag;
36
34#ifdef CONFIG_FUNCTION_TRACER 37#ifdef CONFIG_FUNCTION_TRACER
35/* 38/*
36 * irqsoff uses its own tracer function to keep the overhead down: 39 * irqsoff uses its own tracer function to keep the overhead down:
@@ -71,7 +74,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
71 if (task_cpu(wakeup_task) != cpu) 74 if (task_cpu(wakeup_task) != cpu)
72 goto unlock; 75 goto unlock;
73 76
74 trace_function(tr, data, ip, parent_ip, flags, pc); 77 trace_function(tr, ip, parent_ip, flags, pc);
75 78
76 unlock: 79 unlock:
77 __raw_spin_unlock(&wakeup_lock); 80 __raw_spin_unlock(&wakeup_lock);
@@ -151,7 +154,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
151 if (unlikely(!tracer_enabled || next != wakeup_task)) 154 if (unlikely(!tracer_enabled || next != wakeup_task))
152 goto out_unlock; 155 goto out_unlock;
153 156
154 trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
155 159
156 /* 160 /*
157 * usecs conversion is slow so we try to delay the conversion 161 * usecs conversion is slow so we try to delay the conversion
@@ -182,13 +186,10 @@ out:
182 186
183static void __wakeup_reset(struct trace_array *tr) 187static void __wakeup_reset(struct trace_array *tr)
184{ 188{
185 struct trace_array_cpu *data;
186 int cpu; 189 int cpu;
187 190
188 for_each_possible_cpu(cpu) { 191 for_each_possible_cpu(cpu)
189 data = tr->data[cpu];
190 tracing_reset(tr, cpu); 192 tracing_reset(tr, cpu);
191 }
192 193
193 wakeup_cpu = -1; 194 wakeup_cpu = -1;
194 wakeup_prio = -1; 195 wakeup_prio = -1;
@@ -213,6 +214,7 @@ static void wakeup_reset(struct trace_array *tr)
213static void 214static void
214probe_wakeup(struct rq *rq, struct task_struct *p, int success) 215probe_wakeup(struct rq *rq, struct task_struct *p, int success)
215{ 216{
217 struct trace_array_cpu *data;
216 int cpu = smp_processor_id(); 218 int cpu = smp_processor_id();
217 unsigned long flags; 219 unsigned long flags;
218 long disabled; 220 long disabled;
@@ -224,7 +226,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
224 tracing_record_cmdline(p); 226 tracing_record_cmdline(p);
225 tracing_record_cmdline(current); 227 tracing_record_cmdline(current);
226 228
227 if (likely(!rt_task(p)) || 229 if ((wakeup_rt && !rt_task(p)) ||
228 p->prio >= wakeup_prio || 230 p->prio >= wakeup_prio ||
229 p->prio >= current->prio) 231 p->prio >= current->prio)
230 return; 232 return;
@@ -252,9 +254,16 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
252 254
253 local_save_flags(flags); 255 local_save_flags(flags);
254 256
255 wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); 257 data = wakeup_trace->data[wakeup_cpu];
256 trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], 258 data->preempt_timestamp = ftrace_now(cpu);
257 CALLER_ADDR1, CALLER_ADDR2, flags, pc); 259 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
260
261 /*
262 * We must be careful in using CALLER_ADDR2. But since wake_up
263 * is not called by an assembly function (where as schedule is)
264 * it should be safe to use it here.
265 */
266 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
258 267
259out_locked: 268out_locked:
260 __raw_spin_unlock(&wakeup_lock); 269 __raw_spin_unlock(&wakeup_lock);
@@ -262,12 +271,6 @@ out:
262 atomic_dec(&wakeup_trace->data[cpu]->disabled); 271 atomic_dec(&wakeup_trace->data[cpu]->disabled);
263} 272}
264 273
265/*
266 * save_tracer_enabled is used to save the state of the tracer_enabled
267 * variable when we disable it when we open a trace output file.
268 */
269static int save_tracer_enabled;
270
271static void start_wakeup_tracer(struct trace_array *tr) 274static void start_wakeup_tracer(struct trace_array *tr)
272{ 275{
273 int ret; 276 int ret;
@@ -289,7 +292,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
289 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 292 ret = register_trace_sched_switch(probe_wakeup_sched_switch);
290 if (ret) { 293 if (ret) {
291 pr_info("sched trace: Couldn't activate tracepoint" 294 pr_info("sched trace: Couldn't activate tracepoint"
292 " probe to kernel_sched_schedule\n"); 295 " probe to kernel_sched_switch\n");
293 goto fail_deprobe_wake_new; 296 goto fail_deprobe_wake_new;
294 } 297 }
295 298
@@ -306,13 +309,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
306 309
307 register_ftrace_function(&trace_ops); 310 register_ftrace_function(&trace_ops);
308 311
309 if (tracing_is_enabled()) { 312 if (tracing_is_enabled())
310 tracer_enabled = 1; 313 tracer_enabled = 1;
311 save_tracer_enabled = 1; 314 else
312 } else {
313 tracer_enabled = 0; 315 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
316 316
317 return; 317 return;
318fail_deprobe_wake_new: 318fail_deprobe_wake_new:
@@ -324,53 +324,54 @@ fail_deprobe:
324static void stop_wakeup_tracer(struct trace_array *tr) 324static void stop_wakeup_tracer(struct trace_array *tr)
325{ 325{
326 tracer_enabled = 0; 326 tracer_enabled = 0;
327 save_tracer_enabled = 0;
328 unregister_ftrace_function(&trace_ops); 327 unregister_ftrace_function(&trace_ops);
329 unregister_trace_sched_switch(probe_wakeup_sched_switch); 328 unregister_trace_sched_switch(probe_wakeup_sched_switch);
330 unregister_trace_sched_wakeup_new(probe_wakeup); 329 unregister_trace_sched_wakeup_new(probe_wakeup);
331 unregister_trace_sched_wakeup(probe_wakeup); 330 unregister_trace_sched_wakeup(probe_wakeup);
332} 331}
333 332
334static int wakeup_tracer_init(struct trace_array *tr) 333static int __wakeup_tracer_init(struct trace_array *tr)
335{ 334{
335 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
336 trace_flags |= TRACE_ITER_LATENCY_FMT;
337
338 tracing_max_latency = 0;
336 wakeup_trace = tr; 339 wakeup_trace = tr;
337 start_wakeup_tracer(tr); 340 start_wakeup_tracer(tr);
338 return 0; 341 return 0;
339} 342}
340 343
344static int wakeup_tracer_init(struct trace_array *tr)
345{
346 wakeup_rt = 0;
347 return __wakeup_tracer_init(tr);
348}
349
350static int wakeup_rt_tracer_init(struct trace_array *tr)
351{
352 wakeup_rt = 1;
353 return __wakeup_tracer_init(tr);
354}
355
341static void wakeup_tracer_reset(struct trace_array *tr) 356static void wakeup_tracer_reset(struct trace_array *tr)
342{ 357{
343 stop_wakeup_tracer(tr); 358 stop_wakeup_tracer(tr);
344 /* make sure we put back any tasks we are tracing */ 359 /* make sure we put back any tasks we are tracing */
345 wakeup_reset(tr); 360 wakeup_reset(tr);
361
362 if (!save_lat_flag)
363 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
346} 364}
347 365
348static void wakeup_tracer_start(struct trace_array *tr) 366static void wakeup_tracer_start(struct trace_array *tr)
349{ 367{
350 wakeup_reset(tr); 368 wakeup_reset(tr);
351 tracer_enabled = 1; 369 tracer_enabled = 1;
352 save_tracer_enabled = 1;
353} 370}
354 371
355static void wakeup_tracer_stop(struct trace_array *tr) 372static void wakeup_tracer_stop(struct trace_array *tr)
356{ 373{
357 tracer_enabled = 0; 374 tracer_enabled = 0;
358 save_tracer_enabled = 0;
359}
360
361static void wakeup_tracer_open(struct trace_iterator *iter)
362{
363 /* stop the trace while dumping */
364 tracer_enabled = 0;
365}
366
367static void wakeup_tracer_close(struct trace_iterator *iter)
368{
369 /* forget about any processes we were recording */
370 if (save_tracer_enabled) {
371 wakeup_reset(iter->tr);
372 tracer_enabled = 1;
373 }
374} 375}
375 376
376static struct tracer wakeup_tracer __read_mostly = 377static struct tracer wakeup_tracer __read_mostly =
@@ -380,8 +381,20 @@ static struct tracer wakeup_tracer __read_mostly =
380 .reset = wakeup_tracer_reset, 381 .reset = wakeup_tracer_reset,
381 .start = wakeup_tracer_start, 382 .start = wakeup_tracer_start,
382 .stop = wakeup_tracer_stop, 383 .stop = wakeup_tracer_stop,
383 .open = wakeup_tracer_open, 384 .print_max = 1,
384 .close = wakeup_tracer_close, 385#ifdef CONFIG_FTRACE_SELFTEST
386 .selftest = trace_selftest_startup_wakeup,
387#endif
388};
389
390static struct tracer wakeup_rt_tracer __read_mostly =
391{
392 .name = "wakeup_rt",
393 .init = wakeup_rt_tracer_init,
394 .reset = wakeup_tracer_reset,
395 .start = wakeup_tracer_start,
396 .stop = wakeup_tracer_stop,
397 .wait_pipe = poll_wait_pipe,
385 .print_max = 1, 398 .print_max = 1,
386#ifdef CONFIG_FTRACE_SELFTEST 399#ifdef CONFIG_FTRACE_SELFTEST
387 .selftest = trace_selftest_startup_wakeup, 400 .selftest = trace_selftest_startup_wakeup,
@@ -396,6 +409,10 @@ __init static int init_wakeup_tracer(void)
396 if (ret) 409 if (ret)
397 return ret; 410 return ret;
398 411
412 ret = register_tracer(&wakeup_rt_tracer);
413 if (ret)
414 return ret;
415
399 return 0; 416 return 0;
400} 417}
401device_initcall(init_wakeup_tracer); 418device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 88c8eb70f54a..08f4eb2763d1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
1/* Include in trace.c */ 1/* Include in trace.c */
2 2
3#include <linux/stringify.h>
3#include <linux/kthread.h> 4#include <linux/kthread.h>
4#include <linux/delay.h> 5#include <linux/delay.h>
5 6
@@ -9,11 +10,12 @@ static inline int trace_valid_entry(struct trace_entry *entry)
9 case TRACE_FN: 10 case TRACE_FN:
10 case TRACE_CTX: 11 case TRACE_CTX:
11 case TRACE_WAKE: 12 case TRACE_WAKE:
12 case TRACE_CONT:
13 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT: 14 case TRACE_PRINT:
15 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH: 16 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET:
17 return 1; 19 return 1;
18 } 20 }
19 return 0; 21 return 0;
@@ -23,10 +25,20 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
23{ 25{
24 struct ring_buffer_event *event; 26 struct ring_buffer_event *event;
25 struct trace_entry *entry; 27 struct trace_entry *entry;
28 unsigned int loops = 0;
26 29
27 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 30 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
28 entry = ring_buffer_event_data(event); 31 entry = ring_buffer_event_data(event);
29 32
33 /*
34 * The ring buffer is a size of trace_buf_size, if
35 * we loop more than the size, there's something wrong
36 * with the ring buffer.
37 */
38 if (loops++ > trace_buf_size) {
39 printk(KERN_CONT ".. bad ring buffer ");
40 goto failed;
41 }
30 if (!trace_valid_entry(entry)) { 42 if (!trace_valid_entry(entry)) {
31 printk(KERN_CONT ".. invalid entry %d ", 43 printk(KERN_CONT ".. invalid entry %d ",
32 entry->type); 44 entry->type);
@@ -57,11 +69,20 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
57 69
58 cnt = ring_buffer_entries(tr->buffer); 70 cnt = ring_buffer_entries(tr->buffer);
59 71
72 /*
73 * The trace_test_buffer_cpu runs a while loop to consume all data.
74 * If the calling tracer is broken, and is constantly filling
75 * the buffer, this will run forever, and hard lock the box.
76 * We disable the ring buffer while we do this test to prevent
77 * a hard lock up.
78 */
79 tracing_off();
60 for_each_possible_cpu(cpu) { 80 for_each_possible_cpu(cpu) {
61 ret = trace_test_buffer_cpu(tr, cpu); 81 ret = trace_test_buffer_cpu(tr, cpu);
62 if (ret) 82 if (ret)
63 break; 83 break;
64 } 84 }
85 tracing_on();
65 __raw_spin_unlock(&ftrace_max_lock); 86 __raw_spin_unlock(&ftrace_max_lock);
66 local_irq_restore(flags); 87 local_irq_restore(flags);
67 88
@@ -80,9 +101,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
80 101
81#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
82 103
83#define __STR(x) #x
84#define STR(x) __STR(x)
85
86/* Test dynamic code modification and ftrace filters */ 104/* Test dynamic code modification and ftrace filters */
87int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 105int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
88 struct trace_array *tr, 106 struct trace_array *tr,
@@ -106,17 +124,17 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
106 func(); 124 func();
107 125
108 /* 126 /*
109 * Some archs *cough*PowerPC*cough* add charachters to the 127 * Some archs *cough*PowerPC*cough* add characters to the
110 * start of the function names. We simply put a '*' to 128 * start of the function names. We simply put a '*' to
111 * accomodate them. 129 * accommodate them.
112 */ 130 */
113 func_name = "*" STR(DYN_FTRACE_TEST_NAME); 131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
114 132
115 /* filter only on our function */ 133 /* filter only on our function */
116 ftrace_set_filter(func_name, strlen(func_name), 1); 134 ftrace_set_filter(func_name, strlen(func_name), 1);
117 135
118 /* enable tracing */ 136 /* enable tracing */
119 ret = trace->init(tr); 137 ret = tracer_init(trace, tr);
120 if (ret) { 138 if (ret) {
121 warn_failed_init_tracer(trace, ret); 139 warn_failed_init_tracer(trace, ret);
122 goto out; 140 goto out;
@@ -190,7 +208,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
190 ftrace_enabled = 1; 208 ftrace_enabled = 1;
191 tracer_enabled = 1; 209 tracer_enabled = 1;
192 210
193 ret = trace->init(tr); 211 ret = tracer_init(trace, tr);
194 if (ret) { 212 if (ret) {
195 warn_failed_init_tracer(trace, ret); 213 warn_failed_init_tracer(trace, ret);
196 goto out; 214 goto out;
@@ -228,6 +246,90 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
228} 246}
229#endif /* CONFIG_FUNCTION_TRACER */ 247#endif /* CONFIG_FUNCTION_TRACER */
230 248
249
250#ifdef CONFIG_FUNCTION_GRAPH_TRACER
251
252/* Maximum number of functions to trace before diagnosing a hang */
253#define GRAPH_MAX_FUNC_TEST 100000000
254
255static void __ftrace_dump(bool disable_tracing);
256static unsigned int graph_hang_thresh;
257
258/* Wrap the real function entry probe to avoid possible hanging */
259static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
260{
261 /* This is harmlessly racy, we want to approximately detect a hang */
262 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
263 ftrace_graph_stop();
264 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
265 if (ftrace_dump_on_oops)
266 __ftrace_dump(false);
267 return 0;
268 }
269
270 return trace_graph_entry(trace);
271}
272
273/*
274 * Pretty much the same than for the function tracer from which the selftest
275 * has been borrowed.
276 */
277int
278trace_selftest_startup_function_graph(struct tracer *trace,
279 struct trace_array *tr)
280{
281 int ret;
282 unsigned long count;
283
284 /*
285 * Simulate the init() callback but we attach a watchdog callback
286 * to detect and recover from possible hangs
287 */
288 tracing_reset_online_cpus(tr);
289 ret = register_ftrace_graph(&trace_graph_return,
290 &trace_graph_entry_watchdog);
291 if (ret) {
292 warn_failed_init_tracer(trace, ret);
293 goto out;
294 }
295 tracing_start_cmdline_record();
296
297 /* Sleep for a 1/10 of a second */
298 msleep(100);
299
300 /* Have we just recovered from a hang? */
301 if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
302 tracing_selftest_disabled = true;
303 ret = -1;
304 goto out;
305 }
306
307 tracing_stop();
308
309 /* check the trace buffer */
310 ret = trace_test_buffer(tr, &count);
311
312 trace->reset(tr);
313 tracing_start();
314
315 if (!ret && !count) {
316 printk(KERN_CONT ".. no entries found ..");
317 ret = -1;
318 goto out;
319 }
320
321 /* Don't test dynamic tracing, the function tracer already did */
322
323out:
324 /* Stop it if we failed */
325 if (ret)
326 ftrace_graph_stop();
327
328 return ret;
329}
330#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
331
332
231#ifdef CONFIG_IRQSOFF_TRACER 333#ifdef CONFIG_IRQSOFF_TRACER
232int 334int
233trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) 335trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
@@ -237,7 +339,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
237 int ret; 339 int ret;
238 340
239 /* start the tracing */ 341 /* start the tracing */
240 ret = trace->init(tr); 342 ret = tracer_init(trace, tr);
241 if (ret) { 343 if (ret) {
242 warn_failed_init_tracer(trace, ret); 344 warn_failed_init_tracer(trace, ret);
243 return ret; 345 return ret;
@@ -249,6 +351,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
249 local_irq_disable(); 351 local_irq_disable();
250 udelay(100); 352 udelay(100);
251 local_irq_enable(); 353 local_irq_enable();
354
355 /*
356 * Stop the tracer to avoid a warning subsequent
357 * to buffer flipping failure because tracing_stop()
358 * disables the tr and max buffers, making flipping impossible
359 * in case of parallels max irqs off latencies.
360 */
361 trace->stop(tr);
252 /* stop the tracing. */ 362 /* stop the tracing. */
253 tracing_stop(); 363 tracing_stop();
254 /* check both trace buffers */ 364 /* check both trace buffers */
@@ -291,7 +401,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
291 } 401 }
292 402
293 /* start the tracing */ 403 /* start the tracing */
294 ret = trace->init(tr); 404 ret = tracer_init(trace, tr);
295 if (ret) { 405 if (ret) {
296 warn_failed_init_tracer(trace, ret); 406 warn_failed_init_tracer(trace, ret);
297 return ret; 407 return ret;
@@ -303,6 +413,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
303 preempt_disable(); 413 preempt_disable();
304 udelay(100); 414 udelay(100);
305 preempt_enable(); 415 preempt_enable();
416
417 /*
418 * Stop the tracer to avoid a warning subsequent
419 * to buffer flipping failure because tracing_stop()
420 * disables the tr and max buffers, making flipping impossible
421 * in case of parallels max preempt off latencies.
422 */
423 trace->stop(tr);
306 /* stop the tracing. */ 424 /* stop the tracing. */
307 tracing_stop(); 425 tracing_stop();
308 /* check both trace buffers */ 426 /* check both trace buffers */
@@ -345,10 +463,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
345 } 463 }
346 464
347 /* start the tracing */ 465 /* start the tracing */
348 ret = trace->init(tr); 466 ret = tracer_init(trace, tr);
349 if (ret) { 467 if (ret) {
350 warn_failed_init_tracer(trace, ret); 468 warn_failed_init_tracer(trace, ret);
351 goto out; 469 goto out_no_start;
352 } 470 }
353 471
354 /* reset the max latency */ 472 /* reset the max latency */
@@ -362,31 +480,35 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
362 /* reverse the order of preempt vs irqs */ 480 /* reverse the order of preempt vs irqs */
363 local_irq_enable(); 481 local_irq_enable();
364 482
483 /*
484 * Stop the tracer to avoid a warning subsequent
485 * to buffer flipping failure because tracing_stop()
486 * disables the tr and max buffers, making flipping impossible
487 * in case of parallels max irqs/preempt off latencies.
488 */
489 trace->stop(tr);
365 /* stop the tracing. */ 490 /* stop the tracing. */
366 tracing_stop(); 491 tracing_stop();
367 /* check both trace buffers */ 492 /* check both trace buffers */
368 ret = trace_test_buffer(tr, NULL); 493 ret = trace_test_buffer(tr, NULL);
369 if (ret) { 494 if (ret)
370 tracing_start();
371 goto out; 495 goto out;
372 }
373 496
374 ret = trace_test_buffer(&max_tr, &count); 497 ret = trace_test_buffer(&max_tr, &count);
375 if (ret) { 498 if (ret)
376 tracing_start();
377 goto out; 499 goto out;
378 }
379 500
380 if (!ret && !count) { 501 if (!ret && !count) {
381 printk(KERN_CONT ".. no entries found .."); 502 printk(KERN_CONT ".. no entries found ..");
382 ret = -1; 503 ret = -1;
383 tracing_start();
384 goto out; 504 goto out;
385 } 505 }
386 506
387 /* do the test by disabling interrupts first this time */ 507 /* do the test by disabling interrupts first this time */
388 tracing_max_latency = 0; 508 tracing_max_latency = 0;
389 tracing_start(); 509 tracing_start();
510 trace->start(tr);
511
390 preempt_disable(); 512 preempt_disable();
391 local_irq_disable(); 513 local_irq_disable();
392 udelay(100); 514 udelay(100);
@@ -394,6 +516,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
394 /* reverse the order of preempt vs irqs */ 516 /* reverse the order of preempt vs irqs */
395 local_irq_enable(); 517 local_irq_enable();
396 518
519 trace->stop(tr);
397 /* stop the tracing. */ 520 /* stop the tracing. */
398 tracing_stop(); 521 tracing_stop();
399 /* check both trace buffers */ 522 /* check both trace buffers */
@@ -409,9 +532,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
409 goto out; 532 goto out;
410 } 533 }
411 534
412 out: 535out:
413 trace->reset(tr);
414 tracing_start(); 536 tracing_start();
537out_no_start:
538 trace->reset(tr);
415 tracing_max_latency = save_max; 539 tracing_max_latency = save_max;
416 540
417 return ret; 541 return ret;
@@ -477,7 +601,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
477 wait_for_completion(&isrt); 601 wait_for_completion(&isrt);
478 602
479 /* start the tracing */ 603 /* start the tracing */
480 ret = trace->init(tr); 604 ret = tracer_init(trace, tr);
481 if (ret) { 605 if (ret) {
482 warn_failed_init_tracer(trace, ret); 606 warn_failed_init_tracer(trace, ret);
483 return ret; 607 return ret;
@@ -538,7 +662,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
538 int ret; 662 int ret;
539 663
540 /* start the tracing */ 664 /* start the tracing */
541 ret = trace->init(tr); 665 ret = tracer_init(trace, tr);
542 if (ret) { 666 if (ret) {
543 warn_failed_init_tracer(trace, ret); 667 warn_failed_init_tracer(trace, ret);
544 return ret; 668 return ret;
@@ -570,10 +694,10 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
570 int ret; 694 int ret;
571 695
572 /* start the tracing */ 696 /* start the tracing */
573 ret = trace->init(tr); 697 ret = tracer_init(trace, tr);
574 if (ret) { 698 if (ret) {
575 warn_failed_init_tracer(trace, ret); 699 warn_failed_init_tracer(trace, ret);
576 return 0; 700 return ret;
577 } 701 }
578 702
579 /* Sleep for a 1/10 of a second */ 703 /* Sleep for a 1/10 of a second */
@@ -585,6 +709,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
585 trace->reset(tr); 709 trace->reset(tr);
586 tracing_start(); 710 tracing_start();
587 711
712 if (!ret && !count) {
713 printk(KERN_CONT ".. no entries found ..");
714 ret = -1;
715 }
716
588 return ret; 717 return ret;
589} 718}
590#endif /* CONFIG_SYSPROF_TRACER */ 719#endif /* CONFIG_SYSPROF_TRACER */
@@ -597,7 +726,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
597 int ret; 726 int ret;
598 727
599 /* start the tracing */ 728 /* start the tracing */
600 ret = trace->init(tr); 729 ret = tracer_init(trace, tr);
601 if (ret) { 730 if (ret) {
602 warn_failed_init_tracer(trace, ret); 731 warn_failed_init_tracer(trace, ret);
603 return ret; 732 return ret;
@@ -612,6 +741,11 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
612 trace->reset(tr); 741 trace->reset(tr);
613 tracing_start(); 742 tracing_start();
614 743
744 if (!ret && !count) {
745 printk(KERN_CONT ".. no entries found ..");
746 ret = -1;
747 }
748
615 return ret; 749 return ret;
616} 750}
617#endif /* CONFIG_BRANCH_TRACER */ 751#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d0871bc0aca5..c750f65f9661 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -245,16 +245,31 @@ static int trace_lookup_stack(struct seq_file *m, long i)
245#endif 245#endif
246} 246}
247 247
248static void print_disabled(struct seq_file *m)
249{
250 seq_puts(m, "#\n"
251 "# Stack tracer disabled\n"
252 "#\n"
253 "# To enable the stack tracer, either add 'stacktrace' to the\n"
254 "# kernel command line\n"
255 "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
256 "#\n");
257}
258
248static int t_show(struct seq_file *m, void *v) 259static int t_show(struct seq_file *m, void *v)
249{ 260{
250 long i; 261 long i;
251 int size; 262 int size;
252 263
253 if (v == SEQ_START_TOKEN) { 264 if (v == SEQ_START_TOKEN) {
254 seq_printf(m, " Depth Size Location" 265 seq_printf(m, " Depth Size Location"
255 " (%d entries)\n" 266 " (%d entries)\n"
256 " ----- ---- --------\n", 267 " ----- ---- --------\n",
257 max_stack_trace.nr_entries); 268 max_stack_trace.nr_entries);
269
270 if (!stack_tracer_enabled && !max_stack_size)
271 print_disabled(m);
272
258 return 0; 273 return 0;
259 } 274 }
260 275
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 000000000000..acdebd771a93
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,326 @@
1/*
2 * Infrastructure for statistic tracing (histogram output).
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 * Based on the code from trace_branch.c which is
7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
8 *
9 */
10
11
12#include <linux/list.h>
13#include <linux/debugfs.h>
14#include "trace_stat.h"
15#include "trace.h"
16
17
18/* List of stat entries from a tracer */
19struct trace_stat_list {
20 struct list_head list;
21 void *stat;
22};
23
24/* A stat session is the stats output in one file */
25struct tracer_stat_session {
26 struct list_head session_list;
27 struct tracer_stat *ts;
28 struct list_head stat_list;
29 struct mutex stat_mutex;
30 struct dentry *file;
31};
32
33/* All of the sessions currently in use. Each stat file embed one session */
34static LIST_HEAD(all_stat_sessions);
35static DEFINE_MUTEX(all_stat_sessions_mutex);
36
37/* The root directory for all stat files */
38static struct dentry *stat_dir;
39
40
41static void reset_stat_session(struct tracer_stat_session *session)
42{
43 struct trace_stat_list *node, *next;
44
45 list_for_each_entry_safe(node, next, &session->stat_list, list)
46 kfree(node);
47
48 INIT_LIST_HEAD(&session->stat_list);
49}
50
51static void destroy_session(struct tracer_stat_session *session)
52{
53 debugfs_remove(session->file);
54 reset_stat_session(session);
55 mutex_destroy(&session->stat_mutex);
56 kfree(session);
57}
58
59/*
60 * For tracers that don't provide a stat_cmp callback.
61 * This one will force an immediate insertion on tail of
62 * the list.
63 */
64static int dummy_cmp(void *p1, void *p2)
65{
66 return 1;
67}
68
69/*
70 * Initialize the stat list at each trace_stat file opening.
71 * All of these copies and sorting are required on all opening
72 * since the stats could have changed between two file sessions.
73 */
74static int stat_seq_init(struct tracer_stat_session *session)
75{
76 struct trace_stat_list *iter_entry, *new_entry;
77 struct tracer_stat *ts = session->ts;
78 void *stat;
79 int ret = 0;
80 int i;
81
82 mutex_lock(&session->stat_mutex);
83 reset_stat_session(session);
84
85 if (!ts->stat_cmp)
86 ts->stat_cmp = dummy_cmp;
87
88 stat = ts->stat_start();
89 if (!stat)
90 goto exit;
91
92 /*
93 * The first entry. Actually this is the second, but the first
94 * one (the stat_list head) is pointless.
95 */
96 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
97 if (!new_entry) {
98 ret = -ENOMEM;
99 goto exit;
100 }
101
102 INIT_LIST_HEAD(&new_entry->list);
103
104 list_add(&new_entry->list, &session->stat_list);
105
106 new_entry->stat = stat;
107
108 /*
109 * Iterate over the tracer stat entries and store them in a sorted
110 * list.
111 */
112 for (i = 1; ; i++) {
113 stat = ts->stat_next(stat, i);
114
115 /* End of insertion */
116 if (!stat)
117 break;
118
119 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
120 if (!new_entry) {
121 ret = -ENOMEM;
122 goto exit_free_list;
123 }
124
125 INIT_LIST_HEAD(&new_entry->list);
126 new_entry->stat = stat;
127
128 list_for_each_entry_reverse(iter_entry, &session->stat_list,
129 list) {
130
131 /* Insertion with a descendent sorting */
132 if (ts->stat_cmp(iter_entry->stat,
133 new_entry->stat) >= 0) {
134
135 list_add(&new_entry->list, &iter_entry->list);
136 break;
137 }
138 }
139
140 /* The current larger value */
141 if (list_empty(&new_entry->list))
142 list_add(&new_entry->list, &session->stat_list);
143 }
144exit:
145 mutex_unlock(&session->stat_mutex);
146 return ret;
147
148exit_free_list:
149 reset_stat_session(session);
150 mutex_unlock(&session->stat_mutex);
151 return ret;
152}
153
154
155static void *stat_seq_start(struct seq_file *s, loff_t *pos)
156{
157 struct tracer_stat_session *session = s->private;
158
159 /* Prevent from tracer switch or stat_list modification */
160 mutex_lock(&session->stat_mutex);
161
162 /* If we are in the beginning of the file, print the headers */
163 if (!*pos && session->ts->stat_headers)
164 return SEQ_START_TOKEN;
165
166 return seq_list_start(&session->stat_list, *pos);
167}
168
169static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
170{
171 struct tracer_stat_session *session = s->private;
172
173 if (p == SEQ_START_TOKEN)
174 return seq_list_start(&session->stat_list, *pos);
175
176 return seq_list_next(p, &session->stat_list, pos);
177}
178
179static void stat_seq_stop(struct seq_file *s, void *p)
180{
181 struct tracer_stat_session *session = s->private;
182 mutex_unlock(&session->stat_mutex);
183}
184
185static int stat_seq_show(struct seq_file *s, void *v)
186{
187 struct tracer_stat_session *session = s->private;
188 struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
189
190 if (v == SEQ_START_TOKEN)
191 return session->ts->stat_headers(s);
192
193 return session->ts->stat_show(s, l->stat);
194}
195
196static const struct seq_operations trace_stat_seq_ops = {
197 .start = stat_seq_start,
198 .next = stat_seq_next,
199 .stop = stat_seq_stop,
200 .show = stat_seq_show
201};
202
203/* The session stat is refilled and resorted at each stat file opening */
204static int tracing_stat_open(struct inode *inode, struct file *file)
205{
206 int ret;
207
208 struct tracer_stat_session *session = inode->i_private;
209
210 ret = seq_open(file, &trace_stat_seq_ops);
211 if (!ret) {
212 struct seq_file *m = file->private_data;
213 m->private = session;
214 ret = stat_seq_init(session);
215 }
216
217 return ret;
218}
219
220/*
221 * Avoid consuming memory with our now useless list.
222 */
223static int tracing_stat_release(struct inode *i, struct file *f)
224{
225 struct tracer_stat_session *session = i->i_private;
226
227 mutex_lock(&session->stat_mutex);
228 reset_stat_session(session);
229 mutex_unlock(&session->stat_mutex);
230
231 return 0;
232}
233
234static const struct file_operations tracing_stat_fops = {
235 .open = tracing_stat_open,
236 .read = seq_read,
237 .llseek = seq_lseek,
238 .release = tracing_stat_release
239};
240
241static int tracing_stat_init(void)
242{
243 struct dentry *d_tracing;
244
245 d_tracing = tracing_init_dentry();
246
247 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
248 if (!stat_dir)
249 pr_warning("Could not create debugfs "
250 "'trace_stat' entry\n");
251 return 0;
252}
253
254static int init_stat_file(struct tracer_stat_session *session)
255{
256 if (!stat_dir && tracing_stat_init())
257 return -ENODEV;
258
259 session->file = debugfs_create_file(session->ts->name, 0644,
260 stat_dir,
261 session, &tracing_stat_fops);
262 if (!session->file)
263 return -ENOMEM;
264 return 0;
265}
266
267int register_stat_tracer(struct tracer_stat *trace)
268{
269 struct tracer_stat_session *session, *node, *tmp;
270 int ret;
271
272 if (!trace)
273 return -EINVAL;
274
275 if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
276 return -EINVAL;
277
278 /* Already registered? */
279 mutex_lock(&all_stat_sessions_mutex);
280 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
281 if (node->ts == trace) {
282 mutex_unlock(&all_stat_sessions_mutex);
283 return -EINVAL;
284 }
285 }
286 mutex_unlock(&all_stat_sessions_mutex);
287
288 /* Init the session */
289 session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
290 if (!session)
291 return -ENOMEM;
292
293 session->ts = trace;
294 INIT_LIST_HEAD(&session->session_list);
295 INIT_LIST_HEAD(&session->stat_list);
296 mutex_init(&session->stat_mutex);
297 session->file = NULL;
298
299 ret = init_stat_file(session);
300 if (ret) {
301 destroy_session(session);
302 return ret;
303 }
304
305 /* Register */
306 mutex_lock(&all_stat_sessions_mutex);
307 list_add_tail(&session->session_list, &all_stat_sessions);
308 mutex_unlock(&all_stat_sessions_mutex);
309
310 return 0;
311}
312
313void unregister_stat_tracer(struct tracer_stat *trace)
314{
315 struct tracer_stat_session *node, *tmp;
316
317 mutex_lock(&all_stat_sessions_mutex);
318 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
319 if (node->ts == trace) {
320 list_del(&node->session_list);
321 destroy_session(node);
322 break;
323 }
324 }
325 mutex_unlock(&all_stat_sessions_mutex);
326}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 000000000000..202274cf7f3d
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,31 @@
1#ifndef __TRACE_STAT_H
2#define __TRACE_STAT_H
3
4#include <linux/seq_file.h>
5
6/*
7 * If you want to provide a stat file (one-shot statistics), fill
8 * an iterator with stat_start/stat_next and a stat_show callbacks.
9 * The others callbacks are optional.
10 */
11struct tracer_stat {
12 /* The name of your stat file */
13 const char *name;
14 /* Iteration over statistic entries */
15 void *(*stat_start)(void);
16 void *(*stat_next)(void *prev, int idx);
17 /* Compare two entries for stats sorting */
18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s);
23};
24
25/*
26 * Destroy or create a stat file
27 */
28extern int register_stat_tracer(struct tracer_stat *trace);
29extern void unregister_stat_tracer(struct tracer_stat *trace);
30
31#endif /* __TRACE_STAT_H */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 000000000000..a2a3af29c943
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,250 @@
1#include <linux/kernel.h>
2#include <linux/ftrace.h>
3#include <asm/syscall.h>
4
5#include "trace_output.h"
6#include "trace.h"
7
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock);
13
14/* Option to display the parameters types */
15enum {
16 TRACE_SYSCALLS_OPT_TYPES = 0x1,
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28
29enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags)
31{
32 struct trace_seq *s = &iter->seq;
33 struct trace_entry *ent = iter->ent;
34 struct syscall_trace_enter *trace;
35 struct syscall_metadata *entry;
36 int i, ret, syscall;
37
38 trace_assign_type(trace, ent);
39
40 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall);
43 if (!entry)
44 goto end;
45
46 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE;
49
50 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE;
56 }
57 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
59 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ",");
61 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE;
63 }
64
65end:
66 trace_seq_printf(s, "\n");
67 return TRACE_TYPE_HANDLED;
68}
69
70enum print_line_t
71print_syscall_exit(struct trace_iterator *iter, int flags)
72{
73 struct trace_seq *s = &iter->seq;
74 struct trace_entry *ent = iter->ent;
75 struct syscall_trace_exit *trace;
76 int syscall;
77 struct syscall_metadata *entry;
78 int ret;
79
80 trace_assign_type(trace, ent);
81
82 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall);
85 if (!entry) {
86 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED;
88 }
89
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret);
92 if (!ret)
93 return TRACE_TYPE_PARTIAL_LINE;
94
95 return TRACE_TYPE_HANDLED;
96}
97
98void start_ftrace_syscalls(void)
99{
100 unsigned long flags;
101 struct task_struct *g, *t;
102
103 mutex_lock(&syscall_trace_lock);
104
105 /* Don't enable the flag on the tasks twice */
106 if (++refcount != 1)
107 goto unlock;
108
109 arch_init_ftrace_syscalls();
110 read_lock_irqsave(&tasklist_lock, flags);
111
112 do_each_thread(g, t) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
114 } while_each_thread(g, t);
115
116 read_unlock_irqrestore(&tasklist_lock, flags);
117
118unlock:
119 mutex_unlock(&syscall_trace_lock);
120}
121
122void stop_ftrace_syscalls(void)
123{
124 unsigned long flags;
125 struct task_struct *g, *t;
126
127 mutex_lock(&syscall_trace_lock);
128
129 /* There are perhaps still some users */
130 if (--refcount)
131 goto unlock;
132
133 read_lock_irqsave(&tasklist_lock, flags);
134
135 do_each_thread(g, t) {
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
137 } while_each_thread(g, t);
138
139 read_unlock_irqrestore(&tasklist_lock, flags);
140
141unlock:
142 mutex_unlock(&syscall_trace_lock);
143}
144
145void ftrace_syscall_enter(struct pt_regs *regs)
146{
147 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event;
150 int size;
151 int syscall_nr;
152
153 syscall_nr = syscall_get_nr(current, regs);
154
155 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data)
157 return;
158
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
162 0, 0);
163 if (!event)
164 return;
165
166 entry = ring_buffer_event_data(event);
167 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169
170 trace_current_buffer_unlock_commit(event, 0, 0);
171 trace_wake_up();
172}
173
174void ftrace_syscall_exit(struct pt_regs *regs)
175{
176 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event;
179 int syscall_nr;
180
181 syscall_nr = syscall_get_nr(current, regs);
182
183 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data)
185 return;
186
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
188 sizeof(*entry), 0, 0);
189 if (!event)
190 return;
191
192 entry = ring_buffer_event_data(event);
193 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs);
195
196 trace_current_buffer_unlock_commit(event, 0, 0);
197 trace_wake_up();
198}
199
200static int init_syscall_tracer(struct trace_array *tr)
201{
202 start_ftrace_syscalls();
203
204 return 0;
205}
206
207static void reset_syscall_tracer(struct trace_array *tr)
208{
209 stop_ftrace_syscalls();
210 tracing_reset_online_cpus(tr);
211}
212
213static struct trace_event syscall_enter_event = {
214 .type = TRACE_SYSCALL_ENTER,
215 .trace = print_syscall_enter,
216};
217
218static struct trace_event syscall_exit_event = {
219 .type = TRACE_SYSCALL_EXIT,
220 .trace = print_syscall_exit,
221};
222
223static struct tracer syscall_tracer __read_mostly = {
224 .name = "syscall",
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228};
229
230__init int register_ftrace_syscalls(void)
231{
232 int ret;
233
234 ret = register_ftrace_event(&syscall_enter_event);
235 if (!ret) {
236 printk(KERN_WARNING "event %d failed to register\n",
237 syscall_enter_event.type);
238 WARN_ON_ONCE(1);
239 }
240
241 ret = register_ftrace_event(&syscall_exit_event);
242 if (!ret) {
243 printk(KERN_WARNING "event %d failed to register\n",
244 syscall_exit_event.type);
245 WARN_ON_ONCE(1);
246 }
247
248 return register_tracer(&syscall_tracer);
249}
250device_initcall(register_ftrace_syscalls);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index eaca5ad803ff..91fd19c2149f 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
88 } 88 }
89} 89}
90 90
91const static struct stacktrace_ops backtrace_ops = { 91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning, 92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol, 93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack, 94 .stack = backtrace_stack,
@@ -226,15 +226,6 @@ static void stop_stack_timers(void)
226 stop_stack_timer(cpu); 226 stop_stack_timer(cpu);
227} 227}
228 228
229static void start_stack_trace(struct trace_array *tr)
230{
231 mutex_lock(&sample_timer_lock);
232 tracing_reset_online_cpus(tr);
233 start_stack_timers();
234 tracer_enabled = 1;
235 mutex_unlock(&sample_timer_lock);
236}
237
238static void stop_stack_trace(struct trace_array *tr) 229static void stop_stack_trace(struct trace_array *tr)
239{ 230{
240 mutex_lock(&sample_timer_lock); 231 mutex_lock(&sample_timer_lock);
@@ -247,12 +238,18 @@ static int stack_trace_init(struct trace_array *tr)
247{ 238{
248 sysprof_trace = tr; 239 sysprof_trace = tr;
249 240
250 start_stack_trace(tr); 241 tracing_start_cmdline_record();
242
243 mutex_lock(&sample_timer_lock);
244 start_stack_timers();
245 tracer_enabled = 1;
246 mutex_unlock(&sample_timer_lock);
251 return 0; 247 return 0;
252} 248}
253 249
254static void stack_trace_reset(struct trace_array *tr) 250static void stack_trace_reset(struct trace_array *tr)
255{ 251{
252 tracing_stop_cmdline_record();
256 stop_stack_trace(tr); 253 stop_stack_trace(tr);
257} 254}
258 255
@@ -317,7 +314,7 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf,
317 return cnt; 314 return cnt;
318} 315}
319 316
320static struct file_operations sysprof_sample_fops = { 317static const struct file_operations sysprof_sample_fops = {
321 .read = sysprof_sample_read, 318 .read = sysprof_sample_read,
322 .write = sysprof_sample_write, 319 .write = sysprof_sample_write,
323}; 320};
@@ -330,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
330 d_tracer, NULL, &sysprof_sample_fops); 327 d_tracer, NULL, &sysprof_sample_fops);
331 if (entry) 328 if (entry)
332 return; 329 return;
333 pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); 330 pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
334} 331}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
new file mode 100644
index 000000000000..797201e4a137
--- /dev/null
+++ b/kernel/trace/trace_workqueue.c
@@ -0,0 +1,288 @@
1/*
2 * Workqueue statistical tracer.
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8
9#include <trace/workqueue.h>
10#include <linux/list.h>
11#include <linux/percpu.h>
12#include "trace_stat.h"
13#include "trace.h"
14
15
16/* A cpu workqueue thread */
17struct cpu_workqueue_stats {
18 struct list_head list;
19/* Useful to know if we print the cpu headers */
20 bool first_entry;
21 int cpu;
22 pid_t pid;
23/* Can be inserted from interrupt or user context, need to be atomic */
24 atomic_t inserted;
25/*
26 * Don't need to be atomic, works are serialized in a single workqueue thread
27 * on a single CPU.
28 */
29 unsigned int executed;
30};
31
32/* List of workqueue threads on one cpu */
33struct workqueue_global_stats {
34 struct list_head list;
35 spinlock_t lock;
36};
37
38/* Don't need a global lock because allocated before the workqueues, and
39 * never freed.
40 */
41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
43
44/* Insertion of a work */
45static void
46probe_workqueue_insertion(struct task_struct *wq_thread,
47 struct work_struct *work)
48{
49 int cpu = cpumask_first(&wq_thread->cpus_allowed);
50 struct cpu_workqueue_stats *node, *next;
51 unsigned long flags;
52
53 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
54 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
55 list) {
56 if (node->pid == wq_thread->pid) {
57 atomic_inc(&node->inserted);
58 goto found;
59 }
60 }
61 pr_debug("trace_workqueue: entry not found\n");
62found:
63 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
64}
65
66/* Execution of a work */
67static void
68probe_workqueue_execution(struct task_struct *wq_thread,
69 struct work_struct *work)
70{
71 int cpu = cpumask_first(&wq_thread->cpus_allowed);
72 struct cpu_workqueue_stats *node, *next;
73 unsigned long flags;
74
75 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
76 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
77 list) {
78 if (node->pid == wq_thread->pid) {
79 node->executed++;
80 goto found;
81 }
82 }
83 pr_debug("trace_workqueue: entry not found\n");
84found:
85 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
86}
87
88/* Creation of a cpu workqueue thread */
89static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
90{
91 struct cpu_workqueue_stats *cws;
92 unsigned long flags;
93
94 WARN_ON(cpu < 0);
95
96 /* Workqueues are sometimes created in atomic context */
97 cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
98 if (!cws) {
99 pr_warning("trace_workqueue: not enough memory\n");
100 return;
101 }
102 INIT_LIST_HEAD(&cws->list);
103 cws->cpu = cpu;
104
105 cws->pid = wq_thread->pid;
106
107 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
108 if (list_empty(&workqueue_cpu_stat(cpu)->list))
109 cws->first_entry = true;
110 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
111 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
112}
113
114/* Destruction of a cpu workqueue thread */
115static void probe_workqueue_destruction(struct task_struct *wq_thread)
116{
117 /* Workqueue only execute on one cpu */
118 int cpu = cpumask_first(&wq_thread->cpus_allowed);
119 struct cpu_workqueue_stats *node, *next;
120 unsigned long flags;
121
122 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
123 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
124 list) {
125 if (node->pid == wq_thread->pid) {
126 list_del(&node->list);
127 kfree(node);
128 goto found;
129 }
130 }
131
132 pr_debug("trace_workqueue: don't find workqueue to destroy\n");
133found:
134 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
135
136}
137
138static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
139{
140 unsigned long flags;
141 struct cpu_workqueue_stats *ret = NULL;
142
143
144 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
145
146 if (!list_empty(&workqueue_cpu_stat(cpu)->list))
147 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
148 struct cpu_workqueue_stats, list);
149
150 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
151
152 return ret;
153}
154
155static void *workqueue_stat_start(void)
156{
157 int cpu;
158 void *ret = NULL;
159
160 for_each_possible_cpu(cpu) {
161 ret = workqueue_stat_start_cpu(cpu);
162 if (ret)
163 return ret;
164 }
165 return NULL;
166}
167
168static void *workqueue_stat_next(void *prev, int idx)
169{
170 struct cpu_workqueue_stats *prev_cws = prev;
171 int cpu = prev_cws->cpu;
172 unsigned long flags;
173 void *ret = NULL;
174
175 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
176 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
177 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
178 do {
179 cpu = cpumask_next(cpu, cpu_possible_mask);
180 if (cpu >= nr_cpu_ids)
181 return NULL;
182 } while (!(ret = workqueue_stat_start_cpu(cpu)));
183 return ret;
184 }
185 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
186
187 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
188 list);
189}
190
191static int workqueue_stat_show(struct seq_file *s, void *p)
192{
193 struct cpu_workqueue_stats *cws = p;
194 unsigned long flags;
195 int cpu = cws->cpu;
196 struct pid *pid;
197 struct task_struct *tsk;
198
199 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
200 if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
201 seq_printf(s, "\n");
202 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
203
204 pid = find_get_pid(cws->pid);
205 if (pid) {
206 tsk = get_pid_task(pid, PIDTYPE_PID);
207 if (tsk) {
208 seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
209 atomic_read(&cws->inserted), cws->executed,
210 tsk->comm);
211 put_task_struct(tsk);
212 }
213 put_pid(pid);
214 }
215
216 return 0;
217}
218
219static int workqueue_stat_headers(struct seq_file *s)
220{
221 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
222 seq_printf(s, "# | | | |\n");
223 return 0;
224}
225
226struct tracer_stat workqueue_stats __read_mostly = {
227 .name = "workqueues",
228 .stat_start = workqueue_stat_start,
229 .stat_next = workqueue_stat_next,
230 .stat_show = workqueue_stat_show,
231 .stat_headers = workqueue_stat_headers
232};
233
234
235int __init stat_workqueue_init(void)
236{
237 if (register_stat_tracer(&workqueue_stats)) {
238 pr_warning("Unable to register workqueue stat tracer\n");
239 return 1;
240 }
241
242 return 0;
243}
244fs_initcall(stat_workqueue_init);
245
246/*
247 * Workqueues are created very early, just after pre-smp initcalls.
248 * So we must register our tracepoints at this stage.
249 */
250int __init trace_workqueue_early_init(void)
251{
252 int ret, cpu;
253
254 ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
255 if (ret)
256 goto out;
257
258 ret = register_trace_workqueue_execution(probe_workqueue_execution);
259 if (ret)
260 goto no_insertion;
261
262 ret = register_trace_workqueue_creation(probe_workqueue_creation);
263 if (ret)
264 goto no_execution;
265
266 ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
267 if (ret)
268 goto no_creation;
269
270 for_each_possible_cpu(cpu) {
271 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
272 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
273 }
274
275 return 0;
276
277no_creation:
278 unregister_trace_workqueue_creation(probe_workqueue_creation);
279no_execution:
280 unregister_trace_workqueue_execution(probe_workqueue_execution);
281no_insertion:
282 unregister_trace_workqueue_insertion(probe_workqueue_insertion);
283out:
284 pr_warning("trace_workqueue: unable to trace workqueues\n");
285
286 return 1;
287}
288early_initcall(trace_workqueue_early_init);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 79602740bbb5..1ef5d3a601c7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -272,12 +272,15 @@ static void disable_tracepoint(struct tracepoint *elem)
272 * 272 *
273 * Updates the probe callback corresponding to a range of tracepoints. 273 * Updates the probe callback corresponding to a range of tracepoints.
274 */ 274 */
275void tracepoint_update_probe_range(struct tracepoint *begin, 275void
276 struct tracepoint *end) 276tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
277{ 277{
278 struct tracepoint *iter; 278 struct tracepoint *iter;
279 struct tracepoint_entry *mark_entry; 279 struct tracepoint_entry *mark_entry;
280 280
281 if (!begin)
282 return;
283
281 mutex_lock(&tracepoints_mutex); 284 mutex_lock(&tracepoints_mutex);
282 for (iter = begin; iter < end; iter++) { 285 for (iter = begin; iter < end; iter++) {
283 mark_entry = get_tracepoint(iter->name); 286 mark_entry = get_tracepoint(iter->name);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 43f891b05a4b..00d59d048edf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk)
122 if (likely(tsk->mm)) { 122 if (likely(tsk->mm)) {
123 cputime_t time, dtime; 123 cputime_t time, dtime;
124 struct timeval value; 124 struct timeval value;
125 unsigned long flags;
125 u64 delta; 126 u64 delta;
126 127
128 local_irq_save(flags);
127 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
128 dtime = cputime_sub(time, tsk->acct_timexpd); 130 dtime = cputime_sub(time, tsk->acct_timexpd);
129 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
@@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk)
131 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;
132 134
133 if (delta == 0) 135 if (delta == 0)
134 return; 136 goto out;
135 tsk->acct_timexpd = time; 137 tsk->acct_timexpd = time;
136 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); 138 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
137 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 139 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
140 out:
141 local_irq_restore(flags);
138 } 142 }
139} 143}
140 144
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 2460c3199b5a..0314501688b9 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -17,7 +17,7 @@
17 17
18#include <asm/uaccess.h> 18#include <asm/uaccess.h>
19 19
20asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) 20SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
21{ 21{
22 long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); 22 long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
23 /* avoid REGPARM breakage on x86: */ 23 /* avoid REGPARM breakage on x86: */
@@ -25,7 +25,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi
25 return ret; 25 return ret;
26} 26}
27 27
28asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) 28SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
29{ 29{
30 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); 30 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
31 /* avoid REGPARM breakage on x86: */ 31 /* avoid REGPARM breakage on x86: */
@@ -33,7 +33,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g
33 return ret; 33 return ret;
34} 34}
35 35
36asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) 36SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
37{ 37{
38 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); 38 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
39 /* avoid REGPARM breakage on x86: */ 39 /* avoid REGPARM breakage on x86: */
@@ -41,7 +41,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
41 return ret; 41 return ret;
42} 42}
43 43
44asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) 44SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
45{ 45{
46 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); 46 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
47 /* avoid REGPARM breakage on x86: */ 47 /* avoid REGPARM breakage on x86: */
@@ -49,7 +49,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
49 return ret; 49 return ret;
50} 50}
51 51
52asmlinkage long sys_setgid16(old_gid_t gid) 52SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
53{ 53{
54 long ret = sys_setgid(low2highgid(gid)); 54 long ret = sys_setgid(low2highgid(gid));
55 /* avoid REGPARM breakage on x86: */ 55 /* avoid REGPARM breakage on x86: */
@@ -57,7 +57,7 @@ asmlinkage long sys_setgid16(old_gid_t gid)
57 return ret; 57 return ret;
58} 58}
59 59
60asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) 60SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
61{ 61{
62 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); 62 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
63 /* avoid REGPARM breakage on x86: */ 63 /* avoid REGPARM breakage on x86: */
@@ -65,7 +65,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
65 return ret; 65 return ret;
66} 66}
67 67
68asmlinkage long sys_setuid16(old_uid_t uid) 68SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
69{ 69{
70 long ret = sys_setuid(low2highuid(uid)); 70 long ret = sys_setuid(low2highuid(uid));
71 /* avoid REGPARM breakage on x86: */ 71 /* avoid REGPARM breakage on x86: */
@@ -73,7 +73,7 @@ asmlinkage long sys_setuid16(old_uid_t uid)
73 return ret; 73 return ret;
74} 74}
75 75
76asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) 76SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
77{ 77{
78 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), 78 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
79 low2highuid(suid)); 79 low2highuid(suid));
@@ -82,7 +82,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
82 return ret; 82 return ret;
83} 83}
84 84
85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) 85SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
86{ 86{
87 const struct cred *cred = current_cred(); 87 const struct cred *cred = current_cred();
88 int retval; 88 int retval;
@@ -94,7 +94,7 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
94 return retval; 94 return retval;
95} 95}
96 96
97asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) 97SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
98{ 98{
99 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), 99 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
100 low2highgid(sgid)); 100 low2highgid(sgid));
@@ -103,7 +103,8 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
103 return ret; 103 return ret;
104} 104}
105 105
106asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) 106
107SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
107{ 108{
108 const struct cred *cred = current_cred(); 109 const struct cred *cred = current_cred();
109 int retval; 110 int retval;
@@ -115,7 +116,7 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
115 return retval; 116 return retval;
116} 117}
117 118
118asmlinkage long sys_setfsuid16(old_uid_t uid) 119SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
119{ 120{
120 long ret = sys_setfsuid(low2highuid(uid)); 121 long ret = sys_setfsuid(low2highuid(uid));
121 /* avoid REGPARM breakage on x86: */ 122 /* avoid REGPARM breakage on x86: */
@@ -123,7 +124,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid)
123 return ret; 124 return ret;
124} 125}
125 126
126asmlinkage long sys_setfsgid16(old_gid_t gid) 127SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
127{ 128{
128 long ret = sys_setfsgid(low2highgid(gid)); 129 long ret = sys_setfsgid(low2highgid(gid));
129 /* avoid REGPARM breakage on x86: */ 130 /* avoid REGPARM breakage on x86: */
@@ -161,7 +162,7 @@ static int groups16_from_user(struct group_info *group_info,
161 return 0; 162 return 0;
162} 163}
163 164
164asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) 165SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
165{ 166{
166 const struct cred *cred = current_cred(); 167 const struct cred *cred = current_cred();
167 int i; 168 int i;
@@ -184,7 +185,7 @@ out:
184 return i; 185 return i;
185} 186}
186 187
187asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) 188SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
188{ 189{
189 struct group_info *group_info; 190 struct group_info *group_info;
190 int retval; 191 int retval;
@@ -209,22 +210,22 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
209 return retval; 210 return retval;
210} 211}
211 212
212asmlinkage long sys_getuid16(void) 213SYSCALL_DEFINE0(getuid16)
213{ 214{
214 return high2lowuid(current_uid()); 215 return high2lowuid(current_uid());
215} 216}
216 217
217asmlinkage long sys_geteuid16(void) 218SYSCALL_DEFINE0(geteuid16)
218{ 219{
219 return high2lowuid(current_euid()); 220 return high2lowuid(current_euid());
220} 221}
221 222
222asmlinkage long sys_getgid16(void) 223SYSCALL_DEFINE0(getgid16)
223{ 224{
224 return high2lowgid(current_gid()); 225 return high2lowgid(current_gid());
225} 226}
226 227
227asmlinkage long sys_getegid16(void) 228SYSCALL_DEFINE0(getegid16)
228{ 229{
229 return high2lowgid(current_egid()); 230 return high2lowgid(current_egid());
230} 231}
diff --git a/kernel/up.c b/kernel/up.c
index c04b9dcfcebe..1ff27a28bb7d 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -2,6 +2,7 @@
2 * Uniprocessor-only support functions. The counterpart to kernel/smp.c 2 * Uniprocessor-only support functions. The counterpart to kernel/smp.c
3 */ 3 */
4 4
5#include <linux/interrupt.h>
5#include <linux/kernel.h> 6#include <linux/kernel.h>
6#include <linux/module.h> 7#include <linux/module.h>
7#include <linux/smp.h> 8#include <linux/smp.h>
diff --git a/kernel/user.c b/kernel/user.c
index 477b6660f447..850e0ba41c1e 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,7 +20,7 @@
20 20
21struct user_namespace init_user_ns = { 21struct user_namespace init_user_ns = {
22 .kref = { 22 .kref = {
23 .refcount = ATOMIC_INIT(1), 23 .refcount = ATOMIC_INIT(2),
24 }, 24 },
25 .creator = &root_user, 25 .creator = &root_user,
26}; 26};
@@ -72,6 +72,7 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
72static void uid_hash_remove(struct user_struct *up) 72static void uid_hash_remove(struct user_struct *up)
73{ 73{
74 hlist_del_init(&up->uidhash_node); 74 hlist_del_init(&up->uidhash_node);
75 put_user_ns(up->user_ns);
75} 76}
76 77
77static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
@@ -285,14 +286,12 @@ int __init uids_sysfs_init(void)
285/* work function to remove sysfs directory for a user and free up 286/* work function to remove sysfs directory for a user and free up
286 * corresponding structures. 287 * corresponding structures.
287 */ 288 */
288static void remove_user_sysfs_dir(struct work_struct *w) 289static void cleanup_user_struct(struct work_struct *w)
289{ 290{
290 struct user_struct *up = container_of(w, struct user_struct, work); 291 struct user_struct *up = container_of(w, struct user_struct, work);
291 unsigned long flags; 292 unsigned long flags;
292 int remove_user = 0; 293 int remove_user = 0;
293 294
294 if (up->user_ns != &init_user_ns)
295 return;
296 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() 295 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
297 * atomic. 296 * atomic.
298 */ 297 */
@@ -311,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w)
311 if (!remove_user) 310 if (!remove_user)
312 goto done; 311 goto done;
313 312
314 kobject_uevent(&up->kobj, KOBJ_REMOVE); 313 if (up->user_ns == &init_user_ns) {
315 kobject_del(&up->kobj); 314 kobject_uevent(&up->kobj, KOBJ_REMOVE);
316 kobject_put(&up->kobj); 315 kobject_del(&up->kobj);
316 kobject_put(&up->kobj);
317 }
317 318
318 sched_destroy_user(up); 319 sched_destroy_user(up);
319 key_put(up->uid_keyring); 320 key_put(up->uid_keyring);
@@ -334,8 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
334 atomic_inc(&up->__count); 335 atomic_inc(&up->__count);
335 spin_unlock_irqrestore(&uidhash_lock, flags); 336 spin_unlock_irqrestore(&uidhash_lock, flags);
336 337
337 put_user_ns(up->user_ns); 338 INIT_WORK(&up->work, cleanup_user_struct);
338 INIT_WORK(&up->work, remove_user_sysfs_dir);
339 schedule_work(&up->work); 339 schedule_work(&up->work);
340} 340}
341 341
@@ -357,12 +357,29 @@ static void free_user(struct user_struct *up, unsigned long flags)
357 sched_destroy_user(up); 357 sched_destroy_user(up);
358 key_put(up->uid_keyring); 358 key_put(up->uid_keyring);
359 key_put(up->session_keyring); 359 key_put(up->session_keyring);
360 put_user_ns(up->user_ns);
361 kmem_cache_free(uid_cachep, up); 360 kmem_cache_free(uid_cachep, up);
362} 361}
363 362
364#endif 363#endif
365 364
365#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
366/*
367 * We need to check if a setuid can take place. This function should be called
368 * before successfully completing the setuid.
369 */
370int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
371{
372
373 return sched_rt_can_attach(up->tg, tsk);
374
375}
376#else
377int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
378{
379 return 1;
380}
381#endif
382
366/* 383/*
367 * Locate the user_struct for the passed UID. If found, take a ref on it. The 384 * Locate the user_struct for the passed UID. If found, take a ref on it. The
368 * caller must undo that ref with free_uid(). 385 * caller must undo that ref with free_uid().
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 79084311ee57..076c7c8215b0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -60,12 +60,25 @@ int create_user_ns(struct cred *new)
60 return 0; 60 return 0;
61} 61}
62 62
63void free_user_ns(struct kref *kref) 63/*
64 * Deferred destructor for a user namespace. This is required because
65 * free_user_ns() may be called with uidhash_lock held, but we need to call
66 * back to free_uid() which will want to take the lock again.
67 */
68static void free_user_ns_work(struct work_struct *work)
64{ 69{
65 struct user_namespace *ns; 70 struct user_namespace *ns =
66 71 container_of(work, struct user_namespace, destroyer);
67 ns = container_of(kref, struct user_namespace, kref);
68 free_uid(ns->creator); 72 free_uid(ns->creator);
69 kfree(ns); 73 kfree(ns);
70} 74}
75
76void free_user_ns(struct kref *kref)
77{
78 struct user_namespace *ns =
79 container_of(kref, struct user_namespace, kref);
80
81 INIT_WORK(&ns->destroyer, free_user_ns_work);
82 schedule_work(&ns->destroyer);
83}
71EXPORT_SYMBOL(free_user_ns); 84EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3b34b3545936..92359cc747a7 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
37 up_write(&uts_sem); 37 up_write(&uts_sem);
38} 38}
39 39
40#ifdef CONFIG_PROC_FS 40#ifdef CONFIG_PROC_SYSCTL
41/* 41/*
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/wait.c b/kernel/wait.c
index cd87131f2fc2..42a2dbc181c8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91} 91}
92EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
93 93
94/*
95 * finish_wait - clean up after waiting in a queue
96 * @q: waitqueue waited on
97 * @wait: wait descriptor
98 *
99 * Sets current thread back to running state and removes
100 * the wait descriptor from the given waitqueue if still
101 * queued.
102 */
94void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) 103void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
95{ 104{
96 unsigned long flags; 105 unsigned long flags;
@@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
117} 126}
118EXPORT_SYMBOL(finish_wait); 127EXPORT_SYMBOL(finish_wait);
119 128
129/*
130 * abort_exclusive_wait - abort exclusive waiting in a queue
131 * @q: waitqueue waited on
132 * @wait: wait descriptor
133 * @state: runstate of the waiter to be woken
134 * @key: key to identify a wait bit queue or %NULL
135 *
136 * Sets current thread back to running state and removes
137 * the wait descriptor from the given waitqueue if still
138 * queued.
139 *
140 * Wakes up the next waiter if the caller is concurrently
141 * woken up through the queue.
142 *
143 * This prevents waiter starvation where an exclusive waiter
144 * aborts and is woken up concurrently and noone wakes up
145 * the next waiter.
146 */
147void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
148 unsigned int mode, void *key)
149{
150 unsigned long flags;
151
152 __set_current_state(TASK_RUNNING);
153 spin_lock_irqsave(&q->lock, flags);
154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key);
158 spin_unlock_irqrestore(&q->lock, flags);
159}
160EXPORT_SYMBOL(abort_exclusive_wait);
161
120int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) 162int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
121{ 163{
122 int ret = default_wake_function(wait, mode, sync, key); 164 int ret = default_wake_function(wait, mode, sync, key);
@@ -177,17 +219,20 @@ int __sched
177__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 219__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
178 int (*action)(void *), unsigned mode) 220 int (*action)(void *), unsigned mode)
179{ 221{
180 int ret = 0;
181
182 do { 222 do {
223 int ret;
224
183 prepare_to_wait_exclusive(wq, &q->wait, mode); 225 prepare_to_wait_exclusive(wq, &q->wait, mode);
184 if (test_bit(q->key.bit_nr, q->key.flags)) { 226 if (!test_bit(q->key.bit_nr, q->key.flags))
185 if ((ret = (*action)(q->key.flags))) 227 continue;
186 break; 228 ret = action(q->key.flags);
187 } 229 if (!ret)
230 continue;
231 abort_exclusive_wait(wq, &q->wait, mode, &q->key);
232 return ret;
188 } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); 233 } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
189 finish_wait(wq, &q->wait); 234 finish_wait(wq, &q->wait);
190 return ret; 235 return 0;
191} 236}
192EXPORT_SYMBOL(__wait_on_bit_lock); 237EXPORT_SYMBOL(__wait_on_bit_lock);
193 238
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae37..b6b966ce1451 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,6 +33,7 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 35#include <linux/lockdep.h>
36#include <trace/workqueue.h>
36 37
37/* 38/*
38 * The per-CPU workqueue (if single thread, we always use the first 39 * The per-CPU workqueue (if single thread, we always use the first
@@ -48,8 +49,6 @@ struct cpu_workqueue_struct {
48 49
49 struct workqueue_struct *wq; 50 struct workqueue_struct *wq;
50 struct task_struct *thread; 51 struct task_struct *thread;
51
52 int run_depth; /* Detect run_workqueue() recursion depth */
53} ____cacheline_aligned; 52} ____cacheline_aligned;
54 53
55/* 54/*
@@ -125,9 +124,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
125 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 124 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
126} 125}
127 126
127DEFINE_TRACE(workqueue_insertion);
128
128static void insert_work(struct cpu_workqueue_struct *cwq, 129static void insert_work(struct cpu_workqueue_struct *cwq,
129 struct work_struct *work, struct list_head *head) 130 struct work_struct *work, struct list_head *head)
130{ 131{
132 trace_workqueue_insertion(cwq->thread, work);
133
131 set_wq_data(work, cwq); 134 set_wq_data(work, cwq);
132 /* 135 /*
133 * Ensure that we get the right work->data if we see the 136 * Ensure that we get the right work->data if we see the
@@ -259,16 +262,11 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
259} 262}
260EXPORT_SYMBOL_GPL(queue_delayed_work_on); 263EXPORT_SYMBOL_GPL(queue_delayed_work_on);
261 264
265DEFINE_TRACE(workqueue_execution);
266
262static void run_workqueue(struct cpu_workqueue_struct *cwq) 267static void run_workqueue(struct cpu_workqueue_struct *cwq)
263{ 268{
264 spin_lock_irq(&cwq->lock); 269 spin_lock_irq(&cwq->lock);
265 cwq->run_depth++;
266 if (cwq->run_depth > 3) {
267 /* morton gets to eat his hat */
268 printk("%s: recursion depth exceeded: %d\n",
269 __func__, cwq->run_depth);
270 dump_stack();
271 }
272 while (!list_empty(&cwq->worklist)) { 270 while (!list_empty(&cwq->worklist)) {
273 struct work_struct *work = list_entry(cwq->worklist.next, 271 struct work_struct *work = list_entry(cwq->worklist.next,
274 struct work_struct, entry); 272 struct work_struct, entry);
@@ -284,7 +282,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
284 */ 282 */
285 struct lockdep_map lockdep_map = work->lockdep_map; 283 struct lockdep_map lockdep_map = work->lockdep_map;
286#endif 284#endif
287 285 trace_workqueue_execution(cwq->thread, work);
288 cwq->current_work = work; 286 cwq->current_work = work;
289 list_del_init(cwq->worklist.next); 287 list_del_init(cwq->worklist.next);
290 spin_unlock_irq(&cwq->lock); 288 spin_unlock_irq(&cwq->lock);
@@ -311,7 +309,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
311 spin_lock_irq(&cwq->lock); 309 spin_lock_irq(&cwq->lock);
312 cwq->current_work = NULL; 310 cwq->current_work = NULL;
313 } 311 }
314 cwq->run_depth--;
315 spin_unlock_irq(&cwq->lock); 312 spin_unlock_irq(&cwq->lock);
316} 313}
317 314
@@ -368,29 +365,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
368 365
369static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 366static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
370{ 367{
371 int active; 368 int active = 0;
372 369 struct wq_barrier barr;
373 if (cwq->thread == current) {
374 /*
375 * Probably keventd trying to flush its own queue. So simply run
376 * it by hand rather than deadlocking.
377 */
378 run_workqueue(cwq);
379 active = 1;
380 } else {
381 struct wq_barrier barr;
382 370
383 active = 0; 371 WARN_ON(cwq->thread == current);
384 spin_lock_irq(&cwq->lock);
385 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
386 insert_wq_barrier(cwq, &barr, &cwq->worklist);
387 active = 1;
388 }
389 spin_unlock_irq(&cwq->lock);
390 372
391 if (active) 373 spin_lock_irq(&cwq->lock);
392 wait_for_completion(&barr.done); 374 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
375 insert_wq_barrier(cwq, &barr, &cwq->worklist);
376 active = 1;
393 } 377 }
378 spin_unlock_irq(&cwq->lock);
379
380 if (active)
381 wait_for_completion(&barr.done);
394 382
395 return active; 383 return active;
396} 384}
@@ -416,7 +404,7 @@ void flush_workqueue(struct workqueue_struct *wq)
416 might_sleep(); 404 might_sleep();
417 lock_map_acquire(&wq->lockdep_map); 405 lock_map_acquire(&wq->lockdep_map);
418 lock_map_release(&wq->lockdep_map); 406 lock_map_release(&wq->lockdep_map);
419 for_each_cpu_mask_nr(cpu, *cpu_map) 407 for_each_cpu(cpu, cpu_map)
420 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 408 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
421} 409}
422EXPORT_SYMBOL_GPL(flush_workqueue); 410EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,7 +535,7 @@ static void wait_on_work(struct work_struct *work)
547 wq = cwq->wq; 535 wq = cwq->wq;
548 cpu_map = wq_cpu_map(wq); 536 cpu_map = wq_cpu_map(wq);
549 537
550 for_each_cpu_mask_nr(cpu, *cpu_map) 538 for_each_cpu(cpu, cpu_map)
551 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 539 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
552} 540}
553 541
@@ -765,6 +753,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
765 return cwq; 753 return cwq;
766} 754}
767 755
756DEFINE_TRACE(workqueue_creation);
757
768static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 758static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
769{ 759{
770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 760 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -787,6 +777,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
787 sched_setscheduler_nocheck(p, SCHED_FIFO, &param); 777 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
788 cwq->thread = p; 778 cwq->thread = p;
789 779
780 trace_workqueue_creation(cwq->thread, cpu);
781
790 return 0; 782 return 0;
791} 783}
792 784
@@ -868,6 +860,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
868} 860}
869EXPORT_SYMBOL_GPL(__create_workqueue_key); 861EXPORT_SYMBOL_GPL(__create_workqueue_key);
870 862
863DEFINE_TRACE(workqueue_destruction);
864
871static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 865static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
872{ 866{
873 /* 867 /*
@@ -891,6 +885,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
891 * checks list_empty(), and a "normal" queue_work() can't use 885 * checks list_empty(), and a "normal" queue_work() can't use
892 * a dead CPU. 886 * a dead CPU.
893 */ 887 */
888 trace_workqueue_destruction(cwq->thread);
894 kthread_stop(cwq->thread); 889 kthread_stop(cwq->thread);
895 cwq->thread = NULL; 890 cwq->thread = NULL;
896} 891}
@@ -911,7 +906,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
911 list_del(&wq->list); 906 list_del(&wq->list);
912 spin_unlock(&workqueue_lock); 907 spin_unlock(&workqueue_lock);
913 908
914 for_each_cpu_mask_nr(cpu, *cpu_map) 909 for_each_cpu(cpu, cpu_map)
915 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 910 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
916 cpu_maps_update_done(); 911 cpu_maps_update_done();
917 912
@@ -971,6 +966,8 @@ undo:
971} 966}
972 967
973#ifdef CONFIG_SMP 968#ifdef CONFIG_SMP
969static struct workqueue_struct *work_on_cpu_wq __read_mostly;
970
974struct work_for_cpu { 971struct work_for_cpu {
975 struct work_struct work; 972 struct work_struct work;
976 long (*fn)(void *); 973 long (*fn)(void *);
@@ -991,8 +988,8 @@ static void do_work_for_cpu(struct work_struct *w)
991 * @fn: the function to run 988 * @fn: the function to run
992 * @arg: the function arg 989 * @arg: the function arg
993 * 990 *
994 * This will return -EINVAL in the cpu is not online, or the return value 991 * This will return the value @fn returns.
995 * of @fn otherwise. 992 * It is up to the caller to ensure that the cpu doesn't go offline.
996 */ 993 */
997long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 994long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
998{ 995{
@@ -1001,14 +998,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
1001 INIT_WORK(&wfc.work, do_work_for_cpu); 998 INIT_WORK(&wfc.work, do_work_for_cpu);
1002 wfc.fn = fn; 999 wfc.fn = fn;
1003 wfc.arg = arg; 1000 wfc.arg = arg;
1004 get_online_cpus(); 1001 queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
1005 if (unlikely(!cpu_online(cpu))) 1002 flush_work(&wfc.work);
1006 wfc.ret = -EINVAL;
1007 else {
1008 schedule_work_on(cpu, &wfc.work);
1009 flush_work(&wfc.work);
1010 }
1011 put_online_cpus();
1012 1003
1013 return wfc.ret; 1004 return wfc.ret;
1014} 1005}
@@ -1025,4 +1016,8 @@ void __init init_workqueues(void)
1025 hotcpu_notifier(workqueue_cpu_callback, 0); 1016 hotcpu_notifier(workqueue_cpu_callback, 0);
1026 keventd_wq = create_workqueue("events"); 1017 keventd_wq = create_workqueue("events");
1027 BUG_ON(!keventd_wq); 1018 BUG_ON(!keventd_wq);
1019#ifdef CONFIG_SMP
1020 work_on_cpu_wq = create_workqueue("work_on_cpu");
1021 BUG_ON(!work_on_cpu_wq);
1022#endif
1028} 1023}