aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/async.c12
-rw-r--r--kernel/audit.c31
-rw-r--r--kernel/audit_tree.c8
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cgroup.c241
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/compat.c7
-rw-r--r--kernel/configs.c4
-rw-r--r--kernel/cpu.c94
-rw-r--r--kernel/cpuset.c10
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c22
-rw-r--r--kernel/debug/kdb/kdb_bt.c5
-rw-r--r--kernel/debug/kdb/kdb_cmds4
-rw-r--r--kernel/debug/kdb/kdb_debugger.c21
-rw-r--r--kernel/debug/kdb/kdb_io.c36
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c1001
-rw-r--r--kernel/events/hw_breakpoint.c10
-rw-r--r--kernel/events/internal.h96
-rw-r--r--kernel/events/ring_buffer.c380
-rw-r--r--kernel/exit.c103
-rw-r--r--kernel/fork.c141
-rw-r--r--kernel/futex.c74
-rw-r--r--kernel/gcov/Kconfig10
-rw-r--r--kernel/gcov/gcc_3_4.c88
-rw-r--r--kernel/gcov/gcov.h42
-rw-r--r--kernel/hrtimer.c6
-rw-r--r--kernel/hung_task.c14
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/generic-chip.c4
-rw-r--r--kernel/irq/irqdesc.c37
-rw-r--r--kernel/irq/irqdomain.c184
-rw-r--r--kernel/irq/manage.c22
-rw-r--r--kernel/irq/pm.c55
-rw-r--r--kernel/irq/resend.c19
-rw-r--r--kernel/irq/spurious.c6
-rw-r--r--kernel/jump_label.c3
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/kprobes.c33
-rw-r--r--kernel/lockdep.c74
-rw-r--r--kernel/module.c82
-rw-r--r--kernel/notifier.c31
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/panic.c15
-rw-r--r--kernel/params.c18
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pm_qos_params.c79
-rw-r--r--kernel/posix-cpu-timers.c12
-rw-r--r--kernel/power/Kconfig90
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/consoleearlysuspend.c78
-rw-r--r--kernel/power/earlysuspend.c187
-rw-r--r--kernel/power/fbearlysuspend.c153
-rw-r--r--kernel/power/main.c25
-rw-r--r--kernel/power/power.h24
-rw-r--r--kernel/power/process.c27
-rw-r--r--kernel/power/suspend.c25
-rw-r--r--kernel/power/suspend_time.c111
-rw-r--r--kernel/power/userwakelock.c219
-rw-r--r--kernel/power/wakelock.c634
-rw-r--r--kernel/printk.c88
-rw-r--r--kernel/ptrace.c207
-rw-r--r--kernel/rcupdate.c2
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/rcutree_trace.c2
-rw-r--r--kernel/resource.c28
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/rwsem.c18
-rw-r--r--kernel/sched.c381
-rw-r--r--kernel/sched_autogroup.h1
-rw-r--r--kernel/sched_fair.c72
-rw-r--r--kernel/sched_features.h4
-rw-r--r--kernel/sched_rt.c30
-rw-r--r--kernel/signal.c442
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/stop_machine.c80
-rw-r--r--kernel/sys.c85
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/taskstats.c21
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c20
-rw-r--r--kernel/time/clocksource.c62
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/timekeeping.c39
-rw-r--r--kernel/trace/Kconfig35
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c21
-rw-r--r--kernel/trace/ftrace.c179
-rw-r--r--kernel/trace/ring_buffer.c66
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c330
-rw-r--r--kernel/trace/trace.h63
-rw-r--r--kernel/trace/trace_entries.h3
-rw-r--r--kernel/trace/trace_events.c138
-rw-r--r--kernel/trace/trace_events_filter.c12
-rw-r--r--kernel/trace/trace_functions.c3
-rw-r--r--kernel/trace/trace_functions_graph.c225
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c378
-rw-r--r--kernel/trace/trace_mmiotrace.c2
-rw-r--r--kernel/trace/trace_output.c11
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_stack.c13
-rw-r--r--kernel/trace/tracedump.c682
-rw-r--r--kernel/trace/tracelevel.c142
-rw-r--r--kernel/tsacct.c15
-rw-r--r--kernel/watchdog.c8
-rw-r--r--kernel/workqueue.c86
123 files changed, 6628 insertions, 2206 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b35..24e7cb0ba26 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
35 35
36config PREEMPT 36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)" 37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 select PREEMPT_COUNT
38 help 39 help
39 This option reduces the latency of the kernel by making 40 This option reduces the latency of the kernel by making
40 all kernel code (that is not executing in a critical section) 41 all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
52 53
53endchoice 54endchoice
54 55
56config PREEMPT_COUNT
57 bool \ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d64cfcc8b4..eca595e2fd5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o jump_label.o 13 async.o range.o
14obj-y += groups.o 14obj-y += groups.o
15 15
16ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += events/
107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
108obj-$(CONFIG_PADATA) += padata.o 108obj-$(CONFIG_PADATA) += padata.o
109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
110 111
111ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 112ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
112# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 113# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -125,11 +126,10 @@ targets += config_data.gz
125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE 126$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
126 $(call if_changed,gzip) 127 $(call if_changed,gzip)
127 128
128quiet_cmd_ikconfiggz = IKCFG $@ 129 filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
129 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
130targets += config_data.h 130targets += config_data.h
131$(obj)/config_data.h: $(obj)/config_data.gz FORCE 131$(obj)/config_data.h: $(obj)/config_data.gz FORCE
132 $(call if_changed,ikconfiggz) 132 $(call filechk,ikconfiggz)
133 133
134$(obj)/time.o: $(obj)/timeconst.h 134$(obj)/time.o: $(obj)/timeconst.h
135 135
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb913c7..d5fe7af0de2 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/atomic.h>
53#include <linux/ktime.h>
52#include <linux/module.h> 54#include <linux/module.h>
53#include <linux/wait.h> 55#include <linux/wait.h>
54#include <linux/sched.h> 56#include <linux/sched.h>
55#include <linux/slab.h> 57#include <linux/slab.h>
56#include <linux/workqueue.h> 58#include <linux/workqueue.h>
57#include <asm/atomic.h>
58 59
59static async_cookie_t next_cookie = 1; 60static async_cookie_t next_cookie = 1;
60 61
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work)
128 129
129 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
130 if (initcall_debug && system_state == SYSTEM_BOOTING) { 131 if (initcall_debug && system_state == SYSTEM_BOOTING) {
131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 132 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
133 (long long)entry->cookie,
132 entry->func, task_pid_nr(current)); 134 entry->func, task_pid_nr(current));
133 calltime = ktime_get(); 135 calltime = ktime_get();
134 } 136 }
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work)
136 if (initcall_debug && system_state == SYSTEM_BOOTING) { 138 if (initcall_debug && system_state == SYSTEM_BOOTING) {
137 rettime = ktime_get(); 139 rettime = ktime_get();
138 delta = ktime_sub(rettime, calltime); 140 delta = ktime_sub(rettime, calltime);
139 printk("initcall %lli_%pF returned 0 after %lld usecs\n", 141 printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
140 (long long)entry->cookie, 142 (long long)entry->cookie,
141 entry->func, 143 entry->func,
142 (long long)ktime_to_ns(delta) >> 10); 144 (long long)ktime_to_ns(delta) >> 10);
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
270 ktime_t starttime, delta, endtime; 272 ktime_t starttime, delta, endtime;
271 273
272 if (initcall_debug && system_state == SYSTEM_BOOTING) { 274 if (initcall_debug && system_state == SYSTEM_BOOTING) {
273 printk("async_waiting @ %i\n", task_pid_nr(current)); 275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
274 starttime = ktime_get(); 276 starttime = ktime_get();
275 } 277 }
276 278
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
280 endtime = ktime_get(); 282 endtime = ktime_get();
281 delta = ktime_sub(endtime, starttime); 283 delta = ktime_sub(endtime, starttime);
282 284
283 printk("async_continuing @ %i after %lli usec\n", 285 printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
284 task_pid_nr(current), 286 task_pid_nr(current),
285 (long long)ktime_to_ns(delta) >> 10); 287 (long long)ktime_to_ns(delta) >> 10);
286 } 288 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 93950031706..0a1355ca3d7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,7 +43,7 @@
43 43
44#include <linux/init.h> 44#include <linux/init.h>
45#include <asm/types.h> 45#include <asm/types.h>
46#include <asm/atomic.h> 46#include <linux/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
@@ -55,6 +55,9 @@
55#include <net/sock.h> 55#include <net/sock.h>
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#ifdef CONFIG_SECURITY
59#include <linux/security.h>
60#endif
58#include <linux/netlink.h> 61#include <linux/netlink.h>
59#include <linux/freezer.h> 62#include <linux/freezer.h>
60#include <linux/tty.h> 63#include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
1502 } 1505 }
1503} 1506}
1504 1507
1508#ifdef CONFIG_SECURITY
1509/**
1510 * audit_log_secctx - Converts and logs SELinux context
1511 * @ab: audit_buffer
1512 * @secid: security number
1513 *
1514 * This is a helper function that calls security_secid_to_secctx to convert
1515 * secid to secctx and then adds the (converted) SELinux context to the audit
1516 * log by calling audit_log_format, thus also preventing leak of internal secid
1517 * to userspace. If secid cannot be converted audit_panic is called.
1518 */
1519void audit_log_secctx(struct audit_buffer *ab, u32 secid)
1520{
1521 u32 len;
1522 char *secctx;
1523
1524 if (security_secid_to_secctx(secid, &secctx, &len)) {
1525 audit_panic("Cannot convert secid to context");
1526 } else {
1527 audit_log_format(ab, " obj=%s", secctx);
1528 security_release_secctx(secctx, len);
1529 }
1530}
1531EXPORT_SYMBOL(audit_log_secctx);
1532#endif
1533
1505EXPORT_SYMBOL(audit_log_start); 1534EXPORT_SYMBOL(audit_log_start);
1506EXPORT_SYMBOL(audit_log_end); 1535EXPORT_SYMBOL(audit_log_end);
1507EXPORT_SYMBOL(audit_log_format); 1536EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e99dda04b12..5bf0790497e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
93 atomic_inc(&tree->count); 93 atomic_inc(&tree->count);
94} 94}
95 95
96static void __put_tree(struct rcu_head *rcu)
97{
98 struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
99 kfree(tree);
100}
101
102static inline void put_tree(struct audit_tree *tree) 96static inline void put_tree(struct audit_tree *tree)
103{ 97{
104 if (atomic_dec_and_test(&tree->count)) 98 if (atomic_dec_and_test(&tree->count))
105 call_rcu(&tree->head, __put_tree); 99 kfree_rcu(tree, head);
106} 100}
107 101
108/* to avoid bringing the entire thing in audit.h */ 102/* to avoid bringing the entire thing in audit.h */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 00d79df03e7..ce4b054acee 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -44,7 +44,7 @@
44 44
45#include <linux/init.h> 45#include <linux/init.h>
46#include <asm/types.h> 46#include <asm/types.h>
47#include <asm/atomic.h> 47#include <linux/atomic.h>
48#include <linux/fs.h> 48#include <linux/fs.h>
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2731d115d72..54a36fe288f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,9 +27,11 @@
27 */ 27 */
28 28
29#include <linux/cgroup.h> 29#include <linux/cgroup.h>
30#include <linux/cred.h>
30#include <linux/ctype.h> 31#include <linux/ctype.h>
31#include <linux/errno.h> 32#include <linux/errno.h>
32#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/init_task.h>
33#include <linux/kernel.h> 35#include <linux/kernel.h>
34#include <linux/list.h> 36#include <linux/list.h>
35#include <linux/mm.h> 37#include <linux/mm.h>
@@ -59,7 +61,7 @@
59#include <linux/poll.h> 61#include <linux/poll.h>
60#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
61 63
62#include <asm/atomic.h> 64#include <linux/atomic.h>
63 65
64static DEFINE_MUTEX(cgroup_mutex); 66static DEFINE_MUTEX(cgroup_mutex);
65 67
@@ -268,6 +270,33 @@ static void cgroup_release_agent(struct work_struct *work);
268static DECLARE_WORK(release_agent_work, cgroup_release_agent); 270static DECLARE_WORK(release_agent_work, cgroup_release_agent);
269static void check_for_release(struct cgroup *cgrp); 271static void check_for_release(struct cgroup *cgrp);
270 272
273/*
274 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
275 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
276 * reference to css->refcnt. In general, this refcnt is expected to goes down
277 * to zero, soon.
278 *
279 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
280 */
281DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
282
283static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
284{
285 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
286 wake_up_all(&cgroup_rmdir_waitq);
287}
288
289void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
290{
291 css_get(css);
292}
293
294void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
295{
296 cgroup_wakeup_rmdir_waiter(css->cgroup);
297 css_put(css);
298}
299
271/* Link structure for associating css_set objects with cgroups */ 300/* Link structure for associating css_set objects with cgroups */
272struct cg_cgroup_link { 301struct cg_cgroup_link {
273 /* 302 /*
@@ -327,52 +356,43 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
327 return &css_set_table[index]; 356 return &css_set_table[index];
328} 357}
329 358
330/* We don't maintain the lists running through each css_set to its 359static void free_css_set_work(struct work_struct *work)
331 * task until after the first call to cgroup_iter_start(). This
332 * reduces the fork()/exit() overhead for people who have cgroups
333 * compiled into their kernel but not actually in use */
334static int use_task_css_set_links __read_mostly;
335
336static void __put_css_set(struct css_set *cg, int taskexit)
337{ 360{
361 struct css_set *cg = container_of(work, struct css_set, work);
338 struct cg_cgroup_link *link; 362 struct cg_cgroup_link *link;
339 struct cg_cgroup_link *saved_link; 363 struct cg_cgroup_link *saved_link;
340 /*
341 * Ensure that the refcount doesn't hit zero while any readers
342 * can see it. Similar to atomic_dec_and_lock(), but for an
343 * rwlock
344 */
345 if (atomic_add_unless(&cg->refcount, -1, 1))
346 return;
347 write_lock(&css_set_lock);
348 if (!atomic_dec_and_test(&cg->refcount)) {
349 write_unlock(&css_set_lock);
350 return;
351 }
352
353 /* This css_set is dead. unlink it and release cgroup refcounts */
354 hlist_del(&cg->hlist);
355 css_set_count--;
356 364
365 write_lock(&css_set_lock);
357 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 366 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
358 cg_link_list) { 367 cg_link_list) {
359 struct cgroup *cgrp = link->cgrp; 368 struct cgroup *cgrp = link->cgrp;
360 list_del(&link->cg_link_list); 369 list_del(&link->cg_link_list);
361 list_del(&link->cgrp_link_list); 370 list_del(&link->cgrp_link_list);
362 if (atomic_dec_and_test(&cgrp->count) && 371 if (atomic_dec_and_test(&cgrp->count)) {
363 notify_on_release(cgrp)) {
364 if (taskexit)
365 set_bit(CGRP_RELEASABLE, &cgrp->flags);
366 check_for_release(cgrp); 372 check_for_release(cgrp);
373 cgroup_wakeup_rmdir_waiter(cgrp);
367 } 374 }
368
369 kfree(link); 375 kfree(link);
370 } 376 }
371
372 write_unlock(&css_set_lock); 377 write_unlock(&css_set_lock);
373 kfree_rcu(cg, rcu_head); 378
379 kfree(cg);
380}
381
382static void free_css_set_rcu(struct rcu_head *obj)
383{
384 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
385
386 INIT_WORK(&cg->work, free_css_set_work);
387 schedule_work(&cg->work);
374} 388}
375 389
390/* We don't maintain the lists running through each css_set to its
391 * task until after the first call to cgroup_iter_start(). This
392 * reduces the fork()/exit() overhead for people who have cgroups
393 * compiled into their kernel but not actually in use */
394static int use_task_css_set_links __read_mostly;
395
376/* 396/*
377 * refcounted get/put for css_set objects 397 * refcounted get/put for css_set objects
378 */ 398 */
@@ -381,14 +401,26 @@ static inline void get_css_set(struct css_set *cg)
381 atomic_inc(&cg->refcount); 401 atomic_inc(&cg->refcount);
382} 402}
383 403
384static inline void put_css_set(struct css_set *cg) 404static void put_css_set(struct css_set *cg)
385{ 405{
386 __put_css_set(cg, 0); 406 /*
387} 407 * Ensure that the refcount doesn't hit zero while any readers
408 * can see it. Similar to atomic_dec_and_lock(), but for an
409 * rwlock
410 */
411 if (atomic_add_unless(&cg->refcount, -1, 1))
412 return;
413 write_lock(&css_set_lock);
414 if (!atomic_dec_and_test(&cg->refcount)) {
415 write_unlock(&css_set_lock);
416 return;
417 }
388 418
389static inline void put_css_set_taskexit(struct css_set *cg) 419 hlist_del(&cg->hlist);
390{ 420 css_set_count--;
391 __put_css_set(cg, 1); 421
422 write_unlock(&css_set_lock);
423 call_rcu(&cg->rcu_head, free_css_set_rcu);
392} 424}
393 425
394/* 426/*
@@ -720,9 +752,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
720 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with 752 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
721 * another. It does so using cgroup_mutex, however there are 753 * another. It does so using cgroup_mutex, however there are
722 * several performance critical places that need to reference 754 * several performance critical places that need to reference
723 * task->cgroup without the expense of grabbing a system global 755 * task->cgroups without the expense of grabbing a system global
724 * mutex. Therefore except as noted below, when dereferencing or, as 756 * mutex. Therefore except as noted below, when dereferencing or, as
725 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use 757 * in cgroup_attach_task(), modifying a task's cgroups pointer we use
726 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 758 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
727 * the task_struct routinely used for such matters. 759 * the task_struct routinely used for such matters.
728 * 760 *
@@ -912,33 +944,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
912} 944}
913 945
914/* 946/*
915 * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
916 * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
917 * reference to css->refcnt. In general, this refcnt is expected to goes down
918 * to zero, soon.
919 *
920 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
921 */
922DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
923
924static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
925{
926 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
927 wake_up_all(&cgroup_rmdir_waitq);
928}
929
930void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
931{
932 css_get(css);
933}
934
935void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
936{
937 cgroup_wakeup_rmdir_waiter(css->cgroup);
938 css_put(css);
939}
940
941/*
942 * Call with cgroup_mutex held. Drops reference counts on modules, including 947 * Call with cgroup_mutex held. Drops reference counts on modules, including
943 * any duplicate ones that parse_cgroupfs_options took. If this function 948 * any duplicate ones that parse_cgroupfs_options took. If this function
944 * returns an error, no reference counts are touched. 949 * returns an error, no reference counts are touched.
@@ -1173,10 +1178,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1173 1178
1174 /* 1179 /*
1175 * If the 'all' option was specified select all the subsystems, 1180 * If the 'all' option was specified select all the subsystems,
1176 * otherwise 'all, 'none' and a subsystem name options were not 1181 * otherwise if 'none', 'name=' and a subsystem name options
1177 * specified, let's default to 'all' 1182 * were not specified, let's default to 'all'
1178 */ 1183 */
1179 if (all_ss || (!all_ss && !one_ss && !opts->none)) { 1184 if (all_ss || (!one_ss && !opts->none && !opts->name)) {
1180 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1185 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1181 struct cgroup_subsys *ss = subsys[i]; 1186 struct cgroup_subsys *ss = subsys[i];
1182 if (ss == NULL) 1187 if (ss == NULL)
@@ -1514,6 +1519,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1514 struct cgroup *root_cgrp = &root->top_cgroup; 1519 struct cgroup *root_cgrp = &root->top_cgroup;
1515 struct inode *inode; 1520 struct inode *inode;
1516 struct cgroupfs_root *existing_root; 1521 struct cgroupfs_root *existing_root;
1522 const struct cred *cred;
1517 int i; 1523 int i;
1518 1524
1519 BUG_ON(sb->s_root != NULL); 1525 BUG_ON(sb->s_root != NULL);
@@ -1593,7 +1599,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1593 BUG_ON(!list_empty(&root_cgrp->children)); 1599 BUG_ON(!list_empty(&root_cgrp->children));
1594 BUG_ON(root->number_of_cgroups != 1); 1600 BUG_ON(root->number_of_cgroups != 1);
1595 1601
1602 cred = override_creds(&init_cred);
1596 cgroup_populate_dir(root_cgrp); 1603 cgroup_populate_dir(root_cgrp);
1604 revert_creds(cred);
1597 mutex_unlock(&cgroup_mutex); 1605 mutex_unlock(&cgroup_mutex);
1598 mutex_unlock(&inode->i_mutex); 1606 mutex_unlock(&inode->i_mutex);
1599 } else { 1607 } else {
@@ -1697,7 +1705,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1697{ 1705{
1698 char *start; 1706 char *start;
1699 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1707 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1700 rcu_read_lock_held() ||
1701 cgroup_lock_is_held()); 1708 cgroup_lock_is_held());
1702 1709
1703 if (!dentry || cgrp == dummytop) { 1710 if (!dentry || cgrp == dummytop) {
@@ -1723,7 +1730,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1723 break; 1730 break;
1724 1731
1725 dentry = rcu_dereference_check(cgrp->dentry, 1732 dentry = rcu_dereference_check(cgrp->dentry,
1726 rcu_read_lock_held() ||
1727 cgroup_lock_is_held()); 1733 cgroup_lock_is_held());
1728 if (!cgrp->parent) 1734 if (!cgrp->parent)
1729 continue; 1735 continue;
@@ -1820,6 +1826,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1820 struct cgroup_subsys *ss, *failed_ss = NULL; 1826 struct cgroup_subsys *ss, *failed_ss = NULL;
1821 struct cgroup *oldcgrp; 1827 struct cgroup *oldcgrp;
1822 struct cgroupfs_root *root = cgrp->root; 1828 struct cgroupfs_root *root = cgrp->root;
1829 struct css_set *cg;
1823 1830
1824 /* Nothing to do if the task is already in that cgroup */ 1831 /* Nothing to do if the task is already in that cgroup */
1825 oldcgrp = task_cgroup_from_root(tsk, root); 1832 oldcgrp = task_cgroup_from_root(tsk, root);
@@ -1849,6 +1856,11 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1849 } 1856 }
1850 } 1857 }
1851 1858
1859 task_lock(tsk);
1860 cg = tsk->cgroups;
1861 get_css_set(cg);
1862 task_unlock(tsk);
1863
1852 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1864 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1853 if (retval) 1865 if (retval)
1854 goto out; 1866 goto out;
@@ -1861,8 +1873,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1861 if (ss->attach) 1873 if (ss->attach)
1862 ss->attach(ss, cgrp, oldcgrp, tsk); 1874 ss->attach(ss, cgrp, oldcgrp, tsk);
1863 } 1875 }
1864 1876 set_bit(CGRP_RELEASABLE, &cgrp->flags);
1865 synchronize_rcu(); 1877 /* put_css_set will not destroy cg until after an RCU grace period */
1878 put_css_set(cg);
1866 1879
1867 /* 1880 /*
1868 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1881 * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -2095,11 +2108,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2095 continue; 2108 continue;
2096 /* get old css_set pointer */ 2109 /* get old css_set pointer */
2097 task_lock(tsk); 2110 task_lock(tsk);
2098 if (tsk->flags & PF_EXITING) {
2099 /* ignore this task if it's going away */
2100 task_unlock(tsk);
2101 continue;
2102 }
2103 oldcg = tsk->cgroups; 2111 oldcg = tsk->cgroups;
2104 get_css_set(oldcg); 2112 get_css_set(oldcg);
2105 task_unlock(tsk); 2113 task_unlock(tsk);
@@ -2189,6 +2197,24 @@ out_free_group_list:
2189 return retval; 2197 return retval;
2190} 2198}
2191 2199
2200static int cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk)
2201{
2202 struct cgroup_subsys *ss;
2203 int ret;
2204
2205 for_each_subsys(cgrp->root, ss) {
2206 if (ss->allow_attach) {
2207 ret = ss->allow_attach(cgrp, tsk);
2208 if (ret)
2209 return ret;
2210 } else {
2211 return -EACCES;
2212 }
2213 }
2214
2215 return 0;
2216}
2217
2192/* 2218/*
2193 * Find the task_struct of the task to attach by vpid and pass it along to the 2219 * Find the task_struct of the task to attach by vpid and pass it along to the
2194 * function to attach either it or all tasks in its threadgroup. Will take 2220 * function to attach either it or all tasks in its threadgroup. Will take
@@ -2234,9 +2260,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2234 if (cred->euid && 2260 if (cred->euid &&
2235 cred->euid != tcred->uid && 2261 cred->euid != tcred->uid &&
2236 cred->euid != tcred->suid) { 2262 cred->euid != tcred->suid) {
2237 rcu_read_unlock(); 2263 /*
2238 cgroup_unlock(); 2264 * if the default permission check fails, give each
2239 return -EACCES; 2265 * cgroup a chance to extend the permission check
2266 */
2267 ret = cgroup_allow_attach(cgrp, tsk);
2268 if (ret) {
2269 rcu_read_unlock();
2270 cgroup_unlock();
2271 return ret;
2272 }
2240 } 2273 }
2241 get_task_struct(tsk); 2274 get_task_struct(tsk);
2242 rcu_read_unlock(); 2275 rcu_read_unlock();
@@ -3542,7 +3575,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3542 } 3575 }
3543 3576
3544 /* the process need read permission on control file */ 3577 /* the process need read permission on control file */
3545 ret = file_permission(cfile, MAY_READ); 3578 /* AV: shouldn't we check that it's been opened for read instead? */
3579 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
3546 if (ret < 0) 3580 if (ret < 0)
3547 goto fail; 3581 goto fail;
3548 3582
@@ -3810,6 +3844,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3810 if (err < 0) 3844 if (err < 0)
3811 goto err_remove; 3845 goto err_remove;
3812 3846
3847 set_bit(CGRP_RELEASABLE, &parent->flags);
3848
3813 /* The cgroup directory was pre-locked for us */ 3849 /* The cgroup directory was pre-locked for us */
3814 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); 3850 BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
3815 3851
@@ -3941,6 +3977,21 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
3941 return !failed; 3977 return !failed;
3942} 3978}
3943 3979
3980/* checks if all of the css_sets attached to a cgroup have a refcount of 0.
3981 * Must be called with css_set_lock held */
3982static int cgroup_css_sets_empty(struct cgroup *cgrp)
3983{
3984 struct cg_cgroup_link *link;
3985
3986 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
3987 struct css_set *cg = link->cg;
3988 if (atomic_read(&cg->refcount) > 0)
3989 return 0;
3990 }
3991
3992 return 1;
3993}
3994
3944static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 3995static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3945{ 3996{
3946 struct cgroup *cgrp = dentry->d_fsdata; 3997 struct cgroup *cgrp = dentry->d_fsdata;
@@ -3953,7 +4004,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3953 /* the vfs holds both inode->i_mutex already */ 4004 /* the vfs holds both inode->i_mutex already */
3954again: 4005again:
3955 mutex_lock(&cgroup_mutex); 4006 mutex_lock(&cgroup_mutex);
3956 if (atomic_read(&cgrp->count) != 0) { 4007 if (!cgroup_css_sets_empty(cgrp)) {
3957 mutex_unlock(&cgroup_mutex); 4008 mutex_unlock(&cgroup_mutex);
3958 return -EBUSY; 4009 return -EBUSY;
3959 } 4010 }
@@ -3986,7 +4037,7 @@ again:
3986 4037
3987 mutex_lock(&cgroup_mutex); 4038 mutex_lock(&cgroup_mutex);
3988 parent = cgrp->parent; 4039 parent = cgrp->parent;
3989 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 4040 if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) {
3990 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 4041 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
3991 mutex_unlock(&cgroup_mutex); 4042 mutex_unlock(&cgroup_mutex);
3992 return -EBUSY; 4043 return -EBUSY;
@@ -4026,7 +4077,6 @@ again:
4026 cgroup_d_remove_dir(d); 4077 cgroup_d_remove_dir(d);
4027 dput(d); 4078 dput(d);
4028 4079
4029 set_bit(CGRP_RELEASABLE, &parent->flags);
4030 check_for_release(parent); 4080 check_for_release(parent);
4031 4081
4032 /* 4082 /*
@@ -4626,7 +4676,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4626 task_unlock(tsk); 4676 task_unlock(tsk);
4627 4677
4628 if (cg) 4678 if (cg)
4629 put_css_set_taskexit(cg); 4679 put_css_set(cg);
4630} 4680}
4631 4681
4632/** 4682/**
@@ -4680,6 +4730,14 @@ static void check_for_release(struct cgroup *cgrp)
4680} 4730}
4681 4731
4682/* Caller must verify that the css is not for root cgroup */ 4732/* Caller must verify that the css is not for root cgroup */
4733void __css_get(struct cgroup_subsys_state *css, int count)
4734{
4735 atomic_add(count, &css->refcnt);
4736 set_bit(CGRP_RELEASABLE, &css->cgroup->flags);
4737}
4738EXPORT_SYMBOL_GPL(__css_get);
4739
4740/* Caller must verify that the css is not for root cgroup */
4683void __css_put(struct cgroup_subsys_state *css, int count) 4741void __css_put(struct cgroup_subsys_state *css, int count)
4684{ 4742{
4685 struct cgroup *cgrp = css->cgroup; 4743 struct cgroup *cgrp = css->cgroup;
@@ -4687,10 +4745,7 @@ void __css_put(struct cgroup_subsys_state *css, int count)
4687 rcu_read_lock(); 4745 rcu_read_lock();
4688 val = atomic_sub_return(count, &css->refcnt); 4746 val = atomic_sub_return(count, &css->refcnt);
4689 if (val == 1) { 4747 if (val == 1) {
4690 if (notify_on_release(cgrp)) { 4748 check_for_release(cgrp);
4691 set_bit(CGRP_RELEASABLE, &cgrp->flags);
4692 check_for_release(cgrp);
4693 }
4694 cgroup_wakeup_rmdir_waiter(cgrp); 4749 cgroup_wakeup_rmdir_waiter(cgrp);
4695 } 4750 }
4696 rcu_read_unlock(); 4751 rcu_read_unlock();
@@ -4813,8 +4868,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4813 * on this or this is under rcu_read_lock(). Once css->id is allocated, 4868 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4814 * it's unchanged until freed. 4869 * it's unchanged until freed.
4815 */ 4870 */
4816 cssid = rcu_dereference_check(css->id, 4871 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
4817 rcu_read_lock_held() || atomic_read(&css->refcnt));
4818 4872
4819 if (cssid) 4873 if (cssid)
4820 return cssid->id; 4874 return cssid->id;
@@ -4826,8 +4880,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4826{ 4880{
4827 struct css_id *cssid; 4881 struct css_id *cssid;
4828 4882
4829 cssid = rcu_dereference_check(css->id, 4883 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
4830 rcu_read_lock_held() || atomic_read(&css->refcnt));
4831 4884
4832 if (cssid) 4885 if (cssid)
4833 return cssid->depth; 4886 return cssid->depth;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e691818d7e4..a3f638ac3de 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss,
153 kfree(cgroup_freezer(cgroup)); 153 kfree(cgroup_freezer(cgroup));
154} 154}
155 155
156/* task is frozen or will freeze immediately when next it gets woken */
157static bool is_task_frozen_enough(struct task_struct *task)
158{
159 return frozen(task) ||
160 (task_is_stopped_or_traced(task) && freezing(task));
161}
162
156/* 163/*
157 * The call to cgroup_lock() in the freezer.state write method prevents 164 * The call to cgroup_lock() in the freezer.state write method prevents
158 * a write to that file racing against an attach, and hence the 165 * a write to that file racing against an attach, and hence the
@@ -231,7 +238,7 @@ static void update_if_frozen(struct cgroup *cgroup,
231 cgroup_iter_start(cgroup, &it); 238 cgroup_iter_start(cgroup, &it);
232 while ((task = cgroup_iter_next(cgroup, &it))) { 239 while ((task = cgroup_iter_next(cgroup, &it))) {
233 ntotal++; 240 ntotal++;
234 if (frozen(task)) 241 if (is_task_frozen_enough(task))
235 nfrozen++; 242 nfrozen++;
236 } 243 }
237 244
@@ -284,7 +291,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
284 while ((task = cgroup_iter_next(cgroup, &it))) { 291 while ((task = cgroup_iter_next(cgroup, &it))) {
285 if (!freeze_task(task, true)) 292 if (!freeze_task(task, true))
286 continue; 293 continue;
287 if (frozen(task)) 294 if (is_task_frozen_enough(task))
288 continue; 295 continue;
289 if (!freezing(task) && !freezer_should_skip(task)) 296 if (!freezing(task) && !freezer_should_skip(task))
290 num_cant_freeze_now++; 297 num_cant_freeze_now++;
diff --git a/kernel/compat.c b/kernel/compat.c
index fc9eb093acd..e2435ee9993 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
158 __put_user(ts->tv_sec, &cts->tv_sec) || 158 __put_user(ts->tv_sec, &cts->tv_sec) ||
159 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 159 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
160} 160}
161EXPORT_SYMBOL_GPL(put_compat_timespec);
161 162
162static long compat_nanosleep_restart(struct restart_block *restart) 163static long compat_nanosleep_restart(struct restart_block *restart)
163{ 164{
@@ -890,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
890 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 891 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
891 } 892 }
892} 893}
894EXPORT_SYMBOL_GPL(sigset_from_compat);
893 895
894asmlinkage long 896asmlinkage long
895compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 897compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
@@ -991,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
991 sigset_from_compat(&newset, &newset32); 993 sigset_from_compat(&newset, &newset32);
992 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 994 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
993 995
994 spin_lock_irq(&current->sighand->siglock);
995 current->saved_sigmask = current->blocked; 996 current->saved_sigmask = current->blocked;
996 current->blocked = newset; 997 set_current_blocked(&newset);
997 recalc_sigpending();
998 spin_unlock_irq(&current->sighand->siglock);
999 998
1000 current->state = TASK_INTERRUPTIBLE; 999 current->state = TASK_INTERRUPTIBLE;
1001 schedule(); 1000 schedule();
diff --git a/kernel/configs.c b/kernel/configs.c
index b4066b44a99..42e8fa075ee 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void)
92module_init(ikconfig_init); 92module_init(ikconfig_init);
93module_exit(ikconfig_cleanup); 93module_exit(ikconfig_cleanup);
94 94
95#endif /* CONFIG_IKCONFIG_PROC */
96
95MODULE_LICENSE("GPL"); 97MODULE_LICENSE("GPL");
96MODULE_AUTHOR("Randy Dunlap"); 98MODULE_AUTHOR("Randy Dunlap");
97MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); 99MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
98
99#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 12b7458f23b..eae3d9b3957 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,7 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/gfp.h> 17#include <linux/gfp.h>
18#include <linux/suspend.h>
18 19
19#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
20/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 21/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -476,6 +477,79 @@ static int alloc_frozen_cpus(void)
476 return 0; 477 return 0;
477} 478}
478core_initcall(alloc_frozen_cpus); 479core_initcall(alloc_frozen_cpus);
480
481/*
482 * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
483 * hotplug when tasks are about to be frozen. Also, don't allow the freezer
484 * to continue until any currently running CPU hotplug operation gets
485 * completed.
486 * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
487 * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
488 * CPU hotplug path and released only after it is complete. Thus, we
489 * (and hence the freezer) will block here until any currently running CPU
490 * hotplug operation gets completed.
491 */
492void cpu_hotplug_disable_before_freeze(void)
493{
494 cpu_maps_update_begin();
495 cpu_hotplug_disabled = 1;
496 cpu_maps_update_done();
497}
498
499
500/*
501 * When tasks have been thawed, re-enable regular CPU hotplug (which had been
502 * disabled while beginning to freeze tasks).
503 */
504void cpu_hotplug_enable_after_thaw(void)
505{
506 cpu_maps_update_begin();
507 cpu_hotplug_disabled = 0;
508 cpu_maps_update_done();
509}
510
511/*
512 * When callbacks for CPU hotplug notifications are being executed, we must
513 * ensure that the state of the system with respect to the tasks being frozen
514 * or not, as reported by the notification, remains unchanged *throughout the
515 * duration* of the execution of the callbacks.
516 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
517 *
518 * This synchronization is implemented by mutually excluding regular CPU
519 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
520 * Hibernate notifications.
521 */
522static int
523cpu_hotplug_pm_callback(struct notifier_block *nb,
524 unsigned long action, void *ptr)
525{
526 switch (action) {
527
528 case PM_SUSPEND_PREPARE:
529 case PM_HIBERNATION_PREPARE:
530 cpu_hotplug_disable_before_freeze();
531 break;
532
533 case PM_POST_SUSPEND:
534 case PM_POST_HIBERNATION:
535 cpu_hotplug_enable_after_thaw();
536 break;
537
538 default:
539 return NOTIFY_DONE;
540 }
541
542 return NOTIFY_OK;
543}
544
545
546int cpu_hotplug_pm_sync_init(void)
547{
548 pm_notifier(cpu_hotplug_pm_callback, 0);
549 return 0;
550}
551core_initcall(cpu_hotplug_pm_sync_init);
552
479#endif /* CONFIG_PM_SLEEP_SMP */ 553#endif /* CONFIG_PM_SLEEP_SMP */
480 554
481/** 555/**
@@ -594,3 +668,23 @@ void init_cpu_online(const struct cpumask *src)
594{ 668{
595 cpumask_copy(to_cpumask(cpu_online_bits), src); 669 cpumask_copy(to_cpumask(cpu_online_bits), src);
596} 670}
671
672static ATOMIC_NOTIFIER_HEAD(idle_notifier);
673
674void idle_notifier_register(struct notifier_block *n)
675{
676 atomic_notifier_chain_register(&idle_notifier, n);
677}
678EXPORT_SYMBOL_GPL(idle_notifier_register);
679
680void idle_notifier_unregister(struct notifier_block *n)
681{
682 atomic_notifier_chain_unregister(&idle_notifier, n);
683}
684EXPORT_SYMBOL_GPL(idle_notifier_unregister);
685
686void idle_notifier_call_chain(unsigned long val)
687{
688 atomic_notifier_call_chain(&idle_notifier, val, NULL);
689}
690EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9c9b7545c81..10131fdaff7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
55#include <linux/sort.h> 55#include <linux/sort.h>
56 56
57#include <asm/uaccess.h> 57#include <asm/uaccess.h>
58#include <asm/atomic.h> 58#include <linux/atomic.h>
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/workqueue.h> 60#include <linux/workqueue.h>
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
@@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor)
2460 2460
2461int cpuset_mem_spread_node(void) 2461int cpuset_mem_spread_node(void)
2462{ 2462{
2463 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2464 current->cpuset_mem_spread_rotor =
2465 node_random(&current->mems_allowed);
2466
2463 return cpuset_spread_node(&current->cpuset_mem_spread_rotor); 2467 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2464} 2468}
2465 2469
2466int cpuset_slab_spread_node(void) 2470int cpuset_slab_spread_node(void)
2467{ 2471{
2472 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2473 current->cpuset_slab_spread_rotor =
2474 node_random(&current->mems_allowed);
2475
2468 return cpuset_spread_node(&current->cpuset_slab_spread_rotor); 2476 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2469} 2477}
2470 2478
diff --git a/kernel/cred.c b/kernel/cred.c
index 174fa84eca3..8ef31f53c44 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -508,10 +508,8 @@ int commit_creds(struct cred *new)
508 key_fsgid_changed(task); 508 key_fsgid_changed(task);
509 509
510 /* do it 510 /* do it
511 * - What if a process setreuid()'s and this brings the 511 * RLIMIT_NPROC limits on user->processes have already been checked
512 * new uid over his NPROC rlimit? We can check this now 512 * in set_user().
513 * cheaply with the new uid cache, so if it matters
514 * we should be checking for it. -DaveM
515 */ 513 */
516 alter_cred_subscribers(new, 2); 514 alter_cred_subscribers(new, 2);
517 if (new->user != old->user) 515 if (new->user != old->user)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index bad6786dee8..0d7c08784ef 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -51,7 +51,7 @@
51 51
52#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
53#include <asm/byteorder.h> 53#include <asm/byteorder.h>
54#include <asm/atomic.h> 54#include <linux/atomic.h>
55#include <asm/system.h> 55#include <asm/system.h>
56 56
57#include "debug_core.h" 57#include "debug_core.h"
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a11db956dd6..34872482315 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -42,6 +42,8 @@
42/* Our I/O buffers. */ 42/* Our I/O buffers. */
43static char remcom_in_buffer[BUFMAX]; 43static char remcom_in_buffer[BUFMAX];
44static char remcom_out_buffer[BUFMAX]; 44static char remcom_out_buffer[BUFMAX];
45static int gdbstub_use_prev_in_buf;
46static int gdbstub_prev_in_buf_pos;
45 47
46/* Storage for the registers, in GDB format. */ 48/* Storage for the registers, in GDB format. */
47static unsigned long gdb_regs[(NUMREGBYTES + 49static unsigned long gdb_regs[(NUMREGBYTES +
@@ -58,6 +60,13 @@ static int gdbstub_read_wait(void)
58 int ret = -1; 60 int ret = -1;
59 int i; 61 int i;
60 62
63 if (unlikely(gdbstub_use_prev_in_buf)) {
64 if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
65 return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
66 else
67 gdbstub_use_prev_in_buf = 0;
68 }
69
61 /* poll any additional I/O interfaces that are defined */ 70 /* poll any additional I/O interfaces that are defined */
62 while (ret < 0) 71 while (ret < 0)
63 for (i = 0; kdb_poll_funcs[i] != NULL; i++) { 72 for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
@@ -109,7 +118,6 @@ static void get_packet(char *buffer)
109 buffer[count] = ch; 118 buffer[count] = ch;
110 count = count + 1; 119 count = count + 1;
111 } 120 }
112 buffer[count] = 0;
113 121
114 if (ch == '#') { 122 if (ch == '#') {
115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; 123 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
@@ -124,6 +132,7 @@ static void get_packet(char *buffer)
124 if (dbg_io_ops->flush) 132 if (dbg_io_ops->flush)
125 dbg_io_ops->flush(); 133 dbg_io_ops->flush();
126 } 134 }
135 buffer[count] = 0;
127 } while (checksum != xmitcsum); 136 } while (checksum != xmitcsum);
128} 137}
129 138
@@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1082 case 'c': 1091 case 'c':
1083 strcpy(remcom_in_buffer, cmd); 1092 strcpy(remcom_in_buffer, cmd);
1084 return 0; 1093 return 0;
1085 case '?': 1094 case '$':
1086 gdb_cmd_status(ks); 1095 strcpy(remcom_in_buffer, cmd);
1087 break; 1096 gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
1088 case '\0': 1097 gdbstub_prev_in_buf_pos = 0;
1089 strcpy(remcom_out_buffer, ""); 1098 return 0;
1090 break;
1091 } 1099 }
1092 dbg_io_ops->write_char('+'); 1100 dbg_io_ops->write_char('+');
1093 put_packet(remcom_out_buffer); 1101 put_packet(remcom_out_buffer);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 2f62fe85f16..7179eac7b41 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv)
112 unsigned long addr; 112 unsigned long addr;
113 long offset; 113 long offset;
114 114
115 kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ 115 /* Prompt after each proc in bta */
116 kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each 116 kdbgetintenv("BTAPROMPT", &btaprompt);
117 * proc in bta */
118 117
119 if (strcmp(argv[0], "bta") == 0) { 118 if (strcmp(argv[0], "bta") == 0) {
120 struct task_struct *g, *p; 119 struct task_struct *g, *p;
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
index 56c88e4db30..9834ad303ab 100644
--- a/kernel/debug/kdb/kdb_cmds
+++ b/kernel/debug/kdb/kdb_cmds
@@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging"
18endefcmd 18endefcmd
19 19
20defcmd dumpall "" "First line debugging" 20defcmd dumpall "" "First line debugging"
21 set BTSYMARG 1
22 set BTARGS 9
23 pid R 21 pid R
24 -dumpcommon 22 -dumpcommon
25 -bta 23 -bta
26endefcmd 24endefcmd
27 25
28defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" 26defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
29 set BTSYMARG 1
30 set BTARGS 9
31 pid R 27 pid R
32 -dumpcommon 28 -dumpcommon
33 -btc 29 -btc
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index dd0b1b7dd02..d9ca9aa481e 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs);
30int kdb_poll_idx = 1; 30int kdb_poll_idx = 1;
31EXPORT_SYMBOL_GPL(kdb_poll_idx); 31EXPORT_SYMBOL_GPL(kdb_poll_idx);
32 32
33static struct kgdb_state *kdb_ks;
34
33int kdb_stub(struct kgdb_state *ks) 35int kdb_stub(struct kgdb_state *ks)
34{ 36{
35 int error = 0; 37 int error = 0;
@@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks)
39 kdb_dbtrap_t db_result = KDB_DB_NOBPT; 41 kdb_dbtrap_t db_result = KDB_DB_NOBPT;
40 int i; 42 int i;
41 43
44 kdb_ks = ks;
42 if (KDB_STATE(REENTRY)) { 45 if (KDB_STATE(REENTRY)) {
43 reason = KDB_REASON_SWITCH; 46 reason = KDB_REASON_SWITCH;
44 KDB_STATE_CLEAR(REENTRY); 47 KDB_STATE_CLEAR(REENTRY);
@@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks)
123 KDB_STATE_CLEAR(PAGER); 126 KDB_STATE_CLEAR(PAGER);
124 kdbnearsym_cleanup(); 127 kdbnearsym_cleanup();
125 if (error == KDB_CMD_KGDB) { 128 if (error == KDB_CMD_KGDB) {
126 if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { 129 if (KDB_STATE(DOING_KGDB))
127 /*
128 * This inteface glue which allows kdb to transition in into
129 * the gdb stub. In order to do this the '?' or '' gdb serial
130 * packet response is processed here. And then control is
131 * passed to the gdbstub.
132 */
133 if (KDB_STATE(DOING_KGDB))
134 gdbstub_state(ks, "?");
135 else
136 gdbstub_state(ks, "");
137 KDB_STATE_CLEAR(DOING_KGDB); 130 KDB_STATE_CLEAR(DOING_KGDB);
138 KDB_STATE_CLEAR(DOING_KGDB2);
139 }
140 return DBG_PASS_EVENT; 131 return DBG_PASS_EVENT;
141 } 132 }
142 kdb_bp_install(ks->linux_regs); 133 kdb_bp_install(ks->linux_regs);
@@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks)
166 return kgdb_info[ks->cpu].ret_state; 157 return kgdb_info[ks->cpu].ret_state;
167} 158}
168 159
160void kdb_gdb_state_pass(char *buf)
161{
162 gdbstub_state(kdb_ks, buf);
163}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 96fdaac46a8..4802eb5840e 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN];
31 31
32int kdb_trap_printk; 32int kdb_trap_printk;
33 33
34static void kgdb_transition_check(char *buffer) 34static int kgdb_transition_check(char *buffer)
35{ 35{
36 int slen = strlen(buffer); 36 if (buffer[0] != '+' && buffer[0] != '$') {
37 if (strncmp(buffer, "$?#3f", slen) != 0 &&
38 strncmp(buffer, "$qSupported#37", slen) != 0 &&
39 strncmp(buffer, "+$qSupported#37", slen) != 0) {
40 KDB_STATE_SET(KGDB_TRANS); 37 KDB_STATE_SET(KGDB_TRANS);
41 kdb_printf("%s", buffer); 38 kdb_printf("%s", buffer);
39 } else {
40 int slen = strlen(buffer);
41 if (slen > 3 && buffer[slen - 3] == '#') {
42 kdb_gdb_state_pass(buffer);
43 strcpy(buffer, "kgdb");
44 KDB_STATE_SET(DOING_KGDB);
45 return 1;
46 }
42 } 47 }
48 return 0;
43} 49}
44 50
45static int kdb_read_get_key(char *buffer, size_t bufsize) 51static int kdb_read_get_key(char *buffer, size_t bufsize)
@@ -251,6 +257,10 @@ poll_again:
251 case 13: /* enter */ 257 case 13: /* enter */
252 *lastchar++ = '\n'; 258 *lastchar++ = '\n';
253 *lastchar++ = '\0'; 259 *lastchar++ = '\0';
260 if (!KDB_STATE(KGDB_TRANS)) {
261 KDB_STATE_SET(KGDB_TRANS);
262 kdb_printf("%s", buffer);
263 }
254 kdb_printf("\n"); 264 kdb_printf("\n");
255 return buffer; 265 return buffer;
256 case 4: /* Del */ 266 case 4: /* Del */
@@ -382,22 +392,26 @@ poll_again:
382 * printed characters if we think that 392 * printed characters if we think that
383 * kgdb is connecting, until the check 393 * kgdb is connecting, until the check
384 * fails */ 394 * fails */
385 if (!KDB_STATE(KGDB_TRANS)) 395 if (!KDB_STATE(KGDB_TRANS)) {
386 kgdb_transition_check(buffer); 396 if (kgdb_transition_check(buffer))
387 else 397 return buffer;
398 } else {
388 kdb_printf("%c", key); 399 kdb_printf("%c", key);
400 }
389 } 401 }
390 /* Special escape to kgdb */ 402 /* Special escape to kgdb */
391 if (lastchar - buffer >= 5 && 403 if (lastchar - buffer >= 5 &&
392 strcmp(lastchar - 5, "$?#3f") == 0) { 404 strcmp(lastchar - 5, "$?#3f") == 0) {
405 kdb_gdb_state_pass(lastchar - 5);
393 strcpy(buffer, "kgdb"); 406 strcpy(buffer, "kgdb");
394 KDB_STATE_SET(DOING_KGDB); 407 KDB_STATE_SET(DOING_KGDB);
395 return buffer; 408 return buffer;
396 } 409 }
397 if (lastchar - buffer >= 14 && 410 if (lastchar - buffer >= 11 &&
398 strcmp(lastchar - 14, "$qSupported#37") == 0) { 411 strcmp(lastchar - 11, "$qSupported") == 0) {
412 kdb_gdb_state_pass(lastchar - 11);
399 strcpy(buffer, "kgdb"); 413 strcpy(buffer, "kgdb");
400 KDB_STATE_SET(DOING_KGDB2); 414 KDB_STATE_SET(DOING_KGDB);
401 return buffer; 415 return buffer;
402 } 416 }
403 } 417 }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index be14779bcef..63786e71a3c 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -145,7 +145,6 @@ static char *__env[] = {
145#endif 145#endif
146 "RADIX=16", 146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */ 147 "MDCOUNT=8", /* lines of md output */
148 "BTARGS=9", /* 9 possible args in bt */
149 KDB_PLATFORM_ENV, 148 KDB_PLATFORM_ENV,
150 "DTABCOUNT=30", 149 "DTABCOUNT=30",
151 "NOSECT=1", 150 "NOSECT=1",
@@ -172,6 +171,7 @@ static char *__env[] = {
172 (char *)0, 171 (char *)0,
173 (char *)0, 172 (char *)0,
174 (char *)0, 173 (char *)0,
174 (char *)0,
175}; 175};
176 176
177static const int __nenv = (sizeof(__env) / sizeof(char *)); 177static const int __nenv = (sizeof(__env) / sizeof(char *));
@@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1386 } 1386 }
1387 1387
1388 if (result == KDB_CMD_KGDB) { 1388 if (result == KDB_CMD_KGDB) {
1389 if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) 1389 if (!KDB_STATE(DOING_KGDB))
1390 kdb_printf("Entering please attach debugger " 1390 kdb_printf("Entering please attach debugger "
1391 "or use $D#44+ or $3#33\n"); 1391 "or use $D#44+ or $3#33\n");
1392 break; 1392 break;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 35d69ed1dfb..e381d105b40 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -21,7 +21,6 @@
21#define KDB_CMD_SS (-1003) 21#define KDB_CMD_SS (-1003)
22#define KDB_CMD_SSB (-1004) 22#define KDB_CMD_SSB (-1004)
23#define KDB_CMD_KGDB (-1005) 23#define KDB_CMD_KGDB (-1005)
24#define KDB_CMD_KGDB2 (-1006)
25 24
26/* Internal debug flags */ 25/* Internal debug flags */
27#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ 26#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
@@ -146,7 +145,6 @@ extern int kdb_state;
146 * keyboard on this cpu */ 145 * keyboard on this cpu */
147#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ 146#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
148#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ 147#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
149#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
150#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ 148#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
151#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch 149#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
152 * specific use */ 150 * specific use */
@@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val);
218extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 216extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
219extern void kdb_meminfo_proc_show(void); 217extern void kdb_meminfo_proc_show(void);
220extern char *kdb_getstr(char *, size_t, char *); 218extern char *kdb_getstr(char *, size_t, char *);
219extern void kdb_gdb_state_pass(char *buf);
221 220
222/* Defines for kdb_symbol_print */ 221/* Defines for kdb_symbol_print */
223#define KDB_SP_SPACEB 0x0001 /* Space before string */ 222#define KDB_SP_SPACEB 0x0001 /* Space before string */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa7..418b3f7053a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/delayacct.h> 21#include <linux/delayacct.h>
22#include <linux/module.h>
22 23
23int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 24int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
25EXPORT_SYMBOL_GPL(delayacct_on);
24struct kmem_cache *delayacct_cache; 26struct kmem_cache *delayacct_cache;
25 27
26static int __init delayacct_setup_disable(char *str) 28static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 1ce23d3d839..89e5e8aa4c3 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o 5obj-y := core.o ring_buffer.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9efe7108cca..0f857782d06 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,8 @@
36#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h> 37#include <linux/hw_breakpoint.h>
38 38
39#include "internal.h"
40
39#include <asm/irq_regs.h> 41#include <asm/irq_regs.h>
40 42
41struct remote_function_call { 43struct remote_function_call {
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 202 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201} 203}
202 204
205static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
206 struct perf_event_context *ctx)
207{
208 raw_spin_lock(&cpuctx->ctx.lock);
209 if (ctx)
210 raw_spin_lock(&ctx->lock);
211}
212
213static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
214 struct perf_event_context *ctx)
215{
216 if (ctx)
217 raw_spin_unlock(&ctx->lock);
218 raw_spin_unlock(&cpuctx->ctx.lock);
219}
220
203#ifdef CONFIG_CGROUP_PERF 221#ifdef CONFIG_CGROUP_PERF
204 222
205/* 223/*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
340 rcu_read_lock(); 358 rcu_read_lock();
341 359
342 list_for_each_entry_rcu(pmu, &pmus, entry) { 360 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 361 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345 362
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /* 363 /*
349 * perf_cgroup_events says at least one 364 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events. 365 * context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
353 * events for a context. 368 * events for a context.
354 */ 369 */
355 if (cpuctx->ctx.nr_cgroups > 0) { 370 if (cpuctx->ctx.nr_cgroups > 0) {
371 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
372 perf_pmu_disable(cpuctx->ctx.pmu);
356 373
357 if (mode & PERF_CGROUP_SWOUT) { 374 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 375 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
372 cpuctx->cgrp = perf_cgroup_from_task(task); 389 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 390 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 } 391 }
392 perf_pmu_enable(cpuctx->ctx.pmu);
393 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
375 } 394 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 } 395 }
379 396
380 rcu_read_unlock(); 397 rcu_read_unlock();
@@ -382,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
382 local_irq_restore(flags); 399 local_irq_restore(flags);
383} 400}
384 401
385static inline void perf_cgroup_sched_out(struct task_struct *task) 402static inline void perf_cgroup_sched_out(struct task_struct *task,
403 struct task_struct *next)
386{ 404{
387 perf_cgroup_switch(task, PERF_CGROUP_SWOUT); 405 struct perf_cgroup *cgrp1;
406 struct perf_cgroup *cgrp2 = NULL;
407
408 /*
409 * we come here when we know perf_cgroup_events > 0
410 */
411 cgrp1 = perf_cgroup_from_task(task);
412
413 /*
414 * next is NULL when called from perf_event_enable_on_exec()
415 * that will systematically cause a cgroup_switch()
416 */
417 if (next)
418 cgrp2 = perf_cgroup_from_task(next);
419
420 /*
421 * only schedule out current cgroup events if we know
422 * that we are switching to a different cgroup. Otherwise,
423 * do no touch the cgroup events.
424 */
425 if (cgrp1 != cgrp2)
426 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
388} 427}
389 428
390static inline void perf_cgroup_sched_in(struct task_struct *task) 429static inline void perf_cgroup_sched_in(struct task_struct *prev,
430 struct task_struct *task)
391{ 431{
392 perf_cgroup_switch(task, PERF_CGROUP_SWIN); 432 struct perf_cgroup *cgrp1;
433 struct perf_cgroup *cgrp2 = NULL;
434
435 /*
436 * we come here when we know perf_cgroup_events > 0
437 */
438 cgrp1 = perf_cgroup_from_task(task);
439
440 /* prev can never be NULL */
441 cgrp2 = perf_cgroup_from_task(prev);
442
443 /*
444 * only need to schedule in cgroup events if we are changing
445 * cgroup during ctxsw. Cgroup events were not scheduled
446 * out of ctxsw out if that was not the case.
447 */
448 if (cgrp1 != cgrp2)
449 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
393} 450}
394 451
395static inline int perf_cgroup_connect(int fd, struct perf_event *event, 452static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -501,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
501{ 558{
502} 559}
503 560
504static inline void perf_cgroup_sched_out(struct task_struct *task) 561static inline void perf_cgroup_sched_out(struct task_struct *task,
562 struct task_struct *next)
505{ 563{
506} 564}
507 565
508static inline void perf_cgroup_sched_in(struct task_struct *task) 566static inline void perf_cgroup_sched_in(struct task_struct *prev,
567 struct task_struct *task)
509{ 568{
510} 569}
511 570
@@ -731,6 +790,7 @@ static u64 perf_event_time(struct perf_event *event)
731 790
732/* 791/*
733 * Update the total_time_enabled and total_time_running fields for a event. 792 * Update the total_time_enabled and total_time_running fields for a event.
793 * The caller of this function needs to hold the ctx->lock.
734 */ 794 */
735static void update_event_times(struct perf_event *event) 795static void update_event_times(struct perf_event *event)
736{ 796{
@@ -1105,6 +1165,10 @@ static int __perf_remove_from_context(void *info)
1105 raw_spin_lock(&ctx->lock); 1165 raw_spin_lock(&ctx->lock);
1106 event_sched_out(event, cpuctx, ctx); 1166 event_sched_out(event, cpuctx, ctx);
1107 list_del_event(event, ctx); 1167 list_del_event(event, ctx);
1168 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1169 ctx->is_active = 0;
1170 cpuctx->task_ctx = NULL;
1171 }
1108 raw_spin_unlock(&ctx->lock); 1172 raw_spin_unlock(&ctx->lock);
1109 1173
1110 return 0; 1174 return 0;
@@ -1454,8 +1518,24 @@ static void add_event_to_ctx(struct perf_event *event,
1454 event->tstamp_stopped = tstamp; 1518 event->tstamp_stopped = tstamp;
1455} 1519}
1456 1520
1457static void perf_event_context_sched_in(struct perf_event_context *ctx, 1521static void task_ctx_sched_out(struct perf_event_context *ctx);
1458 struct task_struct *tsk); 1522static void
1523ctx_sched_in(struct perf_event_context *ctx,
1524 struct perf_cpu_context *cpuctx,
1525 enum event_type_t event_type,
1526 struct task_struct *task);
1527
1528static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1529 struct perf_event_context *ctx,
1530 struct task_struct *task)
1531{
1532 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1533 if (ctx)
1534 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1535 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1536 if (ctx)
1537 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1538}
1459 1539
1460/* 1540/*
1461 * Cross CPU call to install and enable a performance event 1541 * Cross CPU call to install and enable a performance event
@@ -1466,20 +1546,37 @@ static int __perf_install_in_context(void *info)
1466{ 1546{
1467 struct perf_event *event = info; 1547 struct perf_event *event = info;
1468 struct perf_event_context *ctx = event->ctx; 1548 struct perf_event_context *ctx = event->ctx;
1469 struct perf_event *leader = event->group_leader;
1470 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1549 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1471 int err; 1550 struct perf_event_context *task_ctx = cpuctx->task_ctx;
1551 struct task_struct *task = current;
1552
1553 perf_ctx_lock(cpuctx, task_ctx);
1554 perf_pmu_disable(cpuctx->ctx.pmu);
1472 1555
1473 /* 1556 /*
1474 * In case we're installing a new context to an already running task, 1557 * If there was an active task_ctx schedule it out.
1475 * could also happen before perf_event_task_sched_in() on architectures
1476 * which do context switches with IRQs enabled.
1477 */ 1558 */
1478 if (ctx->task && !cpuctx->task_ctx) 1559 if (task_ctx)
1479 perf_event_context_sched_in(ctx, ctx->task); 1560 task_ctx_sched_out(task_ctx);
1561
1562 /*
1563 * If the context we're installing events in is not the
1564 * active task_ctx, flip them.
1565 */
1566 if (ctx->task && task_ctx != ctx) {
1567 if (task_ctx)
1568 raw_spin_unlock(&task_ctx->lock);
1569 raw_spin_lock(&ctx->lock);
1570 task_ctx = ctx;
1571 }
1572
1573 if (task_ctx) {
1574 cpuctx->task_ctx = task_ctx;
1575 task = task_ctx->task;
1576 }
1577
1578 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1480 1579
1481 raw_spin_lock(&ctx->lock);
1482 ctx->is_active = 1;
1483 update_context_time(ctx); 1580 update_context_time(ctx);
1484 /* 1581 /*
1485 * update cgrp time only if current cgrp 1582 * update cgrp time only if current cgrp
@@ -1490,43 +1587,13 @@ static int __perf_install_in_context(void *info)
1490 1587
1491 add_event_to_ctx(event, ctx); 1588 add_event_to_ctx(event, ctx);
1492 1589
1493 if (!event_filter_match(event))
1494 goto unlock;
1495
1496 /* 1590 /*
1497 * Don't put the event on if it is disabled or if 1591 * Schedule everything back in
1498 * it is in a group and the group isn't on.
1499 */ 1592 */
1500 if (event->state != PERF_EVENT_STATE_INACTIVE || 1593 perf_event_sched_in(cpuctx, task_ctx, task);
1501 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1502 goto unlock;
1503 1594
1504 /* 1595 perf_pmu_enable(cpuctx->ctx.pmu);
1505 * An exclusive event can't go on if there are already active 1596 perf_ctx_unlock(cpuctx, task_ctx);
1506 * hardware events, and no hardware event can go on if there
1507 * is already an exclusive event on.
1508 */
1509 if (!group_can_go_on(event, cpuctx, 1))
1510 err = -EEXIST;
1511 else
1512 err = event_sched_in(event, cpuctx, ctx);
1513
1514 if (err) {
1515 /*
1516 * This event couldn't go on. If it is in a group
1517 * then we have to pull the whole group off.
1518 * If the event group is pinned then put it in error state.
1519 */
1520 if (leader != event)
1521 group_sched_out(leader, cpuctx, ctx);
1522 if (leader->attr.pinned) {
1523 update_group_times(leader);
1524 leader->state = PERF_EVENT_STATE_ERROR;
1525 }
1526 }
1527
1528unlock:
1529 raw_spin_unlock(&ctx->lock);
1530 1597
1531 return 0; 1598 return 0;
1532} 1599}
@@ -1739,7 +1806,7 @@ out:
1739 raw_spin_unlock_irq(&ctx->lock); 1806 raw_spin_unlock_irq(&ctx->lock);
1740} 1807}
1741 1808
1742static int perf_event_refresh(struct perf_event *event, int refresh) 1809int perf_event_refresh(struct perf_event *event, int refresh)
1743{ 1810{
1744 /* 1811 /*
1745 * not supported on inherited events 1812 * not supported on inherited events
@@ -1752,36 +1819,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1752 1819
1753 return 0; 1820 return 0;
1754} 1821}
1822EXPORT_SYMBOL_GPL(perf_event_refresh);
1755 1823
1756static void ctx_sched_out(struct perf_event_context *ctx, 1824static void ctx_sched_out(struct perf_event_context *ctx,
1757 struct perf_cpu_context *cpuctx, 1825 struct perf_cpu_context *cpuctx,
1758 enum event_type_t event_type) 1826 enum event_type_t event_type)
1759{ 1827{
1760 struct perf_event *event; 1828 struct perf_event *event;
1829 int is_active = ctx->is_active;
1761 1830
1762 raw_spin_lock(&ctx->lock); 1831 ctx->is_active &= ~event_type;
1763 perf_pmu_disable(ctx->pmu);
1764 ctx->is_active = 0;
1765 if (likely(!ctx->nr_events)) 1832 if (likely(!ctx->nr_events))
1766 goto out; 1833 return;
1834
1767 update_context_time(ctx); 1835 update_context_time(ctx);
1768 update_cgrp_time_from_cpuctx(cpuctx); 1836 update_cgrp_time_from_cpuctx(cpuctx);
1769
1770 if (!ctx->nr_active) 1837 if (!ctx->nr_active)
1771 goto out; 1838 return;
1772 1839
1773 if (event_type & EVENT_PINNED) { 1840 perf_pmu_disable(ctx->pmu);
1841 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1774 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1842 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1775 group_sched_out(event, cpuctx, ctx); 1843 group_sched_out(event, cpuctx, ctx);
1776 } 1844 }
1777 1845
1778 if (event_type & EVENT_FLEXIBLE) { 1846 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1779 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1847 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1780 group_sched_out(event, cpuctx, ctx); 1848 group_sched_out(event, cpuctx, ctx);
1781 } 1849 }
1782out:
1783 perf_pmu_enable(ctx->pmu); 1850 perf_pmu_enable(ctx->pmu);
1784 raw_spin_unlock(&ctx->lock);
1785} 1851}
1786 1852
1787/* 1853/*
@@ -1929,8 +1995,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1929 rcu_read_unlock(); 1995 rcu_read_unlock();
1930 1996
1931 if (do_switch) { 1997 if (do_switch) {
1998 raw_spin_lock(&ctx->lock);
1932 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 1999 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1933 cpuctx->task_ctx = NULL; 2000 cpuctx->task_ctx = NULL;
2001 raw_spin_unlock(&ctx->lock);
1934 } 2002 }
1935} 2003}
1936 2004
@@ -1962,11 +2030,10 @@ void __perf_event_task_sched_out(struct task_struct *task,
1962 * cgroup event are system-wide mode only 2030 * cgroup event are system-wide mode only
1963 */ 2031 */
1964 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2032 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1965 perf_cgroup_sched_out(task); 2033 perf_cgroup_sched_out(task, next);
1966} 2034}
1967 2035
1968static void task_ctx_sched_out(struct perf_event_context *ctx, 2036static void task_ctx_sched_out(struct perf_event_context *ctx)
1969 enum event_type_t event_type)
1970{ 2037{
1971 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2038 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1972 2039
@@ -1976,7 +2043,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1976 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2043 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1977 return; 2044 return;
1978 2045
1979 ctx_sched_out(ctx, cpuctx, event_type); 2046 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1980 cpuctx->task_ctx = NULL; 2047 cpuctx->task_ctx = NULL;
1981} 2048}
1982 2049
@@ -2055,11 +2122,11 @@ ctx_sched_in(struct perf_event_context *ctx,
2055 struct task_struct *task) 2122 struct task_struct *task)
2056{ 2123{
2057 u64 now; 2124 u64 now;
2125 int is_active = ctx->is_active;
2058 2126
2059 raw_spin_lock(&ctx->lock); 2127 ctx->is_active |= event_type;
2060 ctx->is_active = 1;
2061 if (likely(!ctx->nr_events)) 2128 if (likely(!ctx->nr_events))
2062 goto out; 2129 return;
2063 2130
2064 now = perf_clock(); 2131 now = perf_clock();
2065 ctx->timestamp = now; 2132 ctx->timestamp = now;
@@ -2068,15 +2135,12 @@ ctx_sched_in(struct perf_event_context *ctx,
2068 * First go through the list and put on any pinned groups 2135 * First go through the list and put on any pinned groups
2069 * in order to give them the best chance of going on. 2136 * in order to give them the best chance of going on.
2070 */ 2137 */
2071 if (event_type & EVENT_PINNED) 2138 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2072 ctx_pinned_sched_in(ctx, cpuctx); 2139 ctx_pinned_sched_in(ctx, cpuctx);
2073 2140
2074 /* Then walk through the lower prio flexible groups */ 2141 /* Then walk through the lower prio flexible groups */
2075 if (event_type & EVENT_FLEXIBLE) 2142 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2076 ctx_flexible_sched_in(ctx, cpuctx); 2143 ctx_flexible_sched_in(ctx, cpuctx);
2077
2078out:
2079 raw_spin_unlock(&ctx->lock);
2080} 2144}
2081 2145
2082static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2146static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2088,19 +2152,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2088 ctx_sched_in(ctx, cpuctx, event_type, task); 2152 ctx_sched_in(ctx, cpuctx, event_type, task);
2089} 2153}
2090 2154
2091static void task_ctx_sched_in(struct perf_event_context *ctx,
2092 enum event_type_t event_type)
2093{
2094 struct perf_cpu_context *cpuctx;
2095
2096 cpuctx = __get_cpu_context(ctx);
2097 if (cpuctx->task_ctx == ctx)
2098 return;
2099
2100 ctx_sched_in(ctx, cpuctx, event_type, NULL);
2101 cpuctx->task_ctx = ctx;
2102}
2103
2104static void perf_event_context_sched_in(struct perf_event_context *ctx, 2155static void perf_event_context_sched_in(struct perf_event_context *ctx,
2105 struct task_struct *task) 2156 struct task_struct *task)
2106{ 2157{
@@ -2110,6 +2161,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2110 if (cpuctx->task_ctx == ctx) 2161 if (cpuctx->task_ctx == ctx)
2111 return; 2162 return;
2112 2163
2164 perf_ctx_lock(cpuctx, ctx);
2113 perf_pmu_disable(ctx->pmu); 2165 perf_pmu_disable(ctx->pmu);
2114 /* 2166 /*
2115 * We want to keep the following priority order: 2167 * We want to keep the following priority order:
@@ -2118,18 +2170,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2118 */ 2170 */
2119 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2171 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2120 2172
2121 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2173 perf_event_sched_in(cpuctx, ctx, task);
2122 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2123 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2124 2174
2125 cpuctx->task_ctx = ctx; 2175 cpuctx->task_ctx = ctx;
2126 2176
2177 perf_pmu_enable(ctx->pmu);
2178 perf_ctx_unlock(cpuctx, ctx);
2179
2127 /* 2180 /*
2128 * Since these rotations are per-cpu, we need to ensure the 2181 * Since these rotations are per-cpu, we need to ensure the
2129 * cpu-context we got scheduled on is actually rotating. 2182 * cpu-context we got scheduled on is actually rotating.
2130 */ 2183 */
2131 perf_pmu_rotate_start(ctx->pmu); 2184 perf_pmu_rotate_start(ctx->pmu);
2132 perf_pmu_enable(ctx->pmu);
2133} 2185}
2134 2186
2135/* 2187/*
@@ -2143,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2143 * accessing the event control register. If a NMI hits, then it will 2195 * accessing the event control register. If a NMI hits, then it will
2144 * keep the event running. 2196 * keep the event running.
2145 */ 2197 */
2146void __perf_event_task_sched_in(struct task_struct *task) 2198void __perf_event_task_sched_in(struct task_struct *prev,
2199 struct task_struct *task)
2147{ 2200{
2148 struct perf_event_context *ctx; 2201 struct perf_event_context *ctx;
2149 int ctxn; 2202 int ctxn;
@@ -2161,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task)
2161 * cgroup event are system-wide mode only 2214 * cgroup event are system-wide mode only
2162 */ 2215 */
2163 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2216 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2164 perf_cgroup_sched_in(task); 2217 perf_cgroup_sched_in(prev, task);
2165} 2218}
2166 2219
2167static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2220static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2269,7 +2322,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2269 u64 interrupts, now; 2322 u64 interrupts, now;
2270 s64 delta; 2323 s64 delta;
2271 2324
2272 raw_spin_lock(&ctx->lock);
2273 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2325 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2274 if (event->state != PERF_EVENT_STATE_ACTIVE) 2326 if (event->state != PERF_EVENT_STATE_ACTIVE)
2275 continue; 2327 continue;
@@ -2301,7 +2353,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2301 if (delta > 0) 2353 if (delta > 0)
2302 perf_adjust_period(event, period, delta); 2354 perf_adjust_period(event, period, delta);
2303 } 2355 }
2304 raw_spin_unlock(&ctx->lock);
2305} 2356}
2306 2357
2307/* 2358/*
@@ -2309,16 +2360,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2309 */ 2360 */
2310static void rotate_ctx(struct perf_event_context *ctx) 2361static void rotate_ctx(struct perf_event_context *ctx)
2311{ 2362{
2312 raw_spin_lock(&ctx->lock);
2313
2314 /* 2363 /*
2315 * Rotate the first entry last of non-pinned groups. Rotation might be 2364 * Rotate the first entry last of non-pinned groups. Rotation might be
2316 * disabled by the inheritance code. 2365 * disabled by the inheritance code.
2317 */ 2366 */
2318 if (!ctx->rotate_disable) 2367 if (!ctx->rotate_disable)
2319 list_rotate_left(&ctx->flexible_groups); 2368 list_rotate_left(&ctx->flexible_groups);
2320
2321 raw_spin_unlock(&ctx->lock);
2322} 2369}
2323 2370
2324/* 2371/*
@@ -2345,6 +2392,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2345 rotate = 1; 2392 rotate = 1;
2346 } 2393 }
2347 2394
2395 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2348 perf_pmu_disable(cpuctx->ctx.pmu); 2396 perf_pmu_disable(cpuctx->ctx.pmu);
2349 perf_ctx_adjust_freq(&cpuctx->ctx, interval); 2397 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2350 if (ctx) 2398 if (ctx)
@@ -2355,21 +2403,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2355 2403
2356 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2404 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2357 if (ctx) 2405 if (ctx)
2358 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 2406 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2359 2407
2360 rotate_ctx(&cpuctx->ctx); 2408 rotate_ctx(&cpuctx->ctx);
2361 if (ctx) 2409 if (ctx)
2362 rotate_ctx(ctx); 2410 rotate_ctx(ctx);
2363 2411
2364 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); 2412 perf_event_sched_in(cpuctx, ctx, current);
2365 if (ctx)
2366 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
2367 2413
2368done: 2414done:
2369 if (remove) 2415 if (remove)
2370 list_del_init(&cpuctx->rotation_list); 2416 list_del_init(&cpuctx->rotation_list);
2371 2417
2372 perf_pmu_enable(cpuctx->ctx.pmu); 2418 perf_pmu_enable(cpuctx->ctx.pmu);
2419 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2373} 2420}
2374 2421
2375void perf_event_task_tick(void) 2422void perf_event_task_tick(void)
@@ -2423,10 +2470,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2423 * ctxswin cgroup events which are already scheduled 2470 * ctxswin cgroup events which are already scheduled
2424 * in. 2471 * in.
2425 */ 2472 */
2426 perf_cgroup_sched_out(current); 2473 perf_cgroup_sched_out(current, NULL);
2427 task_ctx_sched_out(ctx, EVENT_ALL);
2428 2474
2429 raw_spin_lock(&ctx->lock); 2475 raw_spin_lock(&ctx->lock);
2476 task_ctx_sched_out(ctx);
2430 2477
2431 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2478 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2432 ret = event_enable_on_exec(event, ctx); 2479 ret = event_enable_on_exec(event, ctx);
@@ -2835,16 +2882,12 @@ retry:
2835 unclone_ctx(ctx); 2882 unclone_ctx(ctx);
2836 ++ctx->pin_count; 2883 ++ctx->pin_count;
2837 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2884 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2838 } 2885 } else {
2839
2840 if (!ctx) {
2841 ctx = alloc_perf_context(pmu, task); 2886 ctx = alloc_perf_context(pmu, task);
2842 err = -ENOMEM; 2887 err = -ENOMEM;
2843 if (!ctx) 2888 if (!ctx)
2844 goto errout; 2889 goto errout;
2845 2890
2846 get_ctx(ctx);
2847
2848 err = 0; 2891 err = 0;
2849 mutex_lock(&task->perf_event_mutex); 2892 mutex_lock(&task->perf_event_mutex);
2850 /* 2893 /*
@@ -2856,14 +2899,14 @@ retry:
2856 else if (task->perf_event_ctxp[ctxn]) 2899 else if (task->perf_event_ctxp[ctxn])
2857 err = -EAGAIN; 2900 err = -EAGAIN;
2858 else { 2901 else {
2902 get_ctx(ctx);
2859 ++ctx->pin_count; 2903 ++ctx->pin_count;
2860 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2904 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2861 } 2905 }
2862 mutex_unlock(&task->perf_event_mutex); 2906 mutex_unlock(&task->perf_event_mutex);
2863 2907
2864 if (unlikely(err)) { 2908 if (unlikely(err)) {
2865 put_task_struct(task); 2909 put_ctx(ctx);
2866 kfree(ctx);
2867 2910
2868 if (err == -EAGAIN) 2911 if (err == -EAGAIN)
2869 goto retry; 2912 goto retry;
@@ -2890,7 +2933,7 @@ static void free_event_rcu(struct rcu_head *head)
2890 kfree(event); 2933 kfree(event);
2891} 2934}
2892 2935
2893static void perf_buffer_put(struct perf_buffer *buffer); 2936static void ring_buffer_put(struct ring_buffer *rb);
2894 2937
2895static void free_event(struct perf_event *event) 2938static void free_event(struct perf_event *event)
2896{ 2939{
@@ -2913,9 +2956,9 @@ static void free_event(struct perf_event *event)
2913 } 2956 }
2914 } 2957 }
2915 2958
2916 if (event->buffer) { 2959 if (event->rb) {
2917 perf_buffer_put(event->buffer); 2960 ring_buffer_put(event->rb);
2918 event->buffer = NULL; 2961 event->rb = NULL;
2919 } 2962 }
2920 2963
2921 if (is_cgroup_event(event)) 2964 if (is_cgroup_event(event))
@@ -2934,12 +2977,6 @@ int perf_event_release_kernel(struct perf_event *event)
2934{ 2977{
2935 struct perf_event_context *ctx = event->ctx; 2978 struct perf_event_context *ctx = event->ctx;
2936 2979
2937 /*
2938 * Remove from the PMU, can't get re-enabled since we got
2939 * here because the last ref went.
2940 */
2941 perf_event_disable(event);
2942
2943 WARN_ON_ONCE(ctx->parent_ctx); 2980 WARN_ON_ONCE(ctx->parent_ctx);
2944 /* 2981 /*
2945 * There are two ways this annotation is useful: 2982 * There are two ways this annotation is useful:
@@ -2956,8 +2993,8 @@ int perf_event_release_kernel(struct perf_event *event)
2956 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 2993 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2957 raw_spin_lock_irq(&ctx->lock); 2994 raw_spin_lock_irq(&ctx->lock);
2958 perf_group_detach(event); 2995 perf_group_detach(event);
2959 list_del_event(event, ctx);
2960 raw_spin_unlock_irq(&ctx->lock); 2996 raw_spin_unlock_irq(&ctx->lock);
2997 perf_remove_from_context(event);
2961 mutex_unlock(&ctx->mutex); 2998 mutex_unlock(&ctx->mutex);
2962 2999
2963 free_event(event); 3000 free_event(event);
@@ -3149,13 +3186,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3149static unsigned int perf_poll(struct file *file, poll_table *wait) 3186static unsigned int perf_poll(struct file *file, poll_table *wait)
3150{ 3187{
3151 struct perf_event *event = file->private_data; 3188 struct perf_event *event = file->private_data;
3152 struct perf_buffer *buffer; 3189 struct ring_buffer *rb;
3153 unsigned int events = POLL_HUP; 3190 unsigned int events = POLL_HUP;
3154 3191
3155 rcu_read_lock(); 3192 rcu_read_lock();
3156 buffer = rcu_dereference(event->buffer); 3193 rb = rcu_dereference(event->rb);
3157 if (buffer) 3194 if (rb)
3158 events = atomic_xchg(&buffer->poll, 0); 3195 events = atomic_xchg(&rb->poll, 0);
3159 rcu_read_unlock(); 3196 rcu_read_unlock();
3160 3197
3161 poll_wait(file, &event->waitq, wait); 3198 poll_wait(file, &event->waitq, wait);
@@ -3358,6 +3395,18 @@ static int perf_event_index(struct perf_event *event)
3358 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3395 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3359} 3396}
3360 3397
3398static void calc_timer_values(struct perf_event *event,
3399 u64 *enabled,
3400 u64 *running)
3401{
3402 u64 now, ctx_time;
3403
3404 now = perf_clock();
3405 ctx_time = event->shadow_ctx_time + now;
3406 *enabled = ctx_time - event->tstamp_enabled;
3407 *running = ctx_time - event->tstamp_running;
3408}
3409
3361/* 3410/*
3362 * Callers need to ensure there can be no nesting of this function, otherwise 3411 * Callers need to ensure there can be no nesting of this function, otherwise
3363 * the seqlock logic goes bad. We can not serialize this because the arch 3412 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3366,14 +3415,25 @@ static int perf_event_index(struct perf_event *event)
3366void perf_event_update_userpage(struct perf_event *event) 3415void perf_event_update_userpage(struct perf_event *event)
3367{ 3416{
3368 struct perf_event_mmap_page *userpg; 3417 struct perf_event_mmap_page *userpg;
3369 struct perf_buffer *buffer; 3418 struct ring_buffer *rb;
3419 u64 enabled, running;
3370 3420
3371 rcu_read_lock(); 3421 rcu_read_lock();
3372 buffer = rcu_dereference(event->buffer); 3422 /*
3373 if (!buffer) 3423 * compute total_time_enabled, total_time_running
3424 * based on snapshot values taken when the event
3425 * was last scheduled in.
3426 *
3427 * we cannot simply called update_context_time()
3428 * because of locking issue as we can be called in
3429 * NMI context
3430 */
3431 calc_timer_values(event, &enabled, &running);
3432 rb = rcu_dereference(event->rb);
3433 if (!rb)
3374 goto unlock; 3434 goto unlock;
3375 3435
3376 userpg = buffer->user_page; 3436 userpg = rb->user_page;
3377 3437
3378 /* 3438 /*
3379 * Disable preemption so as to not let the corresponding user-space 3439 * Disable preemption so as to not let the corresponding user-space
@@ -3387,10 +3447,10 @@ void perf_event_update_userpage(struct perf_event *event)
3387 if (event->state == PERF_EVENT_STATE_ACTIVE) 3447 if (event->state == PERF_EVENT_STATE_ACTIVE)
3388 userpg->offset -= local64_read(&event->hw.prev_count); 3448 userpg->offset -= local64_read(&event->hw.prev_count);
3389 3449
3390 userpg->time_enabled = event->total_time_enabled + 3450 userpg->time_enabled = enabled +
3391 atomic64_read(&event->child_total_time_enabled); 3451 atomic64_read(&event->child_total_time_enabled);
3392 3452
3393 userpg->time_running = event->total_time_running + 3453 userpg->time_running = running +
3394 atomic64_read(&event->child_total_time_running); 3454 atomic64_read(&event->child_total_time_running);
3395 3455
3396 barrier(); 3456 barrier();
@@ -3400,220 +3460,10 @@ unlock:
3400 rcu_read_unlock(); 3460 rcu_read_unlock();
3401} 3461}
3402 3462
3403static unsigned long perf_data_size(struct perf_buffer *buffer);
3404
3405static void
3406perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3407{
3408 long max_size = perf_data_size(buffer);
3409
3410 if (watermark)
3411 buffer->watermark = min(max_size, watermark);
3412
3413 if (!buffer->watermark)
3414 buffer->watermark = max_size / 2;
3415
3416 if (flags & PERF_BUFFER_WRITABLE)
3417 buffer->writable = 1;
3418
3419 atomic_set(&buffer->refcount, 1);
3420}
3421
3422#ifndef CONFIG_PERF_USE_VMALLOC
3423
3424/*
3425 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3426 */
3427
3428static struct page *
3429perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3430{
3431 if (pgoff > buffer->nr_pages)
3432 return NULL;
3433
3434 if (pgoff == 0)
3435 return virt_to_page(buffer->user_page);
3436
3437 return virt_to_page(buffer->data_pages[pgoff - 1]);
3438}
3439
3440static void *perf_mmap_alloc_page(int cpu)
3441{
3442 struct page *page;
3443 int node;
3444
3445 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3446 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3447 if (!page)
3448 return NULL;
3449
3450 return page_address(page);
3451}
3452
3453static struct perf_buffer *
3454perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3455{
3456 struct perf_buffer *buffer;
3457 unsigned long size;
3458 int i;
3459
3460 size = sizeof(struct perf_buffer);
3461 size += nr_pages * sizeof(void *);
3462
3463 buffer = kzalloc(size, GFP_KERNEL);
3464 if (!buffer)
3465 goto fail;
3466
3467 buffer->user_page = perf_mmap_alloc_page(cpu);
3468 if (!buffer->user_page)
3469 goto fail_user_page;
3470
3471 for (i = 0; i < nr_pages; i++) {
3472 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
3473 if (!buffer->data_pages[i])
3474 goto fail_data_pages;
3475 }
3476
3477 buffer->nr_pages = nr_pages;
3478
3479 perf_buffer_init(buffer, watermark, flags);
3480
3481 return buffer;
3482
3483fail_data_pages:
3484 for (i--; i >= 0; i--)
3485 free_page((unsigned long)buffer->data_pages[i]);
3486
3487 free_page((unsigned long)buffer->user_page);
3488
3489fail_user_page:
3490 kfree(buffer);
3491
3492fail:
3493 return NULL;
3494}
3495
3496static void perf_mmap_free_page(unsigned long addr)
3497{
3498 struct page *page = virt_to_page((void *)addr);
3499
3500 page->mapping = NULL;
3501 __free_page(page);
3502}
3503
3504static void perf_buffer_free(struct perf_buffer *buffer)
3505{
3506 int i;
3507
3508 perf_mmap_free_page((unsigned long)buffer->user_page);
3509 for (i = 0; i < buffer->nr_pages; i++)
3510 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3511 kfree(buffer);
3512}
3513
3514static inline int page_order(struct perf_buffer *buffer)
3515{
3516 return 0;
3517}
3518
3519#else
3520
3521/*
3522 * Back perf_mmap() with vmalloc memory.
3523 *
3524 * Required for architectures that have d-cache aliasing issues.
3525 */
3526
3527static inline int page_order(struct perf_buffer *buffer)
3528{
3529 return buffer->page_order;
3530}
3531
3532static struct page *
3533perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3534{
3535 if (pgoff > (1UL << page_order(buffer)))
3536 return NULL;
3537
3538 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
3539}
3540
3541static void perf_mmap_unmark_page(void *addr)
3542{
3543 struct page *page = vmalloc_to_page(addr);
3544
3545 page->mapping = NULL;
3546}
3547
3548static void perf_buffer_free_work(struct work_struct *work)
3549{
3550 struct perf_buffer *buffer;
3551 void *base;
3552 int i, nr;
3553
3554 buffer = container_of(work, struct perf_buffer, work);
3555 nr = 1 << page_order(buffer);
3556
3557 base = buffer->user_page;
3558 for (i = 0; i < nr + 1; i++)
3559 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3560
3561 vfree(base);
3562 kfree(buffer);
3563}
3564
3565static void perf_buffer_free(struct perf_buffer *buffer)
3566{
3567 schedule_work(&buffer->work);
3568}
3569
3570static struct perf_buffer *
3571perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3572{
3573 struct perf_buffer *buffer;
3574 unsigned long size;
3575 void *all_buf;
3576
3577 size = sizeof(struct perf_buffer);
3578 size += sizeof(void *);
3579
3580 buffer = kzalloc(size, GFP_KERNEL);
3581 if (!buffer)
3582 goto fail;
3583
3584 INIT_WORK(&buffer->work, perf_buffer_free_work);
3585
3586 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3587 if (!all_buf)
3588 goto fail_all_buf;
3589
3590 buffer->user_page = all_buf;
3591 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3592 buffer->page_order = ilog2(nr_pages);
3593 buffer->nr_pages = 1;
3594
3595 perf_buffer_init(buffer, watermark, flags);
3596
3597 return buffer;
3598
3599fail_all_buf:
3600 kfree(buffer);
3601
3602fail:
3603 return NULL;
3604}
3605
3606#endif
3607
3608static unsigned long perf_data_size(struct perf_buffer *buffer)
3609{
3610 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
3611}
3612
3613static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 3463static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3614{ 3464{
3615 struct perf_event *event = vma->vm_file->private_data; 3465 struct perf_event *event = vma->vm_file->private_data;
3616 struct perf_buffer *buffer; 3466 struct ring_buffer *rb;
3617 int ret = VM_FAULT_SIGBUS; 3467 int ret = VM_FAULT_SIGBUS;
3618 3468
3619 if (vmf->flags & FAULT_FLAG_MKWRITE) { 3469 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3623,14 +3473,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3623 } 3473 }
3624 3474
3625 rcu_read_lock(); 3475 rcu_read_lock();
3626 buffer = rcu_dereference(event->buffer); 3476 rb = rcu_dereference(event->rb);
3627 if (!buffer) 3477 if (!rb)
3628 goto unlock; 3478 goto unlock;
3629 3479
3630 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 3480 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3631 goto unlock; 3481 goto unlock;
3632 3482
3633 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); 3483 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3634 if (!vmf->page) 3484 if (!vmf->page)
3635 goto unlock; 3485 goto unlock;
3636 3486
@@ -3645,35 +3495,35 @@ unlock:
3645 return ret; 3495 return ret;
3646} 3496}
3647 3497
3648static void perf_buffer_free_rcu(struct rcu_head *rcu_head) 3498static void rb_free_rcu(struct rcu_head *rcu_head)
3649{ 3499{
3650 struct perf_buffer *buffer; 3500 struct ring_buffer *rb;
3651 3501
3652 buffer = container_of(rcu_head, struct perf_buffer, rcu_head); 3502 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3653 perf_buffer_free(buffer); 3503 rb_free(rb);
3654} 3504}
3655 3505
3656static struct perf_buffer *perf_buffer_get(struct perf_event *event) 3506static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3657{ 3507{
3658 struct perf_buffer *buffer; 3508 struct ring_buffer *rb;
3659 3509
3660 rcu_read_lock(); 3510 rcu_read_lock();
3661 buffer = rcu_dereference(event->buffer); 3511 rb = rcu_dereference(event->rb);
3662 if (buffer) { 3512 if (rb) {
3663 if (!atomic_inc_not_zero(&buffer->refcount)) 3513 if (!atomic_inc_not_zero(&rb->refcount))
3664 buffer = NULL; 3514 rb = NULL;
3665 } 3515 }
3666 rcu_read_unlock(); 3516 rcu_read_unlock();
3667 3517
3668 return buffer; 3518 return rb;
3669} 3519}
3670 3520
3671static void perf_buffer_put(struct perf_buffer *buffer) 3521static void ring_buffer_put(struct ring_buffer *rb)
3672{ 3522{
3673 if (!atomic_dec_and_test(&buffer->refcount)) 3523 if (!atomic_dec_and_test(&rb->refcount))
3674 return; 3524 return;
3675 3525
3676 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); 3526 call_rcu(&rb->rcu_head, rb_free_rcu);
3677} 3527}
3678 3528
3679static void perf_mmap_open(struct vm_area_struct *vma) 3529static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3688,16 +3538,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3688 struct perf_event *event = vma->vm_file->private_data; 3538 struct perf_event *event = vma->vm_file->private_data;
3689 3539
3690 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3540 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3691 unsigned long size = perf_data_size(event->buffer); 3541 unsigned long size = perf_data_size(event->rb);
3692 struct user_struct *user = event->mmap_user; 3542 struct user_struct *user = event->mmap_user;
3693 struct perf_buffer *buffer = event->buffer; 3543 struct ring_buffer *rb = event->rb;
3694 3544
3695 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3545 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3696 vma->vm_mm->locked_vm -= event->mmap_locked; 3546 vma->vm_mm->locked_vm -= event->mmap_locked;
3697 rcu_assign_pointer(event->buffer, NULL); 3547 rcu_assign_pointer(event->rb, NULL);
3698 mutex_unlock(&event->mmap_mutex); 3548 mutex_unlock(&event->mmap_mutex);
3699 3549
3700 perf_buffer_put(buffer); 3550 ring_buffer_put(rb);
3701 free_uid(user); 3551 free_uid(user);
3702 } 3552 }
3703} 3553}
@@ -3715,7 +3565,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3715 unsigned long user_locked, user_lock_limit; 3565 unsigned long user_locked, user_lock_limit;
3716 struct user_struct *user = current_user(); 3566 struct user_struct *user = current_user();
3717 unsigned long locked, lock_limit; 3567 unsigned long locked, lock_limit;
3718 struct perf_buffer *buffer; 3568 struct ring_buffer *rb;
3719 unsigned long vma_size; 3569 unsigned long vma_size;
3720 unsigned long nr_pages; 3570 unsigned long nr_pages;
3721 long user_extra, extra; 3571 long user_extra, extra;
@@ -3724,7 +3574,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3724 /* 3574 /*
3725 * Don't allow mmap() of inherited per-task counters. This would 3575 * Don't allow mmap() of inherited per-task counters. This would
3726 * create a performance issue due to all children writing to the 3576 * create a performance issue due to all children writing to the
3727 * same buffer. 3577 * same rb.
3728 */ 3578 */
3729 if (event->cpu == -1 && event->attr.inherit) 3579 if (event->cpu == -1 && event->attr.inherit)
3730 return -EINVAL; 3580 return -EINVAL;
@@ -3736,7 +3586,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3736 nr_pages = (vma_size / PAGE_SIZE) - 1; 3586 nr_pages = (vma_size / PAGE_SIZE) - 1;
3737 3587
3738 /* 3588 /*
3739 * If we have buffer pages ensure they're a power-of-two number, so we 3589 * If we have rb pages ensure they're a power-of-two number, so we
3740 * can do bitmasks instead of modulo. 3590 * can do bitmasks instead of modulo.
3741 */ 3591 */
3742 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 3592 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3750,9 +3600,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3750 3600
3751 WARN_ON_ONCE(event->ctx->parent_ctx); 3601 WARN_ON_ONCE(event->ctx->parent_ctx);
3752 mutex_lock(&event->mmap_mutex); 3602 mutex_lock(&event->mmap_mutex);
3753 if (event->buffer) { 3603 if (event->rb) {
3754 if (event->buffer->nr_pages == nr_pages) 3604 if (event->rb->nr_pages == nr_pages)
3755 atomic_inc(&event->buffer->refcount); 3605 atomic_inc(&event->rb->refcount);
3756 else 3606 else
3757 ret = -EINVAL; 3607 ret = -EINVAL;
3758 goto unlock; 3608 goto unlock;
@@ -3782,18 +3632,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3782 goto unlock; 3632 goto unlock;
3783 } 3633 }
3784 3634
3785 WARN_ON(event->buffer); 3635 WARN_ON(event->rb);
3786 3636
3787 if (vma->vm_flags & VM_WRITE) 3637 if (vma->vm_flags & VM_WRITE)
3788 flags |= PERF_BUFFER_WRITABLE; 3638 flags |= RING_BUFFER_WRITABLE;
3639
3640 rb = rb_alloc(nr_pages,
3641 event->attr.watermark ? event->attr.wakeup_watermark : 0,
3642 event->cpu, flags);
3789 3643
3790 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, 3644 if (!rb) {
3791 event->cpu, flags);
3792 if (!buffer) {
3793 ret = -ENOMEM; 3645 ret = -ENOMEM;
3794 goto unlock; 3646 goto unlock;
3795 } 3647 }
3796 rcu_assign_pointer(event->buffer, buffer); 3648 rcu_assign_pointer(event->rb, rb);
3797 3649
3798 atomic_long_add(user_extra, &user->locked_vm); 3650 atomic_long_add(user_extra, &user->locked_vm);
3799 event->mmap_locked = extra; 3651 event->mmap_locked = extra;
@@ -3892,117 +3744,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3892} 3744}
3893EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3745EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3894 3746
3895/*
3896 * Output
3897 */
3898static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3899 unsigned long offset, unsigned long head)
3900{
3901 unsigned long mask;
3902
3903 if (!buffer->writable)
3904 return true;
3905
3906 mask = perf_data_size(buffer) - 1;
3907
3908 offset = (offset - tail) & mask;
3909 head = (head - tail) & mask;
3910
3911 if ((int)(head - offset) < 0)
3912 return false;
3913
3914 return true;
3915}
3916
3917static void perf_output_wakeup(struct perf_output_handle *handle)
3918{
3919 atomic_set(&handle->buffer->poll, POLL_IN);
3920
3921 if (handle->nmi) {
3922 handle->event->pending_wakeup = 1;
3923 irq_work_queue(&handle->event->pending);
3924 } else
3925 perf_event_wakeup(handle->event);
3926}
3927
3928/*
3929 * We need to ensure a later event_id doesn't publish a head when a former
3930 * event isn't done writing. However since we need to deal with NMIs we
3931 * cannot fully serialize things.
3932 *
3933 * We only publish the head (and generate a wakeup) when the outer-most
3934 * event completes.
3935 */
3936static void perf_output_get_handle(struct perf_output_handle *handle)
3937{
3938 struct perf_buffer *buffer = handle->buffer;
3939
3940 preempt_disable();
3941 local_inc(&buffer->nest);
3942 handle->wakeup = local_read(&buffer->wakeup);
3943}
3944
3945static void perf_output_put_handle(struct perf_output_handle *handle)
3946{
3947 struct perf_buffer *buffer = handle->buffer;
3948 unsigned long head;
3949
3950again:
3951 head = local_read(&buffer->head);
3952
3953 /*
3954 * IRQ/NMI can happen here, which means we can miss a head update.
3955 */
3956
3957 if (!local_dec_and_test(&buffer->nest))
3958 goto out;
3959
3960 /*
3961 * Publish the known good head. Rely on the full barrier implied
3962 * by atomic_dec_and_test() order the buffer->head read and this
3963 * write.
3964 */
3965 buffer->user_page->data_head = head;
3966
3967 /*
3968 * Now check if we missed an update, rely on the (compiler)
3969 * barrier in atomic_dec_and_test() to re-read buffer->head.
3970 */
3971 if (unlikely(head != local_read(&buffer->head))) {
3972 local_inc(&buffer->nest);
3973 goto again;
3974 }
3975
3976 if (handle->wakeup != local_read(&buffer->wakeup))
3977 perf_output_wakeup(handle);
3978
3979out:
3980 preempt_enable();
3981}
3982
3983__always_inline void perf_output_copy(struct perf_output_handle *handle,
3984 const void *buf, unsigned int len)
3985{
3986 do {
3987 unsigned long size = min_t(unsigned long, handle->size, len);
3988
3989 memcpy(handle->addr, buf, size);
3990
3991 len -= size;
3992 handle->addr += size;
3993 buf += size;
3994 handle->size -= size;
3995 if (!handle->size) {
3996 struct perf_buffer *buffer = handle->buffer;
3997
3998 handle->page++;
3999 handle->page &= buffer->nr_pages - 1;
4000 handle->addr = buffer->data_pages[handle->page];
4001 handle->size = PAGE_SIZE << page_order(buffer);
4002 }
4003 } while (len);
4004}
4005
4006static void __perf_event_header__init_id(struct perf_event_header *header, 3747static void __perf_event_header__init_id(struct perf_event_header *header,
4007 struct perf_sample_data *data, 3748 struct perf_sample_data *data,
4008 struct perf_event *event) 3749 struct perf_event *event)
@@ -4033,9 +3774,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4033 } 3774 }
4034} 3775}
4035 3776
4036static void perf_event_header__init_id(struct perf_event_header *header, 3777void perf_event_header__init_id(struct perf_event_header *header,
4037 struct perf_sample_data *data, 3778 struct perf_sample_data *data,
4038 struct perf_event *event) 3779 struct perf_event *event)
4039{ 3780{
4040 if (event->attr.sample_id_all) 3781 if (event->attr.sample_id_all)
4041 __perf_event_header__init_id(header, data, event); 3782 __perf_event_header__init_id(header, data, event);
@@ -4062,121 +3803,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4062 perf_output_put(handle, data->cpu_entry); 3803 perf_output_put(handle, data->cpu_entry);
4063} 3804}
4064 3805
4065static void perf_event__output_id_sample(struct perf_event *event, 3806void perf_event__output_id_sample(struct perf_event *event,
4066 struct perf_output_handle *handle, 3807 struct perf_output_handle *handle,
4067 struct perf_sample_data *sample) 3808 struct perf_sample_data *sample)
4068{ 3809{
4069 if (event->attr.sample_id_all) 3810 if (event->attr.sample_id_all)
4070 __perf_event__output_id_sample(handle, sample); 3811 __perf_event__output_id_sample(handle, sample);
4071} 3812}
4072 3813
4073int perf_output_begin(struct perf_output_handle *handle,
4074 struct perf_event *event, unsigned int size,
4075 int nmi, int sample)
4076{
4077 struct perf_buffer *buffer;
4078 unsigned long tail, offset, head;
4079 int have_lost;
4080 struct perf_sample_data sample_data;
4081 struct {
4082 struct perf_event_header header;
4083 u64 id;
4084 u64 lost;
4085 } lost_event;
4086
4087 rcu_read_lock();
4088 /*
4089 * For inherited events we send all the output towards the parent.
4090 */
4091 if (event->parent)
4092 event = event->parent;
4093
4094 buffer = rcu_dereference(event->buffer);
4095 if (!buffer)
4096 goto out;
4097
4098 handle->buffer = buffer;
4099 handle->event = event;
4100 handle->nmi = nmi;
4101 handle->sample = sample;
4102
4103 if (!buffer->nr_pages)
4104 goto out;
4105
4106 have_lost = local_read(&buffer->lost);
4107 if (have_lost) {
4108 lost_event.header.size = sizeof(lost_event);
4109 perf_event_header__init_id(&lost_event.header, &sample_data,
4110 event);
4111 size += lost_event.header.size;
4112 }
4113
4114 perf_output_get_handle(handle);
4115
4116 do {
4117 /*
4118 * Userspace could choose to issue a mb() before updating the
4119 * tail pointer. So that all reads will be completed before the
4120 * write is issued.
4121 */
4122 tail = ACCESS_ONCE(buffer->user_page->data_tail);
4123 smp_rmb();
4124 offset = head = local_read(&buffer->head);
4125 head += size;
4126 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
4127 goto fail;
4128 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
4129
4130 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4131 local_add(buffer->watermark, &buffer->wakeup);
4132
4133 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4134 handle->page &= buffer->nr_pages - 1;
4135 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4136 handle->addr = buffer->data_pages[handle->page];
4137 handle->addr += handle->size;
4138 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
4139
4140 if (have_lost) {
4141 lost_event.header.type = PERF_RECORD_LOST;
4142 lost_event.header.misc = 0;
4143 lost_event.id = event->id;
4144 lost_event.lost = local_xchg(&buffer->lost, 0);
4145
4146 perf_output_put(handle, lost_event);
4147 perf_event__output_id_sample(event, handle, &sample_data);
4148 }
4149
4150 return 0;
4151
4152fail:
4153 local_inc(&buffer->lost);
4154 perf_output_put_handle(handle);
4155out:
4156 rcu_read_unlock();
4157
4158 return -ENOSPC;
4159}
4160
4161void perf_output_end(struct perf_output_handle *handle)
4162{
4163 struct perf_event *event = handle->event;
4164 struct perf_buffer *buffer = handle->buffer;
4165
4166 int wakeup_events = event->attr.wakeup_events;
4167
4168 if (handle->sample && wakeup_events) {
4169 int events = local_inc_return(&buffer->events);
4170 if (events >= wakeup_events) {
4171 local_sub(wakeup_events, &buffer->events);
4172 local_inc(&buffer->wakeup);
4173 }
4174 }
4175
4176 perf_output_put_handle(handle);
4177 rcu_read_unlock();
4178}
4179
4180static void perf_output_read_one(struct perf_output_handle *handle, 3814static void perf_output_read_one(struct perf_output_handle *handle,
4181 struct perf_event *event, 3815 struct perf_event *event,
4182 u64 enabled, u64 running) 3816 u64 enabled, u64 running)
@@ -4197,7 +3831,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
4197 if (read_format & PERF_FORMAT_ID) 3831 if (read_format & PERF_FORMAT_ID)
4198 values[n++] = primary_event_id(event); 3832 values[n++] = primary_event_id(event);
4199 3833
4200 perf_output_copy(handle, values, n * sizeof(u64)); 3834 __output_copy(handle, values, n * sizeof(u64));
4201} 3835}
4202 3836
4203/* 3837/*
@@ -4227,7 +3861,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4227 if (read_format & PERF_FORMAT_ID) 3861 if (read_format & PERF_FORMAT_ID)
4228 values[n++] = primary_event_id(leader); 3862 values[n++] = primary_event_id(leader);
4229 3863
4230 perf_output_copy(handle, values, n * sizeof(u64)); 3864 __output_copy(handle, values, n * sizeof(u64));
4231 3865
4232 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3866 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4233 n = 0; 3867 n = 0;
@@ -4239,7 +3873,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4239 if (read_format & PERF_FORMAT_ID) 3873 if (read_format & PERF_FORMAT_ID)
4240 values[n++] = primary_event_id(sub); 3874 values[n++] = primary_event_id(sub);
4241 3875
4242 perf_output_copy(handle, values, n * sizeof(u64)); 3876 __output_copy(handle, values, n * sizeof(u64));
4243 } 3877 }
4244} 3878}
4245 3879
@@ -4249,7 +3883,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4249static void perf_output_read(struct perf_output_handle *handle, 3883static void perf_output_read(struct perf_output_handle *handle,
4250 struct perf_event *event) 3884 struct perf_event *event)
4251{ 3885{
4252 u64 enabled = 0, running = 0, now, ctx_time; 3886 u64 enabled = 0, running = 0;
4253 u64 read_format = event->attr.read_format; 3887 u64 read_format = event->attr.read_format;
4254 3888
4255 /* 3889 /*
@@ -4261,12 +3895,8 @@ static void perf_output_read(struct perf_output_handle *handle,
4261 * because of locking issue as we are called in 3895 * because of locking issue as we are called in
4262 * NMI context 3896 * NMI context
4263 */ 3897 */
4264 if (read_format & PERF_FORMAT_TOTAL_TIMES) { 3898 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4265 now = perf_clock(); 3899 calc_timer_values(event, &enabled, &running);
4266 ctx_time = event->shadow_ctx_time + now;
4267 enabled = ctx_time - event->tstamp_enabled;
4268 running = ctx_time - event->tstamp_running;
4269 }
4270 3900
4271 if (event->attr.read_format & PERF_FORMAT_GROUP) 3901 if (event->attr.read_format & PERF_FORMAT_GROUP)
4272 perf_output_read_group(handle, event, enabled, running); 3902 perf_output_read_group(handle, event, enabled, running);
@@ -4319,7 +3949,7 @@ void perf_output_sample(struct perf_output_handle *handle,
4319 3949
4320 size *= sizeof(u64); 3950 size *= sizeof(u64);
4321 3951
4322 perf_output_copy(handle, data->callchain, size); 3952 __output_copy(handle, data->callchain, size);
4323 } else { 3953 } else {
4324 u64 nr = 0; 3954 u64 nr = 0;
4325 perf_output_put(handle, nr); 3955 perf_output_put(handle, nr);
@@ -4329,8 +3959,8 @@ void perf_output_sample(struct perf_output_handle *handle,
4329 if (sample_type & PERF_SAMPLE_RAW) { 3959 if (sample_type & PERF_SAMPLE_RAW) {
4330 if (data->raw) { 3960 if (data->raw) {
4331 perf_output_put(handle, data->raw->size); 3961 perf_output_put(handle, data->raw->size);
4332 perf_output_copy(handle, data->raw->data, 3962 __output_copy(handle, data->raw->data,
4333 data->raw->size); 3963 data->raw->size);
4334 } else { 3964 } else {
4335 struct { 3965 struct {
4336 u32 size; 3966 u32 size;
@@ -4342,6 +3972,20 @@ void perf_output_sample(struct perf_output_handle *handle,
4342 perf_output_put(handle, raw); 3972 perf_output_put(handle, raw);
4343 } 3973 }
4344 } 3974 }
3975
3976 if (!event->attr.watermark) {
3977 int wakeup_events = event->attr.wakeup_events;
3978
3979 if (wakeup_events) {
3980 struct ring_buffer *rb = handle->rb;
3981 int events = local_inc_return(&rb->events);
3982
3983 if (events >= wakeup_events) {
3984 local_sub(wakeup_events, &rb->events);
3985 local_inc(&rb->wakeup);
3986 }
3987 }
3988 }
4345} 3989}
4346 3990
4347void perf_prepare_sample(struct perf_event_header *header, 3991void perf_prepare_sample(struct perf_event_header *header,
@@ -4386,7 +4030,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4386 } 4030 }
4387} 4031}
4388 4032
4389static void perf_event_output(struct perf_event *event, int nmi, 4033static void perf_event_output(struct perf_event *event,
4390 struct perf_sample_data *data, 4034 struct perf_sample_data *data,
4391 struct pt_regs *regs) 4035 struct pt_regs *regs)
4392{ 4036{
@@ -4398,7 +4042,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
4398 4042
4399 perf_prepare_sample(&header, data, event, regs); 4043 perf_prepare_sample(&header, data, event, regs);
4400 4044
4401 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 4045 if (perf_output_begin(&handle, event, header.size))
4402 goto exit; 4046 goto exit;
4403 4047
4404 perf_output_sample(&handle, &header, data, event); 4048 perf_output_sample(&handle, &header, data, event);
@@ -4438,7 +4082,7 @@ perf_event_read_event(struct perf_event *event,
4438 int ret; 4082 int ret;
4439 4083
4440 perf_event_header__init_id(&read_event.header, &sample, event); 4084 perf_event_header__init_id(&read_event.header, &sample, event);
4441 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 4085 ret = perf_output_begin(&handle, event, read_event.header.size);
4442 if (ret) 4086 if (ret)
4443 return; 4087 return;
4444 4088
@@ -4481,7 +4125,7 @@ static void perf_event_task_output(struct perf_event *event,
4481 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4125 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4482 4126
4483 ret = perf_output_begin(&handle, event, 4127 ret = perf_output_begin(&handle, event,
4484 task_event->event_id.header.size, 0, 0); 4128 task_event->event_id.header.size);
4485 if (ret) 4129 if (ret)
4486 goto out; 4130 goto out;
4487 4131
@@ -4618,7 +4262,7 @@ static void perf_event_comm_output(struct perf_event *event,
4618 4262
4619 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4263 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4620 ret = perf_output_begin(&handle, event, 4264 ret = perf_output_begin(&handle, event,
4621 comm_event->event_id.header.size, 0, 0); 4265 comm_event->event_id.header.size);
4622 4266
4623 if (ret) 4267 if (ret)
4624 goto out; 4268 goto out;
@@ -4627,7 +4271,7 @@ static void perf_event_comm_output(struct perf_event *event,
4627 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4271 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4628 4272
4629 perf_output_put(&handle, comm_event->event_id); 4273 perf_output_put(&handle, comm_event->event_id);
4630 perf_output_copy(&handle, comm_event->comm, 4274 __output_copy(&handle, comm_event->comm,
4631 comm_event->comm_size); 4275 comm_event->comm_size);
4632 4276
4633 perf_event__output_id_sample(event, &handle, &sample); 4277 perf_event__output_id_sample(event, &handle, &sample);
@@ -4765,7 +4409,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4765 4409
4766 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 4410 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4767 ret = perf_output_begin(&handle, event, 4411 ret = perf_output_begin(&handle, event,
4768 mmap_event->event_id.header.size, 0, 0); 4412 mmap_event->event_id.header.size);
4769 if (ret) 4413 if (ret)
4770 goto out; 4414 goto out;
4771 4415
@@ -4773,7 +4417,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4773 mmap_event->event_id.tid = perf_event_tid(event, current); 4417 mmap_event->event_id.tid = perf_event_tid(event, current);
4774 4418
4775 perf_output_put(&handle, mmap_event->event_id); 4419 perf_output_put(&handle, mmap_event->event_id);
4776 perf_output_copy(&handle, mmap_event->file_name, 4420 __output_copy(&handle, mmap_event->file_name,
4777 mmap_event->file_size); 4421 mmap_event->file_size);
4778 4422
4779 perf_event__output_id_sample(event, &handle, &sample); 4423 perf_event__output_id_sample(event, &handle, &sample);
@@ -4829,7 +4473,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4829 4473
4830 if (file) { 4474 if (file) {
4831 /* 4475 /*
4832 * d_path works from the end of the buffer backwards, so we 4476 * d_path works from the end of the rb backwards, so we
4833 * need to add enough zero bytes after the string to handle 4477 * need to add enough zero bytes after the string to handle
4834 * the 64bit alignment we do later. 4478 * the 64bit alignment we do later.
4835 */ 4479 */
@@ -4960,7 +4604,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4960 perf_event_header__init_id(&throttle_event.header, &sample, event); 4604 perf_event_header__init_id(&throttle_event.header, &sample, event);
4961 4605
4962 ret = perf_output_begin(&handle, event, 4606 ret = perf_output_begin(&handle, event,
4963 throttle_event.header.size, 1, 0); 4607 throttle_event.header.size);
4964 if (ret) 4608 if (ret)
4965 return; 4609 return;
4966 4610
@@ -4973,7 +4617,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4973 * Generic event overflow handling, sampling. 4617 * Generic event overflow handling, sampling.
4974 */ 4618 */
4975 4619
4976static int __perf_event_overflow(struct perf_event *event, int nmi, 4620static int __perf_event_overflow(struct perf_event *event,
4977 int throttle, struct perf_sample_data *data, 4621 int throttle, struct perf_sample_data *data,
4978 struct pt_regs *regs) 4622 struct pt_regs *regs)
4979{ 4623{
@@ -5016,34 +4660,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
5016 if (events && atomic_dec_and_test(&event->event_limit)) { 4660 if (events && atomic_dec_and_test(&event->event_limit)) {
5017 ret = 1; 4661 ret = 1;
5018 event->pending_kill = POLL_HUP; 4662 event->pending_kill = POLL_HUP;
5019 if (nmi) { 4663 event->pending_disable = 1;
5020 event->pending_disable = 1; 4664 irq_work_queue(&event->pending);
5021 irq_work_queue(&event->pending);
5022 } else
5023 perf_event_disable(event);
5024 } 4665 }
5025 4666
5026 if (event->overflow_handler) 4667 if (event->overflow_handler)
5027 event->overflow_handler(event, nmi, data, regs); 4668 event->overflow_handler(event, data, regs);
5028 else 4669 else
5029 perf_event_output(event, nmi, data, regs); 4670 perf_event_output(event, data, regs);
5030 4671
5031 if (event->fasync && event->pending_kill) { 4672 if (event->fasync && event->pending_kill) {
5032 if (nmi) { 4673 event->pending_wakeup = 1;
5033 event->pending_wakeup = 1; 4674 irq_work_queue(&event->pending);
5034 irq_work_queue(&event->pending);
5035 } else
5036 perf_event_wakeup(event);
5037 } 4675 }
5038 4676
5039 return ret; 4677 return ret;
5040} 4678}
5041 4679
5042int perf_event_overflow(struct perf_event *event, int nmi, 4680int perf_event_overflow(struct perf_event *event,
5043 struct perf_sample_data *data, 4681 struct perf_sample_data *data,
5044 struct pt_regs *regs) 4682 struct pt_regs *regs)
5045{ 4683{
5046 return __perf_event_overflow(event, nmi, 1, data, regs); 4684 return __perf_event_overflow(event, 1, data, regs);
5047} 4685}
5048 4686
5049/* 4687/*
@@ -5092,7 +4730,7 @@ again:
5092} 4730}
5093 4731
5094static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 4732static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5095 int nmi, struct perf_sample_data *data, 4733 struct perf_sample_data *data,
5096 struct pt_regs *regs) 4734 struct pt_regs *regs)
5097{ 4735{
5098 struct hw_perf_event *hwc = &event->hw; 4736 struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4744,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5106 return; 4744 return;
5107 4745
5108 for (; overflow; overflow--) { 4746 for (; overflow; overflow--) {
5109 if (__perf_event_overflow(event, nmi, throttle, 4747 if (__perf_event_overflow(event, throttle,
5110 data, regs)) { 4748 data, regs)) {
5111 /* 4749 /*
5112 * We inhibit the overflow from happening when 4750 * We inhibit the overflow from happening when
@@ -5119,7 +4757,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5119} 4757}
5120 4758
5121static void perf_swevent_event(struct perf_event *event, u64 nr, 4759static void perf_swevent_event(struct perf_event *event, u64 nr,
5122 int nmi, struct perf_sample_data *data, 4760 struct perf_sample_data *data,
5123 struct pt_regs *regs) 4761 struct pt_regs *regs)
5124{ 4762{
5125 struct hw_perf_event *hwc = &event->hw; 4763 struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4771,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
5133 return; 4771 return;
5134 4772
5135 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4773 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5136 return perf_swevent_overflow(event, 1, nmi, data, regs); 4774 return perf_swevent_overflow(event, 1, data, regs);
5137 4775
5138 if (local64_add_negative(nr, &hwc->period_left)) 4776 if (local64_add_negative(nr, &hwc->period_left))
5139 return; 4777 return;
5140 4778
5141 perf_swevent_overflow(event, 0, nmi, data, regs); 4779 perf_swevent_overflow(event, 0, data, regs);
5142} 4780}
5143 4781
5144static int perf_exclude_event(struct perf_event *event, 4782static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4864,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5226} 4864}
5227 4865
5228static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4866static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5229 u64 nr, int nmi, 4867 u64 nr,
5230 struct perf_sample_data *data, 4868 struct perf_sample_data *data,
5231 struct pt_regs *regs) 4869 struct pt_regs *regs)
5232{ 4870{
@@ -5242,7 +4880,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5242 4880
5243 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4881 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5244 if (perf_swevent_match(event, type, event_id, data, regs)) 4882 if (perf_swevent_match(event, type, event_id, data, regs))
5245 perf_swevent_event(event, nr, nmi, data, regs); 4883 perf_swevent_event(event, nr, data, regs);
5246 } 4884 }
5247end: 4885end:
5248 rcu_read_unlock(); 4886 rcu_read_unlock();
@@ -5263,8 +4901,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
5263 put_recursion_context(swhash->recursion, rctx); 4901 put_recursion_context(swhash->recursion, rctx);
5264} 4902}
5265 4903
5266void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4904void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5267 struct pt_regs *regs, u64 addr)
5268{ 4905{
5269 struct perf_sample_data data; 4906 struct perf_sample_data data;
5270 int rctx; 4907 int rctx;
@@ -5276,7 +4913,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5276 4913
5277 perf_sample_data_init(&data, addr); 4914 perf_sample_data_init(&data, addr);
5278 4915
5279 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4916 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5280 4917
5281 perf_swevent_put_recursion_context(rctx); 4918 perf_swevent_put_recursion_context(rctx);
5282 preempt_enable_notrace(); 4919 preempt_enable_notrace();
@@ -5524,7 +5161,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5524 5161
5525 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5162 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5526 if (perf_tp_event_match(event, &data, regs)) 5163 if (perf_tp_event_match(event, &data, regs))
5527 perf_swevent_event(event, count, 1, &data, regs); 5164 perf_swevent_event(event, count, &data, regs);
5528 } 5165 }
5529 5166
5530 perf_swevent_put_recursion_context(rctx); 5167 perf_swevent_put_recursion_context(rctx);
@@ -5617,7 +5254,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5617 perf_sample_data_init(&sample, bp->attr.bp_addr); 5254 perf_sample_data_init(&sample, bp->attr.bp_addr);
5618 5255
5619 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5256 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5620 perf_swevent_event(bp, 1, 1, &sample, regs); 5257 perf_swevent_event(bp, 1, &sample, regs);
5621} 5258}
5622#endif 5259#endif
5623 5260
@@ -5646,7 +5283,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5646 5283
5647 if (regs && !perf_exclude_event(event, regs)) { 5284 if (regs && !perf_exclude_event(event, regs)) {
5648 if (!(event->attr.exclude_idle && current->pid == 0)) 5285 if (!(event->attr.exclude_idle && current->pid == 0))
5649 if (perf_event_overflow(event, 0, &data, regs)) 5286 if (perf_event_overflow(event, &data, regs))
5650 ret = HRTIMER_NORESTART; 5287 ret = HRTIMER_NORESTART;
5651 } 5288 }
5652 5289
@@ -5986,6 +5623,7 @@ free_dev:
5986} 5623}
5987 5624
5988static struct lock_class_key cpuctx_mutex; 5625static struct lock_class_key cpuctx_mutex;
5626static struct lock_class_key cpuctx_lock;
5989 5627
5990int perf_pmu_register(struct pmu *pmu, char *name, int type) 5628int perf_pmu_register(struct pmu *pmu, char *name, int type)
5991{ 5629{
@@ -6036,6 +5674,7 @@ skip_type:
6036 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5674 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6037 __perf_event_init_context(&cpuctx->ctx); 5675 __perf_event_init_context(&cpuctx->ctx);
6038 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 5676 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5677 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6039 cpuctx->ctx.type = cpu_context; 5678 cpuctx->ctx.type = cpu_context;
6040 cpuctx->ctx.pmu = pmu; 5679 cpuctx->ctx.pmu = pmu;
6041 cpuctx->jiffies_interval = 1; 5680 cpuctx->jiffies_interval = 1;
@@ -6150,7 +5789,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6150 struct task_struct *task, 5789 struct task_struct *task,
6151 struct perf_event *group_leader, 5790 struct perf_event *group_leader,
6152 struct perf_event *parent_event, 5791 struct perf_event *parent_event,
6153 perf_overflow_handler_t overflow_handler) 5792 perf_overflow_handler_t overflow_handler,
5793 void *context)
6154{ 5794{
6155 struct pmu *pmu; 5795 struct pmu *pmu;
6156 struct perf_event *event; 5796 struct perf_event *event;
@@ -6208,10 +5848,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6208#endif 5848#endif
6209 } 5849 }
6210 5850
6211 if (!overflow_handler && parent_event) 5851 if (!overflow_handler && parent_event) {
6212 overflow_handler = parent_event->overflow_handler; 5852 overflow_handler = parent_event->overflow_handler;
5853 context = parent_event->overflow_handler_context;
5854 }
6213 5855
6214 event->overflow_handler = overflow_handler; 5856 event->overflow_handler = overflow_handler;
5857 event->overflow_handler_context = context;
6215 5858
6216 if (attr->disabled) 5859 if (attr->disabled)
6217 event->state = PERF_EVENT_STATE_OFF; 5860 event->state = PERF_EVENT_STATE_OFF;
@@ -6326,13 +5969,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6326 if (ret) 5969 if (ret)
6327 return -EFAULT; 5970 return -EFAULT;
6328 5971
6329 /*
6330 * If the type exists, the corresponding creation will verify
6331 * the attr->config.
6332 */
6333 if (attr->type >= PERF_TYPE_MAX)
6334 return -EINVAL;
6335
6336 if (attr->__reserved_1) 5972 if (attr->__reserved_1)
6337 return -EINVAL; 5973 return -EINVAL;
6338 5974
@@ -6354,7 +5990,7 @@ err_size:
6354static int 5990static int
6355perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5991perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6356{ 5992{
6357 struct perf_buffer *buffer = NULL, *old_buffer = NULL; 5993 struct ring_buffer *rb = NULL, *old_rb = NULL;
6358 int ret = -EINVAL; 5994 int ret = -EINVAL;
6359 5995
6360 if (!output_event) 5996 if (!output_event)
@@ -6371,7 +6007,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6371 goto out; 6007 goto out;
6372 6008
6373 /* 6009 /*
6374 * If its not a per-cpu buffer, it must be the same task. 6010 * If its not a per-cpu rb, it must be the same task.
6375 */ 6011 */
6376 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 6012 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6377 goto out; 6013 goto out;
@@ -6383,20 +6019,20 @@ set:
6383 goto unlock; 6019 goto unlock;
6384 6020
6385 if (output_event) { 6021 if (output_event) {
6386 /* get the buffer we want to redirect to */ 6022 /* get the rb we want to redirect to */
6387 buffer = perf_buffer_get(output_event); 6023 rb = ring_buffer_get(output_event);
6388 if (!buffer) 6024 if (!rb)
6389 goto unlock; 6025 goto unlock;
6390 } 6026 }
6391 6027
6392 old_buffer = event->buffer; 6028 old_rb = event->rb;
6393 rcu_assign_pointer(event->buffer, buffer); 6029 rcu_assign_pointer(event->rb, rb);
6394 ret = 0; 6030 ret = 0;
6395unlock: 6031unlock:
6396 mutex_unlock(&event->mmap_mutex); 6032 mutex_unlock(&event->mmap_mutex);
6397 6033
6398 if (old_buffer) 6034 if (old_rb)
6399 perf_buffer_put(old_buffer); 6035 ring_buffer_put(old_rb);
6400out: 6036out:
6401 return ret; 6037 return ret;
6402} 6038}
@@ -6478,7 +6114,8 @@ SYSCALL_DEFINE5(perf_event_open,
6478 } 6114 }
6479 } 6115 }
6480 6116
6481 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); 6117 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6118 NULL, NULL);
6482 if (IS_ERR(event)) { 6119 if (IS_ERR(event)) {
6483 err = PTR_ERR(event); 6120 err = PTR_ERR(event);
6484 goto err_task; 6121 goto err_task;
@@ -6663,7 +6300,8 @@ err_fd:
6663struct perf_event * 6300struct perf_event *
6664perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 6301perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6665 struct task_struct *task, 6302 struct task_struct *task,
6666 perf_overflow_handler_t overflow_handler) 6303 perf_overflow_handler_t overflow_handler,
6304 void *context)
6667{ 6305{
6668 struct perf_event_context *ctx; 6306 struct perf_event_context *ctx;
6669 struct perf_event *event; 6307 struct perf_event *event;
@@ -6673,7 +6311,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6673 * Get the target context (task or percpu): 6311 * Get the target context (task or percpu):
6674 */ 6312 */
6675 6313
6676 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); 6314 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6315 overflow_handler, context);
6677 if (IS_ERR(event)) { 6316 if (IS_ERR(event)) {
6678 err = PTR_ERR(event); 6317 err = PTR_ERR(event);
6679 goto err; 6318 goto err;
@@ -6780,7 +6419,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6780 * our context. 6419 * our context.
6781 */ 6420 */
6782 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 6421 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6783 task_ctx_sched_out(child_ctx, EVENT_ALL);
6784 6422
6785 /* 6423 /*
6786 * Take the context lock here so that if find_get_context is 6424 * Take the context lock here so that if find_get_context is
@@ -6788,6 +6426,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6788 * incremented the context's refcount before we do put_ctx below. 6426 * incremented the context's refcount before we do put_ctx below.
6789 */ 6427 */
6790 raw_spin_lock(&child_ctx->lock); 6428 raw_spin_lock(&child_ctx->lock);
6429 task_ctx_sched_out(child_ctx);
6791 child->perf_event_ctxp[ctxn] = NULL; 6430 child->perf_event_ctxp[ctxn] = NULL;
6792 /* 6431 /*
6793 * If this context is a clone; unclone it so it can't get 6432 * If this context is a clone; unclone it so it can't get
@@ -6957,7 +6596,7 @@ inherit_event(struct perf_event *parent_event,
6957 parent_event->cpu, 6596 parent_event->cpu,
6958 child, 6597 child,
6959 group_leader, parent_event, 6598 group_leader, parent_event,
6960 NULL); 6599 NULL, NULL);
6961 if (IS_ERR(child_event)) 6600 if (IS_ERR(child_event))
6962 return child_event; 6601 return child_event;
6963 get_ctx(child_ctx); 6602 get_ctx(child_ctx);
@@ -6984,6 +6623,8 @@ inherit_event(struct perf_event *parent_event,
6984 6623
6985 child_event->ctx = child_ctx; 6624 child_event->ctx = child_ctx;
6986 child_event->overflow_handler = parent_event->overflow_handler; 6625 child_event->overflow_handler = parent_event->overflow_handler;
6626 child_event->overflow_handler_context
6627 = parent_event->overflow_handler_context;
6987 6628
6988 /* 6629 /*
6989 * Precalculate sample_data sizes 6630 * Precalculate sample_data sizes
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55..b7971d6f38b 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
431struct perf_event * 431struct perf_event *
432register_user_hw_breakpoint(struct perf_event_attr *attr, 432register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 void *context,
434 struct task_struct *tsk) 435 struct task_struct *tsk)
435{ 436{
436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered); 437 return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
438 context);
437} 439}
438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 440EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
439 441
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
502 */ 504 */
503struct perf_event * __percpu * 505struct perf_event * __percpu *
504register_wide_hw_breakpoint(struct perf_event_attr *attr, 506register_wide_hw_breakpoint(struct perf_event_attr *attr,
505 perf_overflow_handler_t triggered) 507 perf_overflow_handler_t triggered,
508 void *context)
506{ 509{
507 struct perf_event * __percpu *cpu_events, **pevent, *bp; 510 struct perf_event * __percpu *cpu_events, **pevent, *bp;
508 long err; 511 long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
515 get_online_cpus(); 518 get_online_cpus();
516 for_each_online_cpu(cpu) { 519 for_each_online_cpu(cpu) {
517 pevent = per_cpu_ptr(cpu_events, cpu); 520 pevent = per_cpu_ptr(cpu_events, cpu);
518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); 521 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
522 triggered, context);
519 523
520 *pevent = bp; 524 *pevent = bp;
521 525
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 00000000000..09097dd8116
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H
3
4#define RING_BUFFER_WRITABLE 0x01
5
6struct ring_buffer {
7 atomic_t refcount;
8 struct rcu_head rcu_head;
9#ifdef CONFIG_PERF_USE_VMALLOC
10 struct work_struct work;
11 int page_order; /* allocation order */
12#endif
13 int nr_pages; /* nr of data pages */
14 int writable; /* are we writable */
15
16 atomic_t poll; /* POLL_ for wakeups */
17
18 local_t head; /* write position */
19 local_t nest; /* nested writers */
20 local_t events; /* event limit */
21 local_t wakeup; /* wakeup stamp */
22 local_t lost; /* nr records lost */
23
24 long watermark; /* wakeup watermark */
25
26 struct perf_event_mmap_page *user_page;
27 void *data_pages[0];
28};
29
30extern void rb_free(struct ring_buffer *rb);
31extern struct ring_buffer *
32rb_alloc(int nr_pages, long watermark, int cpu, int flags);
33extern void perf_event_wakeup(struct perf_event *event);
34
35extern void
36perf_event_header__init_id(struct perf_event_header *header,
37 struct perf_sample_data *data,
38 struct perf_event *event);
39extern void
40perf_event__output_id_sample(struct perf_event *event,
41 struct perf_output_handle *handle,
42 struct perf_sample_data *sample);
43
44extern struct page *
45perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
46
47#ifdef CONFIG_PERF_USE_VMALLOC
48/*
49 * Back perf_mmap() with vmalloc memory.
50 *
51 * Required for architectures that have d-cache aliasing issues.
52 */
53
54static inline int page_order(struct ring_buffer *rb)
55{
56 return rb->page_order;
57}
58
59#else
60
61static inline int page_order(struct ring_buffer *rb)
62{
63 return 0;
64}
65#endif
66
67static unsigned long perf_data_size(struct ring_buffer *rb)
68{
69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
70}
71
72static inline void
73__output_copy(struct perf_output_handle *handle,
74 const void *buf, unsigned int len)
75{
76 do {
77 unsigned long size = min_t(unsigned long, handle->size, len);
78
79 memcpy(handle->addr, buf, size);
80
81 len -= size;
82 handle->addr += size;
83 buf += size;
84 handle->size -= size;
85 if (!handle->size) {
86 struct ring_buffer *rb = handle->rb;
87
88 handle->page++;
89 handle->page &= rb->nr_pages - 1;
90 handle->addr = rb->data_pages[handle->page];
91 handle->size = PAGE_SIZE << page_order(rb);
92 }
93 } while (len);
94}
95
96#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 00000000000..a2a29205cc0
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
1/*
2 * Performance events ring-buffer code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15
16#include "internal.h"
17
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long mask;
22
23 if (!rb->writable)
24 return true;
25
26 mask = perf_data_size(rb) - 1;
27
28 offset = (offset - tail) & mask;
29 head = (head - tail) & mask;
30
31 if ((int)(head - offset) < 0)
32 return false;
33
34 return true;
35}
36
37static void perf_output_wakeup(struct perf_output_handle *handle)
38{
39 atomic_set(&handle->rb->poll, POLL_IN);
40
41 handle->event->pending_wakeup = 1;
42 irq_work_queue(&handle->event->pending);
43}
44
45/*
46 * We need to ensure a later event_id doesn't publish a head when a former
47 * event isn't done writing. However since we need to deal with NMIs we
48 * cannot fully serialize things.
49 *
50 * We only publish the head (and generate a wakeup) when the outer-most
51 * event completes.
52 */
53static void perf_output_get_handle(struct perf_output_handle *handle)
54{
55 struct ring_buffer *rb = handle->rb;
56
57 preempt_disable();
58 local_inc(&rb->nest);
59 handle->wakeup = local_read(&rb->wakeup);
60}
61
62static void perf_output_put_handle(struct perf_output_handle *handle)
63{
64 struct ring_buffer *rb = handle->rb;
65 unsigned long head;
66
67again:
68 head = local_read(&rb->head);
69
70 /*
71 * IRQ/NMI can happen here, which means we can miss a head update.
72 */
73
74 if (!local_dec_and_test(&rb->nest))
75 goto out;
76
77 /*
78 * Publish the known good head. Rely on the full barrier implied
79 * by atomic_dec_and_test() order the rb->head read and this
80 * write.
81 */
82 rb->user_page->data_head = head;
83
84 /*
85 * Now check if we missed an update, rely on the (compiler)
86 * barrier in atomic_dec_and_test() to re-read rb->head.
87 */
88 if (unlikely(head != local_read(&rb->head))) {
89 local_inc(&rb->nest);
90 goto again;
91 }
92
93 if (handle->wakeup != local_read(&rb->wakeup))
94 perf_output_wakeup(handle);
95
96out:
97 preempt_enable();
98}
99
100int perf_output_begin(struct perf_output_handle *handle,
101 struct perf_event *event, unsigned int size)
102{
103 struct ring_buffer *rb;
104 unsigned long tail, offset, head;
105 int have_lost;
106 struct perf_sample_data sample_data;
107 struct {
108 struct perf_event_header header;
109 u64 id;
110 u64 lost;
111 } lost_event;
112
113 rcu_read_lock();
114 /*
115 * For inherited events we send all the output towards the parent.
116 */
117 if (event->parent)
118 event = event->parent;
119
120 rb = rcu_dereference(event->rb);
121 if (!rb)
122 goto out;
123
124 handle->rb = rb;
125 handle->event = event;
126
127 if (!rb->nr_pages)
128 goto out;
129
130 have_lost = local_read(&rb->lost);
131 if (have_lost) {
132 lost_event.header.size = sizeof(lost_event);
133 perf_event_header__init_id(&lost_event.header, &sample_data,
134 event);
135 size += lost_event.header.size;
136 }
137
138 perf_output_get_handle(handle);
139
140 do {
141 /*
142 * Userspace could choose to issue a mb() before updating the
143 * tail pointer. So that all reads will be completed before the
144 * write is issued.
145 */
146 tail = ACCESS_ONCE(rb->user_page->data_tail);
147 smp_rmb();
148 offset = head = local_read(&rb->head);
149 head += size;
150 if (unlikely(!perf_output_space(rb, tail, offset, head)))
151 goto fail;
152 } while (local_cmpxchg(&rb->head, offset, head) != offset);
153
154 if (head - local_read(&rb->wakeup) > rb->watermark)
155 local_add(rb->watermark, &rb->wakeup);
156
157 handle->page = offset >> (PAGE_SHIFT + page_order(rb));
158 handle->page &= rb->nr_pages - 1;
159 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
160 handle->addr = rb->data_pages[handle->page];
161 handle->addr += handle->size;
162 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
163
164 if (have_lost) {
165 lost_event.header.type = PERF_RECORD_LOST;
166 lost_event.header.misc = 0;
167 lost_event.id = event->id;
168 lost_event.lost = local_xchg(&rb->lost, 0);
169
170 perf_output_put(handle, lost_event);
171 perf_event__output_id_sample(event, handle, &sample_data);
172 }
173
174 return 0;
175
176fail:
177 local_inc(&rb->lost);
178 perf_output_put_handle(handle);
179out:
180 rcu_read_unlock();
181
182 return -ENOSPC;
183}
184
185void perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len)
187{
188 __output_copy(handle, buf, len);
189}
190
191void perf_output_end(struct perf_output_handle *handle)
192{
193 perf_output_put_handle(handle);
194 rcu_read_unlock();
195}
196
197static void
198ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
199{
200 long max_size = perf_data_size(rb);
201
202 if (watermark)
203 rb->watermark = min(max_size, watermark);
204
205 if (!rb->watermark)
206 rb->watermark = max_size / 2;
207
208 if (flags & RING_BUFFER_WRITABLE)
209 rb->writable = 1;
210
211 atomic_set(&rb->refcount, 1);
212}
213
214#ifndef CONFIG_PERF_USE_VMALLOC
215
216/*
217 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
218 */
219
220struct page *
221perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
222{
223 if (pgoff > rb->nr_pages)
224 return NULL;
225
226 if (pgoff == 0)
227 return virt_to_page(rb->user_page);
228
229 return virt_to_page(rb->data_pages[pgoff - 1]);
230}
231
232static void *perf_mmap_alloc_page(int cpu)
233{
234 struct page *page;
235 int node;
236
237 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
238 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
239 if (!page)
240 return NULL;
241
242 return page_address(page);
243}
244
245struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
246{
247 struct ring_buffer *rb;
248 unsigned long size;
249 int i;
250
251 size = sizeof(struct ring_buffer);
252 size += nr_pages * sizeof(void *);
253
254 rb = kzalloc(size, GFP_KERNEL);
255 if (!rb)
256 goto fail;
257
258 rb->user_page = perf_mmap_alloc_page(cpu);
259 if (!rb->user_page)
260 goto fail_user_page;
261
262 for (i = 0; i < nr_pages; i++) {
263 rb->data_pages[i] = perf_mmap_alloc_page(cpu);
264 if (!rb->data_pages[i])
265 goto fail_data_pages;
266 }
267
268 rb->nr_pages = nr_pages;
269
270 ring_buffer_init(rb, watermark, flags);
271
272 return rb;
273
274fail_data_pages:
275 for (i--; i >= 0; i--)
276 free_page((unsigned long)rb->data_pages[i]);
277
278 free_page((unsigned long)rb->user_page);
279
280fail_user_page:
281 kfree(rb);
282
283fail:
284 return NULL;
285}
286
287static void perf_mmap_free_page(unsigned long addr)
288{
289 struct page *page = virt_to_page((void *)addr);
290
291 page->mapping = NULL;
292 __free_page(page);
293}
294
295void rb_free(struct ring_buffer *rb)
296{
297 int i;
298
299 perf_mmap_free_page((unsigned long)rb->user_page);
300 for (i = 0; i < rb->nr_pages; i++)
301 perf_mmap_free_page((unsigned long)rb->data_pages[i]);
302 kfree(rb);
303}
304
305#else
306
307struct page *
308perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
309{
310 if (pgoff > (1UL << page_order(rb)))
311 return NULL;
312
313 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
314}
315
316static void perf_mmap_unmark_page(void *addr)
317{
318 struct page *page = vmalloc_to_page(addr);
319
320 page->mapping = NULL;
321}
322
323static void rb_free_work(struct work_struct *work)
324{
325 struct ring_buffer *rb;
326 void *base;
327 int i, nr;
328
329 rb = container_of(work, struct ring_buffer, work);
330 nr = 1 << page_order(rb);
331
332 base = rb->user_page;
333 for (i = 0; i < nr + 1; i++)
334 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
335
336 vfree(base);
337 kfree(rb);
338}
339
340void rb_free(struct ring_buffer *rb)
341{
342 schedule_work(&rb->work);
343}
344
345struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
346{
347 struct ring_buffer *rb;
348 unsigned long size;
349 void *all_buf;
350
351 size = sizeof(struct ring_buffer);
352 size += sizeof(void *);
353
354 rb = kzalloc(size, GFP_KERNEL);
355 if (!rb)
356 goto fail;
357
358 INIT_WORK(&rb->work, rb_free_work);
359
360 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
361 if (!all_buf)
362 goto fail_all_buf;
363
364 rb->user_page = all_buf;
365 rb->data_pages[0] = all_buf + PAGE_SIZE;
366 rb->page_order = ilog2(nr_pages);
367 rb->nr_pages = 1;
368
369 ring_buffer_init(rb, watermark, flags);
370
371 return rb;
372
373fail_all_buf:
374 kfree(rb);
375
376fail:
377 return NULL;
378}
379
380#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 64879bdff92..9d13da8a8c2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -87,7 +87,6 @@ static void __exit_signal(struct task_struct *tsk)
87 struct tty_struct *uninitialized_var(tty); 87 struct tty_struct *uninitialized_var(tty);
88 88
89 sighand = rcu_dereference_check(tsk->sighand, 89 sighand = rcu_dereference_check(tsk->sighand,
90 rcu_read_lock_held() ||
91 lockdep_tasklist_lock_is_held()); 90 lockdep_tasklist_lock_is_held());
92 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
93 92
@@ -171,7 +170,6 @@ void release_task(struct task_struct * p)
171 struct task_struct *leader; 170 struct task_struct *leader;
172 int zap_leader; 171 int zap_leader;
173repeat: 172repeat:
174 tracehook_prepare_release_task(p);
175 /* don't need to get the RCU readlock here - the process is dead and 173 /* don't need to get the RCU readlock here - the process is dead and
176 * can't be modifying its own credentials. But shut RCU-lockdep up */ 174 * can't be modifying its own credentials. But shut RCU-lockdep up */
177 rcu_read_lock(); 175 rcu_read_lock();
@@ -181,7 +179,7 @@ repeat:
181 proc_flush_task(p); 179 proc_flush_task(p);
182 180
183 write_lock_irq(&tasklist_lock); 181 write_lock_irq(&tasklist_lock);
184 tracehook_finish_release_task(p); 182 ptrace_release_task(p);
185 __exit_signal(p); 183 __exit_signal(p);
186 184
187 /* 185 /*
@@ -192,22 +190,12 @@ repeat:
192 zap_leader = 0; 190 zap_leader = 0;
193 leader = p->group_leader; 191 leader = p->group_leader;
194 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 192 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
195 BUG_ON(task_detached(leader));
196 do_notify_parent(leader, leader->exit_signal);
197 /* 193 /*
198 * If we were the last child thread and the leader has 194 * If we were the last child thread and the leader has
199 * exited already, and the leader's parent ignores SIGCHLD, 195 * exited already, and the leader's parent ignores SIGCHLD,
200 * then we are the one who should release the leader. 196 * then we are the one who should release the leader.
201 *
202 * do_notify_parent() will have marked it self-reaping in
203 * that case.
204 */
205 zap_leader = task_detached(leader);
206
207 /*
208 * This maintains the invariant that release_task()
209 * only runs on a task in EXIT_DEAD, just for sanity.
210 */ 197 */
198 zap_leader = do_notify_parent(leader, leader->exit_signal);
211 if (zap_leader) 199 if (zap_leader)
212 leader->exit_state = EXIT_DEAD; 200 leader->exit_state = EXIT_DEAD;
213 } 201 }
@@ -279,18 +267,16 @@ int is_current_pgrp_orphaned(void)
279 return retval; 267 return retval;
280} 268}
281 269
282static int has_stopped_jobs(struct pid *pgrp) 270static bool has_stopped_jobs(struct pid *pgrp)
283{ 271{
284 int retval = 0;
285 struct task_struct *p; 272 struct task_struct *p;
286 273
287 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 274 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
288 if (!task_is_stopped(p)) 275 if (p->signal->flags & SIGNAL_STOP_STOPPED)
289 continue; 276 return true;
290 retval = 1;
291 break;
292 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 277 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
293 return retval; 278
279 return false;
294} 280}
295 281
296/* 282/*
@@ -753,7 +739,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
753{ 739{
754 list_move_tail(&p->sibling, &p->real_parent->children); 740 list_move_tail(&p->sibling, &p->real_parent->children);
755 741
756 if (task_detached(p)) 742 if (p->exit_state == EXIT_DEAD)
757 return; 743 return;
758 /* 744 /*
759 * If this is a threaded reparent there is no need to 745 * If this is a threaded reparent there is no need to
@@ -766,10 +752,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
766 p->exit_signal = SIGCHLD; 752 p->exit_signal = SIGCHLD;
767 753
768 /* If it has exited notify the new parent about this child's death. */ 754 /* If it has exited notify the new parent about this child's death. */
769 if (!task_ptrace(p) && 755 if (!p->ptrace &&
770 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 756 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
771 do_notify_parent(p, p->exit_signal); 757 if (do_notify_parent(p, p->exit_signal)) {
772 if (task_detached(p)) {
773 p->exit_state = EXIT_DEAD; 758 p->exit_state = EXIT_DEAD;
774 list_move_tail(&p->sibling, dead); 759 list_move_tail(&p->sibling, dead);
775 } 760 }
@@ -796,7 +781,7 @@ static void forget_original_parent(struct task_struct *father)
796 do { 781 do {
797 t->real_parent = reaper; 782 t->real_parent = reaper;
798 if (t->parent == father) { 783 if (t->parent == father) {
799 BUG_ON(task_ptrace(t)); 784 BUG_ON(t->ptrace);
800 t->parent = t->real_parent; 785 t->parent = t->real_parent;
801 } 786 }
802 if (t->pdeath_signal) 787 if (t->pdeath_signal)
@@ -821,8 +806,7 @@ static void forget_original_parent(struct task_struct *father)
821 */ 806 */
822static void exit_notify(struct task_struct *tsk, int group_dead) 807static void exit_notify(struct task_struct *tsk, int group_dead)
823{ 808{
824 int signal; 809 bool autoreap;
825 void *cookie;
826 810
827 /* 811 /*
828 * This does two things: 812 * This does two things:
@@ -853,26 +837,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
853 * we have changed execution domain as these two values started 837 * we have changed execution domain as these two values started
854 * the same after a fork. 838 * the same after a fork.
855 */ 839 */
856 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 840 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
857 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 841 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
858 tsk->self_exec_id != tsk->parent_exec_id)) 842 tsk->self_exec_id != tsk->parent_exec_id))
859 tsk->exit_signal = SIGCHLD; 843 tsk->exit_signal = SIGCHLD;
860 844
861 signal = tracehook_notify_death(tsk, &cookie, group_dead); 845 if (unlikely(tsk->ptrace)) {
862 if (signal >= 0) 846 int sig = thread_group_leader(tsk) &&
863 signal = do_notify_parent(tsk, signal); 847 thread_group_empty(tsk) &&
848 !ptrace_reparented(tsk) ?
849 tsk->exit_signal : SIGCHLD;
850 autoreap = do_notify_parent(tsk, sig);
851 } else if (thread_group_leader(tsk)) {
852 autoreap = thread_group_empty(tsk) &&
853 do_notify_parent(tsk, tsk->exit_signal);
854 } else {
855 autoreap = true;
856 }
864 857
865 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 858 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
866 859
867 /* mt-exec, de_thread() is waiting for group leader */ 860 /* mt-exec, de_thread() is waiting for group leader */
868 if (unlikely(tsk->signal->notify_count < 0)) 861 if (unlikely(tsk->signal->notify_count < 0))
869 wake_up_process(tsk->signal->group_exit_task); 862 wake_up_process(tsk->signal->group_exit_task);
870 write_unlock_irq(&tasklist_lock); 863 write_unlock_irq(&tasklist_lock);
871 864
872 tracehook_report_death(tsk, signal, cookie, group_dead);
873
874 /* If the process is dead, release it - nobody will wait for it */ 865 /* If the process is dead, release it - nobody will wait for it */
875 if (signal == DEATH_REAP) 866 if (autoreap)
876 release_task(tsk); 867 release_task(tsk);
877} 868}
878 869
@@ -908,7 +899,6 @@ NORET_TYPE void do_exit(long code)
908 899
909 profile_task_exit(tsk); 900 profile_task_exit(tsk);
910 901
911 WARN_ON(atomic_read(&tsk->fs_excl));
912 WARN_ON(blk_needs_flush_plug(tsk)); 902 WARN_ON(blk_needs_flush_plug(tsk));
913 903
914 if (unlikely(in_interrupt())) 904 if (unlikely(in_interrupt()))
@@ -925,7 +915,7 @@ NORET_TYPE void do_exit(long code)
925 */ 915 */
926 set_fs(USER_DS); 916 set_fs(USER_DS);
927 917
928 tracehook_report_exit(&code); 918 ptrace_event(PTRACE_EVENT_EXIT, code);
929 919
930 validate_creds_for_do_exit(tsk); 920 validate_creds_for_do_exit(tsk);
931 921
@@ -994,6 +984,7 @@ NORET_TYPE void do_exit(long code)
994 trace_sched_process_exit(tsk); 984 trace_sched_process_exit(tsk);
995 985
996 exit_sem(tsk); 986 exit_sem(tsk);
987 exit_shm(tsk);
997 exit_files(tsk); 988 exit_files(tsk);
998 exit_fs(tsk); 989 exit_fs(tsk);
999 check_stack_usage(); 990 check_stack_usage();
@@ -1239,9 +1230,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1239 traced = ptrace_reparented(p); 1230 traced = ptrace_reparented(p);
1240 /* 1231 /*
1241 * It can be ptraced but not reparented, check 1232 * It can be ptraced but not reparented, check
1242 * !task_detached() to filter out sub-threads. 1233 * thread_group_leader() to filter out sub-threads.
1243 */ 1234 */
1244 if (likely(!traced) && likely(!task_detached(p))) { 1235 if (likely(!traced) && thread_group_leader(p)) {
1245 struct signal_struct *psig; 1236 struct signal_struct *psig;
1246 struct signal_struct *sig; 1237 struct signal_struct *sig;
1247 unsigned long maxrss; 1238 unsigned long maxrss;
@@ -1349,16 +1340,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1349 /* We dropped tasklist, ptracer could die and untrace */ 1340 /* We dropped tasklist, ptracer could die and untrace */
1350 ptrace_unlink(p); 1341 ptrace_unlink(p);
1351 /* 1342 /*
1352 * If this is not a detached task, notify the parent. 1343 * If this is not a sub-thread, notify the parent.
1353 * If it's still not detached after that, don't release 1344 * If parent wants a zombie, don't release it now.
1354 * it now.
1355 */ 1345 */
1356 if (!task_detached(p)) { 1346 if (thread_group_leader(p) &&
1357 do_notify_parent(p, p->exit_signal); 1347 !do_notify_parent(p, p->exit_signal)) {
1358 if (!task_detached(p)) { 1348 p->exit_state = EXIT_ZOMBIE;
1359 p->exit_state = EXIT_ZOMBIE; 1349 p = NULL;
1360 p = NULL;
1361 }
1362 } 1350 }
1363 write_unlock_irq(&tasklist_lock); 1351 write_unlock_irq(&tasklist_lock);
1364 } 1352 }
@@ -1371,7 +1359,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1371static int *task_stopped_code(struct task_struct *p, bool ptrace) 1359static int *task_stopped_code(struct task_struct *p, bool ptrace)
1372{ 1360{
1373 if (ptrace) { 1361 if (ptrace) {
1374 if (task_is_stopped_or_traced(p)) 1362 if (task_is_stopped_or_traced(p) &&
1363 !(p->jobctl & JOBCTL_LISTENING))
1375 return &p->exit_code; 1364 return &p->exit_code;
1376 } else { 1365 } else {
1377 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1366 if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1557,8 +1546,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1557 } 1546 }
1558 1547
1559 /* dead body doesn't have much to contribute */ 1548 /* dead body doesn't have much to contribute */
1560 if (p->exit_state == EXIT_DEAD) 1549 if (unlikely(p->exit_state == EXIT_DEAD)) {
1550 /*
1551 * But do not ignore this task until the tracer does
1552 * wait_task_zombie()->do_notify_parent().
1553 */
1554 if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
1555 wo->notask_error = 0;
1561 return 0; 1556 return 0;
1557 }
1562 1558
1563 /* slay zombie? */ 1559 /* slay zombie? */
1564 if (p->exit_state == EXIT_ZOMBIE) { 1560 if (p->exit_state == EXIT_ZOMBIE) {
@@ -1567,7 +1563,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1567 * Notification and reaping will be cascaded to the real 1563 * Notification and reaping will be cascaded to the real
1568 * parent when the ptracer detaches. 1564 * parent when the ptracer detaches.
1569 */ 1565 */
1570 if (likely(!ptrace) && unlikely(task_ptrace(p))) { 1566 if (likely(!ptrace) && unlikely(p->ptrace)) {
1571 /* it will become visible, clear notask_error */ 1567 /* it will become visible, clear notask_error */
1572 wo->notask_error = 0; 1568 wo->notask_error = 0;
1573 return 0; 1569 return 0;
@@ -1610,8 +1606,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1610 * own children, it should create a separate process which 1606 * own children, it should create a separate process which
1611 * takes the role of real parent. 1607 * takes the role of real parent.
1612 */ 1608 */
1613 if (likely(!ptrace) && task_ptrace(p) && 1609 if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
1614 same_thread_group(p->parent, p->real_parent))
1615 return 0; 1610 return 0;
1616 1611
1617 /* 1612 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 25c6111fe3a..067992d4838 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
37#include <linux/swap.h> 37#include <linux/swap.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/jiffies.h> 39#include <linux/jiffies.h>
40#include <linux/tracehook.h>
41#include <linux/futex.h> 40#include <linux/futex.h>
42#include <linux/compat.h> 41#include <linux/compat.h>
43#include <linux/kthread.h> 42#include <linux/kthread.h>
@@ -84,7 +83,7 @@
84 * Protected counters by write_lock_irq(&tasklist_lock) 83 * Protected counters by write_lock_irq(&tasklist_lock)
85 */ 84 */
86unsigned long total_forks; /* Handle normal Linux uptimes. */ 85unsigned long total_forks; /* Handle normal Linux uptimes. */
87int nr_threads; /* The idle threads do not count.. */ 86int nr_threads; /* The idle threads do not count.. */
88 87
89int max_threads; /* tunable limit on nr_threads */ 88int max_threads; /* tunable limit on nr_threads */
90 89
@@ -157,6 +156,9 @@ struct kmem_cache *vm_area_cachep;
157/* SLAB cache for mm_struct structures (tsk->mm) */ 156/* SLAB cache for mm_struct structures (tsk->mm) */
158static struct kmem_cache *mm_cachep; 157static struct kmem_cache *mm_cachep;
159 158
159/* Notifier list called when a task struct is freed */
160static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
161
160static void account_kernel_stack(struct thread_info *ti, int account) 162static void account_kernel_stack(struct thread_info *ti, int account)
161{ 163{
162 struct zone *zone = page_zone(virt_to_page(ti)); 164 struct zone *zone = page_zone(virt_to_page(ti));
@@ -188,6 +190,18 @@ static inline void put_signal_struct(struct signal_struct *sig)
188 free_signal_struct(sig); 190 free_signal_struct(sig);
189} 191}
190 192
193int task_free_register(struct notifier_block *n)
194{
195 return atomic_notifier_chain_register(&task_free_notifier, n);
196}
197EXPORT_SYMBOL(task_free_register);
198
199int task_free_unregister(struct notifier_block *n)
200{
201 return atomic_notifier_chain_unregister(&task_free_notifier, n);
202}
203EXPORT_SYMBOL(task_free_unregister);
204
191void __put_task_struct(struct task_struct *tsk) 205void __put_task_struct(struct task_struct *tsk)
192{ 206{
193 WARN_ON(!tsk->exit_state); 207 WARN_ON(!tsk->exit_state);
@@ -199,6 +213,7 @@ void __put_task_struct(struct task_struct *tsk)
199 delayacct_tsk_free(tsk); 213 delayacct_tsk_free(tsk);
200 put_signal_struct(tsk->signal); 214 put_signal_struct(tsk->signal);
201 215
216 atomic_notifier_call_chain(&task_free_notifier, 0, tsk);
202 if (!profile_handoff_task(tsk)) 217 if (!profile_handoff_task(tsk))
203 free_task(tsk); 218 free_task(tsk);
204} 219}
@@ -237,7 +252,7 @@ void __init fork_init(unsigned long mempages)
237 /* 252 /*
238 * we need to allow at least 20 threads to boot a system 253 * we need to allow at least 20 threads to boot a system
239 */ 254 */
240 if(max_threads < 20) 255 if (max_threads < 20)
241 max_threads = 20; 256 max_threads = 20;
242 257
243 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; 258 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -273,7 +288,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
273 return NULL; 288 return NULL;
274 } 289 }
275 290
276 err = arch_dup_task_struct(tsk, orig); 291 err = arch_dup_task_struct(tsk, orig);
277 if (err) 292 if (err)
278 goto out; 293 goto out;
279 294
@@ -296,9 +311,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
296 tsk->stack_canary = get_random_int(); 311 tsk->stack_canary = get_random_int();
297#endif 312#endif
298 313
299 /* One for us, one for whoever does the "release_task()" (usually parent) */ 314 /*
300 atomic_set(&tsk->usage,2); 315 * One for us, one for whoever does the "release_task()" (usually
301 atomic_set(&tsk->fs_excl, 0); 316 * parent)
317 */
318 atomic_set(&tsk->usage, 2);
302#ifdef CONFIG_BLK_DEV_IO_TRACE 319#ifdef CONFIG_BLK_DEV_IO_TRACE
303 tsk->btrace_seq = 0; 320 tsk->btrace_seq = 0;
304#endif 321#endif
@@ -446,7 +463,7 @@ fail_nomem:
446 goto out; 463 goto out;
447} 464}
448 465
449static inline int mm_alloc_pgd(struct mm_struct * mm) 466static inline int mm_alloc_pgd(struct mm_struct *mm)
450{ 467{
451 mm->pgd = pgd_alloc(mm); 468 mm->pgd = pgd_alloc(mm);
452 if (unlikely(!mm->pgd)) 469 if (unlikely(!mm->pgd))
@@ -454,7 +471,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
454 return 0; 471 return 0;
455} 472}
456 473
457static inline void mm_free_pgd(struct mm_struct * mm) 474static inline void mm_free_pgd(struct mm_struct *mm)
458{ 475{
459 pgd_free(mm, mm->pgd); 476 pgd_free(mm, mm->pgd);
460} 477}
@@ -491,7 +508,7 @@ static void mm_init_aio(struct mm_struct *mm)
491#endif 508#endif
492} 509}
493 510
494static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 511static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
495{ 512{
496 atomic_set(&mm->mm_users, 1); 513 atomic_set(&mm->mm_users, 1);
497 atomic_set(&mm->mm_count, 1); 514 atomic_set(&mm->mm_count, 1);
@@ -522,9 +539,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
522/* 539/*
523 * Allocate and initialize an mm_struct. 540 * Allocate and initialize an mm_struct.
524 */ 541 */
525struct mm_struct * mm_alloc(void) 542struct mm_struct *mm_alloc(void)
526{ 543{
527 struct mm_struct * mm; 544 struct mm_struct *mm;
528 545
529 mm = allocate_mm(); 546 mm = allocate_mm();
530 if (!mm) 547 if (!mm)
@@ -592,7 +609,7 @@ void added_exe_file_vma(struct mm_struct *mm)
592void removed_exe_file_vma(struct mm_struct *mm) 609void removed_exe_file_vma(struct mm_struct *mm)
593{ 610{
594 mm->num_exe_file_vmas--; 611 mm->num_exe_file_vmas--;
595 if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ 612 if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
596 fput(mm->exe_file); 613 fput(mm->exe_file);
597 mm->exe_file = NULL; 614 mm->exe_file = NULL;
598 } 615 }
@@ -784,9 +801,9 @@ fail_nocontext:
784 return NULL; 801 return NULL;
785} 802}
786 803
787static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) 804static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
788{ 805{
789 struct mm_struct * mm, *oldmm; 806 struct mm_struct *mm, *oldmm;
790 int retval; 807 int retval;
791 808
792 tsk->min_flt = tsk->maj_flt = 0; 809 tsk->min_flt = tsk->maj_flt = 0;
@@ -853,7 +870,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
853 return 0; 870 return 0;
854} 871}
855 872
856static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 873static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
857{ 874{
858 struct files_struct *oldf, *newf; 875 struct files_struct *oldf, *newf;
859 int error = 0; 876 int error = 0;
@@ -1020,7 +1037,7 @@ static void rt_mutex_init_task(struct task_struct *p)
1020{ 1037{
1021 raw_spin_lock_init(&p->pi_lock); 1038 raw_spin_lock_init(&p->pi_lock);
1022#ifdef CONFIG_RT_MUTEXES 1039#ifdef CONFIG_RT_MUTEXES
1023 plist_head_init_raw(&p->pi_waiters, &p->pi_lock); 1040 plist_head_init(&p->pi_waiters);
1024 p->pi_blocked_on = NULL; 1041 p->pi_blocked_on = NULL;
1025#endif 1042#endif
1026} 1043}
@@ -1117,6 +1134,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1117 p->real_cred->user != INIT_USER) 1134 p->real_cred->user != INIT_USER)
1118 goto bad_fork_free; 1135 goto bad_fork_free;
1119 } 1136 }
1137 current->flags &= ~PF_NPROC_EXCEEDED;
1120 1138
1121 retval = copy_creds(p, clone_flags); 1139 retval = copy_creds(p, clone_flags);
1122 if (retval < 0) 1140 if (retval < 0)
@@ -1175,13 +1193,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1175 cgroup_fork(p); 1193 cgroup_fork(p);
1176#ifdef CONFIG_NUMA 1194#ifdef CONFIG_NUMA
1177 p->mempolicy = mpol_dup(p->mempolicy); 1195 p->mempolicy = mpol_dup(p->mempolicy);
1178 if (IS_ERR(p->mempolicy)) { 1196 if (IS_ERR(p->mempolicy)) {
1179 retval = PTR_ERR(p->mempolicy); 1197 retval = PTR_ERR(p->mempolicy);
1180 p->mempolicy = NULL; 1198 p->mempolicy = NULL;
1181 goto bad_fork_cleanup_cgroup; 1199 goto bad_fork_cleanup_cgroup;
1182 } 1200 }
1183 mpol_fix_fork_child_flag(p); 1201 mpol_fix_fork_child_flag(p);
1184#endif 1202#endif
1203#ifdef CONFIG_CPUSETS
1204 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1205 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1206#endif
1185#ifdef CONFIG_TRACE_IRQFLAGS 1207#ifdef CONFIG_TRACE_IRQFLAGS
1186 p->irq_events = 0; 1208 p->irq_events = 0;
1187#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1209#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1221,25 +1243,33 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1221 retval = perf_event_init_task(p); 1243 retval = perf_event_init_task(p);
1222 if (retval) 1244 if (retval)
1223 goto bad_fork_cleanup_policy; 1245 goto bad_fork_cleanup_policy;
1224 1246 retval = audit_alloc(p);
1225 if ((retval = audit_alloc(p))) 1247 if (retval)
1226 goto bad_fork_cleanup_policy; 1248 goto bad_fork_cleanup_policy;
1227 /* copy all the process information */ 1249 /* copy all the process information */
1228 if ((retval = copy_semundo(clone_flags, p))) 1250 retval = copy_semundo(clone_flags, p);
1251 if (retval)
1229 goto bad_fork_cleanup_audit; 1252 goto bad_fork_cleanup_audit;
1230 if ((retval = copy_files(clone_flags, p))) 1253 retval = copy_files(clone_flags, p);
1254 if (retval)
1231 goto bad_fork_cleanup_semundo; 1255 goto bad_fork_cleanup_semundo;
1232 if ((retval = copy_fs(clone_flags, p))) 1256 retval = copy_fs(clone_flags, p);
1257 if (retval)
1233 goto bad_fork_cleanup_files; 1258 goto bad_fork_cleanup_files;
1234 if ((retval = copy_sighand(clone_flags, p))) 1259 retval = copy_sighand(clone_flags, p);
1260 if (retval)
1235 goto bad_fork_cleanup_fs; 1261 goto bad_fork_cleanup_fs;
1236 if ((retval = copy_signal(clone_flags, p))) 1262 retval = copy_signal(clone_flags, p);
1263 if (retval)
1237 goto bad_fork_cleanup_sighand; 1264 goto bad_fork_cleanup_sighand;
1238 if ((retval = copy_mm(clone_flags, p))) 1265 retval = copy_mm(clone_flags, p);
1266 if (retval)
1239 goto bad_fork_cleanup_signal; 1267 goto bad_fork_cleanup_signal;
1240 if ((retval = copy_namespaces(clone_flags, p))) 1268 retval = copy_namespaces(clone_flags, p);
1269 if (retval)
1241 goto bad_fork_cleanup_mm; 1270 goto bad_fork_cleanup_mm;
1242 if ((retval = copy_io(clone_flags, p))) 1271 retval = copy_io(clone_flags, p);
1272 if (retval)
1243 goto bad_fork_cleanup_namespaces; 1273 goto bad_fork_cleanup_namespaces;
1244 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); 1274 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1245 if (retval) 1275 if (retval)
@@ -1261,7 +1291,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1261 /* 1291 /*
1262 * Clear TID on mm_release()? 1292 * Clear TID on mm_release()?
1263 */ 1293 */
1264 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1294 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
1265#ifdef CONFIG_BLOCK 1295#ifdef CONFIG_BLOCK
1266 p->plug = NULL; 1296 p->plug = NULL;
1267#endif 1297#endif
@@ -1329,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1329 * it's process group. 1359 * it's process group.
1330 * A fatal signal pending means that current will exit, so the new 1360 * A fatal signal pending means that current will exit, so the new
1331 * thread can't slip out of an OOM kill (or normal SIGKILL). 1361 * thread can't slip out of an OOM kill (or normal SIGKILL).
1332 */ 1362 */
1333 recalc_sigpending(); 1363 recalc_sigpending();
1334 if (signal_pending(current)) { 1364 if (signal_pending(current)) {
1335 spin_unlock(&current->sighand->siglock); 1365 spin_unlock(&current->sighand->siglock);
@@ -1347,7 +1377,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1347 } 1377 }
1348 1378
1349 if (likely(p->pid)) { 1379 if (likely(p->pid)) {
1350 tracehook_finish_clone(p, clone_flags, trace); 1380 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1351 1381
1352 if (thread_group_leader(p)) { 1382 if (thread_group_leader(p)) {
1353 if (is_child_reaper(pid)) 1383 if (is_child_reaper(pid))
@@ -1488,10 +1518,22 @@ long do_fork(unsigned long clone_flags,
1488 } 1518 }
1489 1519
1490 /* 1520 /*
1491 * When called from kernel_thread, don't do user tracing stuff. 1521 * Determine whether and which event to report to ptracer. When
1522 * called from kernel_thread or CLONE_UNTRACED is explicitly
1523 * requested, no event is reported; otherwise, report if the event
1524 * for the type of forking is enabled.
1492 */ 1525 */
1493 if (likely(user_mode(regs))) 1526 if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
1494 trace = tracehook_prepare_clone(clone_flags); 1527 if (clone_flags & CLONE_VFORK)
1528 trace = PTRACE_EVENT_VFORK;
1529 else if ((clone_flags & CSIGNAL) != SIGCHLD)
1530 trace = PTRACE_EVENT_CLONE;
1531 else
1532 trace = PTRACE_EVENT_FORK;
1533
1534 if (likely(!ptrace_event_enabled(current, trace)))
1535 trace = 0;
1536 }
1495 1537
1496 p = copy_process(clone_flags, stack_start, regs, stack_size, 1538 p = copy_process(clone_flags, stack_start, regs, stack_size,
1497 child_tidptr, NULL, trace); 1539 child_tidptr, NULL, trace);
@@ -1515,26 +1557,26 @@ long do_fork(unsigned long clone_flags,
1515 } 1557 }
1516 1558
1517 audit_finish_fork(p); 1559 audit_finish_fork(p);
1518 tracehook_report_clone(regs, clone_flags, nr, p);
1519 1560
1520 /* 1561 /*
1521 * We set PF_STARTING at creation in case tracing wants to 1562 * We set PF_STARTING at creation in case tracing wants to
1522 * use this to distinguish a fully live task from one that 1563 * use this to distinguish a fully live task from one that
1523 * hasn't gotten to tracehook_report_clone() yet. Now we 1564 * hasn't finished SIGSTOP raising yet. Now we clear it
1524 * clear it and set the child going. 1565 * and set the child going.
1525 */ 1566 */
1526 p->flags &= ~PF_STARTING; 1567 p->flags &= ~PF_STARTING;
1527 1568
1528 wake_up_new_task(p); 1569 wake_up_new_task(p);
1529 1570
1530 tracehook_report_clone_complete(trace, regs, 1571 /* forking complete and child started to run, tell ptracer */
1531 clone_flags, nr, p); 1572 if (unlikely(trace))
1573 ptrace_event(trace, nr);
1532 1574
1533 if (clone_flags & CLONE_VFORK) { 1575 if (clone_flags & CLONE_VFORK) {
1534 freezer_do_not_count(); 1576 freezer_do_not_count();
1535 wait_for_completion(&vfork); 1577 wait_for_completion(&vfork);
1536 freezer_count(); 1578 freezer_count();
1537 tracehook_report_vfork_done(p, nr); 1579 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1538 } 1580 }
1539 } else { 1581 } else {
1540 nr = PTR_ERR(p); 1582 nr = PTR_ERR(p);
@@ -1581,6 +1623,7 @@ void __init proc_caches_init(void)
1581 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1623 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1582 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1624 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1583 mmap_init(); 1625 mmap_init();
1626 nsproxy_cache_init();
1584} 1627}
1585 1628
1586/* 1629/*
@@ -1677,12 +1720,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1677 */ 1720 */
1678 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1721 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1679 do_sysvsem = 1; 1722 do_sysvsem = 1;
1680 if ((err = unshare_fs(unshare_flags, &new_fs))) 1723 err = unshare_fs(unshare_flags, &new_fs);
1724 if (err)
1681 goto bad_unshare_out; 1725 goto bad_unshare_out;
1682 if ((err = unshare_fd(unshare_flags, &new_fd))) 1726 err = unshare_fd(unshare_flags, &new_fd);
1727 if (err)
1683 goto bad_unshare_cleanup_fs; 1728 goto bad_unshare_cleanup_fs;
1684 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1729 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
1685 new_fs))) 1730 if (err)
1686 goto bad_unshare_cleanup_fd; 1731 goto bad_unshare_cleanup_fd;
1687 1732
1688 if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1733 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
diff --git a/kernel/futex.c b/kernel/futex.c
index fe28dc282ea..e6160fa842e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key)
218 * @uaddr: virtual address of the futex 218 * @uaddr: virtual address of the futex
219 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 219 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
220 * @key: address where result is stored. 220 * @key: address where result is stored.
221 * @rw: mapping needs to be read/write (values: VERIFY_READ,
222 * VERIFY_WRITE)
221 * 223 *
222 * Returns a negative error code or 0 224 * Returns a negative error code or 0
223 * The key words are stored in *key on success. 225 * The key words are stored in *key on success.
@@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key)
229 * lock_page() might sleep, the caller should not hold a spinlock. 231 * lock_page() might sleep, the caller should not hold a spinlock.
230 */ 232 */
231static int 233static int
232get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) 234get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
233{ 235{
234 unsigned long address = (unsigned long)uaddr; 236 unsigned long address = (unsigned long)uaddr;
235 struct mm_struct *mm = current->mm; 237 struct mm_struct *mm = current->mm;
236 struct page *page, *page_head; 238 struct page *page, *page_head;
237 int err; 239 int err, ro = 0;
238 240
239 /* 241 /*
240 * The futex address must be "naturally" aligned. 242 * The futex address must be "naturally" aligned.
@@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
262 264
263again: 265again:
264 err = get_user_pages_fast(address, 1, 1, &page); 266 err = get_user_pages_fast(address, 1, 1, &page);
267 /*
268 * If write access is not required (eg. FUTEX_WAIT), try
269 * and get read-only access.
270 */
271 if (err == -EFAULT && rw == VERIFY_READ) {
272 err = get_user_pages_fast(address, 1, 0, &page);
273 ro = 1;
274 }
265 if (err < 0) 275 if (err < 0)
266 return err; 276 return err;
277 else
278 err = 0;
267 279
268#ifdef CONFIG_TRANSPARENT_HUGEPAGE 280#ifdef CONFIG_TRANSPARENT_HUGEPAGE
269 page_head = page; 281 page_head = page;
@@ -302,10 +314,29 @@ again:
302#endif 314#endif
303 315
304 lock_page(page_head); 316 lock_page(page_head);
317
318 /*
319 * If page_head->mapping is NULL, then it cannot be a PageAnon
320 * page; but it might be the ZERO_PAGE or in the gate area or
321 * in a special mapping (all cases which we are happy to fail);
322 * or it may have been a good file page when get_user_pages_fast
323 * found it, but truncated or holepunched or subjected to
324 * invalidate_complete_page2 before we got the page lock (also
325 * cases which we are happy to fail). And we hold a reference,
326 * so refcount care in invalidate_complete_page's remove_mapping
327 * prevents drop_caches from setting mapping to NULL beneath us.
328 *
329 * The case we do have to guard against is when memory pressure made
330 * shmem_writepage move it from filecache to swapcache beneath us:
331 * an unlikely race, but we do need to retry for page_head->mapping.
332 */
305 if (!page_head->mapping) { 333 if (!page_head->mapping) {
334 int shmem_swizzled = PageSwapCache(page_head);
306 unlock_page(page_head); 335 unlock_page(page_head);
307 put_page(page_head); 336 put_page(page_head);
308 goto again; 337 if (shmem_swizzled)
338 goto again;
339 return -EFAULT;
309 } 340 }
310 341
311 /* 342 /*
@@ -316,6 +347,15 @@ again:
316 * the object not the particular process. 347 * the object not the particular process.
317 */ 348 */
318 if (PageAnon(page_head)) { 349 if (PageAnon(page_head)) {
350 /*
351 * A RO anonymous page will never change and thus doesn't make
352 * sense for futex operations.
353 */
354 if (ro) {
355 err = -EFAULT;
356 goto out;
357 }
358
319 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 359 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
320 key->private.mm = mm; 360 key->private.mm = mm;
321 key->private.address = address; 361 key->private.address = address;
@@ -327,9 +367,10 @@ again:
327 367
328 get_futex_key_refs(key); 368 get_futex_key_refs(key);
329 369
370out:
330 unlock_page(page_head); 371 unlock_page(page_head);
331 put_page(page_head); 372 put_page(page_head);
332 return 0; 373 return err;
333} 374}
334 375
335static inline void put_futex_key(union futex_key *key) 376static inline void put_futex_key(union futex_key *key)
@@ -355,8 +396,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
355 int ret; 396 int ret;
356 397
357 down_read(&mm->mmap_sem); 398 down_read(&mm->mmap_sem);
358 ret = get_user_pages(current, mm, (unsigned long)uaddr, 399 ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
359 1, 1, 0, NULL, NULL); 400 FAULT_FLAG_WRITE);
360 up_read(&mm->mmap_sem); 401 up_read(&mm->mmap_sem);
361 402
362 return ret < 0 ? ret : 0; 403 return ret < 0 ? ret : 0;
@@ -940,7 +981,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
940 if (!bitset) 981 if (!bitset)
941 return -EINVAL; 982 return -EINVAL;
942 983
943 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 984 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
944 if (unlikely(ret != 0)) 985 if (unlikely(ret != 0))
945 goto out; 986 goto out;
946 987
@@ -986,10 +1027,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
986 int ret, op_ret; 1027 int ret, op_ret;
987 1028
988retry: 1029retry:
989 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); 1030 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
990 if (unlikely(ret != 0)) 1031 if (unlikely(ret != 0))
991 goto out; 1032 goto out;
992 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); 1033 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
993 if (unlikely(ret != 0)) 1034 if (unlikely(ret != 0))
994 goto out_put_key1; 1035 goto out_put_key1;
995 1036
@@ -1243,10 +1284,11 @@ retry:
1243 pi_state = NULL; 1284 pi_state = NULL;
1244 } 1285 }
1245 1286
1246 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); 1287 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1247 if (unlikely(ret != 0)) 1288 if (unlikely(ret != 0))
1248 goto out; 1289 goto out;
1249 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); 1290 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1291 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1250 if (unlikely(ret != 0)) 1292 if (unlikely(ret != 0))
1251 goto out_put_key1; 1293 goto out_put_key1;
1252 1294
@@ -1790,7 +1832,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1790 * while the syscall executes. 1832 * while the syscall executes.
1791 */ 1833 */
1792retry: 1834retry:
1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); 1835 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1794 if (unlikely(ret != 0)) 1836 if (unlikely(ret != 0))
1795 return ret; 1837 return ret;
1796 1838
@@ -1941,7 +1983,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
1941 } 1983 }
1942 1984
1943retry: 1985retry:
1944 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); 1986 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
1945 if (unlikely(ret != 0)) 1987 if (unlikely(ret != 0))
1946 goto out; 1988 goto out;
1947 1989
@@ -2060,7 +2102,7 @@ retry:
2060 if ((uval & FUTEX_TID_MASK) != vpid) 2102 if ((uval & FUTEX_TID_MASK) != vpid)
2061 return -EPERM; 2103 return -EPERM;
2062 2104
2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 2105 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2064 if (unlikely(ret != 0)) 2106 if (unlikely(ret != 0))
2065 goto out; 2107 goto out;
2066 2108
@@ -2249,7 +2291,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2249 debug_rt_mutex_init_waiter(&rt_waiter); 2291 debug_rt_mutex_init_waiter(&rt_waiter);
2250 rt_waiter.task = NULL; 2292 rt_waiter.task = NULL;
2251 2293
2252 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); 2294 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2253 if (unlikely(ret != 0)) 2295 if (unlikely(ret != 0))
2254 goto out; 2296 goto out;
2255 2297
@@ -2697,7 +2739,7 @@ static int __init futex_init(void)
2697 futex_cmpxchg_enabled = 1; 2739 futex_cmpxchg_enabled = 1;
2698 2740
2699 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2741 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2700 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); 2742 plist_head_init(&futex_queues[i].chain);
2701 spin_lock_init(&futex_queues[i].lock); 2743 spin_lock_init(&futex_queues[i].lock);
2702 } 2744 }
2703 2745
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 5bf924d80b5..824b741925b 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -3,7 +3,7 @@ menu "GCOV-based kernel profiling"
3config GCOV_KERNEL 3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling" 4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS 5 depends on DEBUG_FS
6 select CONSTRUCTORS 6 select CONSTRUCTORS if !UML
7 default n 7 default n
8 ---help--- 8 ---help---
9 This option enables gcov-based code profiling (e.g. for code coverage 9 This option enables gcov-based code profiling (e.g. for code coverage
@@ -35,7 +35,7 @@ config GCOV_KERNEL
35config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE 38 depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE || ARM
39 default n 39 default n
40 ---help--- 40 ---help---
41 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
@@ -46,4 +46,10 @@ config GCOV_PROFILE_ALL
46 larger and run slower. Also be sure to exclude files from profiling 46 larger and run slower. Also be sure to exclude files from profiling
47 which are not linked to the kernel image to prevent linker errors. 47 which are not linked to the kernel image to prevent linker errors.
48 48
49config GCOV_CTORS
50 string
51 depends on CONSTRUCTORS
52 default ".init_array" if ARM && AEABI
53 default ".ctors"
54
49endmenu 55endmenu
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb426003..d753d1152b7 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -297,16 +297,30 @@ void gcov_iter_start(struct gcov_iterator *iter)
297} 297}
298 298
299/* Mapping of logical record number to actual file content. */ 299/* Mapping of logical record number to actual file content. */
300#define RECORD_FILE_MAGIC 0 300#define RECORD_FILE_MAGIC 0
301#define RECORD_GCOV_VERSION 1 301#define RECORD_GCOV_VERSION 1
302#define RECORD_TIME_STAMP 2 302#define RECORD_TIME_STAMP 2
303#define RECORD_FUNCTION_TAG 3 303#define RECORD_FUNCTION_TAG 3
304#define RECORD_FUNCTON_TAG_LEN 4 304#define RECORD_FUNCTON_TAG_LEN 4
305#define RECORD_FUNCTION_IDENT 5 305#define RECORD_FUNCTION_IDENT 5
306#define RECORD_FUNCTION_CHECK 6 306#define RECORD_FUNCTION_CHECK_LINE 6
307#define RECORD_COUNT_TAG 7 307#define RECORD_FUNCTION_CHECK_CFG 7
308#define RECORD_COUNT_LEN 8 308#define RECORD_FUNCTION_NAME_LEN 8
309#define RECORD_COUNT 9 309#define RECORD_FUNCTION_NAME 9
310#define RECORD_COUNT_TAG 10
311#define RECORD_COUNT_LEN 11
312#define RECORD_COUNT 12
313
314/* Return length of string encoded in GCOV format. */
315static size_t
316sizeof_str(const char *str)
317{
318 size_t len;
319 len = (str) ? strlen(str) : 0;
320 if (len == 0)
321 return 1;
322 return 1 + ((len + 4) >> 2);
323}
310 324
311/** 325/**
312 * gcov_iter_next - advance file iterator to next logical record 326 * gcov_iter_next - advance file iterator to next logical record
@@ -323,6 +337,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
323 case RECORD_FUNCTON_TAG_LEN: 337 case RECORD_FUNCTON_TAG_LEN:
324 case RECORD_FUNCTION_IDENT: 338 case RECORD_FUNCTION_IDENT:
325 case RECORD_COUNT_TAG: 339 case RECORD_COUNT_TAG:
340 case RECORD_FUNCTION_CHECK_LINE:
341 case RECORD_FUNCTION_CHECK_CFG:
342 case RECORD_FUNCTION_NAME_LEN:
326 /* Advance to next record */ 343 /* Advance to next record */
327 iter->record++; 344 iter->record++;
328 break; 345 break;
@@ -332,7 +349,7 @@ int gcov_iter_next(struct gcov_iterator *iter)
332 /* fall through */ 349 /* fall through */
333 case RECORD_COUNT_LEN: 350 case RECORD_COUNT_LEN:
334 if (iter->count < get_func(iter)->n_ctrs[iter->type]) { 351 if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
335 iter->record = 9; 352 iter->record = 12;
336 break; 353 break;
337 } 354 }
338 /* Advance to next counter type */ 355 /* Advance to next counter type */
@@ -340,9 +357,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
340 iter->count = 0; 357 iter->count = 0;
341 iter->type++; 358 iter->type++;
342 /* fall through */ 359 /* fall through */
343 case RECORD_FUNCTION_CHECK: 360 case RECORD_FUNCTION_NAME:
344 if (iter->type < iter->num_types) { 361 if (iter->type < iter->num_types) {
345 iter->record = 7; 362 iter->record = 10;
346 break; 363 break;
347 } 364 }
348 /* Advance to next function */ 365 /* Advance to next function */
@@ -395,6 +412,34 @@ static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
395 data[1] = (v >> 32); 412 data[1] = (v >> 32);
396 return seq_write(seq, data, sizeof(data)); 413 return seq_write(seq, data, sizeof(data));
397} 414}
415/**
416 * seq_write_gcov_str - write string in gcov format to seq_file
417 * @seq: seq_file handle
418 * @str: string to be stored
419 *
420 * Number format defined by gcc: numbers are recorded in the 32 bit
421 * unsigned binary form of the endianness of the machine generating the
422 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
423 * first.
424 */
425static int seq_write_gcov_str(struct seq_file *seq, const char *str)
426{
427 if (str) {
428 size_t len;
429 int str_off;
430 u32 data;
431 len = strlen(str);
432 for (str_off = 0; str_off < (sizeof_str(str) - 2) ; str_off++) {
433 memcpy(&data, (str + str_off * 4), 4);
434 seq_write(seq, &data, sizeof(data));
435 }
436 data = 0;
437 memcpy(&data, (str + str_off * 4), (len - str_off * 4));
438 return seq_write(seq, &data, sizeof(data));
439 } else {
440 return 0;
441 }
442}
398 443
399/** 444/**
400 * gcov_iter_write - write data for current pos to seq_file 445 * gcov_iter_write - write data for current pos to seq_file
@@ -421,13 +466,24 @@ int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
421 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); 466 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
422 break; 467 break;
423 case RECORD_FUNCTON_TAG_LEN: 468 case RECORD_FUNCTON_TAG_LEN:
424 rc = seq_write_gcov_u32(seq, 2); 469 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH +
470 (sizeof_str(get_func(iter)->name)));
425 break; 471 break;
426 case RECORD_FUNCTION_IDENT: 472 case RECORD_FUNCTION_IDENT:
427 rc = seq_write_gcov_u32(seq, get_func(iter)->ident); 473 rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
428 break; 474 break;
429 case RECORD_FUNCTION_CHECK: 475 case RECORD_FUNCTION_CHECK_LINE:
430 rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); 476 rc = seq_write_gcov_u32(seq, get_func(iter)->lineno_checksum);
477 break;
478 case RECORD_FUNCTION_CHECK_CFG:
479 rc = seq_write_gcov_u32(seq, get_func(iter)->cfg_checksum);
480 break;
481 case RECORD_FUNCTION_NAME_LEN:
482 rc = seq_write_gcov_u32(seq,
483 (sizeof_str(get_func(iter)->name) - 1));
484 break;
485 case RECORD_FUNCTION_NAME:
486 rc = seq_write_gcov_str(seq, get_func(iter)->name);
431 break; 487 break;
432 case RECORD_COUNT_TAG: 488 case RECORD_COUNT_TAG:
433 rc = seq_write_gcov_u32(seq, 489 rc = seq_write_gcov_u32(seq,
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a..040c6980df0 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -21,9 +21,10 @@
21 * gcc and need to be kept as close to the original definition as possible to 21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible. 22 * remain compatible.
23 */ 23 */
24#define GCOV_COUNTERS 5 24#define GCOV_COUNTERS 10
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) 25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) 26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_FUNCTION_LENGTH 3
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) 28#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
28#define GCOV_TAG_FOR_COUNTER(count) \ 29#define GCOV_TAG_FOR_COUNTER(count) \
29 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17)) 30 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
@@ -34,10 +35,38 @@ typedef long gcov_type;
34typedef long long gcov_type; 35typedef long long gcov_type;
35#endif 36#endif
36 37
38/*
39 * Source module info. The data structure is used in both runtime and
40 * profile-use phase.
41 */
42struct gcov_module_info {
43 unsigned int ident;
44/*
45 * This is overloaded to mean two things:
46 * (1) means FDO/LIPO in instrumented binary.
47 * (2) means IS_PRIMARY in persistent file or memory copy used in profile-use.
48 */
49 unsigned int is_primary;
50 unsigned int is_exported;
51 unsigned int lang;
52 char *da_filename;
53 char *source_filename;
54 unsigned int num_quote_paths;
55 unsigned int num_bracket_paths;
56 unsigned int num_cpp_defines;
57 unsigned int num_cpp_includes;
58 unsigned int num_cl_args;
59 char *string_array[1];
60};
61
62
37/** 63/**
38 * struct gcov_fn_info - profiling meta data per function 64 * struct gcov_fn_info - profiling meta data per function
39 * @ident: object file-unique function identifier 65 * @ident: object file-unique function identifier
40 * @checksum: function checksum 66 * @lineno_checksum: function lineno checksum
67 * @cfg_checksum: function cfg checksum
68 * @dc_offset: direct call offset
69 * @name: function name
41 * @n_ctrs: number of values per counter type belonging to this function 70 * @n_ctrs: number of values per counter type belonging to this function
42 * 71 *
43 * This data is generated by gcc during compilation and doesn't change 72 * This data is generated by gcc during compilation and doesn't change
@@ -45,7 +74,10 @@ typedef long long gcov_type;
45 */ 74 */
46struct gcov_fn_info { 75struct gcov_fn_info {
47 unsigned int ident; 76 unsigned int ident;
48 unsigned int checksum; 77 unsigned int lineno_checksum;
78 unsigned int cfg_checksum;
79 unsigned int dc_offset;
80 const char *name;
49 unsigned int n_ctrs[0]; 81 unsigned int n_ctrs[0];
50}; 82};
51 83
@@ -67,9 +99,11 @@ struct gcov_ctr_info {
67/** 99/**
68 * struct gcov_info - profiling data per object file 100 * struct gcov_info - profiling data per object file
69 * @version: gcov version magic indicating the gcc version used for compilation 101 * @version: gcov version magic indicating the gcc version used for compilation
102 * @modinfo: additional module information
70 * @next: list head for a singly-linked list 103 * @next: list head for a singly-linked list
71 * @stamp: time stamp 104 * @stamp: time stamp
72 * @filename: name of the associated gcov data file 105 * @filename: name of the associated gcov data file
106 * @eof_pos: end position of profile data
73 * @n_functions: number of instrumented functions 107 * @n_functions: number of instrumented functions
74 * @functions: function data 108 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active 109 * @ctr_mask: mask specifying which counter types are active
@@ -80,9 +114,11 @@ struct gcov_ctr_info {
80 */ 114 */
81struct gcov_info { 115struct gcov_info {
82 unsigned int version; 116 unsigned int version;
117 struct gcov_module_info *mod_info;
83 struct gcov_info *next; 118 struct gcov_info *next;
84 unsigned int stamp; 119 unsigned int stamp;
85 const char *filename; 120 const char *filename;
121 unsigned int eof_pos;
86 unsigned int n_functions; 122 unsigned int n_functions;
87 const struct gcov_fn_info *functions; 123 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask; 124 unsigned int ctr_mask;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 11e89690382..2391745f656 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -887,10 +887,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
887 struct hrtimer_clock_base *base, 887 struct hrtimer_clock_base *base,
888 unsigned long newstate, int reprogram) 888 unsigned long newstate, int reprogram)
889{ 889{
890 struct timerqueue_node *next_timer;
890 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 891 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
891 goto out; 892 goto out;
892 893
893 if (&timer->node == timerqueue_getnext(&base->active)) { 894 next_timer = timerqueue_getnext(&base->active);
895 timerqueue_del(&base->active, &timer->node);
896 if (&timer->node == next_timer) {
894#ifdef CONFIG_HIGH_RES_TIMERS 897#ifdef CONFIG_HIGH_RES_TIMERS
895 /* Reprogram the clock event device. if enabled */ 898 /* Reprogram the clock event device. if enabled */
896 if (reprogram && hrtimer_hres_active()) { 899 if (reprogram && hrtimer_hres_active()) {
@@ -903,7 +906,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
903 } 906 }
904#endif 907#endif
905 } 908 }
906 timerqueue_del(&base->active, &timer->node);
907 if (!timerqueue_getnext(&base->active)) 909 if (!timerqueue_getnext(&base->active))
908 base->cpu_base->active_bases &= ~(1 << base->index); 910 base->cpu_base->active_bases &= ~(1 << base->index);
909out: 911out:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ea640120ab8..e972276f12f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
74 74
75 /* 75 /*
76 * Ensure the task is not frozen. 76 * Ensure the task is not frozen.
77 * Also, when a freshly created task is scheduled once, changes 77 * Also, skip vfork and any other user process that freezer should skip.
78 * its state to TASK_UNINTERRUPTIBLE without having ever been
79 * switched out once, it musn't be checked.
80 */ 78 */
81 if (unlikely(t->flags & PF_FROZEN || !switch_count)) 79 if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
80 return;
81
82 /*
83 * When a freshly created task is scheduled once, changes its state to
84 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
85 * musn't be checked.
86 */
87 if (unlikely(!switch_count))
82 return; 88 return;
83 89
84 if (switch_count != t->last_switch_count) { 90 if (switch_count != t->last_switch_count) {
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1d051b38e0..5a38bf4de64 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -52,6 +52,10 @@ config IRQ_EDGE_EOI_HANDLER
52config GENERIC_IRQ_CHIP 52config GENERIC_IRQ_CHIP
53 bool 53 bool
54 54
55# Generic irq_domain hw <--> linux irq number translation
56config IRQ_DOMAIN
57 bool
58
55# Support forced irq threading 59# Support forced irq threading
56config IRQ_FORCED_THREADING 60config IRQ_FORCED_THREADING
57 bool 61 bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 73290056cfb..fff17381f0a 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,7 @@
2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o 3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
5obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
5obj-$(CONFIG_PROC_FS) += proc.o 6obj-$(CONFIG_PROC_FS) += proc.o
6obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 8obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d5a3009da71..dc5114b4c16 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -178,7 +178,7 @@ void irq_shutdown(struct irq_desc *desc)
178 desc->depth = 1; 178 desc->depth = 1;
179 if (desc->irq_data.chip->irq_shutdown) 179 if (desc->irq_data.chip->irq_shutdown)
180 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 180 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
181 if (desc->irq_data.chip->irq_disable) 181 else if (desc->irq_data.chip->irq_disable)
182 desc->irq_data.chip->irq_disable(&desc->irq_data); 182 desc->irq_data.chip->irq_disable(&desc->irq_data);
183 else 183 else
184 desc->irq_data.chip->irq_mask(&desc->irq_data); 184 desc->irq_data.chip->irq_mask(&desc->irq_data);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef4ffcdfa5..bd8e788d71e 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
88 struct irq_devres match_data = { irq, dev_id }; 88 struct irq_devres match_data = { irq, dev_id };
89 89
90 free_irq(irq, dev_id);
91 WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, 90 WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
92 &match_data)); 91 &match_data));
92 free_irq(irq, dev_id);
93} 93}
94EXPORT_SYMBOL(devm_free_irq); 94EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 3a2cab407b9..e38544dddb1 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -246,7 +246,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
246 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); 246 gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
247 247
248 for (i = gc->irq_base; msk; msk >>= 1, i++) { 248 for (i = gc->irq_base; msk; msk >>= 1, i++) {
249 if (!msk & 0x01) 249 if (!(msk & 0x01))
250 continue; 250 continue;
251 251
252 if (flags & IRQ_GC_INIT_NESTED_LOCK) 252 if (flags & IRQ_GC_INIT_NESTED_LOCK)
@@ -301,7 +301,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
301 raw_spin_unlock(&gc_lock); 301 raw_spin_unlock(&gc_lock);
302 302
303 for (; msk; msk >>= 1, i++) { 303 for (; msk; msk >>= 1, i++) {
304 if (!msk & 0x01) 304 if (!(msk & 0x01))
305 continue; 305 continue;
306 306
307 /* Remove handler first. That will mask the irq line */ 307 /* Remove handler first. That will mask the irq line */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4c60a50e66b..039b889ea05 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -70,7 +70,8 @@ static inline void desc_smp_init(struct irq_desc *desc, int node) { }
70static inline int desc_node(struct irq_desc *desc) { return 0; } 70static inline int desc_node(struct irq_desc *desc) { return 0; }
71#endif 71#endif
72 72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) 73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
74 struct module *owner)
74{ 75{
75 int cpu; 76 int cpu;
76 77
@@ -86,6 +87,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
86 desc->irq_count = 0; 87 desc->irq_count = 0;
87 desc->irqs_unhandled = 0; 88 desc->irqs_unhandled = 0;
88 desc->name = NULL; 89 desc->name = NULL;
90 desc->owner = owner;
89 for_each_possible_cpu(cpu) 91 for_each_possible_cpu(cpu)
90 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; 92 *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
91 desc_smp_init(desc, node); 93 desc_smp_init(desc, node);
@@ -128,7 +130,7 @@ static void free_masks(struct irq_desc *desc)
128static inline void free_masks(struct irq_desc *desc) { } 130static inline void free_masks(struct irq_desc *desc) { }
129#endif 131#endif
130 132
131static struct irq_desc *alloc_desc(int irq, int node) 133static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
132{ 134{
133 struct irq_desc *desc; 135 struct irq_desc *desc;
134 gfp_t gfp = GFP_KERNEL; 136 gfp_t gfp = GFP_KERNEL;
@@ -147,7 +149,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
147 raw_spin_lock_init(&desc->lock); 149 raw_spin_lock_init(&desc->lock);
148 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 150 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
149 151
150 desc_set_defaults(irq, desc, node); 152 desc_set_defaults(irq, desc, node, owner);
151 153
152 return desc; 154 return desc;
153 155
@@ -173,13 +175,14 @@ static void free_desc(unsigned int irq)
173 kfree(desc); 175 kfree(desc);
174} 176}
175 177
176static int alloc_descs(unsigned int start, unsigned int cnt, int node) 178static int alloc_descs(unsigned int start, unsigned int cnt, int node,
179 struct module *owner)
177{ 180{
178 struct irq_desc *desc; 181 struct irq_desc *desc;
179 int i; 182 int i;
180 183
181 for (i = 0; i < cnt; i++) { 184 for (i = 0; i < cnt; i++) {
182 desc = alloc_desc(start + i, node); 185 desc = alloc_desc(start + i, node, owner);
183 if (!desc) 186 if (!desc)
184 goto err; 187 goto err;
185 mutex_lock(&sparse_irq_lock); 188 mutex_lock(&sparse_irq_lock);
@@ -227,7 +230,7 @@ int __init early_irq_init(void)
227 nr_irqs = initcnt; 230 nr_irqs = initcnt;
228 231
229 for (i = 0; i < initcnt; i++) { 232 for (i = 0; i < initcnt; i++) {
230 desc = alloc_desc(i, node); 233 desc = alloc_desc(i, node, NULL);
231 set_bit(i, allocated_irqs); 234 set_bit(i, allocated_irqs);
232 irq_insert_desc(i, desc); 235 irq_insert_desc(i, desc);
233 } 236 }
@@ -261,7 +264,7 @@ int __init early_irq_init(void)
261 alloc_masks(&desc[i], GFP_KERNEL, node); 264 alloc_masks(&desc[i], GFP_KERNEL, node);
262 raw_spin_lock_init(&desc[i].lock); 265 raw_spin_lock_init(&desc[i].lock);
263 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 266 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
264 desc_set_defaults(i, &desc[i], node); 267 desc_set_defaults(i, &desc[i], node, NULL);
265 } 268 }
266 return arch_early_irq_init(); 269 return arch_early_irq_init();
267} 270}
@@ -276,8 +279,16 @@ static void free_desc(unsigned int irq)
276 dynamic_irq_cleanup(irq); 279 dynamic_irq_cleanup(irq);
277} 280}
278 281
279static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 282static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
283 struct module *owner)
280{ 284{
285 u32 i;
286
287 for (i = 0; i < cnt; i++) {
288 struct irq_desc *desc = irq_to_desc(start + i);
289
290 desc->owner = owner;
291 }
281 return start; 292 return start;
282} 293}
283 294
@@ -333,11 +344,13 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
333 * @from: Start the search from this irq number 344 * @from: Start the search from this irq number
334 * @cnt: Number of consecutive irqs to allocate. 345 * @cnt: Number of consecutive irqs to allocate.
335 * @node: Preferred node on which the irq descriptor should be allocated 346 * @node: Preferred node on which the irq descriptor should be allocated
347 * @owner: Owning module (can be NULL)
336 * 348 *
337 * Returns the first irq number or error code 349 * Returns the first irq number or error code
338 */ 350 */
339int __ref 351int __ref
340irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) 352__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
353 struct module *owner)
341{ 354{
342 int start, ret; 355 int start, ret;
343 356
@@ -366,13 +379,13 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
366 379
367 bitmap_set(allocated_irqs, start, cnt); 380 bitmap_set(allocated_irqs, start, cnt);
368 mutex_unlock(&sparse_irq_lock); 381 mutex_unlock(&sparse_irq_lock);
369 return alloc_descs(start, cnt, node); 382 return alloc_descs(start, cnt, node, owner);
370 383
371err: 384err:
372 mutex_unlock(&sparse_irq_lock); 385 mutex_unlock(&sparse_irq_lock);
373 return ret; 386 return ret;
374} 387}
375EXPORT_SYMBOL_GPL(irq_alloc_descs); 388EXPORT_SYMBOL_GPL(__irq_alloc_descs);
376 389
377/** 390/**
378 * irq_reserve_irqs - mark irqs allocated 391 * irq_reserve_irqs - mark irqs allocated
@@ -440,7 +453,7 @@ void dynamic_irq_cleanup(unsigned int irq)
440 unsigned long flags; 453 unsigned long flags;
441 454
442 raw_spin_lock_irqsave(&desc->lock, flags); 455 raw_spin_lock_irqsave(&desc->lock, flags);
443 desc_set_defaults(irq, desc, desc_node(desc)); 456 desc_set_defaults(irq, desc, desc_node(desc), NULL);
444 raw_spin_unlock_irqrestore(&desc->lock, flags); 457 raw_spin_unlock_irqrestore(&desc->lock, flags);
445} 458}
446 459
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
new file mode 100644
index 00000000000..b57a3776de4
--- /dev/null
+++ b/kernel/irq/irqdomain.c
@@ -0,0 +1,184 @@
1#include <linux/irq.h>
2#include <linux/irqdomain.h>
3#include <linux/module.h>
4#include <linux/mutex.h>
5#include <linux/of.h>
6#include <linux/of_address.h>
7#include <linux/slab.h>
8
9static LIST_HEAD(irq_domain_list);
10static DEFINE_MUTEX(irq_domain_mutex);
11
12/**
13 * irq_domain_add() - Register an irq_domain
14 * @domain: ptr to initialized irq_domain structure
15 *
16 * Registers an irq_domain structure. The irq_domain must at a minimum be
17 * initialized with an ops structure pointer, and either a ->to_irq hook or
18 * a valid irq_base value. Everything else is optional.
19 */
20void irq_domain_add(struct irq_domain *domain)
21{
22 struct irq_data *d;
23 int hwirq;
24
25 /*
26 * This assumes that the irq_domain owner has already allocated
27 * the irq_descs. This block will be removed when support for dynamic
28 * allocation of irq_descs is added to irq_domain.
29 */
30 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
31 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
32 if (!d) {
33 WARN(1, "error: assigning domain to non existant irq_desc");
34 return;
35 }
36 if (d->domain) {
37 /* things are broken; just report, don't clean up */
38 WARN(1, "error: irq_desc already assigned to a domain");
39 return;
40 }
41 d->domain = domain;
42 d->hwirq = hwirq;
43 }
44
45 mutex_lock(&irq_domain_mutex);
46 list_add(&domain->list, &irq_domain_list);
47 mutex_unlock(&irq_domain_mutex);
48}
49
50/**
51 * irq_domain_del() - Unregister an irq_domain
52 * @domain: ptr to registered irq_domain.
53 */
54void irq_domain_del(struct irq_domain *domain)
55{
56 struct irq_data *d;
57 int hwirq;
58
59 mutex_lock(&irq_domain_mutex);
60 list_del(&domain->list);
61 mutex_unlock(&irq_domain_mutex);
62
63 /* Clear the irq_domain assignments */
64 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
65 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
66 d->domain = NULL;
67 }
68}
69
70#if defined(CONFIG_OF_IRQ)
71/**
72 * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
73 *
74 * Used by the device tree interrupt mapping code to translate a device tree
75 * interrupt specifier to a valid linux irq number. Returns either a valid
76 * linux IRQ number or 0.
77 *
78 * When the caller no longer need the irq number returned by this function it
79 * should arrange to call irq_dispose_mapping().
80 */
81unsigned int irq_create_of_mapping(struct device_node *controller,
82 const u32 *intspec, unsigned int intsize)
83{
84 struct irq_domain *domain;
85 unsigned long hwirq;
86 unsigned int irq, type;
87 int rc = -EINVAL;
88
89 /* Find a domain which can translate the irq spec */
90 mutex_lock(&irq_domain_mutex);
91 list_for_each_entry(domain, &irq_domain_list, list) {
92 if (!domain->ops->dt_translate)
93 continue;
94 rc = domain->ops->dt_translate(domain, controller,
95 intspec, intsize, &hwirq, &type);
96 if (rc == 0)
97 break;
98 }
99 mutex_unlock(&irq_domain_mutex);
100
101 if (rc != 0)
102 return 0;
103
104 irq = irq_domain_to_irq(domain, hwirq);
105 if (type != IRQ_TYPE_NONE)
106 irq_set_irq_type(irq, type);
107 pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
108 controller->full_name, (int)hwirq, irq, type);
109 return irq;
110}
111EXPORT_SYMBOL_GPL(irq_create_of_mapping);
112
113/**
114 * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
115 * @irq: linux irq number to be discarded
116 *
117 * Calling this function indicates the caller no longer needs a reference to
118 * the linux irq number returned by a prior call to irq_create_of_mapping().
119 */
120void irq_dispose_mapping(unsigned int irq)
121{
122 /*
123 * nothing yet; will be filled when support for dynamic allocation of
124 * irq_descs is added to irq_domain
125 */
126}
127EXPORT_SYMBOL_GPL(irq_dispose_mapping);
128
129int irq_domain_simple_dt_translate(struct irq_domain *d,
130 struct device_node *controller,
131 const u32 *intspec, unsigned int intsize,
132 unsigned long *out_hwirq, unsigned int *out_type)
133{
134 if (d->of_node != controller)
135 return -EINVAL;
136 if (intsize < 1)
137 return -EINVAL;
138
139 *out_hwirq = intspec[0];
140 *out_type = IRQ_TYPE_NONE;
141 if (intsize > 1)
142 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
143 return 0;
144}
145
146struct irq_domain_ops irq_domain_simple_ops = {
147 .dt_translate = irq_domain_simple_dt_translate,
148};
149EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
150
151/**
152 * irq_domain_create_simple() - Set up a 'simple' translation range
153 */
154void irq_domain_add_simple(struct device_node *controller, int irq_base)
155{
156 struct irq_domain *domain;
157
158 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
159 if (!domain) {
160 WARN_ON(1);
161 return;
162 }
163
164 domain->irq_base = irq_base;
165 domain->of_node = of_node_get(controller);
166 domain->ops = &irq_domain_simple_ops;
167 irq_domain_add(domain);
168}
169EXPORT_SYMBOL_GPL(irq_domain_add_simple);
170
171void irq_domain_generate_simple(const struct of_device_id *match,
172 u64 phys_base, unsigned int irq_start)
173{
174 struct device_node *node;
175 pr_info("looking for phys_base=%llx, irq_start=%i\n",
176 (unsigned long long) phys_base, (int) irq_start);
177 node = of_find_matching_node_by_address(NULL, match, phys_base);
178 if (node)
179 irq_domain_add_simple(node, irq_start);
180 else
181 pr_info("no node found\n");
182}
183EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
184#endif /* CONFIG_OF_IRQ */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a7840aeb0f..d6c4adc2804 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -620,8 +620,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
620 620
621static int irq_wait_for_interrupt(struct irqaction *action) 621static int irq_wait_for_interrupt(struct irqaction *action)
622{ 622{
623 set_current_state(TASK_INTERRUPTIBLE);
624
623 while (!kthread_should_stop()) { 625 while (!kthread_should_stop()) {
624 set_current_state(TASK_INTERRUPTIBLE);
625 626
626 if (test_and_clear_bit(IRQTF_RUNTHREAD, 627 if (test_and_clear_bit(IRQTF_RUNTHREAD,
627 &action->thread_flags)) { 628 &action->thread_flags)) {
@@ -629,7 +630,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
629 return 0; 630 return 0;
630 } 631 }
631 schedule(); 632 schedule();
633 set_current_state(TASK_INTERRUPTIBLE);
632 } 634 }
635 __set_current_state(TASK_RUNNING);
633 return -1; 636 return -1;
634} 637}
635 638
@@ -883,6 +886,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
883 886
884 if (desc->irq_data.chip == &no_irq_chip) 887 if (desc->irq_data.chip == &no_irq_chip)
885 return -ENOSYS; 888 return -ENOSYS;
889 if (!try_module_get(desc->owner))
890 return -ENODEV;
886 /* 891 /*
887 * Some drivers like serial.c use request_irq() heavily, 892 * Some drivers like serial.c use request_irq() heavily,
888 * so we have to be careful not to interfere with a 893 * so we have to be careful not to interfere with a
@@ -906,8 +911,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
906 */ 911 */
907 nested = irq_settings_is_nested_thread(desc); 912 nested = irq_settings_is_nested_thread(desc);
908 if (nested) { 913 if (nested) {
909 if (!new->thread_fn) 914 if (!new->thread_fn) {
910 return -EINVAL; 915 ret = -EINVAL;
916 goto out_mput;
917 }
911 /* 918 /*
912 * Replace the primary handler which was provided from 919 * Replace the primary handler which was provided from
913 * the driver for non nested interrupt handling by the 920 * the driver for non nested interrupt handling by the
@@ -929,8 +936,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
929 936
930 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 937 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
931 new->name); 938 new->name);
932 if (IS_ERR(t)) 939 if (IS_ERR(t)) {
933 return PTR_ERR(t); 940 ret = PTR_ERR(t);
941 goto out_mput;
942 }
934 /* 943 /*
935 * We keep the reference to the task struct even if 944 * We keep the reference to the task struct even if
936 * the thread dies to avoid that the interrupt code 945 * the thread dies to avoid that the interrupt code
@@ -1095,6 +1104,8 @@ out_thread:
1095 kthread_stop(t); 1104 kthread_stop(t);
1096 put_task_struct(t); 1105 put_task_struct(t);
1097 } 1106 }
1107out_mput:
1108 module_put(desc->owner);
1098 return ret; 1109 return ret;
1099} 1110}
1100 1111
@@ -1203,6 +1214,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1203 put_task_struct(action->thread); 1214 put_task_struct(action->thread);
1204 } 1215 }
1205 1216
1217 module_put(desc->owner);
1206 return action; 1218 return action;
1207} 1219}
1208 1220
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c987..fe4b09cf829 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/syscore_ops.h>
12 13
13#include "internals.h" 14#include "internals.h"
14 15
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
39} 40}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 41EXPORT_SYMBOL_GPL(suspend_device_irqs);
41 42
42/** 43static void resume_irqs(bool want_early)
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQS_SUSPENDED flag set.
47 */
48void resume_device_irqs(void)
49{ 44{
50 struct irq_desc *desc; 45 struct irq_desc *desc;
51 int irq; 46 int irq;
52 47
53 for_each_irq_desc(irq, desc) { 48 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 49 unsigned long flags;
50 bool is_early = desc->action &&
51 desc->action->flags & IRQF_EARLY_RESUME;
52
53 if (is_early != want_early)
54 continue;
55 55
56 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
57 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
58 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
59 } 59 }
60} 60}
61
62/**
63 * irq_pm_syscore_ops - enable interrupt lines early
64 *
65 * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
66 */
67static void irq_pm_syscore_resume(void)
68{
69 resume_irqs(true);
70}
71
72static struct syscore_ops irq_pm_syscore_ops = {
73 .resume = irq_pm_syscore_resume,
74};
75
76static int __init irq_pm_init_ops(void)
77{
78 register_syscore_ops(&irq_pm_syscore_ops);
79 return 0;
80}
81
82device_initcall(irq_pm_init_ops);
83
84/**
85 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
86 *
87 * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
88 * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
89 * set as well as those with %IRQF_FORCE_RESUME.
90 */
91void resume_device_irqs(void)
92{
93 resume_irqs(false);
94}
61EXPORT_SYMBOL_GPL(resume_device_irqs); 95EXPORT_SYMBOL_GPL(resume_device_irqs);
62 96
63/** 97/**
@@ -70,8 +104,13 @@ int check_wakeup_irqs(void)
70 104
71 for_each_irq_desc(irq, desc) { 105 for_each_irq_desc(irq, desc) {
72 if (irqd_is_wakeup_set(&desc->irq_data)) { 106 if (irqd_is_wakeup_set(&desc->irq_data)) {
73 if (desc->istate & IRQS_PENDING) 107 if (desc->istate & IRQS_PENDING) {
108 pr_info("Wakeup IRQ %d %s pending, suspend aborted\n",
109 irq,
110 desc->action && desc->action->name ?
111 desc->action->name : "");
74 return -EBUSY; 112 return -EBUSY;
113 }
75 continue; 114 continue;
76 } 115 }
77 /* 116 /*
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c..ef60772d2fe 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,17 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 /*
59 * We do not resend level type interrupts. Level type
60 * interrupts are resent by hardware when they are still
61 * active.
62 */
63 if (irq_settings_is_level(desc))
64 return;
65 if (desc->istate & IRQS_REPLAY)
66 return;
67 if (desc->istate & IRQS_PENDING) { 58 if (desc->istate & IRQS_PENDING) {
68 desc->istate &= ~IRQS_PENDING; 59 desc->istate &= ~IRQS_PENDING;
60 /*
61 * We do not resend level type interrupts. Level type
62 * interrupts are resent by hardware when they are still
63 * active.
64 */
65 if (irq_settings_is_level(desc))
66 return;
67 if (desc->istate & IRQS_REPLAY)
68 return;
69
69 desc->istate |= IRQS_REPLAY; 70 desc->istate |= IRQS_REPLAY;
70 71
71 if (!desc->irq_data.chip->irq_retrigger || 72 if (!desc->irq_data.chip->irq_retrigger ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index aa57d5da18c..dc813a948be 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
84 */ 84 */
85 action = desc->action; 85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) || 86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next) 87 (action->flags & __IRQF_TIMER) ||
88 (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
89 !action->next)
88 goto out; 90 goto out;
89 91
90 /* Already running on another processor */ 92 /* Already running on another processor */
@@ -115,7 +117,7 @@ static int misrouted_irq(int irq)
115 struct irq_desc *desc; 117 struct irq_desc *desc;
116 int i, ok = 0; 118 int i, ok = 0;
117 119
118 if (atomic_inc_return(&irq_poll_active) == 1) 120 if (atomic_inc_return(&irq_poll_active) != 1)
119 goto out; 121 goto out;
120 122
121 irq_poll_cpu = smp_processor_id(); 123 irq_poll_cpu = smp_processor_id();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a8ce45097f3..e6f1f24ad57 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -66,8 +66,9 @@ void jump_label_inc(struct jump_label_key *key)
66 return; 66 return;
67 67
68 jump_label_lock(); 68 jump_label_lock();
69 if (atomic_add_return(1, &key->enabled) == 1) 69 if (atomic_read(&key->enabled) == 0)
70 jump_label_update(key, JUMP_LABEL_ENABLE); 70 jump_label_update(key, JUMP_LABEL_ENABLE);
71 atomic_inc(&key->enabled);
71 jump_label_unlock(); 72 jump_label_unlock();
72} 73}
73 74
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8d814cbc810..296fbc84d65 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void)
1095 size_t size = 0; 1095 size_t size = 0;
1096 mutex_lock(&kexec_mutex); 1096 mutex_lock(&kexec_mutex);
1097 if (crashk_res.end != crashk_res.start) 1097 if (crashk_res.end != crashk_res.start)
1098 size = crashk_res.end - crashk_res.start + 1; 1098 size = resource_size(&crashk_res);
1099 mutex_unlock(&kexec_mutex); 1099 mutex_unlock(&kexec_mutex);
1100 return size; 1100 return size;
1101} 1101}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 47613dfb7b2..a4bea97c75b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...)
114 atomic_inc(&kmod_concurrent); 114 atomic_inc(&kmod_concurrent);
115 if (atomic_read(&kmod_concurrent) > max_modprobes) { 115 if (atomic_read(&kmod_concurrent) > max_modprobes) {
116 /* We may be blaming an innocent here, but unlikely */ 116 /* We may be blaming an innocent here, but unlikely */
117 if (kmod_loop_msg++ < 5) 117 if (kmod_loop_msg < 5) {
118 printk(KERN_ERR 118 printk(KERN_ERR
119 "request_module: runaway loop modprobe %s\n", 119 "request_module: runaway loop modprobe %s\n",
120 module_name); 120 module_name);
121 kmod_loop_msg++;
122 }
121 atomic_dec(&kmod_concurrent); 123 atomic_dec(&kmod_concurrent);
122 return -ENOMEM; 124 return -ENOMEM;
123 } 125 }
@@ -274,7 +276,7 @@ static void __call_usermodehelper(struct work_struct *work)
274 * (used for preventing user land processes from being created after the user 276 * (used for preventing user land processes from being created after the user
275 * land has been frozen during a system-wide hibernation or suspend operation). 277 * land has been frozen during a system-wide hibernation or suspend operation).
276 */ 278 */
277static int usermodehelper_disabled; 279static int usermodehelper_disabled = 1;
278 280
279/* Number of helpers running */ 281/* Number of helpers running */
280static atomic_t running_helpers = ATOMIC_INIT(0); 282static atomic_t running_helpers = ATOMIC_INIT(0);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 77981813a1e..b30fd54eb98 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
1255/* 1255/*
1256 * If we have a symbol_name argument, look it up and add the offset field 1256 * If we have a symbol_name argument, look it up and add the offset field
1257 * to it. This way, we can specify a relative address to a symbol. 1257 * to it. This way, we can specify a relative address to a symbol.
1258 * This returns encoded errors if it fails to look up symbol or invalid
1259 * combination of parameters.
1258 */ 1260 */
1259static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) 1261static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1260{ 1262{
1261 kprobe_opcode_t *addr = p->addr; 1263 kprobe_opcode_t *addr = p->addr;
1264
1265 if ((p->symbol_name && p->addr) ||
1266 (!p->symbol_name && !p->addr))
1267 goto invalid;
1268
1262 if (p->symbol_name) { 1269 if (p->symbol_name) {
1263 if (addr)
1264 return NULL;
1265 kprobe_lookup_name(p->symbol_name, addr); 1270 kprobe_lookup_name(p->symbol_name, addr);
1271 if (!addr)
1272 return ERR_PTR(-ENOENT);
1266 } 1273 }
1267 1274
1268 if (!addr) 1275 addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
1269 return NULL; 1276 if (addr)
1270 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 1277 return addr;
1278
1279invalid:
1280 return ERR_PTR(-EINVAL);
1271} 1281}
1272 1282
1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1283/* Check passed kprobe is valid and return kprobe in kprobe_table. */
@@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1311 kprobe_opcode_t *addr; 1321 kprobe_opcode_t *addr;
1312 1322
1313 addr = kprobe_addr(p); 1323 addr = kprobe_addr(p);
1314 if (!addr) 1324 if (IS_ERR(addr))
1315 return -EINVAL; 1325 return PTR_ERR(addr);
1316 p->addr = addr; 1326 p->addr = addr;
1317 1327
1318 ret = check_kprobe_rereg(p); 1328 ret = check_kprobe_rereg(p);
@@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1335 */ 1345 */
1336 probed_mod = __module_text_address((unsigned long) p->addr); 1346 probed_mod = __module_text_address((unsigned long) p->addr);
1337 if (probed_mod) { 1347 if (probed_mod) {
1348 /* Return -ENOENT if fail. */
1349 ret = -ENOENT;
1338 /* 1350 /*
1339 * We must hold a refcount of the probed module while updating 1351 * We must hold a refcount of the probed module while updating
1340 * its code to prohibit unexpected unloading. 1352 * its code to prohibit unexpected unloading.
@@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1351 module_put(probed_mod); 1363 module_put(probed_mod);
1352 goto fail_with_jump_label; 1364 goto fail_with_jump_label;
1353 } 1365 }
1366 /* ret will be updated by following code */
1354 } 1367 }
1355 preempt_enable(); 1368 preempt_enable();
1356 jump_label_unlock(); 1369 jump_label_unlock();
@@ -1399,7 +1412,7 @@ out:
1399fail_with_jump_label: 1412fail_with_jump_label:
1400 preempt_enable(); 1413 preempt_enable();
1401 jump_label_unlock(); 1414 jump_label_unlock();
1402 return -EINVAL; 1415 return ret;
1403} 1416}
1404EXPORT_SYMBOL_GPL(register_kprobe); 1417EXPORT_SYMBOL_GPL(register_kprobe);
1405 1418
@@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1686 1699
1687 if (kretprobe_blacklist_size) { 1700 if (kretprobe_blacklist_size) {
1688 addr = kprobe_addr(&rp->kp); 1701 addr = kprobe_addr(&rp->kp);
1689 if (!addr) 1702 if (IS_ERR(addr))
1690 return -EINVAL; 1703 return PTR_ERR(addr);
1691 1704
1692 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 1705 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
1693 if (kretprobe_blacklist[i].addr == addr) 1706 if (kretprobe_blacklist[i].addr == addr)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 298c9276dfd..447960603fb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -44,6 +44,7 @@
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/gfp.h> 46#include <linux/gfp.h>
47#include <linux/kmemcheck.h>
47 48
48#include <asm/sections.h> 49#include <asm/sections.h>
49 50
@@ -2468,6 +2469,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2468 2469
2469 BUG_ON(usage_bit >= LOCK_USAGE_STATES); 2470 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2470 2471
2472 if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
2473 continue;
2474
2471 if (!mark_lock(curr, hlock, usage_bit)) 2475 if (!mark_lock(curr, hlock, usage_bit))
2472 return 0; 2476 return 0;
2473 } 2477 }
@@ -2478,34 +2482,13 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2478/* 2482/*
2479 * Hardirqs will be enabled: 2483 * Hardirqs will be enabled:
2480 */ 2484 */
2481void trace_hardirqs_on_caller(unsigned long ip) 2485static void __trace_hardirqs_on_caller(unsigned long ip)
2482{ 2486{
2483 struct task_struct *curr = current; 2487 struct task_struct *curr = current;
2484 2488
2485 time_hardirqs_on(CALLER_ADDR0, ip);
2486
2487 if (unlikely(!debug_locks || current->lockdep_recursion))
2488 return;
2489
2490 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2491 return;
2492
2493 if (unlikely(curr->hardirqs_enabled)) {
2494 /*
2495 * Neither irq nor preemption are disabled here
2496 * so this is racy by nature but losing one hit
2497 * in a stat is not a big deal.
2498 */
2499 __debug_atomic_inc(redundant_hardirqs_on);
2500 return;
2501 }
2502 /* we'll do an OFF -> ON transition: */ 2489 /* we'll do an OFF -> ON transition: */
2503 curr->hardirqs_enabled = 1; 2490 curr->hardirqs_enabled = 1;
2504 2491
2505 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2506 return;
2507 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2508 return;
2509 /* 2492 /*
2510 * We are going to turn hardirqs on, so set the 2493 * We are going to turn hardirqs on, so set the
2511 * usage bit for all held locks: 2494 * usage bit for all held locks:
@@ -2525,6 +2508,37 @@ void trace_hardirqs_on_caller(unsigned long ip)
2525 curr->hardirq_enable_event = ++curr->irq_events; 2508 curr->hardirq_enable_event = ++curr->irq_events;
2526 debug_atomic_inc(hardirqs_on_events); 2509 debug_atomic_inc(hardirqs_on_events);
2527} 2510}
2511
2512void trace_hardirqs_on_caller(unsigned long ip)
2513{
2514 time_hardirqs_on(CALLER_ADDR0, ip);
2515
2516 if (unlikely(!debug_locks || current->lockdep_recursion))
2517 return;
2518
2519 if (unlikely(current->hardirqs_enabled)) {
2520 /*
2521 * Neither irq nor preemption are disabled here
2522 * so this is racy by nature but losing one hit
2523 * in a stat is not a big deal.
2524 */
2525 __debug_atomic_inc(redundant_hardirqs_on);
2526 return;
2527 }
2528
2529 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2530 return;
2531
2532 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2533 return;
2534
2535 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2536 return;
2537
2538 current->lockdep_recursion = 1;
2539 __trace_hardirqs_on_caller(ip);
2540 current->lockdep_recursion = 0;
2541}
2528EXPORT_SYMBOL(trace_hardirqs_on_caller); 2542EXPORT_SYMBOL(trace_hardirqs_on_caller);
2529 2543
2530void trace_hardirqs_on(void) 2544void trace_hardirqs_on(void)
@@ -2574,7 +2588,7 @@ void trace_softirqs_on(unsigned long ip)
2574{ 2588{
2575 struct task_struct *curr = current; 2589 struct task_struct *curr = current;
2576 2590
2577 if (unlikely(!debug_locks)) 2591 if (unlikely(!debug_locks || current->lockdep_recursion))
2578 return; 2592 return;
2579 2593
2580 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2594 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2585,6 +2599,7 @@ void trace_softirqs_on(unsigned long ip)
2585 return; 2599 return;
2586 } 2600 }
2587 2601
2602 current->lockdep_recursion = 1;
2588 /* 2603 /*
2589 * We'll do an OFF -> ON transition: 2604 * We'll do an OFF -> ON transition:
2590 */ 2605 */
@@ -2599,6 +2614,7 @@ void trace_softirqs_on(unsigned long ip)
2599 */ 2614 */
2600 if (curr->hardirqs_enabled) 2615 if (curr->hardirqs_enabled)
2601 mark_held_locks(curr, SOFTIRQ); 2616 mark_held_locks(curr, SOFTIRQ);
2617 current->lockdep_recursion = 0;
2602} 2618}
2603 2619
2604/* 2620/*
@@ -2608,7 +2624,7 @@ void trace_softirqs_off(unsigned long ip)
2608{ 2624{
2609 struct task_struct *curr = current; 2625 struct task_struct *curr = current;
2610 2626
2611 if (unlikely(!debug_locks)) 2627 if (unlikely(!debug_locks || current->lockdep_recursion))
2612 return; 2628 return;
2613 2629
2614 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2630 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2861,6 +2877,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2861{ 2877{
2862 int i; 2878 int i;
2863 2879
2880 kmemcheck_mark_initialized(lock, sizeof(*lock));
2881
2864 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) 2882 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2865 lock->class_cache[i] = NULL; 2883 lock->class_cache[i] = NULL;
2866 2884
@@ -3099,7 +3117,13 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
3099 if (!class) 3117 if (!class)
3100 class = look_up_lock_class(lock, 0); 3118 class = look_up_lock_class(lock, 0);
3101 3119
3102 if (DEBUG_LOCKS_WARN_ON(!class)) 3120 /*
3121 * If look_up_lock_class() failed to find a class, we're trying
3122 * to test if we hold a lock that has never yet been acquired.
3123 * Clearly if the lock hasn't been acquired _ever_, we're not
3124 * holding it either, so report failure.
3125 */
3126 if (!class)
3103 return 0; 3127 return 0;
3104 3128
3105 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) 3129 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
diff --git a/kernel/module.c b/kernel/module.c
index 795bdc7f5c3..e0ddcece2be 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
545 mod->field = kstrdup(s, GFP_KERNEL); \ 545 mod->field = kstrdup(s, GFP_KERNEL); \
546} \ 546} \
547static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ 547static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
548 struct module *mod, char *buffer) \ 548 struct module_kobject *mk, char *buffer) \
549{ \ 549{ \
550 return sprintf(buffer, "%s\n", mod->field); \ 550 return sprintf(buffer, "%s\n", mk->mod->field); \
551} \ 551} \
552static int modinfo_##field##_exists(struct module *mod) \ 552static int modinfo_##field##_exists(struct module *mod) \
553{ \ 553{ \
@@ -902,9 +902,9 @@ void symbol_put_addr(void *addr)
902EXPORT_SYMBOL_GPL(symbol_put_addr); 902EXPORT_SYMBOL_GPL(symbol_put_addr);
903 903
904static ssize_t show_refcnt(struct module_attribute *mattr, 904static ssize_t show_refcnt(struct module_attribute *mattr,
905 struct module *mod, char *buffer) 905 struct module_kobject *mk, char *buffer)
906{ 906{
907 return sprintf(buffer, "%u\n", module_refcount(mod)); 907 return sprintf(buffer, "%u\n", module_refcount(mk->mod));
908} 908}
909 909
910static struct module_attribute refcnt = { 910static struct module_attribute refcnt = {
@@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod)
952#endif /* CONFIG_MODULE_UNLOAD */ 952#endif /* CONFIG_MODULE_UNLOAD */
953 953
954static ssize_t show_initstate(struct module_attribute *mattr, 954static ssize_t show_initstate(struct module_attribute *mattr,
955 struct module *mod, char *buffer) 955 struct module_kobject *mk, char *buffer)
956{ 956{
957 const char *state = "unknown"; 957 const char *state = "unknown";
958 958
959 switch (mod->state) { 959 switch (mk->mod->state) {
960 case MODULE_STATE_LIVE: 960 case MODULE_STATE_LIVE:
961 state = "live"; 961 state = "live";
962 break; 962 break;
@@ -975,10 +975,27 @@ static struct module_attribute initstate = {
975 .show = show_initstate, 975 .show = show_initstate,
976}; 976};
977 977
978static ssize_t store_uevent(struct module_attribute *mattr,
979 struct module_kobject *mk,
980 const char *buffer, size_t count)
981{
982 enum kobject_action action;
983
984 if (kobject_action_type(buffer, count, &action) == 0)
985 kobject_uevent(&mk->kobj, action);
986 return count;
987}
988
989struct module_attribute module_uevent = {
990 .attr = { .name = "uevent", .mode = 0200 },
991 .store = store_uevent,
992};
993
978static struct module_attribute *modinfo_attrs[] = { 994static struct module_attribute *modinfo_attrs[] = {
979 &modinfo_version, 995 &modinfo_version,
980 &modinfo_srcversion, 996 &modinfo_srcversion,
981 &initstate, 997 &initstate,
998 &module_uevent,
982#ifdef CONFIG_MODULE_UNLOAD 999#ifdef CONFIG_MODULE_UNLOAD
983 &refcnt, 1000 &refcnt,
984#endif 1001#endif
@@ -1187,7 +1204,7 @@ struct module_sect_attrs
1187}; 1204};
1188 1205
1189static ssize_t module_sect_show(struct module_attribute *mattr, 1206static ssize_t module_sect_show(struct module_attribute *mattr,
1190 struct module *mod, char *buf) 1207 struct module_kobject *mk, char *buf)
1191{ 1208{
1192 struct module_sect_attr *sattr = 1209 struct module_sect_attr *sattr =
1193 container_of(mattr, struct module_sect_attr, mattr); 1210 container_of(mattr, struct module_sect_attr, mattr);
@@ -1697,6 +1714,15 @@ static void unset_module_core_ro_nx(struct module *mod) { }
1697static void unset_module_init_ro_nx(struct module *mod) { } 1714static void unset_module_init_ro_nx(struct module *mod) { }
1698#endif 1715#endif
1699 1716
1717void __weak module_free(struct module *mod, void *module_region)
1718{
1719 vfree(module_region);
1720}
1721
1722void __weak module_arch_cleanup(struct module *mod)
1723{
1724}
1725
1700/* Free a module, remove from lists, etc. */ 1726/* Free a module, remove from lists, etc. */
1701static void free_module(struct module *mod) 1727static void free_module(struct module *mod)
1702{ 1728{
@@ -1851,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1851 return ret; 1877 return ret;
1852} 1878}
1853 1879
1880int __weak apply_relocate(Elf_Shdr *sechdrs,
1881 const char *strtab,
1882 unsigned int symindex,
1883 unsigned int relsec,
1884 struct module *me)
1885{
1886 pr_err("module %s: REL relocation unsupported\n", me->name);
1887 return -ENOEXEC;
1888}
1889
1890int __weak apply_relocate_add(Elf_Shdr *sechdrs,
1891 const char *strtab,
1892 unsigned int symindex,
1893 unsigned int relsec,
1894 struct module *me)
1895{
1896 pr_err("module %s: RELA relocation unsupported\n", me->name);
1897 return -ENOEXEC;
1898}
1899
1854static int apply_relocations(struct module *mod, const struct load_info *info) 1900static int apply_relocations(struct module *mod, const struct load_info *info)
1855{ 1901{
1856 unsigned int i; 1902 unsigned int i;
@@ -2235,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2235 ddebug_remove_module(debug->modname); 2281 ddebug_remove_module(debug->modname);
2236} 2282}
2237 2283
2284void * __weak module_alloc(unsigned long size)
2285{
2286 return size == 0 ? NULL : vmalloc_exec(size);
2287}
2288
2238static void *module_alloc_update_bounds(unsigned long size) 2289static void *module_alloc_update_bounds(unsigned long size)
2239{ 2290{
2240 void *ret = module_alloc(size); 2291 void *ret = module_alloc(size);
@@ -2477,7 +2528,7 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2477 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); 2528 mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
2478#endif 2529#endif
2479#ifdef CONFIG_CONSTRUCTORS 2530#ifdef CONFIG_CONSTRUCTORS
2480 mod->ctors = section_objs(info, ".ctors", 2531 mod->ctors = section_objs(info, CONFIG_GCOV_CTORS,
2481 sizeof(*mod->ctors), &mod->num_ctors); 2532 sizeof(*mod->ctors), &mod->num_ctors);
2482#endif 2533#endif
2483 2534
@@ -2645,6 +2696,14 @@ static void flush_module_icache(const struct module *mod)
2645 set_fs(old_fs); 2696 set_fs(old_fs);
2646} 2697}
2647 2698
2699int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2700 Elf_Shdr *sechdrs,
2701 char *secstrings,
2702 struct module *mod)
2703{
2704 return 0;
2705}
2706
2648static struct module *layout_and_allocate(struct load_info *info) 2707static struct module *layout_and_allocate(struct load_info *info)
2649{ 2708{
2650 /* Module within temporary copy. */ 2709 /* Module within temporary copy. */
@@ -2716,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info)
2716 module_free(mod, mod->module_core); 2775 module_free(mod, mod->module_core);
2717} 2776}
2718 2777
2778int __weak module_finalize(const Elf_Ehdr *hdr,
2779 const Elf_Shdr *sechdrs,
2780 struct module *me)
2781{
2782 return 0;
2783}
2784
2719static int post_relocation(struct module *mod, const struct load_info *info) 2785static int post_relocation(struct module *mod, const struct load_info *info)
2720{ 2786{
2721 /* Sort exception table now relocations are done. */ 2787 /* Sort exception table now relocations are done. */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb56..8d7b435806c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
525} 525}
526EXPORT_SYMBOL_GPL(srcu_init_notifier_head); 526EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
527 527
528/**
529 * register_reboot_notifier - Register function to be called at reboot time
530 * @nb: Info about notifier function to be called
531 *
532 * Registers a function with the list of functions
533 * to be called at reboot time.
534 *
535 * Currently always returns zero, as blocking_notifier_chain_register()
536 * always returns zero.
537 */
538int register_reboot_notifier(struct notifier_block *nb)
539{
540 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
541}
542EXPORT_SYMBOL(register_reboot_notifier);
543
544/**
545 * unregister_reboot_notifier - Unregister previously registered reboot notifier
546 * @nb: Hook to be unregistered
547 *
548 * Unregisters a previously registered reboot
549 * notifier function.
550 *
551 * Returns zero on success, or %-ENOENT on failure.
552 */
553int unregister_reboot_notifier(struct notifier_block *nb)
554{
555 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
556}
557EXPORT_SYMBOL(unregister_reboot_notifier);
558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 528static ATOMIC_NOTIFIER_HEAD(die_chain);
560 529
561int notrace __kprobes notify_die(enum die_val val, const char *str, 530int notrace __kprobes notify_die(enum die_val val, const char *str,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index d6a00f3de15..9aeab4b98c6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -271,10 +271,8 @@ out:
271 return err; 271 return err;
272} 272}
273 273
274static int __init nsproxy_cache_init(void) 274int __init nsproxy_cache_init(void)
275{ 275{
276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
277 return 0; 277 return 0;
278} 278}
279
280module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 69231670eb9..41fc78ea3db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,13 +27,19 @@
27#define PANIC_TIMER_STEP 100 27#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 28#define PANIC_BLINK_SPD 18
29 29
30/* Machine specific panic information string */
31char *mach_panic_string;
32
30int panic_on_oops; 33int panic_on_oops;
31static unsigned long tainted_mask; 34static unsigned long tainted_mask;
32static int pause_on_oops; 35static int pause_on_oops;
33static int pause_on_oops_flag; 36static int pause_on_oops_flag;
34static DEFINE_SPINLOCK(pause_on_oops_lock); 37static DEFINE_SPINLOCK(pause_on_oops_lock);
35 38
36int panic_timeout; 39#ifndef CONFIG_PANIC_TIMEOUT
40#define CONFIG_PANIC_TIMEOUT 0
41#endif
42int panic_timeout = CONFIG_PANIC_TIMEOUT;
37EXPORT_SYMBOL_GPL(panic_timeout); 43EXPORT_SYMBOL_GPL(panic_timeout);
38 44
39ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 45ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -119,6 +125,8 @@ NORET_TYPE void panic(const char * fmt, ...)
119 } 125 }
120 mdelay(PANIC_TIMER_STEP); 126 mdelay(PANIC_TIMER_STEP);
121 } 127 }
128 }
129 if (panic_timeout != 0) {
122 /* 130 /*
123 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
124 * shutting down. But if there is a chance of 132 * shutting down. But if there is a chance of
@@ -342,6 +350,11 @@ late_initcall(init_oops_id);
342void print_oops_end_marker(void) 350void print_oops_end_marker(void)
343{ 351{
344 init_oops_id(); 352 init_oops_id();
353
354 if (mach_panic_string)
355 printk(KERN_WARNING "Board Information: %s\n",
356 mach_panic_string);
357
345 printk(KERN_WARNING "---[ end trace %016llx ]---\n", 358 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
346 (unsigned long long)oops_id); 359 (unsigned long long)oops_id);
347} 360}
diff --git a/kernel/params.c b/kernel/params.c
index ed72e133086..22df3e0d142 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -225,8 +225,8 @@ int parse_args(const char *name,
225 int ret; \ 225 int ret; \
226 \ 226 \
227 ret = strtolfn(val, 0, &l); \ 227 ret = strtolfn(val, 0, &l); \
228 if (ret == -EINVAL || ((type)l != l)) \ 228 if (ret < 0 || ((type)l != l)) \
229 return -EINVAL; \ 229 return ret < 0 ? ret : -EINVAL; \
230 *((type *)kp->arg) = l; \ 230 *((type *)kp->arg) = l; \
231 return 0; \ 231 return 0; \
232 } \ 232 } \
@@ -511,7 +511,7 @@ struct module_param_attrs
511#define to_param_attr(n) container_of(n, struct param_attribute, mattr) 511#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
512 512
513static ssize_t param_attr_show(struct module_attribute *mattr, 513static ssize_t param_attr_show(struct module_attribute *mattr,
514 struct module *mod, char *buf) 514 struct module_kobject *mk, char *buf)
515{ 515{
516 int count; 516 int count;
517 struct param_attribute *attribute = to_param_attr(mattr); 517 struct param_attribute *attribute = to_param_attr(mattr);
@@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
531 531
532/* sysfs always hands a nul-terminated string in buf. We rely on that. */ 532/* sysfs always hands a nul-terminated string in buf. We rely on that. */
533static ssize_t param_attr_store(struct module_attribute *mattr, 533static ssize_t param_attr_store(struct module_attribute *mattr,
534 struct module *owner, 534 struct module_kobject *km,
535 const char *buf, size_t len) 535 const char *buf, size_t len)
536{ 536{
537 int err; 537 int err;
@@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
730 mk->kobj.kset = module_kset; 730 mk->kobj.kset = module_kset;
731 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, 731 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
732 "%s", name); 732 "%s", name);
733#ifdef CONFIG_MODULES
734 if (!err)
735 err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
736#endif
733 if (err) { 737 if (err) {
734 kobject_put(&mk->kobj); 738 kobject_put(&mk->kobj);
735 printk(KERN_ERR 739 printk(KERN_ERR
@@ -807,7 +811,7 @@ static void __init param_sysfs_builtin(void)
807} 811}
808 812
809ssize_t __modver_version_show(struct module_attribute *mattr, 813ssize_t __modver_version_show(struct module_attribute *mattr,
810 struct module *mod, char *buf) 814 struct module_kobject *mk, char *buf)
811{ 815{
812 struct module_version_attribute *vattr = 816 struct module_version_attribute *vattr =
813 container_of(mattr, struct module_version_attribute, mattr); 817 container_of(mattr, struct module_version_attribute, mattr);
@@ -852,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
852 if (!attribute->show) 856 if (!attribute->show)
853 return -EIO; 857 return -EIO;
854 858
855 ret = attribute->show(attribute, mk->mod, buf); 859 ret = attribute->show(attribute, mk, buf);
856 860
857 return ret; 861 return ret;
858} 862}
@@ -871,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
871 if (!attribute->store) 875 if (!attribute->store)
872 return -EIO; 876 return -EIO;
873 877
874 ret = attribute->store(attribute, mk->mod, buf, len); 878 ret = attribute->store(attribute, mk, buf, len);
875 879
876 return ret; 880 return ret;
877} 881}
diff --git a/kernel/pid.c b/kernel/pid.c
index 57a8346a270..e432057f3b2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
405 if (pid) { 405 if (pid) {
406 struct hlist_node *first; 406 struct hlist_node *first;
407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), 407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
408 rcu_read_lock_held() ||
409 lockdep_tasklist_lock_is_held()); 408 lockdep_tasklist_lock_is_held());
410 if (first) 409 if (first)
411 result = hlist_entry(first, struct task_struct, pids[(type)].node); 410 result = hlist_entry(first, struct task_struct, pids[(type)].node);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 6824ca7d4d0..82da7ac3b1f 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock);
74static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), 77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
78 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
@@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
84 84
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), 87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
88 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
89 .name = "network_latency", 89 .name = "network_latency",
90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
@@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
95 95
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), 98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
99 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 100 .name = "network_throughput",
101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
@@ -104,11 +104,59 @@ static struct pm_qos_object network_throughput_pm_qos = {
104}; 104};
105 105
106 106
107static BLOCKING_NOTIFIER_HEAD(min_online_cpus_notifier);
108static struct pm_qos_object min_online_cpus_pm_qos = {
109 .requests = PLIST_HEAD_INIT(min_online_cpus_pm_qos.requests),
110 .notifiers = &min_online_cpus_notifier,
111 .name = "min_online_cpus",
112 .target_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE,
113 .default_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE,
114 .type = PM_QOS_MAX,
115};
116
117
118static BLOCKING_NOTIFIER_HEAD(max_online_cpus_notifier);
119static struct pm_qos_object max_online_cpus_pm_qos = {
120 .requests = PLIST_HEAD_INIT(max_online_cpus_pm_qos.requests),
121 .notifiers = &max_online_cpus_notifier,
122 .name = "max_online_cpus",
123 .target_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE,
124 .default_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE,
125 .type = PM_QOS_MIN,
126};
127
128
129static BLOCKING_NOTIFIER_HEAD(cpu_freq_min_notifier);
130static struct pm_qos_object cpu_freq_min_pm_qos = {
131 .requests = PLIST_HEAD_INIT(cpu_freq_min_pm_qos.requests),
132 .notifiers = &cpu_freq_min_notifier,
133 .name = "cpu_freq_min",
134 .target_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE,
135 .default_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE,
136 .type = PM_QOS_MAX,
137};
138
139
140static BLOCKING_NOTIFIER_HEAD(cpu_freq_max_notifier);
141static struct pm_qos_object cpu_freq_max_pm_qos = {
142 .requests = PLIST_HEAD_INIT(cpu_freq_max_pm_qos.requests),
143 .notifiers = &cpu_freq_max_notifier,
144 .name = "cpu_freq_max",
145 .target_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE,
146 .default_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE,
147 .type = PM_QOS_MIN,
148};
149
150
107static struct pm_qos_object *pm_qos_array[] = { 151static struct pm_qos_object *pm_qos_array[] = {
108 &null_pm_qos, 152 &null_pm_qos,
109 &cpu_dma_pm_qos, 153 &cpu_dma_pm_qos,
110 &network_lat_pm_qos, 154 &network_lat_pm_qos,
111 &network_throughput_pm_qos 155 &network_throughput_pm_qos,
156 &min_online_cpus_pm_qos,
157 &max_online_cpus_pm_qos,
158 &cpu_freq_min_pm_qos,
159 &cpu_freq_max_pm_qos
112}; 160};
113 161
114static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 162static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -459,21 +507,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
459static int __init pm_qos_power_init(void) 507static int __init pm_qos_power_init(void)
460{ 508{
461 int ret = 0; 509 int ret = 0;
510 int i;
462 511
463 ret = register_pm_qos_misc(&cpu_dma_pm_qos); 512 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
464 if (ret < 0) { 513
465 printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); 514 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
466 return ret; 515 ret = register_pm_qos_misc(pm_qos_array[i]);
467 } 516 if (ret < 0) {
468 ret = register_pm_qos_misc(&network_lat_pm_qos); 517 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
469 if (ret < 0) { 518 pm_qos_array[i]->name);
470 printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); 519 return ret;
471 return ret; 520 }
472 } 521 }
473 ret = register_pm_qos_misc(&network_throughput_pm_qos);
474 if (ret < 0)
475 printk(KERN_ERR
476 "pm_qos_param: network_throughput setup failed\n");
477 522
478 return ret; 523 return ret;
479} 524}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b581e..640ded8f5c4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
250 do { 250 do {
251 times->utime = cputime_add(times->utime, t->utime); 251 times->utime = cputime_add(times->utime, t->utime);
252 times->stime = cputime_add(times->stime, t->stime); 252 times->stime = cputime_add(times->stime, t->stime);
253 times->sum_exec_runtime += t->se.sum_exec_runtime; 253 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 254 } while_each_thread(tsk, t);
255out: 255out:
256 rcu_read_unlock(); 256 rcu_read_unlock();
@@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
274 struct task_cputime sum; 274 struct task_cputime sum;
275 unsigned long flags; 275 unsigned long flags;
276 276
277 spin_lock_irqsave(&cputimer->lock, flags);
278 if (!cputimer->running) { 277 if (!cputimer->running) {
279 cputimer->running = 1;
280 /* 278 /*
281 * The POSIX timer interface allows for absolute time expiry 279 * The POSIX timer interface allows for absolute time expiry
282 * values through the TIMER_ABSTIME flag, therefore we have 280 * values through the TIMER_ABSTIME flag, therefore we have
@@ -284,8 +282,11 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
284 * it. 282 * it.
285 */ 283 */
286 thread_group_cputime(tsk, &sum); 284 thread_group_cputime(tsk, &sum);
285 spin_lock_irqsave(&cputimer->lock, flags);
286 cputimer->running = 1;
287 update_gt_cputime(&cputimer->cputime, &sum); 287 update_gt_cputime(&cputimer->cputime, &sum);
288 } 288 } else
289 spin_lock_irqsave(&cputimer->lock, flags);
289 *times = cputimer->cputime; 290 *times = cputimer->cputime;
290 spin_unlock_irqrestore(&cputimer->lock, flags); 291 spin_unlock_irqrestore(&cputimer->lock, flags);
291} 292}
@@ -312,7 +313,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
312 cpu->cpu = cputime.utime; 313 cpu->cpu = cputime.utime;
313 break; 314 break;
314 case CPUCLOCK_SCHED: 315 case CPUCLOCK_SCHED:
315 cpu->sched = thread_group_sched_runtime(p); 316 thread_group_cputime(p, &cputime);
317 cpu->sched = cputime.sum_exec_runtime;
316 break; 318 break;
317 } 319 }
318 return 0; 320 return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 87f4d24b55b..fcf5a834c4e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,73 @@ config SUSPEND_FREEZER
18 18
19 Turning OFF this setting is NOT recommended! If in doubt, say Y. 19 Turning OFF this setting is NOT recommended! If in doubt, say Y.
20 20
21config HAS_WAKELOCK
22 bool
23
24config HAS_EARLYSUSPEND
25 bool
26
27config WAKELOCK
28 bool "Wake lock"
29 depends on PM && RTC_CLASS
30 default n
31 select HAS_WAKELOCK
32 ---help---
33 Enable wakelocks. When user space request a sleep state the
34 sleep request will be delayed until no wake locks are held.
35
36config WAKELOCK_STAT
37 bool "Wake lock stats"
38 depends on WAKELOCK
39 default y
40 ---help---
41 Report wake lock stats in /proc/wakelocks
42
43config USER_WAKELOCK
44 bool "Userspace wake locks"
45 depends on WAKELOCK
46 default y
47 ---help---
48 User-space wake lock api. Write "lockname" or "lockname timeout"
49 to /sys/power/wake_lock lock and if needed create a wake lock.
50 Write "lockname" to /sys/power/wake_unlock to unlock a user wake
51 lock.
52
53config EARLYSUSPEND
54 bool "Early suspend"
55 depends on WAKELOCK
56 default y
57 select HAS_EARLYSUSPEND
58 ---help---
59 Call early suspend handlers when the user requested sleep state
60 changes.
61
62choice
63 prompt "User-space screen access"
64 default FB_EARLYSUSPEND if !FRAMEBUFFER_CONSOLE
65 default CONSOLE_EARLYSUSPEND
66 depends on HAS_EARLYSUSPEND
67
68 config NO_USER_SPACE_SCREEN_ACCESS_CONTROL
69 bool "None"
70
71 config CONSOLE_EARLYSUSPEND
72 bool "Console switch on early-suspend"
73 depends on HAS_EARLYSUSPEND && VT
74 ---help---
75 Register early suspend handler to perform a console switch to
76 when user-space should stop drawing to the screen and a switch
77 back when it should resume.
78
79 config FB_EARLYSUSPEND
80 bool "Sysfs interface"
81 depends on HAS_EARLYSUSPEND
82 ---help---
83 Register early suspend handler that notifies and waits for
84 user-space through sysfs when user-space should stop drawing
85 to the screen and notifies user-space when it should resume.
86endchoice
87
21config HIBERNATE_CALLBACKS 88config HIBERNATE_CALLBACKS
22 bool 89 bool
23 90
@@ -193,8 +260,8 @@ config APM_EMULATION
193 notification of APM "events" (e.g. battery status change). 260 notification of APM "events" (e.g. battery status change).
194 261
195 In order to use APM, you will need supporting software. For location 262 In order to use APM, you will need supporting software. For location
196 and more information, read <file:Documentation/power/pm.txt> and the 263 and more information, read <file:Documentation/power/apm-acpi.txt>
197 Battery Powered Linux mini-HOWTO, available from 264 and the Battery Powered Linux mini-HOWTO, available from
198 <http://www.tldp.org/docs.html#howto>. 265 <http://www.tldp.org/docs.html#howto>.
199 266
200 This driver does not spin down disk drives (see the hdparm(8) 267 This driver does not spin down disk drives (see the hdparm(8)
@@ -224,6 +291,21 @@ config PM_OPP
224 implementations a ready to use framework to manage OPPs. 291 implementations a ready to use framework to manage OPPs.
225 For more information, read <file:Documentation/power/opp.txt> 292 For more information, read <file:Documentation/power/opp.txt>
226 293
227config PM_RUNTIME_CLK 294config PM_CLK
295 def_bool y
296 depends on PM && HAVE_CLK
297
298config PM_GENERIC_DOMAINS
299 bool
300 depends on PM
301
302config PM_GENERIC_DOMAINS_RUNTIME
228 def_bool y 303 def_bool y
229 depends on PM_RUNTIME && HAVE_CLK 304 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
305
306config SUSPEND_TIME
307 bool "Log time spent in suspend"
308 ---help---
309 Prints the time spent in suspend in the kernel log, and
310 keeps statistics on the time spent in suspend in
311 /sys/kernel/debug/suspend_time
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a9064..9b224e16b19 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,5 +8,11 @@ obj-$(CONFIG_SUSPEND) += suspend.o
8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 8obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
9obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ 9obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
10 block_io.o 10 block_io.o
11obj-$(CONFIG_WAKELOCK) += wakelock.o
12obj-$(CONFIG_USER_WAKELOCK) += userwakelock.o
13obj-$(CONFIG_EARLYSUSPEND) += earlysuspend.o
14obj-$(CONFIG_CONSOLE_EARLYSUSPEND) += consoleearlysuspend.o
15obj-$(CONFIG_FB_EARLYSUSPEND) += fbearlysuspend.o
16obj-$(CONFIG_SUSPEND_TIME) += suspend_time.o
11 17
12obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 18obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/consoleearlysuspend.c b/kernel/power/consoleearlysuspend.c
new file mode 100644
index 00000000000..a3edcb26738
--- /dev/null
+++ b/kernel/power/consoleearlysuspend.c
@@ -0,0 +1,78 @@
1/* kernel/power/consoleearlysuspend.c
2 *
3 * Copyright (C) 2005-2008 Google, Inc.
4 *
5 * This software is licensed under the terms of the GNU General Public
6 * License version 2, as published by the Free Software Foundation, and
7 * may be copied, distributed, and modified under those terms.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 */
15
16#include <linux/console.h>
17#include <linux/earlysuspend.h>
18#include <linux/kbd_kern.h>
19#include <linux/module.h>
20#include <linux/vt_kern.h>
21#include <linux/wait.h>
22
23#define EARLY_SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
24
25static int orig_fgconsole;
26static void console_early_suspend(struct early_suspend *h)
27{
28 acquire_console_sem();
29 orig_fgconsole = fg_console;
30 if (vc_allocate(EARLY_SUSPEND_CONSOLE))
31 goto err;
32 if (set_console(EARLY_SUSPEND_CONSOLE))
33 goto err;
34 release_console_sem();
35
36 if (vt_waitactive(EARLY_SUSPEND_CONSOLE + 1))
37 pr_warning("console_early_suspend: Can't switch VCs.\n");
38 return;
39err:
40 pr_warning("console_early_suspend: Can't set console\n");
41 release_console_sem();
42}
43
44static void console_late_resume(struct early_suspend *h)
45{
46 int ret;
47 acquire_console_sem();
48 ret = set_console(orig_fgconsole);
49 release_console_sem();
50 if (ret) {
51 pr_warning("console_late_resume: Can't set console.\n");
52 return;
53 }
54
55 if (vt_waitactive(orig_fgconsole + 1))
56 pr_warning("console_late_resume: Can't switch VCs.\n");
57}
58
59static struct early_suspend console_early_suspend_desc = {
60 .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
61 .suspend = console_early_suspend,
62 .resume = console_late_resume,
63};
64
65static int __init console_early_suspend_init(void)
66{
67 register_early_suspend(&console_early_suspend_desc);
68 return 0;
69}
70
71static void __exit console_early_suspend_exit(void)
72{
73 unregister_early_suspend(&console_early_suspend_desc);
74}
75
76module_init(console_early_suspend_init);
77module_exit(console_early_suspend_exit);
78
diff --git a/kernel/power/earlysuspend.c b/kernel/power/earlysuspend.c
new file mode 100644
index 00000000000..b15f02eba45
--- /dev/null
+++ b/kernel/power/earlysuspend.c
@@ -0,0 +1,187 @@
1/* kernel/power/earlysuspend.c
2 *
3 * Copyright (C) 2005-2008 Google, Inc.
4 *
5 * This software is licensed under the terms of the GNU General Public
6 * License version 2, as published by the Free Software Foundation, and
7 * may be copied, distributed, and modified under those terms.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 */
15
16#include <linux/earlysuspend.h>
17#include <linux/module.h>
18#include <linux/mutex.h>
19#include <linux/rtc.h>
20#include <linux/syscalls.h> /* sys_sync */
21#include <linux/wakelock.h>
22#include <linux/workqueue.h>
23
24#include "power.h"
25
26enum {
27 DEBUG_USER_STATE = 1U << 0,
28 DEBUG_SUSPEND = 1U << 2,
29 DEBUG_VERBOSE = 1U << 3,
30};
31static int debug_mask = DEBUG_USER_STATE;
32module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
33
34static DEFINE_MUTEX(early_suspend_lock);
35static LIST_HEAD(early_suspend_handlers);
36static void early_suspend(struct work_struct *work);
37static void late_resume(struct work_struct *work);
38static DECLARE_WORK(early_suspend_work, early_suspend);
39static DECLARE_WORK(late_resume_work, late_resume);
40static DEFINE_SPINLOCK(state_lock);
41enum {
42 SUSPEND_REQUESTED = 0x1,
43 SUSPENDED = 0x2,
44 SUSPEND_REQUESTED_AND_SUSPENDED = SUSPEND_REQUESTED | SUSPENDED,
45};
46static int state;
47
48void register_early_suspend(struct early_suspend *handler)
49{
50 struct list_head *pos;
51
52 mutex_lock(&early_suspend_lock);
53 list_for_each(pos, &early_suspend_handlers) {
54 struct early_suspend *e;
55 e = list_entry(pos, struct early_suspend, link);
56 if (e->level > handler->level)
57 break;
58 }
59 list_add_tail(&handler->link, pos);
60 if ((state & SUSPENDED) && handler->suspend)
61 handler->suspend(handler);
62 mutex_unlock(&early_suspend_lock);
63}
64EXPORT_SYMBOL(register_early_suspend);
65
66void unregister_early_suspend(struct early_suspend *handler)
67{
68 mutex_lock(&early_suspend_lock);
69 list_del(&handler->link);
70 mutex_unlock(&early_suspend_lock);
71}
72EXPORT_SYMBOL(unregister_early_suspend);
73
74static void early_suspend(struct work_struct *work)
75{
76 struct early_suspend *pos;
77 unsigned long irqflags;
78 int abort = 0;
79
80 mutex_lock(&early_suspend_lock);
81 spin_lock_irqsave(&state_lock, irqflags);
82 if (state == SUSPEND_REQUESTED)
83 state |= SUSPENDED;
84 else
85 abort = 1;
86 spin_unlock_irqrestore(&state_lock, irqflags);
87
88 if (abort) {
89 if (debug_mask & DEBUG_SUSPEND)
90 pr_info("early_suspend: abort, state %d\n", state);
91 mutex_unlock(&early_suspend_lock);
92 goto abort;
93 }
94
95 if (debug_mask & DEBUG_SUSPEND)
96 pr_info("early_suspend: call handlers\n");
97 list_for_each_entry(pos, &early_suspend_handlers, link) {
98 if (pos->suspend != NULL) {
99 if (debug_mask & DEBUG_VERBOSE)
100 pr_info("early_suspend: calling %pf\n", pos->suspend);
101 pos->suspend(pos);
102 }
103 }
104 mutex_unlock(&early_suspend_lock);
105
106 if (debug_mask & DEBUG_SUSPEND)
107 pr_info("early_suspend: sync\n");
108
109 sys_sync();
110abort:
111 spin_lock_irqsave(&state_lock, irqflags);
112 if (state == SUSPEND_REQUESTED_AND_SUSPENDED)
113 wake_unlock(&main_wake_lock);
114 spin_unlock_irqrestore(&state_lock, irqflags);
115}
116
117static void late_resume(struct work_struct *work)
118{
119 struct early_suspend *pos;
120 unsigned long irqflags;
121 int abort = 0;
122
123 mutex_lock(&early_suspend_lock);
124 spin_lock_irqsave(&state_lock, irqflags);
125 if (state == SUSPENDED)
126 state &= ~SUSPENDED;
127 else
128 abort = 1;
129 spin_unlock_irqrestore(&state_lock, irqflags);
130
131 if (abort) {
132 if (debug_mask & DEBUG_SUSPEND)
133 pr_info("late_resume: abort, state %d\n", state);
134 goto abort;
135 }
136 if (debug_mask & DEBUG_SUSPEND)
137 pr_info("late_resume: call handlers\n");
138 list_for_each_entry_reverse(pos, &early_suspend_handlers, link) {
139 if (pos->resume != NULL) {
140 if (debug_mask & DEBUG_VERBOSE)
141 pr_info("late_resume: calling %pf\n", pos->resume);
142
143 pos->resume(pos);
144 }
145 }
146 if (debug_mask & DEBUG_SUSPEND)
147 pr_info("late_resume: done\n");
148abort:
149 mutex_unlock(&early_suspend_lock);
150}
151
152void request_suspend_state(suspend_state_t new_state)
153{
154 unsigned long irqflags;
155 int old_sleep;
156
157 spin_lock_irqsave(&state_lock, irqflags);
158 old_sleep = state & SUSPEND_REQUESTED;
159 if (debug_mask & DEBUG_USER_STATE) {
160 struct timespec ts;
161 struct rtc_time tm;
162 getnstimeofday(&ts);
163 rtc_time_to_tm(ts.tv_sec, &tm);
164 pr_info("request_suspend_state: %s (%d->%d) at %lld "
165 "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n",
166 new_state != PM_SUSPEND_ON ? "sleep" : "wakeup",
167 requested_suspend_state, new_state,
168 ktime_to_ns(ktime_get()),
169 tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
170 tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
171 }
172 if (!old_sleep && new_state != PM_SUSPEND_ON) {
173 state |= SUSPEND_REQUESTED;
174 queue_work(suspend_work_queue, &early_suspend_work);
175 } else if (old_sleep && new_state == PM_SUSPEND_ON) {
176 state &= ~SUSPEND_REQUESTED;
177 wake_lock(&main_wake_lock);
178 queue_work(suspend_work_queue, &late_resume_work);
179 }
180 requested_suspend_state = new_state;
181 spin_unlock_irqrestore(&state_lock, irqflags);
182}
183
184suspend_state_t get_suspend_state(void)
185{
186 return requested_suspend_state;
187}
diff --git a/kernel/power/fbearlysuspend.c b/kernel/power/fbearlysuspend.c
new file mode 100644
index 00000000000..15137650149
--- /dev/null
+++ b/kernel/power/fbearlysuspend.c
@@ -0,0 +1,153 @@
1/* kernel/power/fbearlysuspend.c
2 *
3 * Copyright (C) 2005-2008 Google, Inc.
4 *
5 * This software is licensed under the terms of the GNU General Public
6 * License version 2, as published by the Free Software Foundation, and
7 * may be copied, distributed, and modified under those terms.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 */
15
16#include <linux/earlysuspend.h>
17#include <linux/module.h>
18#include <linux/wait.h>
19
20#include "power.h"
21
22static wait_queue_head_t fb_state_wq;
23static DEFINE_SPINLOCK(fb_state_lock);
24static enum {
25 FB_STATE_STOPPED_DRAWING,
26 FB_STATE_REQUEST_STOP_DRAWING,
27 FB_STATE_DRAWING_OK,
28} fb_state;
29
30/* tell userspace to stop drawing, wait for it to stop */
31static void stop_drawing_early_suspend(struct early_suspend *h)
32{
33 int ret;
34 unsigned long irq_flags;
35
36 spin_lock_irqsave(&fb_state_lock, irq_flags);
37 fb_state = FB_STATE_REQUEST_STOP_DRAWING;
38 spin_unlock_irqrestore(&fb_state_lock, irq_flags);
39
40 wake_up_all(&fb_state_wq);
41 ret = wait_event_timeout(fb_state_wq,
42 fb_state == FB_STATE_STOPPED_DRAWING,
43 HZ);
44 if (unlikely(fb_state != FB_STATE_STOPPED_DRAWING))
45 pr_warning("stop_drawing_early_suspend: timeout waiting for "
46 "userspace to stop drawing\n");
47}
48
49/* tell userspace to start drawing */
50static void start_drawing_late_resume(struct early_suspend *h)
51{
52 unsigned long irq_flags;
53
54 spin_lock_irqsave(&fb_state_lock, irq_flags);
55 fb_state = FB_STATE_DRAWING_OK;
56 spin_unlock_irqrestore(&fb_state_lock, irq_flags);
57 wake_up(&fb_state_wq);
58}
59
60static struct early_suspend stop_drawing_early_suspend_desc = {
61 .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING,
62 .suspend = stop_drawing_early_suspend,
63 .resume = start_drawing_late_resume,
64};
65
66static ssize_t wait_for_fb_sleep_show(struct kobject *kobj,
67 struct kobj_attribute *attr, char *buf)
68{
69 char *s = buf;
70 int ret;
71
72 ret = wait_event_interruptible(fb_state_wq,
73 fb_state != FB_STATE_DRAWING_OK);
74 if (ret && fb_state == FB_STATE_DRAWING_OK)
75 return ret;
76 else
77 s += sprintf(buf, "sleeping");
78 return s - buf;
79}
80
81static ssize_t wait_for_fb_wake_show(struct kobject *kobj,
82 struct kobj_attribute *attr, char *buf)
83{
84 char *s = buf;
85 int ret;
86 unsigned long irq_flags;
87
88 spin_lock_irqsave(&fb_state_lock, irq_flags);
89 if (fb_state == FB_STATE_REQUEST_STOP_DRAWING) {
90 fb_state = FB_STATE_STOPPED_DRAWING;
91 wake_up(&fb_state_wq);
92 }
93 spin_unlock_irqrestore(&fb_state_lock, irq_flags);
94
95 ret = wait_event_interruptible(fb_state_wq,
96 fb_state == FB_STATE_DRAWING_OK);
97 if (ret && fb_state != FB_STATE_DRAWING_OK)
98 return ret;
99 else
100 s += sprintf(buf, "awake");
101
102 return s - buf;
103}
104
105#define power_ro_attr(_name) \
106static struct kobj_attribute _name##_attr = { \
107 .attr = { \
108 .name = __stringify(_name), \
109 .mode = 0444, \
110 }, \
111 .show = _name##_show, \
112 .store = NULL, \
113}
114
115power_ro_attr(wait_for_fb_sleep);
116power_ro_attr(wait_for_fb_wake);
117
118static struct attribute *g[] = {
119 &wait_for_fb_sleep_attr.attr,
120 &wait_for_fb_wake_attr.attr,
121 NULL,
122};
123
124static struct attribute_group attr_group = {
125 .attrs = g,
126};
127
128static int __init android_power_init(void)
129{
130 int ret;
131
132 init_waitqueue_head(&fb_state_wq);
133 fb_state = FB_STATE_DRAWING_OK;
134
135 ret = sysfs_create_group(power_kobj, &attr_group);
136 if (ret) {
137 pr_err("android_power_init: sysfs_create_group failed\n");
138 return ret;
139 }
140
141 register_early_suspend(&stop_drawing_early_suspend_desc);
142 return 0;
143}
144
145static void __exit android_power_exit(void)
146{
147 unregister_early_suspend(&stop_drawing_early_suspend_desc);
148 sysfs_remove_group(power_kobj, &attr_group);
149}
150
151module_init(android_power_init);
152module_exit(android_power_exit);
153
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2981af4ce7c..3304594553c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier);
37 37
38int pm_notifier_call_chain(unsigned long val) 38int pm_notifier_call_chain(unsigned long val)
39{ 39{
40 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) 40 int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
41 == NOTIFY_BAD) ? -EINVAL : 0; 41
42 return notifier_to_errno(ret);
42} 43}
43 44
44/* If set, devices may be suspended and resumed asynchronously. */ 45/* If set, devices may be suspended and resumed asynchronously. */
@@ -170,7 +171,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
170 const char *buf, size_t n) 171 const char *buf, size_t n)
171{ 172{
172#ifdef CONFIG_SUSPEND 173#ifdef CONFIG_SUSPEND
174#ifdef CONFIG_EARLYSUSPEND
175 suspend_state_t state = PM_SUSPEND_ON;
176#else
173 suspend_state_t state = PM_SUSPEND_STANDBY; 177 suspend_state_t state = PM_SUSPEND_STANDBY;
178#endif
174 const char * const *s; 179 const char * const *s;
175#endif 180#endif
176 char *p; 181 char *p;
@@ -192,8 +197,15 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
192 break; 197 break;
193 } 198 }
194 if (state < PM_SUSPEND_MAX && *s) 199 if (state < PM_SUSPEND_MAX && *s)
200#ifdef CONFIG_EARLYSUSPEND
201 if (state == PM_SUSPEND_ON || valid_state(state)) {
202 error = 0;
203 request_suspend_state(state);
204 }
205#else
195 error = enter_state(state); 206 error = enter_state(state);
196#endif 207#endif
208#endif
197 209
198 Exit: 210 Exit:
199 return error ? error : n; 211 return error ? error : n;
@@ -297,6 +309,11 @@ power_attr(pm_trace_dev_match);
297 309
298#endif /* CONFIG_PM_TRACE */ 310#endif /* CONFIG_PM_TRACE */
299 311
312#ifdef CONFIG_USER_WAKELOCK
313power_attr(wake_lock);
314power_attr(wake_unlock);
315#endif
316
300static struct attribute * g[] = { 317static struct attribute * g[] = {
301 &state_attr.attr, 318 &state_attr.attr,
302#ifdef CONFIG_PM_TRACE 319#ifdef CONFIG_PM_TRACE
@@ -309,6 +326,10 @@ static struct attribute * g[] = {
309#ifdef CONFIG_PM_DEBUG 326#ifdef CONFIG_PM_DEBUG
310 &pm_test_attr.attr, 327 &pm_test_attr.attr,
311#endif 328#endif
329#ifdef CONFIG_USER_WAKELOCK
330 &wake_lock_attr.attr,
331 &wake_unlock_attr.attr,
332#endif
312#endif 333#endif
313 NULL, 334 NULL,
314}; 335};
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a2628..b6b9006480f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -245,3 +245,27 @@ static inline void suspend_thaw_processes(void)
245{ 245{
246} 246}
247#endif 247#endif
248
249#ifdef CONFIG_WAKELOCK
250/* kernel/power/wakelock.c */
251extern struct workqueue_struct *suspend_work_queue;
252extern struct wake_lock main_wake_lock;
253extern suspend_state_t requested_suspend_state;
254#endif
255
256#ifdef CONFIG_USER_WAKELOCK
257ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr,
258 char *buf);
259ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr,
260 const char *buf, size_t n);
261ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr,
262 char *buf);
263ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr,
264 const char *buf, size_t n);
265#endif
266
267#ifdef CONFIG_EARLYSUSPEND
268/* kernel/power/earlysuspend.c */
269void request_suspend_state(suspend_state_t state);
270suspend_state_t get_suspend_state(void);
271#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9..31338cdeafc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -16,6 +16,7 @@
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/workqueue.h> 18#include <linux/workqueue.h>
19#include <linux/wakelock.h>
19 20
20/* 21/*
21 * Timeout for stopping processes 22 * Timeout for stopping processes
@@ -82,6 +83,10 @@ static int try_to_freeze_tasks(bool sig_only)
82 todo += wq_busy; 83 todo += wq_busy;
83 } 84 }
84 85
86 if (todo && has_wake_lock(WAKE_LOCK_SUSPEND)) {
87 wakeup = 1;
88 break;
89 }
85 if (!todo || time_after(jiffies, end_time)) 90 if (!todo || time_after(jiffies, end_time))
86 break; 91 break;
87 92
@@ -108,19 +113,25 @@ static int try_to_freeze_tasks(bool sig_only)
108 * and caller must call thaw_processes() if something fails), 113 * and caller must call thaw_processes() if something fails),
109 * but it cleans up leftover PF_FREEZE requests. 114 * but it cleans up leftover PF_FREEZE requests.
110 */ 115 */
111 printk("\n"); 116 if(wakeup) {
112 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 117 printk("\n");
113 "(%d tasks refusing to freeze, wq_busy=%d):\n", 118 printk(KERN_ERR "Freezing of %s aborted\n",
114 wakeup ? "aborted" : "failed", 119 sig_only ? "user space " : "tasks ");
115 elapsed_csecs / 100, elapsed_csecs % 100, 120 }
116 todo - wq_busy, wq_busy); 121 else {
117 122 printk("\n");
123 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
124 "(%d tasks refusing to freeze, wq_busy=%d):\n",
125 elapsed_csecs / 100, elapsed_csecs % 100,
126 todo - wq_busy, wq_busy);
127 }
118 thaw_workqueues(); 128 thaw_workqueues();
119 129
120 read_lock(&tasklist_lock); 130 read_lock(&tasklist_lock);
121 do_each_thread(g, p) { 131 do_each_thread(g, p) {
122 task_lock(p); 132 task_lock(p);
123 if (!wakeup && freezing(p) && !freezer_should_skip(p)) 133 if (freezing(p) && !freezer_should_skip(p) &&
134 elapsed_csecs > 100)
124 sched_show_task(p); 135 sched_show_task(p);
125 cancel_freezing(p); 136 cancel_freezing(p);
126 task_unlock(p); 137 task_unlock(p);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1c41ba21541..a6f6e3114a2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -28,6 +28,9 @@
28#include "power.h" 28#include "power.h"
29 29
30const char *const pm_states[PM_SUSPEND_MAX] = { 30const char *const pm_states[PM_SUSPEND_MAX] = {
31#ifdef CONFIG_EARLYSUSPEND
32 [PM_SUSPEND_ON] = "on",
33#endif
31 [PM_SUSPEND_STANDBY] = "standby", 34 [PM_SUSPEND_STANDBY] = "standby",
32 [PM_SUSPEND_MEM] = "mem", 35 [PM_SUSPEND_MEM] = "mem",
33}; 36};
@@ -44,6 +47,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
44 suspend_ops = ops; 47 suspend_ops = ops;
45 mutex_unlock(&pm_mutex); 48 mutex_unlock(&pm_mutex);
46} 49}
50EXPORT_SYMBOL_GPL(suspend_set_ops);
47 51
48bool valid_state(suspend_state_t state) 52bool valid_state(suspend_state_t state)
49{ 53{
@@ -65,6 +69,7 @@ int suspend_valid_only_mem(suspend_state_t state)
65{ 69{
66 return state == PM_SUSPEND_MEM; 70 return state == PM_SUSPEND_MEM;
67} 71}
72EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
68 73
69static int suspend_test(int level) 74static int suspend_test(int level)
70{ 75{
@@ -126,12 +131,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
126} 131}
127 132
128/** 133/**
129 * suspend_enter - enter the desired system sleep state. 134 * suspend_enter - enter the desired system sleep state.
130 * @state: state to enter 135 * @state: State to enter
136 * @wakeup: Returns information that suspend should not be entered again.
131 * 137 *
132 * This function should be called after devices have been suspended. 138 * This function should be called after devices have been suspended.
133 */ 139 */
134static int suspend_enter(suspend_state_t state) 140static int suspend_enter(suspend_state_t state, bool *wakeup)
135{ 141{
136 int error; 142 int error;
137 143
@@ -165,7 +171,8 @@ static int suspend_enter(suspend_state_t state)
165 171
166 error = syscore_suspend(); 172 error = syscore_suspend();
167 if (!error) { 173 if (!error) {
168 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 174 *wakeup = pm_wakeup_pending();
175 if (!(suspend_test(TEST_CORE) || *wakeup)) {
169 error = suspend_ops->enter(state); 176 error = suspend_ops->enter(state);
170 events_check_enabled = false; 177 events_check_enabled = false;
171 } 178 }
@@ -199,6 +206,7 @@ static int suspend_enter(suspend_state_t state)
199int suspend_devices_and_enter(suspend_state_t state) 206int suspend_devices_and_enter(suspend_state_t state)
200{ 207{
201 int error; 208 int error;
209 bool wakeup = false;
202 210
203 if (!suspend_ops) 211 if (!suspend_ops)
204 return -ENOSYS; 212 return -ENOSYS;
@@ -220,7 +228,10 @@ int suspend_devices_and_enter(suspend_state_t state)
220 if (suspend_test(TEST_DEVICES)) 228 if (suspend_test(TEST_DEVICES))
221 goto Recover_platform; 229 goto Recover_platform;
222 230
223 error = suspend_enter(state); 231 do {
232 error = suspend_enter(state, &wakeup);
233 } while (!error && !wakeup
234 && suspend_ops->suspend_again && suspend_ops->suspend_again());
224 235
225 Resume_devices: 236 Resume_devices:
226 suspend_test_start(); 237 suspend_test_start();
@@ -307,7 +318,7 @@ int enter_state(suspend_state_t state)
307 */ 318 */
308int pm_suspend(suspend_state_t state) 319int pm_suspend(suspend_state_t state)
309{ 320{
310 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) 321 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
311 return enter_state(state); 322 return enter_state(state);
312 return -EINVAL; 323 return -EINVAL;
313} 324}
diff --git a/kernel/power/suspend_time.c b/kernel/power/suspend_time.c
new file mode 100644
index 00000000000..d2a65da9f22
--- /dev/null
+++ b/kernel/power/suspend_time.c
@@ -0,0 +1,111 @@
1/*
2 * debugfs file to track time spent in suspend
3 *
4 * Copyright (c) 2011, Google, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/debugfs.h>
18#include <linux/err.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/seq_file.h>
22#include <linux/syscore_ops.h>
23#include <linux/time.h>
24
25static struct timespec suspend_time_before;
26static unsigned int time_in_suspend_bins[32];
27
28#ifdef CONFIG_DEBUG_FS
29static int suspend_time_debug_show(struct seq_file *s, void *data)
30{
31 int bin;
32 seq_printf(s, "time (secs) count\n");
33 seq_printf(s, "------------------\n");
34 for (bin = 0; bin < 32; bin++) {
35 if (time_in_suspend_bins[bin] == 0)
36 continue;
37 seq_printf(s, "%4d - %4d %4u\n",
38 bin ? 1 << (bin - 1) : 0, 1 << bin,
39 time_in_suspend_bins[bin]);
40 }
41 return 0;
42}
43
44static int suspend_time_debug_open(struct inode *inode, struct file *file)
45{
46 return single_open(file, suspend_time_debug_show, NULL);
47}
48
49static const struct file_operations suspend_time_debug_fops = {
50 .open = suspend_time_debug_open,
51 .read = seq_read,
52 .llseek = seq_lseek,
53 .release = single_release,
54};
55
56static int __init suspend_time_debug_init(void)
57{
58 struct dentry *d;
59
60 d = debugfs_create_file("suspend_time", 0755, NULL, NULL,
61 &suspend_time_debug_fops);
62 if (!d) {
63 pr_err("Failed to create suspend_time debug file\n");
64 return -ENOMEM;
65 }
66
67 return 0;
68}
69
70late_initcall(suspend_time_debug_init);
71#endif
72
73static int suspend_time_syscore_suspend(void)
74{
75 read_persistent_clock(&suspend_time_before);
76
77 return 0;
78}
79
80static void suspend_time_syscore_resume(void)
81{
82 struct timespec after;
83
84 read_persistent_clock(&after);
85
86 after = timespec_sub(after, suspend_time_before);
87
88 time_in_suspend_bins[fls(after.tv_sec)]++;
89
90 pr_info("Suspended for %lu.%03lu seconds\n", after.tv_sec,
91 after.tv_nsec / NSEC_PER_MSEC);
92}
93
94static struct syscore_ops suspend_time_syscore_ops = {
95 .suspend = suspend_time_syscore_suspend,
96 .resume = suspend_time_syscore_resume,
97};
98
99static int suspend_time_syscore_init(void)
100{
101 register_syscore_ops(&suspend_time_syscore_ops);
102
103 return 0;
104}
105
106static void suspend_time_syscore_exit(void)
107{
108 unregister_syscore_ops(&suspend_time_syscore_ops);
109}
110module_init(suspend_time_syscore_init);
111module_exit(suspend_time_syscore_exit);
diff --git a/kernel/power/userwakelock.c b/kernel/power/userwakelock.c
new file mode 100644
index 00000000000..a28a8db4146
--- /dev/null
+++ b/kernel/power/userwakelock.c
@@ -0,0 +1,219 @@
1/* kernel/power/userwakelock.c
2 *
3 * Copyright (C) 2005-2008 Google, Inc.
4 *
5 * This software is licensed under the terms of the GNU General Public
6 * License version 2, as published by the Free Software Foundation, and
7 * may be copied, distributed, and modified under those terms.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 */
15
16#include <linux/ctype.h>
17#include <linux/module.h>
18#include <linux/wakelock.h>
19#include <linux/slab.h>
20
21#include "power.h"
22
23enum {
24 DEBUG_FAILURE = BIT(0),
25 DEBUG_ERROR = BIT(1),
26 DEBUG_NEW = BIT(2),
27 DEBUG_ACCESS = BIT(3),
28 DEBUG_LOOKUP = BIT(4),
29};
30static int debug_mask = DEBUG_FAILURE;
31module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
32
33static DEFINE_MUTEX(tree_lock);
34
35struct user_wake_lock {
36 struct rb_node node;
37 struct wake_lock wake_lock;
38 char name[0];
39};
40struct rb_root user_wake_locks;
41
42static struct user_wake_lock *lookup_wake_lock_name(
43 const char *buf, int allocate, long *timeoutptr)
44{
45 struct rb_node **p = &user_wake_locks.rb_node;
46 struct rb_node *parent = NULL;
47 struct user_wake_lock *l;
48 int diff;
49 u64 timeout;
50 int name_len;
51 const char *arg;
52
53 /* Find length of lock name and start of optional timeout string */
54 arg = buf;
55 while (*arg && !isspace(*arg))
56 arg++;
57 name_len = arg - buf;
58 if (!name_len)
59 goto bad_arg;
60 while (isspace(*arg))
61 arg++;
62
63 /* Process timeout string */
64 if (timeoutptr && *arg) {
65 timeout = simple_strtoull(arg, (char **)&arg, 0);
66 while (isspace(*arg))
67 arg++;
68 if (*arg)
69 goto bad_arg;
70 /* convert timeout from nanoseconds to jiffies > 0 */
71 timeout += (NSEC_PER_SEC / HZ) - 1;
72 do_div(timeout, (NSEC_PER_SEC / HZ));
73 if (timeout <= 0)
74 timeout = 1;
75 *timeoutptr = timeout;
76 } else if (*arg)
77 goto bad_arg;
78 else if (timeoutptr)
79 *timeoutptr = 0;
80
81 /* Lookup wake lock in rbtree */
82 while (*p) {
83 parent = *p;
84 l = rb_entry(parent, struct user_wake_lock, node);
85 diff = strncmp(buf, l->name, name_len);
86 if (!diff && l->name[name_len])
87 diff = -1;
88 if (debug_mask & DEBUG_ERROR)
89 pr_info("lookup_wake_lock_name: compare %.*s %s %d\n",
90 name_len, buf, l->name, diff);
91
92 if (diff < 0)
93 p = &(*p)->rb_left;
94 else if (diff > 0)
95 p = &(*p)->rb_right;
96 else
97 return l;
98 }
99
100 /* Allocate and add new wakelock to rbtree */
101 if (!allocate) {
102 if (debug_mask & DEBUG_ERROR)
103 pr_info("lookup_wake_lock_name: %.*s not found\n",
104 name_len, buf);
105 return ERR_PTR(-EINVAL);
106 }
107 l = kzalloc(sizeof(*l) + name_len + 1, GFP_KERNEL);
108 if (l == NULL) {
109 if (debug_mask & DEBUG_FAILURE)
110 pr_err("lookup_wake_lock_name: failed to allocate "
111 "memory for %.*s\n", name_len, buf);
112 return ERR_PTR(-ENOMEM);
113 }
114 memcpy(l->name, buf, name_len);
115 if (debug_mask & DEBUG_NEW)
116 pr_info("lookup_wake_lock_name: new wake lock %s\n", l->name);
117 wake_lock_init(&l->wake_lock, WAKE_LOCK_SUSPEND, l->name);
118 rb_link_node(&l->node, parent, p);
119 rb_insert_color(&l->node, &user_wake_locks);
120 return l;
121
122bad_arg:
123 if (debug_mask & DEBUG_ERROR)
124 pr_info("lookup_wake_lock_name: wake lock, %.*s, bad arg, %s\n",
125 name_len, buf, arg);
126 return ERR_PTR(-EINVAL);
127}
128
129ssize_t wake_lock_show(
130 struct kobject *kobj, struct kobj_attribute *attr, char *buf)
131{
132 char *s = buf;
133 char *end = buf + PAGE_SIZE;
134 struct rb_node *n;
135 struct user_wake_lock *l;
136
137 mutex_lock(&tree_lock);
138
139 for (n = rb_first(&user_wake_locks); n != NULL; n = rb_next(n)) {
140 l = rb_entry(n, struct user_wake_lock, node);
141 if (wake_lock_active(&l->wake_lock))
142 s += scnprintf(s, end - s, "%s ", l->name);
143 }
144 s += scnprintf(s, end - s, "\n");
145
146 mutex_unlock(&tree_lock);
147 return (s - buf);
148}
149
150ssize_t wake_lock_store(
151 struct kobject *kobj, struct kobj_attribute *attr,
152 const char *buf, size_t n)
153{
154 long timeout;
155 struct user_wake_lock *l;
156
157 mutex_lock(&tree_lock);
158 l = lookup_wake_lock_name(buf, 1, &timeout);
159 if (IS_ERR(l)) {
160 n = PTR_ERR(l);
161 goto bad_name;
162 }
163
164 if (debug_mask & DEBUG_ACCESS)
165 pr_info("wake_lock_store: %s, timeout %ld\n", l->name, timeout);
166
167 if (timeout)
168 wake_lock_timeout(&l->wake_lock, timeout);
169 else
170 wake_lock(&l->wake_lock);
171bad_name:
172 mutex_unlock(&tree_lock);
173 return n;
174}
175
176
177ssize_t wake_unlock_show(
178 struct kobject *kobj, struct kobj_attribute *attr, char *buf)
179{
180 char *s = buf;
181 char *end = buf + PAGE_SIZE;
182 struct rb_node *n;
183 struct user_wake_lock *l;
184
185 mutex_lock(&tree_lock);
186
187 for (n = rb_first(&user_wake_locks); n != NULL; n = rb_next(n)) {
188 l = rb_entry(n, struct user_wake_lock, node);
189 if (!wake_lock_active(&l->wake_lock))
190 s += scnprintf(s, end - s, "%s ", l->name);
191 }
192 s += scnprintf(s, end - s, "\n");
193
194 mutex_unlock(&tree_lock);
195 return (s - buf);
196}
197
198ssize_t wake_unlock_store(
199 struct kobject *kobj, struct kobj_attribute *attr,
200 const char *buf, size_t n)
201{
202 struct user_wake_lock *l;
203
204 mutex_lock(&tree_lock);
205 l = lookup_wake_lock_name(buf, 0, NULL);
206 if (IS_ERR(l)) {
207 n = PTR_ERR(l);
208 goto not_found;
209 }
210
211 if (debug_mask & DEBUG_ACCESS)
212 pr_info("wake_unlock_store: %s\n", l->name);
213
214 wake_unlock(&l->wake_lock);
215not_found:
216 mutex_unlock(&tree_lock);
217 return n;
218}
219
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 00000000000..81e1b7c65ca
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,634 @@
1/* kernel/power/wakelock.c
2 *
3 * Copyright (C) 2005-2008 Google, Inc.
4 *
5 * This software is licensed under the terms of the GNU General Public
6 * License version 2, as published by the Free Software Foundation, and
7 * may be copied, distributed, and modified under those terms.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 */
15
16#include <linux/module.h>
17#include <linux/platform_device.h>
18#include <linux/rtc.h>
19#include <linux/suspend.h>
20#include <linux/syscalls.h> /* sys_sync */
21#include <linux/wakelock.h>
22#ifdef CONFIG_WAKELOCK_STAT
23#include <linux/proc_fs.h>
24#endif
25#include "power.h"
26
27enum {
28 DEBUG_EXIT_SUSPEND = 1U << 0,
29 DEBUG_WAKEUP = 1U << 1,
30 DEBUG_SUSPEND = 1U << 2,
31 DEBUG_EXPIRE = 1U << 3,
32 DEBUG_WAKE_LOCK = 1U << 4,
33};
34static int debug_mask = DEBUG_EXIT_SUSPEND | DEBUG_WAKEUP;
35module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP);
36
37#define WAKE_LOCK_TYPE_MASK (0x0f)
38#define WAKE_LOCK_INITIALIZED (1U << 8)
39#define WAKE_LOCK_ACTIVE (1U << 9)
40#define WAKE_LOCK_AUTO_EXPIRE (1U << 10)
41#define WAKE_LOCK_PREVENTING_SUSPEND (1U << 11)
42
43static DEFINE_SPINLOCK(list_lock);
44static LIST_HEAD(inactive_locks);
45static struct list_head active_wake_locks[WAKE_LOCK_TYPE_COUNT];
46static int current_event_num;
47struct workqueue_struct *suspend_work_queue;
48struct wake_lock main_wake_lock;
49suspend_state_t requested_suspend_state = PM_SUSPEND_MEM;
50static struct wake_lock unknown_wakeup;
51static struct wake_lock suspend_backoff_lock;
52
53#define SUSPEND_BACKOFF_THRESHOLD 10
54#define SUSPEND_BACKOFF_INTERVAL 10000
55
56static unsigned suspend_short_count;
57
58#ifdef CONFIG_WAKELOCK_STAT
59static struct wake_lock deleted_wake_locks;
60static ktime_t last_sleep_time_update;
61static int wait_for_wakeup;
62
63int get_expired_time(struct wake_lock *lock, ktime_t *expire_time)
64{
65 struct timespec ts;
66 struct timespec kt;
67 struct timespec tomono;
68 struct timespec delta;
69 struct timespec sleep;
70 long timeout;
71
72 if (!(lock->flags & WAKE_LOCK_AUTO_EXPIRE))
73 return 0;
74 get_xtime_and_monotonic_and_sleep_offset(&kt, &tomono, &sleep);
75 timeout = lock->expires - jiffies;
76 if (timeout > 0)
77 return 0;
78 jiffies_to_timespec(-timeout, &delta);
79 set_normalized_timespec(&ts, kt.tv_sec + tomono.tv_sec - delta.tv_sec,
80 kt.tv_nsec + tomono.tv_nsec - delta.tv_nsec);
81 *expire_time = timespec_to_ktime(ts);
82 return 1;
83}
84
85
86static int print_lock_stat(struct seq_file *m, struct wake_lock *lock)
87{
88 int lock_count = lock->stat.count;
89 int expire_count = lock->stat.expire_count;
90 ktime_t active_time = ktime_set(0, 0);
91 ktime_t total_time = lock->stat.total_time;
92 ktime_t max_time = lock->stat.max_time;
93
94 ktime_t prevent_suspend_time = lock->stat.prevent_suspend_time;
95 if (lock->flags & WAKE_LOCK_ACTIVE) {
96 ktime_t now, add_time;
97 int expired = get_expired_time(lock, &now);
98 if (!expired)
99 now = ktime_get();
100 add_time = ktime_sub(now, lock->stat.last_time);
101 lock_count++;
102 if (!expired)
103 active_time = add_time;
104 else
105 expire_count++;
106 total_time = ktime_add(total_time, add_time);
107 if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND)
108 prevent_suspend_time = ktime_add(prevent_suspend_time,
109 ktime_sub(now, last_sleep_time_update));
110 if (add_time.tv64 > max_time.tv64)
111 max_time = add_time;
112 }
113
114 return seq_printf(m,
115 "\"%s\"\t%d\t%d\t%d\t%lld\t%lld\t%lld\t%lld\t%lld\n",
116 lock->name, lock_count, expire_count,
117 lock->stat.wakeup_count, ktime_to_ns(active_time),
118 ktime_to_ns(total_time),
119 ktime_to_ns(prevent_suspend_time), ktime_to_ns(max_time),
120 ktime_to_ns(lock->stat.last_time));
121}
122
123static int wakelock_stats_show(struct seq_file *m, void *unused)
124{
125 unsigned long irqflags;
126 struct wake_lock *lock;
127 int ret;
128 int type;
129
130 spin_lock_irqsave(&list_lock, irqflags);
131
132 ret = seq_puts(m, "name\tcount\texpire_count\twake_count\tactive_since"
133 "\ttotal_time\tsleep_time\tmax_time\tlast_change\n");
134 list_for_each_entry(lock, &inactive_locks, link)
135 ret = print_lock_stat(m, lock);
136 for (type = 0; type < WAKE_LOCK_TYPE_COUNT; type++) {
137 list_for_each_entry(lock, &active_wake_locks[type], link)
138 ret = print_lock_stat(m, lock);
139 }
140 spin_unlock_irqrestore(&list_lock, irqflags);
141 return 0;
142}
143
144static void wake_unlock_stat_locked(struct wake_lock *lock, int expired)
145{
146 ktime_t duration;
147 ktime_t now;
148 if (!(lock->flags & WAKE_LOCK_ACTIVE))
149 return;
150 if (get_expired_time(lock, &now))
151 expired = 1;
152 else
153 now = ktime_get();
154 lock->stat.count++;
155 if (expired)
156 lock->stat.expire_count++;
157 duration = ktime_sub(now, lock->stat.last_time);
158 lock->stat.total_time = ktime_add(lock->stat.total_time, duration);
159 if (ktime_to_ns(duration) > ktime_to_ns(lock->stat.max_time))
160 lock->stat.max_time = duration;
161 lock->stat.last_time = ktime_get();
162 if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) {
163 duration = ktime_sub(now, last_sleep_time_update);
164 lock->stat.prevent_suspend_time = ktime_add(
165 lock->stat.prevent_suspend_time, duration);
166 lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND;
167 }
168}
169
170static void update_sleep_wait_stats_locked(int done)
171{
172 struct wake_lock *lock;
173 ktime_t now, etime, elapsed, add;
174 int expired;
175
176 now = ktime_get();
177 elapsed = ktime_sub(now, last_sleep_time_update);
178 list_for_each_entry(lock, &active_wake_locks[WAKE_LOCK_SUSPEND], link) {
179 expired = get_expired_time(lock, &etime);
180 if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) {
181 if (expired)
182 add = ktime_sub(etime, last_sleep_time_update);
183 else
184 add = elapsed;
185 lock->stat.prevent_suspend_time = ktime_add(
186 lock->stat.prevent_suspend_time, add);
187 }
188 if (done || expired)
189 lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND;
190 else
191 lock->flags |= WAKE_LOCK_PREVENTING_SUSPEND;
192 }
193 last_sleep_time_update = now;
194}
195#endif
196
197
198static void expire_wake_lock(struct wake_lock *lock)
199{
200#ifdef CONFIG_WAKELOCK_STAT
201 wake_unlock_stat_locked(lock, 1);
202#endif
203 lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE);
204 list_del(&lock->link);
205 list_add(&lock->link, &inactive_locks);
206 if (debug_mask & (DEBUG_WAKE_LOCK | DEBUG_EXPIRE))
207 pr_info("expired wake lock %s\n", lock->name);
208}
209
210/* Caller must acquire the list_lock spinlock */
211static void print_active_locks(int type)
212{
213 struct wake_lock *lock;
214 bool print_expired = true;
215
216 BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
217 list_for_each_entry(lock, &active_wake_locks[type], link) {
218 if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) {
219 long timeout = lock->expires - jiffies;
220 if (timeout > 0)
221 pr_info("active wake lock %s, time left %ld\n",
222 lock->name, timeout);
223 else if (print_expired)
224 pr_info("wake lock %s, expired\n", lock->name);
225 } else {
226 pr_info("active wake lock %s\n", lock->name);
227 if (!(debug_mask & DEBUG_EXPIRE))
228 print_expired = false;
229 }
230 }
231}
232
233static long has_wake_lock_locked(int type)
234{
235 struct wake_lock *lock, *n;
236 long max_timeout = 0;
237
238 BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
239 list_for_each_entry_safe(lock, n, &active_wake_locks[type], link) {
240 if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) {
241 long timeout = lock->expires - jiffies;
242 if (timeout <= 0)
243 expire_wake_lock(lock);
244 else if (timeout > max_timeout)
245 max_timeout = timeout;
246 } else
247 return -1;
248 }
249 return max_timeout;
250}
251
252long has_wake_lock(int type)
253{
254 long ret;
255 unsigned long irqflags;
256 spin_lock_irqsave(&list_lock, irqflags);
257 ret = has_wake_lock_locked(type);
258 if (ret && (debug_mask & DEBUG_WAKEUP) && type == WAKE_LOCK_SUSPEND)
259 print_active_locks(type);
260 spin_unlock_irqrestore(&list_lock, irqflags);
261 return ret;
262}
263
264static void suspend_backoff(void)
265{
266 pr_info("suspend: too many immediate wakeups, back off\n");
267 wake_lock_timeout(&suspend_backoff_lock,
268 msecs_to_jiffies(SUSPEND_BACKOFF_INTERVAL));
269}
270
271static void suspend(struct work_struct *work)
272{
273 int ret;
274 int entry_event_num;
275 struct timespec ts_entry, ts_exit;
276
277 if (has_wake_lock(WAKE_LOCK_SUSPEND)) {
278 if (debug_mask & DEBUG_SUSPEND)
279 pr_info("suspend: abort suspend\n");
280 return;
281 }
282
283 entry_event_num = current_event_num;
284 sys_sync();
285 if (debug_mask & DEBUG_SUSPEND)
286 pr_info("suspend: enter suspend\n");
287 getnstimeofday(&ts_entry);
288 ret = pm_suspend(requested_suspend_state);
289 getnstimeofday(&ts_exit);
290
291 if (debug_mask & DEBUG_EXIT_SUSPEND) {
292 struct rtc_time tm;
293 rtc_time_to_tm(ts_exit.tv_sec, &tm);
294 pr_info("suspend: exit suspend, ret = %d "
295 "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n", ret,
296 tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
297 tm.tm_hour, tm.tm_min, tm.tm_sec, ts_exit.tv_nsec);
298 }
299
300 if (ts_exit.tv_sec - ts_entry.tv_sec <= 1) {
301 ++suspend_short_count;
302
303 if (suspend_short_count == SUSPEND_BACKOFF_THRESHOLD) {
304 suspend_backoff();
305 suspend_short_count = 0;
306 }
307 } else {
308 suspend_short_count = 0;
309 }
310
311 if (current_event_num == entry_event_num) {
312 if (debug_mask & DEBUG_SUSPEND)
313 pr_info("suspend: pm_suspend returned with no event\n");
314 wake_lock_timeout(&unknown_wakeup, HZ / 2);
315 }
316}
317static DECLARE_WORK(suspend_work, suspend);
318
319static void expire_wake_locks(unsigned long data)
320{
321 long has_lock;
322 unsigned long irqflags;
323 if (debug_mask & DEBUG_EXPIRE)
324 pr_info("expire_wake_locks: start\n");
325 spin_lock_irqsave(&list_lock, irqflags);
326 if (debug_mask & DEBUG_SUSPEND)
327 print_active_locks(WAKE_LOCK_SUSPEND);
328 has_lock = has_wake_lock_locked(WAKE_LOCK_SUSPEND);
329 if (debug_mask & DEBUG_EXPIRE)
330 pr_info("expire_wake_locks: done, has_lock %ld\n", has_lock);
331 if (has_lock == 0)
332 queue_work(suspend_work_queue, &suspend_work);
333 spin_unlock_irqrestore(&list_lock, irqflags);
334}
335static DEFINE_TIMER(expire_timer, expire_wake_locks, 0, 0);
336
337static int power_suspend_late(struct device *dev)
338{
339 int ret = has_wake_lock(WAKE_LOCK_SUSPEND) ? -EAGAIN : 0;
340#ifdef CONFIG_WAKELOCK_STAT
341 wait_for_wakeup = !ret;
342#endif
343 if (debug_mask & DEBUG_SUSPEND)
344 pr_info("power_suspend_late return %d\n", ret);
345 return ret;
346}
347
348static struct dev_pm_ops power_driver_pm_ops = {
349 .suspend_noirq = power_suspend_late,
350};
351
352static struct platform_driver power_driver = {
353 .driver.name = "power",
354 .driver.pm = &power_driver_pm_ops,
355};
356static struct platform_device power_device = {
357 .name = "power",
358};
359
360void wake_lock_init(struct wake_lock *lock, int type, const char *name)
361{
362 unsigned long irqflags = 0;
363
364 if (name)
365 lock->name = name;
366 BUG_ON(!lock->name);
367
368 if (debug_mask & DEBUG_WAKE_LOCK)
369 pr_info("wake_lock_init name=%s\n", lock->name);
370#ifdef CONFIG_WAKELOCK_STAT
371 lock->stat.count = 0;
372 lock->stat.expire_count = 0;
373 lock->stat.wakeup_count = 0;
374 lock->stat.total_time = ktime_set(0, 0);
375 lock->stat.prevent_suspend_time = ktime_set(0, 0);
376 lock->stat.max_time = ktime_set(0, 0);
377 lock->stat.last_time = ktime_set(0, 0);
378#endif
379 lock->flags = (type & WAKE_LOCK_TYPE_MASK) | WAKE_LOCK_INITIALIZED;
380
381 INIT_LIST_HEAD(&lock->link);
382 spin_lock_irqsave(&list_lock, irqflags);
383 list_add(&lock->link, &inactive_locks);
384 spin_unlock_irqrestore(&list_lock, irqflags);
385}
386EXPORT_SYMBOL(wake_lock_init);
387
388void wake_lock_destroy(struct wake_lock *lock)
389{
390 unsigned long irqflags;
391 if (debug_mask & DEBUG_WAKE_LOCK)
392 pr_info("wake_lock_destroy name=%s\n", lock->name);
393 spin_lock_irqsave(&list_lock, irqflags);
394 lock->flags &= ~WAKE_LOCK_INITIALIZED;
395#ifdef CONFIG_WAKELOCK_STAT
396 if (lock->stat.count) {
397 deleted_wake_locks.stat.count += lock->stat.count;
398 deleted_wake_locks.stat.expire_count += lock->stat.expire_count;
399 deleted_wake_locks.stat.total_time =
400 ktime_add(deleted_wake_locks.stat.total_time,
401 lock->stat.total_time);
402 deleted_wake_locks.stat.prevent_suspend_time =
403 ktime_add(deleted_wake_locks.stat.prevent_suspend_time,
404 lock->stat.prevent_suspend_time);
405 deleted_wake_locks.stat.max_time =
406 ktime_add(deleted_wake_locks.stat.max_time,
407 lock->stat.max_time);
408 }
409#endif
410 list_del(&lock->link);
411 spin_unlock_irqrestore(&list_lock, irqflags);
412}
413EXPORT_SYMBOL(wake_lock_destroy);
414
415static void wake_lock_internal(
416 struct wake_lock *lock, long timeout, int has_timeout)
417{
418 int type;
419 unsigned long irqflags;
420 long expire_in;
421
422 spin_lock_irqsave(&list_lock, irqflags);
423 type = lock->flags & WAKE_LOCK_TYPE_MASK;
424 BUG_ON(type >= WAKE_LOCK_TYPE_COUNT);
425 BUG_ON(!(lock->flags & WAKE_LOCK_INITIALIZED));
426#ifdef CONFIG_WAKELOCK_STAT
427 if (type == WAKE_LOCK_SUSPEND && wait_for_wakeup) {
428 if (debug_mask & DEBUG_WAKEUP)
429 pr_info("wakeup wake lock: %s\n", lock->name);
430 wait_for_wakeup = 0;
431 lock->stat.wakeup_count++;
432 }
433 if ((lock->flags & WAKE_LOCK_AUTO_EXPIRE) &&
434 (long)(lock->expires - jiffies) <= 0) {
435 wake_unlock_stat_locked(lock, 0);
436 lock->stat.last_time = ktime_get();
437 }
438#endif
439 if (!(lock->flags & WAKE_LOCK_ACTIVE)) {
440 lock->flags |= WAKE_LOCK_ACTIVE;
441#ifdef CONFIG_WAKELOCK_STAT
442 lock->stat.last_time = ktime_get();
443#endif
444 }
445 list_del(&lock->link);
446 if (has_timeout) {
447 if (debug_mask & DEBUG_WAKE_LOCK)
448 pr_info("wake_lock: %s, type %d, timeout %ld.%03lu\n",
449 lock->name, type, timeout / HZ,
450 (timeout % HZ) * MSEC_PER_SEC / HZ);
451 lock->expires = jiffies + timeout;
452 lock->flags |= WAKE_LOCK_AUTO_EXPIRE;
453 list_add_tail(&lock->link, &active_wake_locks[type]);
454 } else {
455 if (debug_mask & DEBUG_WAKE_LOCK)
456 pr_info("wake_lock: %s, type %d\n", lock->name, type);
457 lock->expires = LONG_MAX;
458 lock->flags &= ~WAKE_LOCK_AUTO_EXPIRE;
459 list_add(&lock->link, &active_wake_locks[type]);
460 }
461 if (type == WAKE_LOCK_SUSPEND) {
462 current_event_num++;
463#ifdef CONFIG_WAKELOCK_STAT
464 if (lock == &main_wake_lock)
465 update_sleep_wait_stats_locked(1);
466 else if (!wake_lock_active(&main_wake_lock))
467 update_sleep_wait_stats_locked(0);
468#endif
469 if (has_timeout)
470 expire_in = has_wake_lock_locked(type);
471 else
472 expire_in = -1;
473 if (expire_in > 0) {
474 if (debug_mask & DEBUG_EXPIRE)
475 pr_info("wake_lock: %s, start expire timer, "
476 "%ld\n", lock->name, expire_in);
477 mod_timer(&expire_timer, jiffies + expire_in);
478 } else {
479 if (del_timer(&expire_timer))
480 if (debug_mask & DEBUG_EXPIRE)
481 pr_info("wake_lock: %s, stop expire timer\n",
482 lock->name);
483 if (expire_in == 0)
484 queue_work(suspend_work_queue, &suspend_work);
485 }
486 }
487 spin_unlock_irqrestore(&list_lock, irqflags);
488}
489
490void wake_lock(struct wake_lock *lock)
491{
492 wake_lock_internal(lock, 0, 0);
493}
494EXPORT_SYMBOL(wake_lock);
495
496void wake_lock_timeout(struct wake_lock *lock, long timeout)
497{
498 wake_lock_internal(lock, timeout, 1);
499}
500EXPORT_SYMBOL(wake_lock_timeout);
501
502void wake_unlock(struct wake_lock *lock)
503{
504 int type;
505 unsigned long irqflags;
506 spin_lock_irqsave(&list_lock, irqflags);
507 type = lock->flags & WAKE_LOCK_TYPE_MASK;
508#ifdef CONFIG_WAKELOCK_STAT
509 wake_unlock_stat_locked(lock, 0);
510#endif
511 if (debug_mask & DEBUG_WAKE_LOCK)
512 pr_info("wake_unlock: %s\n", lock->name);
513 lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE);
514 list_del(&lock->link);
515 list_add(&lock->link, &inactive_locks);
516 if (type == WAKE_LOCK_SUSPEND) {
517 long has_lock = has_wake_lock_locked(type);
518 if (has_lock > 0) {
519 if (debug_mask & DEBUG_EXPIRE)
520 pr_info("wake_unlock: %s, start expire timer, "
521 "%ld\n", lock->name, has_lock);
522 mod_timer(&expire_timer, jiffies + has_lock);
523 } else {
524 if (del_timer(&expire_timer))
525 if (debug_mask & DEBUG_EXPIRE)
526 pr_info("wake_unlock: %s, stop expire "
527 "timer\n", lock->name);
528 if (has_lock == 0)
529 queue_work(suspend_work_queue, &suspend_work);
530 }
531 if (lock == &main_wake_lock) {
532 if (debug_mask & DEBUG_SUSPEND)
533 print_active_locks(WAKE_LOCK_SUSPEND);
534#ifdef CONFIG_WAKELOCK_STAT
535 update_sleep_wait_stats_locked(0);
536#endif
537 }
538 }
539 spin_unlock_irqrestore(&list_lock, irqflags);
540}
541EXPORT_SYMBOL(wake_unlock);
542
543int wake_lock_active(struct wake_lock *lock)
544{
545 return !!(lock->flags & WAKE_LOCK_ACTIVE);
546}
547EXPORT_SYMBOL(wake_lock_active);
548
549static int wakelock_stats_open(struct inode *inode, struct file *file)
550{
551 return single_open(file, wakelock_stats_show, NULL);
552}
553
554static const struct file_operations wakelock_stats_fops = {
555 .owner = THIS_MODULE,
556 .open = wakelock_stats_open,
557 .read = seq_read,
558 .llseek = seq_lseek,
559 .release = single_release,
560};
561
562static int __init wakelocks_init(void)
563{
564 int ret;
565 int i;
566
567 for (i = 0; i < ARRAY_SIZE(active_wake_locks); i++)
568 INIT_LIST_HEAD(&active_wake_locks[i]);
569
570#ifdef CONFIG_WAKELOCK_STAT
571 wake_lock_init(&deleted_wake_locks, WAKE_LOCK_SUSPEND,
572 "deleted_wake_locks");
573#endif
574 wake_lock_init(&main_wake_lock, WAKE_LOCK_SUSPEND, "main");
575 wake_lock(&main_wake_lock);
576 wake_lock_init(&unknown_wakeup, WAKE_LOCK_SUSPEND, "unknown_wakeups");
577 wake_lock_init(&suspend_backoff_lock, WAKE_LOCK_SUSPEND,
578 "suspend_backoff");
579
580 ret = platform_device_register(&power_device);
581 if (ret) {
582 pr_err("wakelocks_init: platform_device_register failed\n");
583 goto err_platform_device_register;
584 }
585 ret = platform_driver_register(&power_driver);
586 if (ret) {
587 pr_err("wakelocks_init: platform_driver_register failed\n");
588 goto err_platform_driver_register;
589 }
590
591 suspend_work_queue = create_singlethread_workqueue("suspend");
592 if (suspend_work_queue == NULL) {
593 ret = -ENOMEM;
594 goto err_suspend_work_queue;
595 }
596
597#ifdef CONFIG_WAKELOCK_STAT
598 proc_create("wakelocks", S_IRUGO, NULL, &wakelock_stats_fops);
599#endif
600
601 return 0;
602
603err_suspend_work_queue:
604 platform_driver_unregister(&power_driver);
605err_platform_driver_register:
606 platform_device_unregister(&power_device);
607err_platform_device_register:
608 wake_lock_destroy(&suspend_backoff_lock);
609 wake_lock_destroy(&unknown_wakeup);
610 wake_lock_destroy(&main_wake_lock);
611#ifdef CONFIG_WAKELOCK_STAT
612 wake_lock_destroy(&deleted_wake_locks);
613#endif
614 return ret;
615}
616
617static void __exit wakelocks_exit(void)
618{
619#ifdef CONFIG_WAKELOCK_STAT
620 remove_proc_entry("wakelocks", NULL);
621#endif
622 destroy_workqueue(suspend_work_queue);
623 platform_driver_unregister(&power_driver);
624 platform_device_unregister(&power_device);
625 wake_lock_destroy(&suspend_backoff_lock);
626 wake_lock_destroy(&unknown_wakeup);
627 wake_lock_destroy(&main_wake_lock);
628#ifdef CONFIG_WAKELOCK_STAT
629 wake_lock_destroy(&deleted_wake_locks);
630#endif
631}
632
633core_initcall(wakelocks_init);
634module_exit(wakelocks_exit);
diff --git a/kernel/printk.c b/kernel/printk.c
index b799a2ee96e..cbebc142be1 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,6 +53,10 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
53 53
54#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 54#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
55 55
56#ifdef CONFIG_DEBUG_LL
57extern void printascii(char *);
58#endif
59
56/* printk's without a loglevel use this.. */ 60/* printk's without a loglevel use this.. */
57#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 61#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
58 62
@@ -297,6 +301,53 @@ static inline void boot_delay_msec(void)
297} 301}
298#endif 302#endif
299 303
304/*
305 * Return the number of unread characters in the log buffer.
306 */
307static int log_buf_get_len(void)
308{
309 return logged_chars;
310}
311
312/*
313 * Clears the ring-buffer
314 */
315void log_buf_clear(void)
316{
317 logged_chars = 0;
318}
319
320/*
321 * Copy a range of characters from the log buffer.
322 */
323int log_buf_copy(char *dest, int idx, int len)
324{
325 int ret, max;
326 bool took_lock = false;
327
328 if (!oops_in_progress) {
329 spin_lock_irq(&logbuf_lock);
330 took_lock = true;
331 }
332
333 max = log_buf_get_len();
334 if (idx < 0 || idx >= max) {
335 ret = -1;
336 } else {
337 if (len > max - idx)
338 len = max - idx;
339 ret = len;
340 idx += (log_end - max);
341 while (len-- > 0)
342 dest[len] = LOG_BUF(idx + len);
343 }
344
345 if (took_lock)
346 spin_unlock_irq(&logbuf_lock);
347
348 return ret;
349}
350
300#ifdef CONFIG_SECURITY_DMESG_RESTRICT 351#ifdef CONFIG_SECURITY_DMESG_RESTRICT
301int dmesg_restrict = 1; 352int dmesg_restrict = 1;
302#else 353#else
@@ -325,8 +376,10 @@ static int check_syslog_permissions(int type, bool from_file)
325 return 0; 376 return 0;
326 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ 377 /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
327 if (capable(CAP_SYS_ADMIN)) { 378 if (capable(CAP_SYS_ADMIN)) {
328 WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " 379 printk_once(KERN_WARNING "%s (%d): "
329 "but no CAP_SYSLOG (deprecated).\n"); 380 "Attempt to access syslog with CAP_SYS_ADMIN "
381 "but no CAP_SYSLOG (deprecated).\n",
382 current->comm, task_pid_nr(current));
330 return 0; 383 return 0;
331 } 384 }
332 return -EPERM; 385 return -EPERM;
@@ -789,7 +842,7 @@ static inline int can_use_console(unsigned int cpu)
789static int console_trylock_for_printk(unsigned int cpu) 842static int console_trylock_for_printk(unsigned int cpu)
790 __releases(&logbuf_lock) 843 __releases(&logbuf_lock)
791{ 844{
792 int retval = 0; 845 int retval = 0, wake = 0;
793 846
794 if (console_trylock()) { 847 if (console_trylock()) {
795 retval = 1; 848 retval = 1;
@@ -802,12 +855,14 @@ static int console_trylock_for_printk(unsigned int cpu)
802 */ 855 */
803 if (!can_use_console(cpu)) { 856 if (!can_use_console(cpu)) {
804 console_locked = 0; 857 console_locked = 0;
805 up(&console_sem); 858 wake = 1;
806 retval = 0; 859 retval = 0;
807 } 860 }
808 } 861 }
809 printk_cpu = UINT_MAX; 862 printk_cpu = UINT_MAX;
810 spin_unlock(&logbuf_lock); 863 spin_unlock(&logbuf_lock);
864 if (wake)
865 up(&console_sem);
811 return retval; 866 return retval;
812} 867}
813static const char recursion_bug_msg [] = 868static const char recursion_bug_msg [] =
@@ -882,6 +937,10 @@ asmlinkage int vprintk(const char *fmt, va_list args)
882 if (trace_override && !trace_recurse) 937 if (trace_override && !trace_recurse)
883 TRACE("%s", printk_buf); 938 TRACE("%s", printk_buf);
884 939
940#ifdef CONFIG_DEBUG_LL
941 printascii(printk_buf);
942#endif
943
885 p = printk_buf; 944 p = printk_buf;
886 945
887 /* Read log level and handle special printk prefix */ 946 /* Read log level and handle special printk prefix */
@@ -1156,7 +1215,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
1156 switch (action) { 1215 switch (action) {
1157 case CPU_ONLINE: 1216 case CPU_ONLINE:
1158 case CPU_DEAD: 1217 case CPU_DEAD:
1159 case CPU_DYING:
1160 case CPU_DOWN_FAILED: 1218 case CPU_DOWN_FAILED:
1161 case CPU_UP_CANCELED: 1219 case CPU_UP_CANCELED:
1162 console_lock(); 1220 console_lock();
@@ -1252,7 +1310,7 @@ void console_unlock(void)
1252{ 1310{
1253 unsigned long flags; 1311 unsigned long flags;
1254 unsigned _con_start, _log_end; 1312 unsigned _con_start, _log_end;
1255 unsigned wake_klogd = 0; 1313 unsigned wake_klogd = 0, retry = 0;
1256 1314
1257 if (console_suspended) { 1315 if (console_suspended) {
1258 up(&console_sem); 1316 up(&console_sem);
@@ -1261,6 +1319,7 @@ void console_unlock(void)
1261 1319
1262 console_may_schedule = 0; 1320 console_may_schedule = 0;
1263 1321
1322again:
1264 for ( ; ; ) { 1323 for ( ; ; ) {
1265 spin_lock_irqsave(&logbuf_lock, flags); 1324 spin_lock_irqsave(&logbuf_lock, flags);
1266 wake_klogd |= log_start - log_end; 1325 wake_klogd |= log_start - log_end;
@@ -1281,8 +1340,23 @@ void console_unlock(void)
1281 if (unlikely(exclusive_console)) 1340 if (unlikely(exclusive_console))
1282 exclusive_console = NULL; 1341 exclusive_console = NULL;
1283 1342
1343 spin_unlock(&logbuf_lock);
1344
1284 up(&console_sem); 1345 up(&console_sem);
1346
1347 /*
1348 * Someone could have filled up the buffer again, so re-check if there's
1349 * something to flush. In case we cannot trylock the console_sem again,
1350 * there's a new owner and the console_unlock() from them will do the
1351 * flush, no worries.
1352 */
1353 spin_lock(&logbuf_lock);
1354 if (con_start != log_end)
1355 retry = 1;
1285 spin_unlock_irqrestore(&logbuf_lock, flags); 1356 spin_unlock_irqrestore(&logbuf_lock, flags);
1357 if (retry && console_trylock())
1358 goto again;
1359
1286 if (wake_klogd) 1360 if (wake_klogd)
1287 wake_up_klogd(); 1361 wake_up_klogd();
1288} 1362}
@@ -1594,7 +1668,7 @@ static int __init printk_late_init(void)
1594 struct console *con; 1668 struct console *con;
1595 1669
1596 for_each_console(con) { 1670 for_each_console(con) {
1597 if (con->flags & CON_BOOT) { 1671 if (!keep_bootcon && con->flags & CON_BOOT) {
1598 printk(KERN_INFO "turn off boot console %s%d\n", 1672 printk(KERN_INFO "turn off boot console %s%d\n",
1599 con->name, con->index); 1673 con->name, con->index);
1600 unregister_console(con); 1674 unregister_console(con);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df115790cd..67d1fdd3c55 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h> 25#include <linux/hw_breakpoint.h>
26#include <linux/cn_proc.h>
26 27
27 28
29static int ptrace_trapping_sleep_fn(void *flags)
30{
31 schedule();
32 return 0;
33}
34
28/* 35/*
29 * ptrace a task: make the debugger its new parent and 36 * ptrace a task: make the debugger its new parent and
30 * move it to the ptrace list. 37 * move it to the ptrace list.
@@ -77,13 +84,31 @@ void __ptrace_unlink(struct task_struct *child)
77 spin_lock(&child->sighand->siglock); 84 spin_lock(&child->sighand->siglock);
78 85
79 /* 86 /*
80 * Reinstate GROUP_STOP_PENDING if group stop is in effect and 87 * Clear all pending traps and TRAPPING. TRAPPING should be
88 * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
89 */
90 task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
91 task_clear_jobctl_trapping(child);
92
93 /*
94 * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
81 * @child isn't dead. 95 * @child isn't dead.
82 */ 96 */
83 if (!(child->flags & PF_EXITING) && 97 if (!(child->flags & PF_EXITING) &&
84 (child->signal->flags & SIGNAL_STOP_STOPPED || 98 (child->signal->flags & SIGNAL_STOP_STOPPED ||
85 child->signal->group_stop_count)) 99 child->signal->group_stop_count)) {
86 child->group_stop |= GROUP_STOP_PENDING; 100 child->jobctl |= JOBCTL_STOP_PENDING;
101
102 /*
103 * This is only possible if this thread was cloned by the
104 * traced task running in the stopped group, set the signal
105 * for the future reports.
106 * FIXME: we should change ptrace_init_task() to handle this
107 * case.
108 */
109 if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
110 child->jobctl |= SIGSTOP;
111 }
87 112
88 /* 113 /*
89 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick 114 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
@@ -91,16 +116,30 @@ void __ptrace_unlink(struct task_struct *child)
91 * is in TASK_TRACED; otherwise, we might unduly disrupt 116 * is in TASK_TRACED; otherwise, we might unduly disrupt
92 * TASK_KILLABLE sleeps. 117 * TASK_KILLABLE sleeps.
93 */ 118 */
94 if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) 119 if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
95 signal_wake_up(child, task_is_traced(child)); 120 signal_wake_up(child, task_is_traced(child));
96 121
97 spin_unlock(&child->sighand->siglock); 122 spin_unlock(&child->sighand->siglock);
98} 123}
99 124
100/* 125/**
101 * Check that we have indeed attached to the thing.. 126 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
127 * @child: ptracee to check for
128 * @ignore_state: don't check whether @child is currently %TASK_TRACED
129 *
130 * Check whether @child is being ptraced by %current and ready for further
131 * ptrace operations. If @ignore_state is %false, @child also should be in
132 * %TASK_TRACED state and on return the child is guaranteed to be traced
133 * and not executing. If @ignore_state is %true, @child can be in any
134 * state.
135 *
136 * CONTEXT:
137 * Grabs and releases tasklist_lock and @child->sighand->siglock.
138 *
139 * RETURNS:
140 * 0 on success, -ESRCH if %child is not ready.
102 */ 141 */
103int ptrace_check_attach(struct task_struct *child, int kill) 142int ptrace_check_attach(struct task_struct *child, bool ignore_state)
104{ 143{
105 int ret = -ESRCH; 144 int ret = -ESRCH;
106 145
@@ -119,13 +158,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
119 */ 158 */
120 spin_lock_irq(&child->sighand->siglock); 159 spin_lock_irq(&child->sighand->siglock);
121 WARN_ON_ONCE(task_is_stopped(child)); 160 WARN_ON_ONCE(task_is_stopped(child));
122 if (task_is_traced(child) || kill) 161 if (ignore_state || (task_is_traced(child) &&
162 !(child->jobctl & JOBCTL_LISTENING)))
123 ret = 0; 163 ret = 0;
124 spin_unlock_irq(&child->sighand->siglock); 164 spin_unlock_irq(&child->sighand->siglock);
125 } 165 }
126 read_unlock(&tasklist_lock); 166 read_unlock(&tasklist_lock);
127 167
128 if (!ret && !kill) 168 if (!ret && !ignore_state)
129 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; 169 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
130 170
131 /* All systems go.. */ 171 /* All systems go.. */
@@ -182,11 +222,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
182 return !err; 222 return !err;
183} 223}
184 224
185static int ptrace_attach(struct task_struct *task) 225static int ptrace_attach(struct task_struct *task, long request,
226 unsigned long flags)
186{ 227{
187 bool wait_trap = false; 228 bool seize = (request == PTRACE_SEIZE);
188 int retval; 229 int retval;
189 230
231 /*
232 * SEIZE will enable new ptrace behaviors which will be implemented
233 * gradually. SEIZE_DEVEL is used to prevent applications
234 * expecting full SEIZE behaviors trapping on kernel commits which
235 * are still in the process of implementing them.
236 *
237 * Only test programs for new ptrace behaviors being implemented
238 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
239 *
240 * Once SEIZE behaviors are completely implemented, this flag and
241 * the following test will be removed.
242 */
243 retval = -EIO;
244 if (seize && !(flags & PTRACE_SEIZE_DEVEL))
245 goto out;
246
190 audit_ptrace(task); 247 audit_ptrace(task);
191 248
192 retval = -EPERM; 249 retval = -EPERM;
@@ -218,16 +275,21 @@ static int ptrace_attach(struct task_struct *task)
218 goto unlock_tasklist; 275 goto unlock_tasklist;
219 276
220 task->ptrace = PT_PTRACED; 277 task->ptrace = PT_PTRACED;
278 if (seize)
279 task->ptrace |= PT_SEIZED;
221 if (task_ns_capable(task, CAP_SYS_PTRACE)) 280 if (task_ns_capable(task, CAP_SYS_PTRACE))
222 task->ptrace |= PT_PTRACE_CAP; 281 task->ptrace |= PT_PTRACE_CAP;
223 282
224 __ptrace_link(task, current); 283 __ptrace_link(task, current);
225 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 284
285 /* SEIZE doesn't trap tracee on attach */
286 if (!seize)
287 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
226 288
227 spin_lock(&task->sighand->siglock); 289 spin_lock(&task->sighand->siglock);
228 290
229 /* 291 /*
230 * If the task is already STOPPED, set GROUP_STOP_PENDING and 292 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
231 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING 293 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
232 * will be cleared if the child completes the transition or any 294 * will be cleared if the child completes the transition or any
233 * event which clears the group stop states happens. We'll wait 295 * event which clears the group stop states happens. We'll wait
@@ -243,11 +305,9 @@ static int ptrace_attach(struct task_struct *task)
243 * The following task_is_stopped() test is safe as both transitions 305 * The following task_is_stopped() test is safe as both transitions
244 * in and out of STOPPED are protected by siglock. 306 * in and out of STOPPED are protected by siglock.
245 */ 307 */
246 if (task_is_stopped(task)) { 308 if (task_is_stopped(task) &&
247 task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; 309 task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
248 signal_wake_up(task, 1); 310 signal_wake_up(task, 1);
249 wait_trap = true;
250 }
251 311
252 spin_unlock(&task->sighand->siglock); 312 spin_unlock(&task->sighand->siglock);
253 313
@@ -257,9 +317,12 @@ unlock_tasklist:
257unlock_creds: 317unlock_creds:
258 mutex_unlock(&task->signal->cred_guard_mutex); 318 mutex_unlock(&task->signal->cred_guard_mutex);
259out: 319out:
260 if (wait_trap) 320 if (!retval) {
261 wait_event(current->signal->wait_chldexit, 321 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
262 !(task->group_stop & GROUP_STOP_TRAPPING)); 322 ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
323 proc_ptrace_connector(task, PTRACE_ATTACH);
324 }
325
263 return retval; 326 return retval;
264} 327}
265 328
@@ -322,25 +385,27 @@ static int ignoring_children(struct sighand_struct *sigh)
322 */ 385 */
323static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 386static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
324{ 387{
388 bool dead;
389
325 __ptrace_unlink(p); 390 __ptrace_unlink(p);
326 391
327 if (p->exit_state == EXIT_ZOMBIE) { 392 if (p->exit_state != EXIT_ZOMBIE)
328 if (!task_detached(p) && thread_group_empty(p)) { 393 return false;
329 if (!same_thread_group(p->real_parent, tracer)) 394
330 do_notify_parent(p, p->exit_signal); 395 dead = !thread_group_leader(p);
331 else if (ignoring_children(tracer->sighand)) { 396
332 __wake_up_parent(p, tracer); 397 if (!dead && thread_group_empty(p)) {
333 p->exit_signal = -1; 398 if (!same_thread_group(p->real_parent, tracer))
334 } 399 dead = do_notify_parent(p, p->exit_signal);
335 } 400 else if (ignoring_children(tracer->sighand)) {
336 if (task_detached(p)) { 401 __wake_up_parent(p, tracer);
337 /* Mark it as in the process of being reaped. */ 402 dead = true;
338 p->exit_state = EXIT_DEAD;
339 return true;
340 } 403 }
341 } 404 }
342 405 /* Mark it as in the process of being reaped. */
343 return false; 406 if (dead)
407 p->exit_state = EXIT_DEAD;
408 return dead;
344} 409}
345 410
346static int ptrace_detach(struct task_struct *child, unsigned int data) 411static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -365,6 +430,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
365 } 430 }
366 write_unlock_irq(&tasklist_lock); 431 write_unlock_irq(&tasklist_lock);
367 432
433 proc_ptrace_connector(child, PTRACE_DETACH);
368 if (unlikely(dead)) 434 if (unlikely(dead))
369 release_task(child); 435 release_task(child);
370 436
@@ -611,10 +677,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
611int ptrace_request(struct task_struct *child, long request, 677int ptrace_request(struct task_struct *child, long request,
612 unsigned long addr, unsigned long data) 678 unsigned long addr, unsigned long data)
613{ 679{
680 bool seized = child->ptrace & PT_SEIZED;
614 int ret = -EIO; 681 int ret = -EIO;
615 siginfo_t siginfo; 682 siginfo_t siginfo, *si;
616 void __user *datavp = (void __user *) data; 683 void __user *datavp = (void __user *) data;
617 unsigned long __user *datalp = datavp; 684 unsigned long __user *datalp = datavp;
685 unsigned long flags;
618 686
619 switch (request) { 687 switch (request) {
620 case PTRACE_PEEKTEXT: 688 case PTRACE_PEEKTEXT:
@@ -647,6 +715,59 @@ int ptrace_request(struct task_struct *child, long request,
647 ret = ptrace_setsiginfo(child, &siginfo); 715 ret = ptrace_setsiginfo(child, &siginfo);
648 break; 716 break;
649 717
718 case PTRACE_INTERRUPT:
719 /*
720 * Stop tracee without any side-effect on signal or job
721 * control. At least one trap is guaranteed to happen
722 * after this request. If @child is already trapped, the
723 * current trap is not disturbed and another trap will
724 * happen after the current trap is ended with PTRACE_CONT.
725 *
726 * The actual trap might not be PTRACE_EVENT_STOP trap but
727 * the pending condition is cleared regardless.
728 */
729 if (unlikely(!seized || !lock_task_sighand(child, &flags)))
730 break;
731
732 /*
733 * INTERRUPT doesn't disturb existing trap sans one
734 * exception. If ptracer issued LISTEN for the current
735 * STOP, this INTERRUPT should clear LISTEN and re-trap
736 * tracee into STOP.
737 */
738 if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
739 signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
740
741 unlock_task_sighand(child, &flags);
742 ret = 0;
743 break;
744
745 case PTRACE_LISTEN:
746 /*
747 * Listen for events. Tracee must be in STOP. It's not
748 * resumed per-se but is not considered to be in TRACED by
749 * wait(2) or ptrace(2). If an async event (e.g. group
750 * stop state change) happens, tracee will enter STOP trap
751 * again. Alternatively, ptracer can issue INTERRUPT to
752 * finish listening and re-trap tracee into STOP.
753 */
754 if (unlikely(!seized || !lock_task_sighand(child, &flags)))
755 break;
756
757 si = child->last_siginfo;
758 if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) {
759 child->jobctl |= JOBCTL_LISTENING;
760 /*
761 * If NOTIFY is set, it means event happened between
762 * start of this trap and now. Trigger re-trap.
763 */
764 if (child->jobctl & JOBCTL_TRAP_NOTIFY)
765 signal_wake_up(child, true);
766 ret = 0;
767 }
768 unlock_task_sighand(child, &flags);
769 break;
770
650 case PTRACE_DETACH: /* detach a process that was attached. */ 771 case PTRACE_DETACH: /* detach a process that was attached. */
651 ret = ptrace_detach(child, data); 772 ret = ptrace_detach(child, data);
652 break; 773 break;
@@ -761,8 +882,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
761 goto out; 882 goto out;
762 } 883 }
763 884
764 if (request == PTRACE_ATTACH) { 885 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
765 ret = ptrace_attach(child); 886 ret = ptrace_attach(child, request, data);
766 /* 887 /*
767 * Some architectures need to do book-keeping after 888 * Some architectures need to do book-keeping after
768 * a ptrace attach. 889 * a ptrace attach.
@@ -772,7 +893,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
772 goto out_put_task_struct; 893 goto out_put_task_struct;
773 } 894 }
774 895
775 ret = ptrace_check_attach(child, request == PTRACE_KILL); 896 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
897 request == PTRACE_INTERRUPT);
776 if (ret < 0) 898 if (ret < 0)
777 goto out_put_task_struct; 899 goto out_put_task_struct;
778 900
@@ -903,8 +1025,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
903 goto out; 1025 goto out;
904 } 1026 }
905 1027
906 if (request == PTRACE_ATTACH) { 1028 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
907 ret = ptrace_attach(child); 1029 ret = ptrace_attach(child, request, data);
908 /* 1030 /*
909 * Some architectures need to do book-keeping after 1031 * Some architectures need to do book-keeping after
910 * a ptrace attach. 1032 * a ptrace attach.
@@ -914,7 +1036,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
914 goto out_put_task_struct; 1036 goto out_put_task_struct;
915 } 1037 }
916 1038
917 ret = ptrace_check_attach(child, request == PTRACE_KILL); 1039 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
1040 request == PTRACE_INTERRUPT);
918 if (!ret) 1041 if (!ret)
919 ret = compat_arch_ptrace(child, request, addr, data); 1042 ret = compat_arch_ptrace(child, request, addr, data);
920 1043
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 7784bd216b6..ddddb320be6 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -37,7 +37,7 @@
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/interrupt.h> 38#include <linux/interrupt.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <asm/atomic.h> 40#include <linux/atomic.h>
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/percpu.h> 42#include <linux/percpu.h>
43#include <linux/notifier.h> 43#include <linux/notifier.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e138db0338..98f51b13bb7 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -33,7 +33,7 @@
33#include <linux/rcupdate.h> 33#include <linux/rcupdate.h>
34#include <linux/interrupt.h> 34#include <linux/interrupt.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <asm/atomic.h> 36#include <linux/atomic.h>
37#include <linux/bitops.h> 37#include <linux/bitops.h>
38#include <linux/completion.h> 38#include <linux/completion.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
@@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused)
941 idx = cur_ops->readlock(); 941 idx = cur_ops->readlock();
942 completed = cur_ops->completed(); 942 completed = cur_ops->completed();
943 p = rcu_dereference_check(rcu_torture_current, 943 p = rcu_dereference_check(rcu_torture_current,
944 rcu_read_lock_held() ||
945 rcu_read_lock_bh_held() || 944 rcu_read_lock_bh_held() ||
946 rcu_read_lock_sched_held() || 945 rcu_read_lock_sched_held() ||
947 srcu_read_lock_held(&srcu_ctl)); 946 srcu_read_lock_held(&srcu_ctl));
@@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg)
1002 idx = cur_ops->readlock(); 1001 idx = cur_ops->readlock();
1003 completed = cur_ops->completed(); 1002 completed = cur_ops->completed();
1004 p = rcu_dereference_check(rcu_torture_current, 1003 p = rcu_dereference_check(rcu_torture_current,
1005 rcu_read_lock_held() ||
1006 rcu_read_lock_bh_held() || 1004 rcu_read_lock_bh_held() ||
1007 rcu_read_lock_sched_held() || 1005 rcu_read_lock_sched_held() ||
1008 srcu_read_lock_held(&srcu_ctl)); 1006 srcu_read_lock_held(&srcu_ctl));
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4e144876dc6..3b0c0986afc 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -31,7 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/interrupt.h> 32#include <linux/interrupt.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <asm/atomic.h> 34#include <linux/atomic.h>
35#include <linux/bitops.h> 35#include <linux/bitops.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/completion.h> 37#include <linux/completion.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 3ff40178dce..c8dc249da5c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old,
419 else 419 else
420 tmp.end = root->end; 420 tmp.end = root->end;
421 421
422 if (tmp.end < tmp.start)
423 goto next;
424
422 resource_clip(&tmp, constraint->min, constraint->max); 425 resource_clip(&tmp, constraint->min, constraint->max);
423 arch_remove_reservations(&tmp); 426 arch_remove_reservations(&tmp);
424 427
@@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old,
436 return 0; 439 return 0;
437 } 440 }
438 } 441 }
439 if (!this) 442
443next: if (!this || this->end == root->end)
440 break; 444 break;
445
441 if (this != old) 446 if (this != old)
442 tmp.start = this->end + 1; 447 tmp.start = this->end + 1;
443 this = this->sibling; 448 this = this->sibling;
@@ -553,6 +558,27 @@ int allocate_resource(struct resource *root, struct resource *new,
553 558
554EXPORT_SYMBOL(allocate_resource); 559EXPORT_SYMBOL(allocate_resource);
555 560
561/**
562 * lookup_resource - find an existing resource by a resource start address
563 * @root: root resource descriptor
564 * @start: resource start address
565 *
566 * Returns a pointer to the resource if found, NULL otherwise
567 */
568struct resource *lookup_resource(struct resource *root, resource_size_t start)
569{
570 struct resource *res;
571
572 read_lock(&resource_lock);
573 for (res = root->child; res; res = res->sibling) {
574 if (res->start == start)
575 break;
576 }
577 read_unlock(&resource_lock);
578
579 return res;
580}
581
556/* 582/*
557 * Insert a resource into the resource tree. If successful, return NULL, 583 * Insert a resource into the resource tree. If successful, return NULL,
558 * otherwise return the conflicting resource (compare to __request_resource()) 584 * otherwise return the conflicting resource (compare to __request_resource())
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index ab449117aaf..255e1662acd 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
890{ 890{
891 lock->owner = NULL; 891 lock->owner = NULL;
892 raw_spin_lock_init(&lock->wait_lock); 892 raw_spin_lock_init(&lock->wait_lock);
893 plist_head_init_raw(&lock->wait_list, &lock->wait_lock); 893 plist_head_init(&lock->wait_list);
894 894
895 debug_rt_mutex_init(lock, name); 895 debug_rt_mutex_init(lock, name);
896} 896}
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b05f5..9f48f3d82e9 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -11,7 +11,7 @@
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/atomic.h> 14#include <linux/atomic.h>
15 15
16/* 16/*
17 * lock for reading 17 * lock for reading
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
117 117
118EXPORT_SYMBOL(down_read_nested); 118EXPORT_SYMBOL(down_read_nested);
119 119
120void down_read_non_owner(struct rw_semaphore *sem)
121{
122 might_sleep();
123
124 __down_read(sem);
125}
126
127EXPORT_SYMBOL(down_read_non_owner);
128
129void down_write_nested(struct rw_semaphore *sem, int subclass) 120void down_write_nested(struct rw_semaphore *sem, int subclass)
130{ 121{
131 might_sleep(); 122 might_sleep();
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
136 127
137EXPORT_SYMBOL(down_write_nested); 128EXPORT_SYMBOL(down_write_nested);
138 129
139void up_read_non_owner(struct rw_semaphore *sem)
140{
141 __up_read(sem);
142}
143
144EXPORT_SYMBOL(up_read_non_owner);
145
146#endif 130#endif
147 131
148 132
diff --git a/kernel/sched.c b/kernel/sched.c
index c4b6bd5151f..a1bf2646d12 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,10 +71,14 @@
71#include <linux/ctype.h> 71#include <linux/ctype.h>
72#include <linux/ftrace.h> 72#include <linux/ftrace.h>
73#include <linux/slab.h> 73#include <linux/slab.h>
74#include <linux/cpuacct.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
77#include <asm/mutex.h> 78#include <asm/mutex.h>
79#ifdef CONFIG_PARAVIRT
80#include <asm/paravirt.h>
81#endif
78 82
79#include "sched_cpupri.h" 83#include "sched_cpupri.h"
80#include "workqueue_sched.h" 84#include "workqueue_sched.h"
@@ -129,7 +133,7 @@ static void litmus_tick(struct rq*, struct task_struct*);
129 133
130static inline int rt_policy(int policy) 134static inline int rt_policy(int policy)
131{ 135{
132 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 136 if (policy == SCHED_FIFO || policy == SCHED_RR)
133 return 1; 137 return 1;
134 return 0; 138 return 0;
135} 139}
@@ -433,6 +437,7 @@ struct litmus_rq {
433 */ 437 */
434struct root_domain { 438struct root_domain {
435 atomic_t refcount; 439 atomic_t refcount;
440 atomic_t rto_count;
436 struct rcu_head rcu; 441 struct rcu_head rcu;
437 cpumask_var_t span; 442 cpumask_var_t span;
438 cpumask_var_t online; 443 cpumask_var_t online;
@@ -442,7 +447,6 @@ struct root_domain {
442 * one runnable RT task. 447 * one runnable RT task.
443 */ 448 */
444 cpumask_var_t rto_mask; 449 cpumask_var_t rto_mask;
445 atomic_t rto_count;
446 struct cpupri cpupri; 450 struct cpupri cpupri;
447}; 451};
448 452
@@ -540,6 +544,12 @@ struct rq {
540#ifdef CONFIG_IRQ_TIME_ACCOUNTING 544#ifdef CONFIG_IRQ_TIME_ACCOUNTING
541 u64 prev_irq_time; 545 u64 prev_irq_time;
542#endif 546#endif
547#ifdef CONFIG_PARAVIRT
548 u64 prev_steal_time;
549#endif
550#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
551 u64 prev_steal_time_rq;
552#endif
543 553
544 /* calc_load related fields */ 554 /* calc_load related fields */
545 unsigned long calc_load_update; 555 unsigned long calc_load_update;
@@ -593,7 +603,6 @@ static inline int cpu_of(struct rq *rq)
593 603
594#define rcu_dereference_check_sched_domain(p) \ 604#define rcu_dereference_check_sched_domain(p) \
595 rcu_dereference_check((p), \ 605 rcu_dereference_check((p), \
596 rcu_read_lock_held() || \
597 lockdep_is_held(&sched_domains_mutex)) 606 lockdep_is_held(&sched_domains_mutex))
598 607
599/* 608/*
@@ -1581,38 +1590,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1581 return rq->avg_load_per_task; 1590 return rq->avg_load_per_task;
1582} 1591}
1583 1592
1584#ifdef CONFIG_FAIR_GROUP_SCHED
1585
1586/*
1587 * Compute the cpu's hierarchical load factor for each task group.
1588 * This needs to be done in a top-down fashion because the load of a child
1589 * group is a fraction of its parents load.
1590 */
1591static int tg_load_down(struct task_group *tg, void *data)
1592{
1593 unsigned long load;
1594 long cpu = (long)data;
1595
1596 if (!tg->parent) {
1597 load = cpu_rq(cpu)->load.weight;
1598 } else {
1599 load = tg->parent->cfs_rq[cpu]->h_load;
1600 load *= tg->se[cpu]->load.weight;
1601 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1602 }
1603
1604 tg->cfs_rq[cpu]->h_load = load;
1605
1606 return 0;
1607}
1608
1609static void update_h_load(long cpu)
1610{
1611 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1612}
1613
1614#endif
1615
1616#ifdef CONFIG_PREEMPT 1593#ifdef CONFIG_PREEMPT
1617 1594
1618static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1595static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1966,10 +1943,28 @@ void account_system_vtime(struct task_struct *curr)
1966} 1943}
1967EXPORT_SYMBOL_GPL(account_system_vtime); 1944EXPORT_SYMBOL_GPL(account_system_vtime);
1968 1945
1969static void update_rq_clock_task(struct rq *rq, s64 delta) 1946#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1947
1948#ifdef CONFIG_PARAVIRT
1949static inline u64 steal_ticks(u64 steal)
1970{ 1950{
1971 s64 irq_delta; 1951 if (unlikely(steal > NSEC_PER_SEC))
1952 return div_u64(steal, TICK_NSEC);
1953
1954 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
1955}
1956#endif
1972 1957
1958static void update_rq_clock_task(struct rq *rq, s64 delta)
1959{
1960/*
1961 * In theory, the compile should just see 0 here, and optimize out the call
1962 * to sched_rt_avg_update. But I don't trust it...
1963 */
1964#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1965 s64 steal = 0, irq_delta = 0;
1966#endif
1967#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1973 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 1968 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1974 1969
1975 /* 1970 /*
@@ -1992,12 +1987,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1992 1987
1993 rq->prev_irq_time += irq_delta; 1988 rq->prev_irq_time += irq_delta;
1994 delta -= irq_delta; 1989 delta -= irq_delta;
1990#endif
1991#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1992 if (static_branch((&paravirt_steal_rq_enabled))) {
1993 u64 st;
1994
1995 steal = paravirt_steal_clock(cpu_of(rq));
1996 steal -= rq->prev_steal_time_rq;
1997
1998 if (unlikely(steal > delta))
1999 steal = delta;
2000
2001 st = steal_ticks(steal);
2002 steal = st * TICK_NSEC;
2003
2004 rq->prev_steal_time_rq += steal;
2005
2006 delta -= steal;
2007 }
2008#endif
2009
1995 rq->clock_task += delta; 2010 rq->clock_task += delta;
1996 2011
1997 if (irq_delta && sched_feat(NONIRQ_POWER)) 2012#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1998 sched_rt_avg_update(rq, irq_delta); 2013 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
2014 sched_rt_avg_update(rq, irq_delta + steal);
2015#endif
1999} 2016}
2000 2017
2018#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2001static int irqtime_account_hi_update(void) 2019static int irqtime_account_hi_update(void)
2002{ 2020{
2003 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2021 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2032,12 +2050,7 @@ static int irqtime_account_si_update(void)
2032 2050
2033#define sched_clock_irqtime (0) 2051#define sched_clock_irqtime (0)
2034 2052
2035static void update_rq_clock_task(struct rq *rq, s64 delta) 2053#endif
2036{
2037 rq->clock_task += delta;
2038}
2039
2040#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2041 2054
2042#include "sched_idletask.c" 2055#include "sched_idletask.c"
2043#include "sched_fair.c" 2056#include "sched_fair.c"
@@ -2238,7 +2251,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2238 2251
2239 if (task_cpu(p) != new_cpu) { 2252 if (task_cpu(p) != new_cpu) {
2240 p->se.nr_migrations++; 2253 p->se.nr_migrations++;
2241 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2254 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
2242 } 2255 }
2243 2256
2244 __set_task_cpu(p, new_cpu); 2257 __set_task_cpu(p, new_cpu);
@@ -2515,7 +2528,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2515 if (p->sched_class->task_woken) 2528 if (p->sched_class->task_woken)
2516 p->sched_class->task_woken(rq, p); 2529 p->sched_class->task_woken(rq, p);
2517 2530
2518 if (unlikely(rq->idle_stamp)) { 2531 if (rq->idle_stamp) {
2519 u64 delta = rq->clock - rq->idle_stamp; 2532 u64 delta = rq->clock - rq->idle_stamp;
2520 u64 max = 2*sysctl_sched_migration_cost; 2533 u64 max = 2*sysctl_sched_migration_cost;
2521 2534
@@ -2927,7 +2940,7 @@ void sched_fork(struct task_struct *p)
2927#if defined(CONFIG_SMP) 2940#if defined(CONFIG_SMP)
2928 p->on_cpu = 0; 2941 p->on_cpu = 0;
2929#endif 2942#endif
2930#ifdef CONFIG_PREEMPT 2943#ifdef CONFIG_PREEMPT_COUNT
2931 /* Want to start with kernel preemption disabled. */ 2944 /* Want to start with kernel preemption disabled. */
2932 task_thread_info(p)->preempt_count = 1; 2945 task_thread_info(p)->preempt_count = 1;
2933#endif 2946#endif
@@ -3096,7 +3109,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3096#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 3109#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3097 local_irq_disable(); 3110 local_irq_disable();
3098#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 3111#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3099 perf_event_task_sched_in(current); 3112 perf_event_task_sched_in(prev, current);
3100#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 3113#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3101 local_irq_enable(); 3114 local_irq_enable();
3102#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 3115#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -3775,30 +3788,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3775} 3788}
3776 3789
3777/* 3790/*
3778 * Return sum_exec_runtime for the thread group.
3779 * In case the task is currently running, return the sum plus current's
3780 * pending runtime that have not been accounted yet.
3781 *
3782 * Note that the thread group might have other running tasks as well,
3783 * so the return value not includes other pending runtime that other
3784 * running tasks might have.
3785 */
3786unsigned long long thread_group_sched_runtime(struct task_struct *p)
3787{
3788 struct task_cputime totals;
3789 unsigned long flags;
3790 struct rq *rq;
3791 u64 ns;
3792
3793 rq = task_rq_lock(p, &flags);
3794 thread_group_cputime(p, &totals);
3795 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3796 task_rq_unlock(rq, p, &flags);
3797
3798 return ns;
3799}
3800
3801/*
3802 * Account user cpu time to a process. 3791 * Account user cpu time to a process.
3803 * @p: the process that the cpu time gets accounted to 3792 * @p: the process that the cpu time gets accounted to
3804 * @cputime: the cpu time spent in user space since the last update 3793 * @cputime: the cpu time spent in user space since the last update
@@ -3939,6 +3928,25 @@ void account_idle_time(cputime_t cputime)
3939 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3928 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3940} 3929}
3941 3930
3931static __always_inline bool steal_account_process_tick(void)
3932{
3933#ifdef CONFIG_PARAVIRT
3934 if (static_branch(&paravirt_steal_enabled)) {
3935 u64 steal, st = 0;
3936
3937 steal = paravirt_steal_clock(smp_processor_id());
3938 steal -= this_rq()->prev_steal_time;
3939
3940 st = steal_ticks(steal);
3941 this_rq()->prev_steal_time += st * TICK_NSEC;
3942
3943 account_steal_time(st);
3944 return st;
3945 }
3946#endif
3947 return false;
3948}
3949
3942#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3950#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3943 3951
3944#ifdef CONFIG_IRQ_TIME_ACCOUNTING 3952#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3970,6 +3978,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3970 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 3978 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3971 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3979 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3972 3980
3981 if (steal_account_process_tick())
3982 return;
3983
3973 if (irqtime_account_hi_update()) { 3984 if (irqtime_account_hi_update()) {
3974 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3985 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3975 } else if (irqtime_account_si_update()) { 3986 } else if (irqtime_account_si_update()) {
@@ -4023,6 +4034,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
4023 return; 4034 return;
4024 } 4035 }
4025 4036
4037 if (steal_account_process_tick())
4038 return;
4039
4026 if (user_tick) 4040 if (user_tick)
4027 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4041 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4028 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4042 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4320,9 +4334,9 @@ pick_next_task(struct rq *rq)
4320} 4334}
4321 4335
4322/* 4336/*
4323 * schedule() is the main scheduler function. 4337 * __schedule() is the main scheduler function.
4324 */ 4338 */
4325asmlinkage void __sched schedule(void) 4339static void __sched __schedule(void)
4326{ 4340{
4327 struct task_struct *prev, *next; 4341 struct task_struct *prev, *next;
4328 unsigned long *switch_count; 4342 unsigned long *switch_count;
@@ -4371,16 +4385,6 @@ litmus_need_resched_nonpreemptible:
4371 if (to_wakeup) 4385 if (to_wakeup)
4372 try_to_wake_up_local(to_wakeup); 4386 try_to_wake_up_local(to_wakeup);
4373 } 4387 }
4374
4375 /*
4376 * If we are going to sleep and we have plugged IO
4377 * queued, make sure to submit it to avoid deadlocks.
4378 */
4379 if (blk_needs_flush_plug(prev)) {
4380 raw_spin_unlock(&rq->lock);
4381 blk_schedule_flush_plug(prev);
4382 raw_spin_lock(&rq->lock);
4383 }
4384 } 4388 }
4385 switch_count = &prev->nvcsw; 4389 switch_count = &prev->nvcsw;
4386 } 4390 }
@@ -4436,17 +4440,34 @@ litmus_need_resched_nonpreemptible:
4436 4440
4437 srp_ceiling_block(); 4441 srp_ceiling_block();
4438} 4442}
4443
4444static inline void sched_submit_work(struct task_struct *tsk)
4445{
4446 if (!tsk->state)
4447 return;
4448 /*
4449 * If we are going to sleep and we have plugged IO queued,
4450 * make sure to submit it to avoid deadlocks.
4451 */
4452 if (blk_needs_flush_plug(tsk))
4453 blk_schedule_flush_plug(tsk);
4454}
4455
4456asmlinkage void __sched schedule(void)
4457{
4458 struct task_struct *tsk = current;
4459
4460 sched_submit_work(tsk);
4461 __schedule();
4462}
4439EXPORT_SYMBOL(schedule); 4463EXPORT_SYMBOL(schedule);
4440 4464
4441#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4465#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4442 4466
4443static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 4467static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4444{ 4468{
4445 bool ret = false;
4446
4447 rcu_read_lock();
4448 if (lock->owner != owner) 4469 if (lock->owner != owner)
4449 goto fail; 4470 return false;
4450 4471
4451 /* 4472 /*
4452 * Ensure we emit the owner->on_cpu, dereference _after_ checking 4473 * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4456,11 +4477,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4456 */ 4477 */
4457 barrier(); 4478 barrier();
4458 4479
4459 ret = owner->on_cpu; 4480 return owner->on_cpu;
4460fail:
4461 rcu_read_unlock();
4462
4463 return ret;
4464} 4481}
4465 4482
4466/* 4483/*
@@ -4472,21 +4489,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4472 if (!sched_feat(OWNER_SPIN)) 4489 if (!sched_feat(OWNER_SPIN))
4473 return 0; 4490 return 0;
4474 4491
4492 rcu_read_lock();
4475 while (owner_running(lock, owner)) { 4493 while (owner_running(lock, owner)) {
4476 if (need_resched()) 4494 if (need_resched())
4477 return 0; 4495 break;
4478 4496
4479 arch_mutex_cpu_relax(); 4497 arch_mutex_cpu_relax();
4480 } 4498 }
4499 rcu_read_unlock();
4481 4500
4482 /* 4501 /*
4483 * If the owner changed to another task there is likely 4502 * We break out the loop above on need_resched() and when the
4484 * heavy contention, stop spinning. 4503 * owner changed, which is a sign for heavy contention. Return
4504 * success only when lock->owner is NULL.
4485 */ 4505 */
4486 if (lock->owner) 4506 return lock->owner == NULL;
4487 return 0;
4488
4489 return 1;
4490} 4507}
4491#endif 4508#endif
4492 4509
@@ -4509,7 +4526,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
4509 4526
4510 do { 4527 do {
4511 add_preempt_count_notrace(PREEMPT_ACTIVE); 4528 add_preempt_count_notrace(PREEMPT_ACTIVE);
4512 schedule(); 4529 __schedule();
4513 sub_preempt_count_notrace(PREEMPT_ACTIVE); 4530 sub_preempt_count_notrace(PREEMPT_ACTIVE);
4514 4531
4515 /* 4532 /*
@@ -4537,7 +4554,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4537 do { 4554 do {
4538 add_preempt_count(PREEMPT_ACTIVE); 4555 add_preempt_count(PREEMPT_ACTIVE);
4539 local_irq_enable(); 4556 local_irq_enable();
4540 schedule(); 4557 __schedule();
4541 local_irq_disable(); 4558 local_irq_disable();
4542 sub_preempt_count(PREEMPT_ACTIVE); 4559 sub_preempt_count(PREEMPT_ACTIVE);
4543 4560
@@ -5682,7 +5699,7 @@ static inline int should_resched(void)
5682static void __cond_resched(void) 5699static void __cond_resched(void)
5683{ 5700{
5684 add_preempt_count(PREEMPT_ACTIVE); 5701 add_preempt_count(PREEMPT_ACTIVE);
5685 schedule(); 5702 __schedule();
5686 sub_preempt_count(PREEMPT_ACTIVE); 5703 sub_preempt_count(PREEMPT_ACTIVE);
5687} 5704}
5688 5705
@@ -6618,7 +6635,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6618 unsigned long action, void *hcpu) 6635 unsigned long action, void *hcpu)
6619{ 6636{
6620 switch (action & ~CPU_TASKS_FROZEN) { 6637 switch (action & ~CPU_TASKS_FROZEN) {
6621 case CPU_ONLINE: 6638 case CPU_STARTING:
6622 case CPU_DOWN_FAILED: 6639 case CPU_DOWN_FAILED:
6623 set_cpu_active((long)hcpu, true); 6640 set_cpu_active((long)hcpu, true);
6624 return NOTIFY_OK; 6641 return NOTIFY_OK;
@@ -7537,6 +7554,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
7537 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 7554 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7538 if (sd && (sd->flags & SD_OVERLAP)) 7555 if (sd && (sd->flags & SD_OVERLAP))
7539 free_sched_groups(sd->groups, 0); 7556 free_sched_groups(sd->groups, 0);
7557 kfree(*per_cpu_ptr(sdd->sd, j));
7540 kfree(*per_cpu_ptr(sdd->sg, j)); 7558 kfree(*per_cpu_ptr(sdd->sg, j));
7541 kfree(*per_cpu_ptr(sdd->sgp, j)); 7559 kfree(*per_cpu_ptr(sdd->sgp, j));
7542 } 7560 }
@@ -8022,17 +8040,10 @@ int in_sched_functions(unsigned long addr)
8022 && addr < (unsigned long)__sched_text_end); 8040 && addr < (unsigned long)__sched_text_end);
8023} 8041}
8024 8042
8025static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 8043static void init_cfs_rq(struct cfs_rq *cfs_rq)
8026{ 8044{
8027 cfs_rq->tasks_timeline = RB_ROOT; 8045 cfs_rq->tasks_timeline = RB_ROOT;
8028 INIT_LIST_HEAD(&cfs_rq->tasks); 8046 INIT_LIST_HEAD(&cfs_rq->tasks);
8029#ifdef CONFIG_FAIR_GROUP_SCHED
8030 cfs_rq->rq = rq;
8031 /* allow initial update_cfs_load() to truncate */
8032#ifdef CONFIG_SMP
8033 cfs_rq->load_stamp = 1;
8034#endif
8035#endif
8036 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8047 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8037#ifndef CONFIG_64BIT 8048#ifndef CONFIG_64BIT
8038 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 8049 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -8052,27 +8063,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8052 /* delimiter for bitsearch: */ 8063 /* delimiter for bitsearch: */
8053 __set_bit(MAX_RT_PRIO, array->bitmap); 8064 __set_bit(MAX_RT_PRIO, array->bitmap);
8054 8065
8055#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8066#if defined CONFIG_SMP
8056 rt_rq->highest_prio.curr = MAX_RT_PRIO; 8067 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8057#ifdef CONFIG_SMP
8058 rt_rq->highest_prio.next = MAX_RT_PRIO; 8068 rt_rq->highest_prio.next = MAX_RT_PRIO;
8059#endif
8060#endif
8061#ifdef CONFIG_SMP
8062 rt_rq->rt_nr_migratory = 0; 8069 rt_rq->rt_nr_migratory = 0;
8063 rt_rq->overloaded = 0; 8070 rt_rq->overloaded = 0;
8064 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 8071 plist_head_init(&rt_rq->pushable_tasks);
8065#endif 8072#endif
8066 8073
8067 rt_rq->rt_time = 0; 8074 rt_rq->rt_time = 0;
8068 rt_rq->rt_throttled = 0; 8075 rt_rq->rt_throttled = 0;
8069 rt_rq->rt_runtime = 0; 8076 rt_rq->rt_runtime = 0;
8070 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 8077 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8071
8072#ifdef CONFIG_RT_GROUP_SCHED
8073 rt_rq->rt_nr_boosted = 0;
8074 rt_rq->rq = rq;
8075#endif
8076} 8078}
8077 8079
8078#ifdef CONFIG_FAIR_GROUP_SCHED 8080#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8081,11 +8083,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8081 struct sched_entity *parent) 8083 struct sched_entity *parent)
8082{ 8084{
8083 struct rq *rq = cpu_rq(cpu); 8085 struct rq *rq = cpu_rq(cpu);
8084 tg->cfs_rq[cpu] = cfs_rq; 8086
8085 init_cfs_rq(cfs_rq, rq);
8086 cfs_rq->tg = tg; 8087 cfs_rq->tg = tg;
8088 cfs_rq->rq = rq;
8089#ifdef CONFIG_SMP
8090 /* allow initial update_cfs_load() to truncate */
8091 cfs_rq->load_stamp = 1;
8092#endif
8087 8093
8094 tg->cfs_rq[cpu] = cfs_rq;
8088 tg->se[cpu] = se; 8095 tg->se[cpu] = se;
8096
8089 /* se could be NULL for root_task_group */ 8097 /* se could be NULL for root_task_group */
8090 if (!se) 8098 if (!se)
8091 return; 8099 return;
@@ -8108,12 +8116,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8108{ 8116{
8109 struct rq *rq = cpu_rq(cpu); 8117 struct rq *rq = cpu_rq(cpu);
8110 8118
8111 tg->rt_rq[cpu] = rt_rq; 8119 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8112 init_rt_rq(rt_rq, rq); 8120 rt_rq->rt_nr_boosted = 0;
8121 rt_rq->rq = rq;
8113 rt_rq->tg = tg; 8122 rt_rq->tg = tg;
8114 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8115 8123
8124 tg->rt_rq[cpu] = rt_rq;
8116 tg->rt_se[cpu] = rt_se; 8125 tg->rt_se[cpu] = rt_se;
8126
8117 if (!rt_se) 8127 if (!rt_se)
8118 return; 8128 return;
8119 8129
@@ -8195,7 +8205,7 @@ void __init sched_init(void)
8195 rq->nr_running = 0; 8205 rq->nr_running = 0;
8196 rq->calc_load_active = 0; 8206 rq->calc_load_active = 0;
8197 rq->calc_load_update = jiffies + LOAD_FREQ; 8207 rq->calc_load_update = jiffies + LOAD_FREQ;
8198 init_cfs_rq(&rq->cfs, rq); 8208 init_cfs_rq(&rq->cfs);
8199 init_rt_rq(&rq->rt, rq); 8209 init_rt_rq(&rq->rt, rq);
8200#ifdef CONFIG_FAIR_GROUP_SCHED 8210#ifdef CONFIG_FAIR_GROUP_SCHED
8201 root_task_group.shares = root_task_group_load; 8211 root_task_group.shares = root_task_group_load;
@@ -8266,7 +8276,7 @@ void __init sched_init(void)
8266#endif 8276#endif
8267 8277
8268#ifdef CONFIG_RT_MUTEXES 8278#ifdef CONFIG_RT_MUTEXES
8269 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 8279 plist_head_init(&init_task.pi_waiters);
8270#endif 8280#endif
8271 8281
8272 /* 8282 /*
@@ -8300,6 +8310,7 @@ void __init sched_init(void)
8300 atomic_set(&nohz.load_balancer, nr_cpu_ids); 8310 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8301 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); 8311 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8302 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); 8312 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8313 nohz.next_balance = jiffies;
8303#endif 8314#endif
8304 /* May be allocated at isolcpus cmdline parse time */ 8315 /* May be allocated at isolcpus cmdline parse time */
8305 if (cpu_isolated_map == NULL) 8316 if (cpu_isolated_map == NULL)
@@ -8309,7 +8320,7 @@ void __init sched_init(void)
8309 scheduler_running = 1; 8320 scheduler_running = 1;
8310} 8321}
8311 8322
8312#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 8323#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8313static inline int preempt_count_equals(int preempt_offset) 8324static inline int preempt_count_equals(int preempt_offset)
8314{ 8325{
8315 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8326 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8317,13 +8328,23 @@ static inline int preempt_count_equals(int preempt_offset)
8317 return (nested == preempt_offset); 8328 return (nested == preempt_offset);
8318} 8329}
8319 8330
8331static int __might_sleep_init_called;
8332int __init __might_sleep_init(void)
8333{
8334 __might_sleep_init_called = 1;
8335 return 0;
8336}
8337early_initcall(__might_sleep_init);
8338
8320void __might_sleep(const char *file, int line, int preempt_offset) 8339void __might_sleep(const char *file, int line, int preempt_offset)
8321{ 8340{
8322#ifdef in_atomic
8323 static unsigned long prev_jiffy; /* ratelimiting */ 8341 static unsigned long prev_jiffy; /* ratelimiting */
8324 8342
8325 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8343 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8326 system_state != SYSTEM_RUNNING || oops_in_progress) 8344 oops_in_progress)
8345 return;
8346 if (system_state != SYSTEM_RUNNING &&
8347 (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
8327 return; 8348 return;
8328 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8349 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8329 return; 8350 return;
@@ -8341,7 +8362,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
8341 if (irqs_disabled()) 8362 if (irqs_disabled())
8342 print_irqtrace_events(current); 8363 print_irqtrace_events(current);
8343 dump_stack(); 8364 dump_stack();
8344#endif
8345} 8365}
8346EXPORT_SYMBOL(__might_sleep); 8366EXPORT_SYMBOL(__might_sleep);
8347#endif 8367#endif
@@ -8500,6 +8520,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8500 if (!se) 8520 if (!se)
8501 goto err_free_rq; 8521 goto err_free_rq;
8502 8522
8523 init_cfs_rq(cfs_rq);
8503 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8524 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8504 } 8525 }
8505 8526
@@ -8527,7 +8548,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8527 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8548 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8528 raw_spin_unlock_irqrestore(&rq->lock, flags); 8549 raw_spin_unlock_irqrestore(&rq->lock, flags);
8529} 8550}
8530#else /* !CONFG_FAIR_GROUP_SCHED */ 8551#else /* !CONFIG_FAIR_GROUP_SCHED */
8531static inline void free_fair_sched_group(struct task_group *tg) 8552static inline void free_fair_sched_group(struct task_group *tg)
8532{ 8553{
8533} 8554}
@@ -8548,7 +8569,8 @@ static void free_rt_sched_group(struct task_group *tg)
8548{ 8569{
8549 int i; 8570 int i;
8550 8571
8551 destroy_rt_bandwidth(&tg->rt_bandwidth); 8572 if (tg->rt_se)
8573 destroy_rt_bandwidth(&tg->rt_bandwidth);
8552 8574
8553 for_each_possible_cpu(i) { 8575 for_each_possible_cpu(i) {
8554 if (tg->rt_rq) 8576 if (tg->rt_rq)
@@ -8589,6 +8611,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8589 if (!rt_se) 8611 if (!rt_se)
8590 goto err_free_rq; 8612 goto err_free_rq;
8591 8613
8614 init_rt_rq(rt_rq, cpu_rq(i));
8615 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8592 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 8616 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8593 } 8617 }
8594 8618
@@ -9067,6 +9091,20 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9067} 9091}
9068 9092
9069static int 9093static int
9094cpu_cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk)
9095{
9096 const struct cred *cred = current_cred(), *tcred;
9097
9098 tcred = __task_cred(tsk);
9099
9100 if ((current != tsk) && !capable(CAP_SYS_NICE) &&
9101 cred->euid != tcred->uid && cred->euid != tcred->suid)
9102 return -EACCES;
9103
9104 return 0;
9105}
9106
9107static int
9070cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 9108cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9071{ 9109{
9072#ifdef CONFIG_RT_GROUP_SCHED 9110#ifdef CONFIG_RT_GROUP_SCHED
@@ -9171,6 +9209,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9171 .name = "cpu", 9209 .name = "cpu",
9172 .create = cpu_cgroup_create, 9210 .create = cpu_cgroup_create,
9173 .destroy = cpu_cgroup_destroy, 9211 .destroy = cpu_cgroup_destroy,
9212 .allow_attach = cpu_cgroup_allow_attach,
9174 .can_attach_task = cpu_cgroup_can_attach_task, 9213 .can_attach_task = cpu_cgroup_can_attach_task,
9175 .attach_task = cpu_cgroup_attach_task, 9214 .attach_task = cpu_cgroup_attach_task,
9176 .exit = cpu_cgroup_exit, 9215 .exit = cpu_cgroup_exit,
@@ -9197,8 +9236,30 @@ struct cpuacct {
9197 u64 __percpu *cpuusage; 9236 u64 __percpu *cpuusage;
9198 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 9237 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9199 struct cpuacct *parent; 9238 struct cpuacct *parent;
9239 struct cpuacct_charge_calls *cpufreq_fn;
9240 void *cpuacct_data;
9200}; 9241};
9201 9242
9243static struct cpuacct *cpuacct_root;
9244
9245/* Default calls for cpufreq accounting */
9246static struct cpuacct_charge_calls *cpuacct_cpufreq;
9247int cpuacct_register_cpufreq(struct cpuacct_charge_calls *fn)
9248{
9249 cpuacct_cpufreq = fn;
9250
9251 /*
9252 * Root node is created before platform can register callbacks,
9253 * initalize here.
9254 */
9255 if (cpuacct_root && fn) {
9256 cpuacct_root->cpufreq_fn = fn;
9257 if (fn->init)
9258 fn->init(&cpuacct_root->cpuacct_data);
9259 }
9260 return 0;
9261}
9262
9202struct cgroup_subsys cpuacct_subsys; 9263struct cgroup_subsys cpuacct_subsys;
9203 9264
9204/* return cpu accounting group corresponding to this container */ 9265/* return cpu accounting group corresponding to this container */
@@ -9233,8 +9294,16 @@ static struct cgroup_subsys_state *cpuacct_create(
9233 if (percpu_counter_init(&ca->cpustat[i], 0)) 9294 if (percpu_counter_init(&ca->cpustat[i], 0))
9234 goto out_free_counters; 9295 goto out_free_counters;
9235 9296
9297 ca->cpufreq_fn = cpuacct_cpufreq;
9298
9299 /* If available, have platform code initalize cpu frequency table */
9300 if (ca->cpufreq_fn && ca->cpufreq_fn->init)
9301 ca->cpufreq_fn->init(&ca->cpuacct_data);
9302
9236 if (cgrp->parent) 9303 if (cgrp->parent)
9237 ca->parent = cgroup_ca(cgrp->parent); 9304 ca->parent = cgroup_ca(cgrp->parent);
9305 else
9306 cpuacct_root = ca;
9238 9307
9239 return &ca->css; 9308 return &ca->css;
9240 9309
@@ -9362,6 +9431,32 @@ static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9362 return 0; 9431 return 0;
9363} 9432}
9364 9433
9434static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft,
9435 struct cgroup_map_cb *cb)
9436{
9437 struct cpuacct *ca = cgroup_ca(cgrp);
9438 if (ca->cpufreq_fn && ca->cpufreq_fn->cpufreq_show)
9439 ca->cpufreq_fn->cpufreq_show(ca->cpuacct_data, cb);
9440
9441 return 0;
9442}
9443
9444/* return total cpu power usage (milliWatt second) of a group */
9445static u64 cpuacct_powerusage_read(struct cgroup *cgrp, struct cftype *cft)
9446{
9447 int i;
9448 struct cpuacct *ca = cgroup_ca(cgrp);
9449 u64 totalpower = 0;
9450
9451 if (ca->cpufreq_fn && ca->cpufreq_fn->power_usage)
9452 for_each_present_cpu(i) {
9453 totalpower += ca->cpufreq_fn->power_usage(
9454 ca->cpuacct_data);
9455 }
9456
9457 return totalpower;
9458}
9459
9365static struct cftype files[] = { 9460static struct cftype files[] = {
9366 { 9461 {
9367 .name = "usage", 9462 .name = "usage",
@@ -9376,6 +9471,14 @@ static struct cftype files[] = {
9376 .name = "stat", 9471 .name = "stat",
9377 .read_map = cpuacct_stats_show, 9472 .read_map = cpuacct_stats_show,
9378 }, 9473 },
9474 {
9475 .name = "cpufreq",
9476 .read_map = cpuacct_cpufreq_show,
9477 },
9478 {
9479 .name = "power",
9480 .read_u64 = cpuacct_powerusage_read
9481 },
9379}; 9482};
9380 9483
9381static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9484static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9405,6 +9508,10 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9405 for (; ca; ca = ca->parent) { 9508 for (; ca; ca = ca->parent) {
9406 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 9509 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9407 *cpuusage += cputime; 9510 *cpuusage += cputime;
9511
9512 /* Call back into platform code to account for CPU speeds */
9513 if (ca->cpufreq_fn && ca->cpufreq_fn->charge)
9514 ca->cpufreq_fn->charge(ca->cpuacct_data, cputime, cpu);
9408 } 9515 }
9409 9516
9410 rcu_read_unlock(); 9517 rcu_read_unlock();
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 05577055cfc..c2f0e7248dc 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -13,6 +13,7 @@ struct autogroup {
13 int nice; 13 int nice;
14}; 14};
15 15
16static inline bool task_group_is_autogroup(struct task_group *tg);
16static inline struct task_group * 17static inline struct task_group *
17autogroup_task_group(struct task_struct *p, struct task_group *tg); 18autogroup_task_group(struct task_struct *p, struct task_group *tg);
18 19
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 334eb474af9..22999b257ad 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
135 return grp->my_q; 135 return grp->my_q;
136} 136}
137 137
138/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
139 * another cpu ('this_cpu')
140 */
141static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
142{
143 return cfs_rq->tg->cfs_rq[this_cpu];
144}
145
146static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 138static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
147{ 139{
148 if (!cfs_rq->on_list) { 140 if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
271 return NULL; 263 return NULL;
272} 264}
273 265
274static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
275{
276 return &cpu_rq(this_cpu)->cfs;
277}
278
279static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 266static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
280{ 267{
281} 268}
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
334 return (s64)(a->vruntime - b->vruntime) < 0; 321 return (s64)(a->vruntime - b->vruntime) < 0;
335} 322}
336 323
337static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
338{
339 return se->vruntime - cfs_rq->min_vruntime;
340}
341
342static void update_min_vruntime(struct cfs_rq *cfs_rq) 324static void update_min_vruntime(struct cfs_rq *cfs_rq)
343{ 325{
344 u64 vruntime = cfs_rq->min_vruntime; 326 u64 vruntime = cfs_rq->min_vruntime;
@@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
372 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 354 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
373 struct rb_node *parent = NULL; 355 struct rb_node *parent = NULL;
374 struct sched_entity *entry; 356 struct sched_entity *entry;
375 s64 key = entity_key(cfs_rq, se);
376 int leftmost = 1; 357 int leftmost = 1;
377 358
378 /* 359 /*
@@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
385 * We dont care about collisions. Nodes with 366 * We dont care about collisions. Nodes with
386 * the same key stay together. 367 * the same key stay together.
387 */ 368 */
388 if (key < entity_key(cfs_rq, entry)) { 369 if (entity_before(se, entry)) {
389 link = &parent->rb_left; 370 link = &parent->rb_left;
390 } else { 371 } else {
391 link = &parent->rb_right; 372 link = &parent->rb_right;
@@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1336 } 1317 }
1337 1318
1338 for_each_sched_entity(se) { 1319 for_each_sched_entity(se) {
1339 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1320 cfs_rq = cfs_rq_of(se);
1340 1321
1341 update_cfs_load(cfs_rq, 0); 1322 update_cfs_load(cfs_rq, 0);
1342 update_cfs_shares(cfs_rq); 1323 update_cfs_shares(cfs_rq);
@@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1370 */ 1351 */
1371 if (task_sleep && parent_entity(se)) 1352 if (task_sleep && parent_entity(se))
1372 set_next_buddy(parent_entity(se)); 1353 set_next_buddy(parent_entity(se));
1354
1355 /* avoid re-evaluating load for this entity */
1356 se = parent_entity(se);
1373 break; 1357 break;
1374 } 1358 }
1375 flags |= DEQUEUE_SLEEP; 1359 flags |= DEQUEUE_SLEEP;
1376 } 1360 }
1377 1361
1378 for_each_sched_entity(se) { 1362 for_each_sched_entity(se) {
1379 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 cfs_rq = cfs_rq_of(se);
1380 1364
1381 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1382 update_cfs_shares(cfs_rq); 1366 update_cfs_shares(cfs_rq);
@@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1481 * effect of the currently running task from the load 1465 * effect of the currently running task from the load
1482 * of the current CPU: 1466 * of the current CPU:
1483 */ 1467 */
1484 rcu_read_lock();
1485 if (sync) { 1468 if (sync) {
1486 tg = task_group(current); 1469 tg = task_group(current);
1487 weight = current->se.load.weight; 1470 weight = current->se.load.weight;
@@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1517 balanced = this_eff_load <= prev_eff_load; 1500 balanced = this_eff_load <= prev_eff_load;
1518 } else 1501 } else
1519 balanced = true; 1502 balanced = true;
1520 rcu_read_unlock();
1521 1503
1522 /* 1504 /*
1523 * If the currently running task will sleep within 1505 * If the currently running task will sleep within
@@ -1924,8 +1906,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1924 if (!sched_feat(WAKEUP_PREEMPT)) 1906 if (!sched_feat(WAKEUP_PREEMPT))
1925 return; 1907 return;
1926 1908
1927 update_curr(cfs_rq);
1928 find_matching_se(&se, &pse); 1909 find_matching_se(&se, &pse);
1910 update_curr(cfs_rq_of(se));
1929 BUG_ON(!pse); 1911 BUG_ON(!pse);
1930 if (wakeup_preempt_entity(se, pse) == 1) { 1912 if (wakeup_preempt_entity(se, pse) == 1) {
1931 /* 1913 /*
@@ -2234,11 +2216,43 @@ static void update_shares(int cpu)
2234 struct rq *rq = cpu_rq(cpu); 2216 struct rq *rq = cpu_rq(cpu);
2235 2217
2236 rcu_read_lock(); 2218 rcu_read_lock();
2219 /*
2220 * Iterates the task_group tree in a bottom up fashion, see
2221 * list_add_leaf_cfs_rq() for details.
2222 */
2237 for_each_leaf_cfs_rq(rq, cfs_rq) 2223 for_each_leaf_cfs_rq(rq, cfs_rq)
2238 update_shares_cpu(cfs_rq->tg, cpu); 2224 update_shares_cpu(cfs_rq->tg, cpu);
2239 rcu_read_unlock(); 2225 rcu_read_unlock();
2240} 2226}
2241 2227
2228/*
2229 * Compute the cpu's hierarchical load factor for each task group.
2230 * This needs to be done in a top-down fashion because the load of a child
2231 * group is a fraction of its parents load.
2232 */
2233static int tg_load_down(struct task_group *tg, void *data)
2234{
2235 unsigned long load;
2236 long cpu = (long)data;
2237
2238 if (!tg->parent) {
2239 load = cpu_rq(cpu)->load.weight;
2240 } else {
2241 load = tg->parent->cfs_rq[cpu]->h_load;
2242 load *= tg->se[cpu]->load.weight;
2243 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
2244 }
2245
2246 tg->cfs_rq[cpu]->h_load = load;
2247
2248 return 0;
2249}
2250
2251static void update_h_load(long cpu)
2252{
2253 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
2254}
2255
2242static unsigned long 2256static unsigned long
2243load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2257load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2244 unsigned long max_load_move, 2258 unsigned long max_load_move,
@@ -2246,14 +2260,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2246 int *all_pinned) 2260 int *all_pinned)
2247{ 2261{
2248 long rem_load_move = max_load_move; 2262 long rem_load_move = max_load_move;
2249 int busiest_cpu = cpu_of(busiest); 2263 struct cfs_rq *busiest_cfs_rq;
2250 struct task_group *tg;
2251 2264
2252 rcu_read_lock(); 2265 rcu_read_lock();
2253 update_h_load(busiest_cpu); 2266 update_h_load(cpu_of(busiest));
2254 2267
2255 list_for_each_entry_rcu(tg, &task_groups, list) { 2268 for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
2256 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
2257 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 2269 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
2258 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 2270 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
2259 u64 rem_load, moved_load; 2271 u64 rem_load, moved_load;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1e7066d76c2..2e74677cb04 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,9 +61,9 @@ SCHED_FEAT(LB_BIAS, 1)
61SCHED_FEAT(OWNER_SPIN, 1) 61SCHED_FEAT(OWNER_SPIN, 1)
62 62
63/* 63/*
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on time not spent running tasks
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONTASK_POWER, 1)
67 67
68/* 68/*
69 * Queue remote wakeups on the target CPU and process them 69 * Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index db04161fe37..b827550a0d0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -187,11 +187,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
187 187
188typedef struct task_group *rt_rq_iter_t; 188typedef struct task_group *rt_rq_iter_t;
189 189
190#define for_each_rt_rq(rt_rq, iter, rq) \ 190static inline struct task_group *next_task_group(struct task_group *tg)
191 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ 191{
192 (&iter->list != &task_groups) && \ 192 do {
193 (rt_rq = iter->rt_rq[cpu_of(rq)]); \ 193 tg = list_entry_rcu(tg->list.next,
194 iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) 194 typeof(struct task_group), list);
195 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
196
197 if (&tg->list == &task_groups)
198 tg = NULL;
199
200 return tg;
201}
202
203#define for_each_rt_rq(rt_rq, iter, rq) \
204 for (iter = container_of(&task_groups, typeof(*iter), list); \
205 (iter = next_task_group(iter)) && \
206 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
195 207
196static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 208static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
197{ 209{
@@ -1045,7 +1057,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1045 */ 1057 */
1046 if (curr && unlikely(rt_task(curr)) && 1058 if (curr && unlikely(rt_task(curr)) &&
1047 (curr->rt.nr_cpus_allowed < 2 || 1059 (curr->rt.nr_cpus_allowed < 2 ||
1048 curr->prio < p->prio) && 1060 curr->prio <= p->prio) &&
1049 (p->rt.nr_cpus_allowed > 1)) { 1061 (p->rt.nr_cpus_allowed > 1)) {
1050 int target = find_lowest_rq(p); 1062 int target = find_lowest_rq(p);
1051 1063
@@ -1133,7 +1145,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1133 1145
1134 rt_rq = &rq->rt; 1146 rt_rq = &rq->rt;
1135 1147
1136 if (unlikely(!rt_rq->rt_nr_running)) 1148 if (!rt_rq->rt_nr_running)
1137 return NULL; 1149 return NULL;
1138 1150
1139 if (rt_rq_throttled(rt_rq)) 1151 if (rt_rq_throttled(rt_rq))
@@ -1555,7 +1567,7 @@ skip:
1555static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1567static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1556{ 1568{
1557 /* Try to pull RT tasks here if we lower this rq's prio */ 1569 /* Try to pull RT tasks here if we lower this rq's prio */
1558 if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) 1570 if (rq->rt.highest_prio.curr > prev->prio)
1559 pull_rt_task(rq); 1571 pull_rt_task(rq);
1560} 1572}
1561 1573
@@ -1576,7 +1588,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1576 p->rt.nr_cpus_allowed > 1 && 1588 p->rt.nr_cpus_allowed > 1 &&
1577 rt_task(rq->curr) && 1589 rt_task(rq->curr) &&
1578 (rq->curr->rt.nr_cpus_allowed < 2 || 1590 (rq->curr->rt.nr_cpus_allowed < 2 ||
1579 rq->curr->prio < p->prio)) 1591 rq->curr->prio <= p->prio))
1580 push_rt_tasks(rq); 1592 push_rt_tasks(rq);
1581} 1593}
1582 1594
diff --git a/kernel/signal.c b/kernel/signal.c
index 415d85d6f6c..195331c56ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
87 /* 87 /*
88 * Tracers may want to know about even ignored signals. 88 * Tracers may want to know about even ignored signals.
89 */ 89 */
90 return !tracehook_consider_ignored_signal(t, sig); 90 return !t->ptrace;
91} 91}
92 92
93/* 93/*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
124 124
125static int recalc_sigpending_tsk(struct task_struct *t) 125static int recalc_sigpending_tsk(struct task_struct *t)
126{ 126{
127 if ((t->group_stop & GROUP_STOP_PENDING) || 127 if ((t->jobctl & JOBCTL_PENDING_MASK) ||
128 PENDING(&t->pending, &t->blocked) || 128 PENDING(&t->pending, &t->blocked) ||
129 PENDING(&t->signal->shared_pending, &t->blocked)) { 129 PENDING(&t->signal->shared_pending, &t->blocked)) {
130 set_tsk_thread_flag(t, TIF_SIGPENDING); 130 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
150 150
151void recalc_sigpending(void) 151void recalc_sigpending(void)
152{ 152{
153 if (unlikely(tracehook_force_sigpending())) 153 if (!recalc_sigpending_tsk(current) && !freezing(current))
154 set_thread_flag(TIF_SIGPENDING);
155 else if (!recalc_sigpending_tsk(current) && !freezing(current))
156 clear_thread_flag(TIF_SIGPENDING); 154 clear_thread_flag(TIF_SIGPENDING);
157 155
158} 156}
@@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig)
224} 222}
225 223
226/** 224/**
227 * task_clear_group_stop_trapping - clear group stop trapping bit 225 * task_set_jobctl_pending - set jobctl pending bits
228 * @task: target task 226 * @task: target task
227 * @mask: pending bits to set
229 * 228 *
230 * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it 229 * Clear @mask from @task->jobctl. @mask must be subset of
231 * and wake up the ptracer. Note that we don't need any further locking. 230 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
232 * @task->siglock guarantees that @task->parent points to the ptracer. 231 * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
232 * cleared. If @task is already being killed or exiting, this function
233 * becomes noop.
234 *
235 * CONTEXT:
236 * Must be called with @task->sighand->siglock held.
237 *
238 * RETURNS:
239 * %true if @mask is set, %false if made noop because @task was dying.
240 */
241bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
242{
243 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
244 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
245 BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
246
247 if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
248 return false;
249
250 if (mask & JOBCTL_STOP_SIGMASK)
251 task->jobctl &= ~JOBCTL_STOP_SIGMASK;
252
253 task->jobctl |= mask;
254 return true;
255}
256
257/**
258 * task_clear_jobctl_trapping - clear jobctl trapping bit
259 * @task: target task
260 *
261 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
262 * Clear it and wake up the ptracer. Note that we don't need any further
263 * locking. @task->siglock guarantees that @task->parent points to the
264 * ptracer.
233 * 265 *
234 * CONTEXT: 266 * CONTEXT:
235 * Must be called with @task->sighand->siglock held. 267 * Must be called with @task->sighand->siglock held.
236 */ 268 */
237static void task_clear_group_stop_trapping(struct task_struct *task) 269void task_clear_jobctl_trapping(struct task_struct *task)
238{ 270{
239 if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { 271 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
240 task->group_stop &= ~GROUP_STOP_TRAPPING; 272 task->jobctl &= ~JOBCTL_TRAPPING;
241 __wake_up_sync_key(&task->parent->signal->wait_chldexit, 273 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
242 TASK_UNINTERRUPTIBLE, 1, task);
243 } 274 }
244} 275}
245 276
246/** 277/**
247 * task_clear_group_stop_pending - clear pending group stop 278 * task_clear_jobctl_pending - clear jobctl pending bits
248 * @task: target task 279 * @task: target task
280 * @mask: pending bits to clear
249 * 281 *
250 * Clear group stop states for @task. 282 * Clear @mask from @task->jobctl. @mask must be subset of
283 * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
284 * STOP bits are cleared together.
285 *
286 * If clearing of @mask leaves no stop or trap pending, this function calls
287 * task_clear_jobctl_trapping().
251 * 288 *
252 * CONTEXT: 289 * CONTEXT:
253 * Must be called with @task->sighand->siglock held. 290 * Must be called with @task->sighand->siglock held.
254 */ 291 */
255void task_clear_group_stop_pending(struct task_struct *task) 292void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
256{ 293{
257 task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | 294 BUG_ON(mask & ~JOBCTL_PENDING_MASK);
258 GROUP_STOP_DEQUEUED); 295
296 if (mask & JOBCTL_STOP_PENDING)
297 mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
298
299 task->jobctl &= ~mask;
300
301 if (!(task->jobctl & JOBCTL_PENDING_MASK))
302 task_clear_jobctl_trapping(task);
259} 303}
260 304
261/** 305/**
262 * task_participate_group_stop - participate in a group stop 306 * task_participate_group_stop - participate in a group stop
263 * @task: task participating in a group stop 307 * @task: task participating in a group stop
264 * 308 *
265 * @task has GROUP_STOP_PENDING set and is participating in a group stop. 309 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
266 * Group stop states are cleared and the group stop count is consumed if 310 * Group stop states are cleared and the group stop count is consumed if
267 * %GROUP_STOP_CONSUME was set. If the consumption completes the group 311 * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
268 * stop, the appropriate %SIGNAL_* flags are set. 312 * stop, the appropriate %SIGNAL_* flags are set.
269 * 313 *
270 * CONTEXT: 314 * CONTEXT:
@@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task)
277static bool task_participate_group_stop(struct task_struct *task) 321static bool task_participate_group_stop(struct task_struct *task)
278{ 322{
279 struct signal_struct *sig = task->signal; 323 struct signal_struct *sig = task->signal;
280 bool consume = task->group_stop & GROUP_STOP_CONSUME; 324 bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
281 325
282 WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); 326 WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
283 327
284 task_clear_group_stop_pending(task); 328 task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
285 329
286 if (!consume) 330 if (!consume)
287 return false; 331 return false;
@@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
449 return 1; 493 return 1;
450 if (handler != SIG_IGN && handler != SIG_DFL) 494 if (handler != SIG_IGN && handler != SIG_DFL)
451 return 0; 495 return 0;
452 return !tracehook_consider_fatal_signal(tsk, sig); 496 /* if ptraced, let the tracer determine */
497 return !tsk->ptrace;
453} 498}
454 499
455/* 500/*
@@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
604 * is to alert stop-signal processing code when another 649 * is to alert stop-signal processing code when another
605 * processor has come along and cleared the flag. 650 * processor has come along and cleared the flag.
606 */ 651 */
607 current->group_stop |= GROUP_STOP_DEQUEUED; 652 current->jobctl |= JOBCTL_STOP_DEQUEUED;
608 } 653 }
609 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 654 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
610 /* 655 /*
@@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
773 return security_task_kill(t, info, sig, 0); 818 return security_task_kill(t, info, sig, 0);
774} 819}
775 820
821/**
822 * ptrace_trap_notify - schedule trap to notify ptracer
823 * @t: tracee wanting to notify tracer
824 *
825 * This function schedules sticky ptrace trap which is cleared on the next
826 * TRAP_STOP to notify ptracer of an event. @t must have been seized by
827 * ptracer.
828 *
829 * If @t is running, STOP trap will be taken. If trapped for STOP and
830 * ptracer is listening for events, tracee is woken up so that it can
831 * re-trap for the new event. If trapped otherwise, STOP trap will be
832 * eventually taken without returning to userland after the existing traps
833 * are finished by PTRACE_CONT.
834 *
835 * CONTEXT:
836 * Must be called with @task->sighand->siglock held.
837 */
838static void ptrace_trap_notify(struct task_struct *t)
839{
840 WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
841 assert_spin_locked(&t->sighand->siglock);
842
843 task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
844 signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
845}
846
776/* 847/*
777 * Handle magic process-wide effects of stop/continue signals. Unlike 848 * Handle magic process-wide effects of stop/continue signals. Unlike
778 * the signal actions, these happen immediately at signal-generation 849 * the signal actions, these happen immediately at signal-generation
@@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
809 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 880 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
810 t = p; 881 t = p;
811 do { 882 do {
812 task_clear_group_stop_pending(t); 883 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
813 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 884 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
814 wake_up_state(t, __TASK_STOPPED); 885 if (likely(!(t->ptrace & PT_SEIZED)))
886 wake_up_state(t, __TASK_STOPPED);
887 else
888 ptrace_trap_notify(t);
815 } while_each_thread(p, t); 889 } while_each_thread(p, t);
816 890
817 /* 891 /*
@@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
908 if (sig_fatal(p, sig) && 982 if (sig_fatal(p, sig) &&
909 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 983 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
910 !sigismember(&t->real_blocked, sig) && 984 !sigismember(&t->real_blocked, sig) &&
911 (sig == SIGKILL || 985 (sig == SIGKILL || !t->ptrace)) {
912 !tracehook_consider_fatal_signal(t, sig))) {
913 /* 986 /*
914 * This signal will be fatal to the whole group. 987 * This signal will be fatal to the whole group.
915 */ 988 */
@@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
925 signal->group_stop_count = 0; 998 signal->group_stop_count = 0;
926 t = p; 999 t = p;
927 do { 1000 do {
928 task_clear_group_stop_pending(t); 1001 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
929 sigaddset(&t->pending.signal, SIGKILL); 1002 sigaddset(&t->pending.signal, SIGKILL);
930 signal_wake_up(t, 1); 1003 signal_wake_up(t, 1);
931 } while_each_thread(p, t); 1004 } while_each_thread(p, t);
@@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p)
1160 p->signal->group_stop_count = 0; 1233 p->signal->group_stop_count = 0;
1161 1234
1162 while_each_thread(p, t) { 1235 while_each_thread(p, t) {
1163 task_clear_group_stop_pending(t); 1236 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
1164 count++; 1237 count++;
1165 1238
1166 /* Don't bother with already dead threads */ 1239 /* Don't bother with already dead threads */
@@ -1511,22 +1584,22 @@ ret:
1511 * Let a parent know about the death of a child. 1584 * Let a parent know about the death of a child.
1512 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1585 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1513 * 1586 *
1514 * Returns -1 if our parent ignored us and so we've switched to 1587 * Returns true if our parent ignored us and so we've switched to
1515 * self-reaping, or else @sig. 1588 * self-reaping.
1516 */ 1589 */
1517int do_notify_parent(struct task_struct *tsk, int sig) 1590bool do_notify_parent(struct task_struct *tsk, int sig)
1518{ 1591{
1519 struct siginfo info; 1592 struct siginfo info;
1520 unsigned long flags; 1593 unsigned long flags;
1521 struct sighand_struct *psig; 1594 struct sighand_struct *psig;
1522 int ret = sig; 1595 bool autoreap = false;
1523 1596
1524 BUG_ON(sig == -1); 1597 BUG_ON(sig == -1);
1525 1598
1526 /* do_notify_parent_cldstop should have been called instead. */ 1599 /* do_notify_parent_cldstop should have been called instead. */
1527 BUG_ON(task_is_stopped_or_traced(tsk)); 1600 BUG_ON(task_is_stopped_or_traced(tsk));
1528 1601
1529 BUG_ON(!task_ptrace(tsk) && 1602 BUG_ON(!tsk->ptrace &&
1530 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1603 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1531 1604
1532 info.si_signo = sig; 1605 info.si_signo = sig;
@@ -1565,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1565 1638
1566 psig = tsk->parent->sighand; 1639 psig = tsk->parent->sighand;
1567 spin_lock_irqsave(&psig->siglock, flags); 1640 spin_lock_irqsave(&psig->siglock, flags);
1568 if (!task_ptrace(tsk) && sig == SIGCHLD && 1641 if (!tsk->ptrace && sig == SIGCHLD &&
1569 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1642 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1570 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1643 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1571 /* 1644 /*
@@ -1583,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1583 * is implementation-defined: we do (if you don't want 1656 * is implementation-defined: we do (if you don't want
1584 * it, just use SIG_IGN instead). 1657 * it, just use SIG_IGN instead).
1585 */ 1658 */
1586 ret = tsk->exit_signal = -1; 1659 autoreap = true;
1587 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) 1660 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1588 sig = -1; 1661 sig = 0;
1589 } 1662 }
1590 if (valid_signal(sig) && sig > 0) 1663 if (valid_signal(sig) && sig)
1591 __group_send_sig_info(sig, &info, tsk->parent); 1664 __group_send_sig_info(sig, &info, tsk->parent);
1592 __wake_up_parent(tsk, tsk->parent); 1665 __wake_up_parent(tsk, tsk->parent);
1593 spin_unlock_irqrestore(&psig->siglock, flags); 1666 spin_unlock_irqrestore(&psig->siglock, flags);
1594 1667
1595 return ret; 1668 return autoreap;
1596} 1669}
1597 1670
1598/** 1671/**
@@ -1665,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1665 1738
1666static inline int may_ptrace_stop(void) 1739static inline int may_ptrace_stop(void)
1667{ 1740{
1668 if (!likely(task_ptrace(current))) 1741 if (!likely(current->ptrace))
1669 return 0; 1742 return 0;
1670 /* 1743 /*
1671 * Are we in the middle of do_coredump? 1744 * Are we in the middle of do_coredump?
@@ -1694,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk)
1694} 1767}
1695 1768
1696/* 1769/*
1697 * Test whether the target task of the usual cldstop notification - the
1698 * real_parent of @child - is in the same group as the ptracer.
1699 */
1700static bool real_parent_is_ptracer(struct task_struct *child)
1701{
1702 return same_thread_group(child->parent, child->real_parent);
1703}
1704
1705/*
1706 * This must be called with current->sighand->siglock held. 1770 * This must be called with current->sighand->siglock held.
1707 * 1771 *
1708 * This should be the path for all ptrace stops. 1772 * This should be the path for all ptrace stops.
@@ -1739,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1739 } 1803 }
1740 1804
1741 /* 1805 /*
1742 * If @why is CLD_STOPPED, we're trapping to participate in a group 1806 * We're committing to trapping. TRACED should be visible before
1743 * stop. Do the bookkeeping. Note that if SIGCONT was delievered 1807 * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
1744 * while siglock was released for the arch hook, PENDING could be 1808 * Also, transition to TRACED and updates to ->jobctl should be
1745 * clear now. We act as if SIGCONT is received after TASK_TRACED 1809 * atomic with respect to siglock and should be done after the arch
1746 * is entered - ignore it. 1810 * hook as siglock is released and regrabbed across it.
1747 */ 1811 */
1748 if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) 1812 set_current_state(TASK_TRACED);
1749 gstop_done = task_participate_group_stop(current);
1750 1813
1751 current->last_siginfo = info; 1814 current->last_siginfo = info;
1752 current->exit_code = exit_code; 1815 current->exit_code = exit_code;
1753 1816
1754 /* 1817 /*
1755 * TRACED should be visible before TRAPPING is cleared; otherwise, 1818 * If @why is CLD_STOPPED, we're trapping to participate in a group
1756 * the tracer might fail do_wait(). 1819 * stop. Do the bookkeeping. Note that if SIGCONT was delievered
1820 * across siglock relocks since INTERRUPT was scheduled, PENDING
1821 * could be clear now. We act as if SIGCONT is received after
1822 * TASK_TRACED is entered - ignore it.
1757 */ 1823 */
1758 set_current_state(TASK_TRACED); 1824 if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
1825 gstop_done = task_participate_group_stop(current);
1759 1826
1760 /* 1827 /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
1761 * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and 1828 task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
1762 * transition to TASK_TRACED should be atomic with respect to 1829 if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
1763 * siglock. This hsould be done after the arch hook as siglock is 1830 task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
1764 * released and regrabbed across it. 1831
1765 */ 1832 /* entering a trap, clear TRAPPING */
1766 task_clear_group_stop_trapping(current); 1833 task_clear_jobctl_trapping(current);
1767 1834
1768 spin_unlock_irq(&current->sighand->siglock); 1835 spin_unlock_irq(&current->sighand->siglock);
1769 read_lock(&tasklist_lock); 1836 read_lock(&tasklist_lock);
@@ -1779,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1779 * separately unless they're gonna be duplicates. 1846 * separately unless they're gonna be duplicates.
1780 */ 1847 */
1781 do_notify_parent_cldstop(current, true, why); 1848 do_notify_parent_cldstop(current, true, why);
1782 if (gstop_done && !real_parent_is_ptracer(current)) 1849 if (gstop_done && ptrace_reparented(current))
1783 do_notify_parent_cldstop(current, false, why); 1850 do_notify_parent_cldstop(current, false, why);
1784 1851
1785 /* 1852 /*
@@ -1799,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1799 * 1866 *
1800 * If @gstop_done, the ptracer went away between group stop 1867 * If @gstop_done, the ptracer went away between group stop
1801 * completion and here. During detach, it would have set 1868 * completion and here. During detach, it would have set
1802 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED 1869 * JOBCTL_STOP_PENDING on us and we'll re-enter
1803 * in do_signal_stop() on return, so notifying the real 1870 * TASK_STOPPED in do_signal_stop() on return, so notifying
1804 * parent of the group stop completion is enough. 1871 * the real parent of the group stop completion is enough.
1805 */ 1872 */
1806 if (gstop_done) 1873 if (gstop_done)
1807 do_notify_parent_cldstop(current, false, why); 1874 do_notify_parent_cldstop(current, false, why);
@@ -1827,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1827 spin_lock_irq(&current->sighand->siglock); 1894 spin_lock_irq(&current->sighand->siglock);
1828 current->last_siginfo = NULL; 1895 current->last_siginfo = NULL;
1829 1896
1897 /* LISTENING can be set only during STOP traps, clear it */
1898 current->jobctl &= ~JOBCTL_LISTENING;
1899
1830 /* 1900 /*
1831 * Queued signals ignored us while we were stopped for tracing. 1901 * Queued signals ignored us while we were stopped for tracing.
1832 * So check for any that we should take before resuming user mode. 1902 * So check for any that we should take before resuming user mode.
@@ -1835,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1835 recalc_sigpending_tsk(current); 1905 recalc_sigpending_tsk(current);
1836} 1906}
1837 1907
1838void ptrace_notify(int exit_code) 1908static void ptrace_do_notify(int signr, int exit_code, int why)
1839{ 1909{
1840 siginfo_t info; 1910 siginfo_t info;
1841 1911
1842 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1843
1844 memset(&info, 0, sizeof info); 1912 memset(&info, 0, sizeof info);
1845 info.si_signo = SIGTRAP; 1913 info.si_signo = signr;
1846 info.si_code = exit_code; 1914 info.si_code = exit_code;
1847 info.si_pid = task_pid_vnr(current); 1915 info.si_pid = task_pid_vnr(current);
1848 info.si_uid = current_uid(); 1916 info.si_uid = current_uid();
1849 1917
1850 /* Let the debugger run. */ 1918 /* Let the debugger run. */
1919 ptrace_stop(exit_code, why, 1, &info);
1920}
1921
1922void ptrace_notify(int exit_code)
1923{
1924 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1925
1851 spin_lock_irq(&current->sighand->siglock); 1926 spin_lock_irq(&current->sighand->siglock);
1852 ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); 1927 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
1853 spin_unlock_irq(&current->sighand->siglock); 1928 spin_unlock_irq(&current->sighand->siglock);
1854} 1929}
1855 1930
1856/* 1931/**
1857 * This performs the stopping for SIGSTOP and other stop signals. 1932 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
1858 * We have to stop all threads in the thread group. 1933 * @signr: signr causing group stop if initiating
1859 * Returns non-zero if we've actually stopped and released the siglock. 1934 *
1860 * Returns zero if we didn't stop and still hold the siglock. 1935 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
1936 * and participate in it. If already set, participate in the existing
1937 * group stop. If participated in a group stop (and thus slept), %true is
1938 * returned with siglock released.
1939 *
1940 * If ptraced, this function doesn't handle stop itself. Instead,
1941 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
1942 * untouched. The caller must ensure that INTERRUPT trap handling takes
1943 * places afterwards.
1944 *
1945 * CONTEXT:
1946 * Must be called with @current->sighand->siglock held, which is released
1947 * on %true return.
1948 *
1949 * RETURNS:
1950 * %false if group stop is already cancelled or ptrace trap is scheduled.
1951 * %true if participated in group stop.
1861 */ 1952 */
1862static int do_signal_stop(int signr) 1953static bool do_signal_stop(int signr)
1954 __releases(&current->sighand->siglock)
1863{ 1955{
1864 struct signal_struct *sig = current->signal; 1956 struct signal_struct *sig = current->signal;
1865 1957
1866 if (!(current->group_stop & GROUP_STOP_PENDING)) { 1958 if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
1867 unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; 1959 unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
1868 struct task_struct *t; 1960 struct task_struct *t;
1869 1961
1870 /* signr will be recorded in task->group_stop for retries */ 1962 /* signr will be recorded in task->jobctl for retries */
1871 WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); 1963 WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
1872 1964
1873 if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || 1965 if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
1874 unlikely(signal_group_exit(sig))) 1966 unlikely(signal_group_exit(sig)))
1875 return 0; 1967 return false;
1876 /* 1968 /*
1877 * There is no group stop already in progress. We must 1969 * There is no group stop already in progress. We must
1878 * initiate one now. 1970 * initiate one now.
@@ -1894,29 +1986,31 @@ static int do_signal_stop(int signr)
1894 */ 1986 */
1895 if (!(sig->flags & SIGNAL_STOP_STOPPED)) 1987 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1896 sig->group_exit_code = signr; 1988 sig->group_exit_code = signr;
1897 else
1898 WARN_ON_ONCE(!task_ptrace(current));
1899 1989
1900 current->group_stop &= ~GROUP_STOP_SIGMASK; 1990 sig->group_stop_count = 0;
1901 current->group_stop |= signr | gstop; 1991
1902 sig->group_stop_count = 1; 1992 if (task_set_jobctl_pending(current, signr | gstop))
1993 sig->group_stop_count++;
1994
1903 for (t = next_thread(current); t != current; 1995 for (t = next_thread(current); t != current;
1904 t = next_thread(t)) { 1996 t = next_thread(t)) {
1905 t->group_stop &= ~GROUP_STOP_SIGMASK;
1906 /* 1997 /*
1907 * Setting state to TASK_STOPPED for a group 1998 * Setting state to TASK_STOPPED for a group
1908 * stop is always done with the siglock held, 1999 * stop is always done with the siglock held,
1909 * so this check has no races. 2000 * so this check has no races.
1910 */ 2001 */
1911 if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { 2002 if (!task_is_stopped(t) &&
1912 t->group_stop |= signr | gstop; 2003 task_set_jobctl_pending(t, signr | gstop)) {
1913 sig->group_stop_count++; 2004 sig->group_stop_count++;
1914 signal_wake_up(t, 0); 2005 if (likely(!(t->ptrace & PT_SEIZED)))
2006 signal_wake_up(t, 0);
2007 else
2008 ptrace_trap_notify(t);
1915 } 2009 }
1916 } 2010 }
1917 } 2011 }
1918retry: 2012
1919 if (likely(!task_ptrace(current))) { 2013 if (likely(!current->ptrace)) {
1920 int notify = 0; 2014 int notify = 0;
1921 2015
1922 /* 2016 /*
@@ -1947,43 +2041,65 @@ retry:
1947 2041
1948 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2042 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1949 schedule(); 2043 schedule();
1950 2044 return true;
1951 spin_lock_irq(&current->sighand->siglock);
1952 } else { 2045 } else {
1953 ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, 2046 /*
1954 CLD_STOPPED, 0, NULL); 2047 * While ptraced, group stop is handled by STOP trap.
1955 current->exit_code = 0; 2048 * Schedule it and let the caller deal with it.
2049 */
2050 task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
2051 return false;
1956 } 2052 }
2053}
1957 2054
1958 /* 2055/**
1959 * GROUP_STOP_PENDING could be set if another group stop has 2056 * do_jobctl_trap - take care of ptrace jobctl traps
1960 * started since being woken up or ptrace wants us to transit 2057 *
1961 * between TASK_STOPPED and TRACED. Retry group stop. 2058 * When PT_SEIZED, it's used for both group stop and explicit
1962 */ 2059 * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
1963 if (current->group_stop & GROUP_STOP_PENDING) { 2060 * accompanying siginfo. If stopped, lower eight bits of exit_code contain
1964 WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); 2061 * the stop signal; otherwise, %SIGTRAP.
1965 goto retry; 2062 *
2063 * When !PT_SEIZED, it's used only for group stop trap with stop signal
2064 * number as exit_code and no siginfo.
2065 *
2066 * CONTEXT:
2067 * Must be called with @current->sighand->siglock held, which may be
2068 * released and re-acquired before returning with intervening sleep.
2069 */
2070static void do_jobctl_trap(void)
2071{
2072 struct signal_struct *signal = current->signal;
2073 int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
2074
2075 if (current->ptrace & PT_SEIZED) {
2076 if (!signal->group_stop_count &&
2077 !(signal->flags & SIGNAL_STOP_STOPPED))
2078 signr = SIGTRAP;
2079 WARN_ON_ONCE(!signr);
2080 ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
2081 CLD_STOPPED);
2082 } else {
2083 WARN_ON_ONCE(!signr);
2084 ptrace_stop(signr, CLD_STOPPED, 0, NULL);
2085 current->exit_code = 0;
1966 } 2086 }
1967
1968 /* PTRACE_ATTACH might have raced with task killing, clear trapping */
1969 task_clear_group_stop_trapping(current);
1970
1971 spin_unlock_irq(&current->sighand->siglock);
1972
1973 tracehook_finish_jctl();
1974
1975 return 1;
1976} 2087}
1977 2088
1978static int ptrace_signal(int signr, siginfo_t *info, 2089static int ptrace_signal(int signr, siginfo_t *info,
1979 struct pt_regs *regs, void *cookie) 2090 struct pt_regs *regs, void *cookie)
1980{ 2091{
1981 if (!task_ptrace(current))
1982 return signr;
1983
1984 ptrace_signal_deliver(regs, cookie); 2092 ptrace_signal_deliver(regs, cookie);
1985 2093 /*
1986 /* Let the debugger run. */ 2094 * We do not check sig_kernel_stop(signr) but set this marker
2095 * unconditionally because we do not know whether debugger will
2096 * change signr. This flag has no meaning unless we are going
2097 * to stop after return from ptrace_stop(). In this case it will
2098 * be checked in do_signal_stop(), we should only stop if it was
2099 * not cleared by SIGCONT while we were sleeping. See also the
2100 * comment in dequeue_signal().
2101 */
2102 current->jobctl |= JOBCTL_STOP_DEQUEUED;
1987 ptrace_stop(signr, CLD_TRAPPED, 0, info); 2103 ptrace_stop(signr, CLD_TRAPPED, 0, info);
1988 2104
1989 /* We're back. Did the debugger cancel the sig? */ 2105 /* We're back. Did the debugger cancel the sig? */
@@ -2039,7 +2155,6 @@ relock:
2039 * the CLD_ si_code into SIGNAL_CLD_MASK bits. 2155 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
2040 */ 2156 */
2041 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { 2157 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
2042 struct task_struct *leader;
2043 int why; 2158 int why;
2044 2159
2045 if (signal->flags & SIGNAL_CLD_CONTINUED) 2160 if (signal->flags & SIGNAL_CLD_CONTINUED)
@@ -2060,13 +2175,11 @@ relock:
2060 * a duplicate. 2175 * a duplicate.
2061 */ 2176 */
2062 read_lock(&tasklist_lock); 2177 read_lock(&tasklist_lock);
2063
2064 do_notify_parent_cldstop(current, false, why); 2178 do_notify_parent_cldstop(current, false, why);
2065 2179
2066 leader = current->group_leader; 2180 if (ptrace_reparented(current->group_leader))
2067 if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) 2181 do_notify_parent_cldstop(current->group_leader,
2068 do_notify_parent_cldstop(leader, true, why); 2182 true, why);
2069
2070 read_unlock(&tasklist_lock); 2183 read_unlock(&tasklist_lock);
2071 2184
2072 goto relock; 2185 goto relock;
@@ -2074,37 +2187,31 @@ relock:
2074 2187
2075 for (;;) { 2188 for (;;) {
2076 struct k_sigaction *ka; 2189 struct k_sigaction *ka;
2077 /* 2190
2078 * Tracing can induce an artificial signal and choose sigaction. 2191 if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
2079 * The return value in @signr determines the default action, 2192 do_signal_stop(0))
2080 * but @info->si_signo is the signal number we will report.
2081 */
2082 signr = tracehook_get_signal(current, regs, info, return_ka);
2083 if (unlikely(signr < 0))
2084 goto relock; 2193 goto relock;
2085 if (unlikely(signr != 0))
2086 ka = return_ka;
2087 else {
2088 if (unlikely(current->group_stop &
2089 GROUP_STOP_PENDING) && do_signal_stop(0))
2090 goto relock;
2091 2194
2092 signr = dequeue_signal(current, &current->blocked, 2195 if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
2093 info); 2196 do_jobctl_trap();
2197 spin_unlock_irq(&sighand->siglock);
2198 goto relock;
2199 }
2094 2200
2095 if (!signr) 2201 signr = dequeue_signal(current, &current->blocked, info);
2096 break; /* will return 0 */
2097 2202
2098 if (signr != SIGKILL) { 2203 if (!signr)
2099 signr = ptrace_signal(signr, info, 2204 break; /* will return 0 */
2100 regs, cookie);
2101 if (!signr)
2102 continue;
2103 }
2104 2205
2105 ka = &sighand->action[signr-1]; 2206 if (unlikely(current->ptrace) && signr != SIGKILL) {
2207 signr = ptrace_signal(signr, info,
2208 regs, cookie);
2209 if (!signr)
2210 continue;
2106 } 2211 }
2107 2212
2213 ka = &sighand->action[signr-1];
2214
2108 /* Trace actually delivered signals. */ 2215 /* Trace actually delivered signals. */
2109 trace_signal_deliver(signr, info, ka); 2216 trace_signal_deliver(signr, info, ka);
2110 2217
@@ -2260,7 +2367,7 @@ void exit_signals(struct task_struct *tsk)
2260 signotset(&unblocked); 2367 signotset(&unblocked);
2261 retarget_shared_pending(tsk, &unblocked); 2368 retarget_shared_pending(tsk, &unblocked);
2262 2369
2263 if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && 2370 if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
2264 task_participate_group_stop(tsk)) 2371 task_participate_group_stop(tsk))
2265 group_stop = CLD_STOPPED; 2372 group_stop = CLD_STOPPED;
2266out: 2373out:
@@ -2993,15 +3100,11 @@ SYSCALL_DEFINE0(sgetmask)
2993 3100
2994SYSCALL_DEFINE1(ssetmask, int, newmask) 3101SYSCALL_DEFINE1(ssetmask, int, newmask)
2995{ 3102{
2996 int old; 3103 int old = current->blocked.sig[0];
2997 3104 sigset_t newset;
2998 spin_lock_irq(&current->sighand->siglock);
2999 old = current->blocked.sig[0];
3000 3105
3001 siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)| 3106 siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
3002 sigmask(SIGSTOP))); 3107 set_current_blocked(&newset);
3003 recalc_sigpending();
3004 spin_unlock_irq(&current->sighand->siglock);
3005 3108
3006 return old; 3109 return old;
3007} 3110}
@@ -3058,11 +3161,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3058 return -EFAULT; 3161 return -EFAULT;
3059 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3162 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3060 3163
3061 spin_lock_irq(&current->sighand->siglock);
3062 current->saved_sigmask = current->blocked; 3164 current->saved_sigmask = current->blocked;
3063 current->blocked = newset; 3165 set_current_blocked(&newset);
3064 recalc_sigpending();
3065 spin_unlock_irq(&current->sighand->siglock);
3066 3166
3067 current->state = TASK_INTERRUPTIBLE; 3167 current->state = TASK_INTERRUPTIBLE;
3068 schedule(); 3168 schedule();
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index eb212f8f8bc..d20c6983aad 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28/* 28/*
29 * Architectures that do not implement save_stack_trace_tsk get this 29 * Architectures that do not implement save_stack_trace_tsk or
30 * weak alias and a once-per-bootup warning (whenever this facility 30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
31 * is utilized - for example by procfs): 31 * (whenever this facility is utilized - for example by procfs):
32 */ 32 */
33__weak void 33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{ 35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); 36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37} 37}
38
39__weak void
40save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
41{
42 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
43}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b29076..ba5070ce576 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -19,7 +19,7 @@
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21 21
22#include <asm/atomic.h> 22#include <linux/atomic.h>
23 23
24/* 24/*
25 * Structure to determine completion condition and record errors. May 25 * Structure to determine completion condition and record errors. May
@@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
136static DEFINE_MUTEX(stop_cpus_mutex); 136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); 137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138 138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) 139static void queue_stop_cpus_work(const struct cpumask *cpumask,
140 cpu_stop_fn_t fn, void *arg,
141 struct cpu_stop_done *done)
140{ 142{
141 struct cpu_stop_work *work; 143 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu; 144 unsigned int cpu;
144 145
145 /* initialize works and done */ 146 /* initialize works and done */
@@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
147 work = &per_cpu(stop_cpus_work, cpu); 148 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn; 149 work->fn = fn;
149 work->arg = arg; 150 work->arg = arg;
150 work->done = &done; 151 work->done = done;
151 } 152 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153 153
154 /* 154 /*
155 * Disable preemption while queueing to avoid getting 155 * Disable preemption while queueing to avoid getting
@@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), 161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu)); 162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable(); 163 preempt_enable();
164}
164 165
166static int __stop_cpus(const struct cpumask *cpumask,
167 cpu_stop_fn_t fn, void *arg)
168{
169 struct cpu_stop_done done;
170
171 cpu_stop_init_done(&done, cpumask_weight(cpumask));
172 queue_stop_cpus_work(cpumask, fn, arg, &done);
165 wait_for_completion(&done.completion); 173 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT; 174 return done.executed ? done.ret : -ENOENT;
167} 175}
@@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data)
431 struct stop_machine_data *smdata = data; 439 struct stop_machine_data *smdata = data;
432 enum stopmachine_state curstate = STOPMACHINE_NONE; 440 enum stopmachine_state curstate = STOPMACHINE_NONE;
433 int cpu = smp_processor_id(), err = 0; 441 int cpu = smp_processor_id(), err = 0;
442 unsigned long flags;
434 bool is_active; 443 bool is_active;
435 444
445 /*
446 * When called from stop_machine_from_inactive_cpu(), irq might
447 * already be disabled. Save the state and restore it on exit.
448 */
449 local_save_flags(flags);
450
436 if (!smdata->active_cpus) 451 if (!smdata->active_cpus)
437 is_active = cpu == cpumask_first(cpu_online_mask); 452 is_active = cpu == cpumask_first(cpu_online_mask);
438 else 453 else
@@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data)
460 } 475 }
461 } while (curstate != STOPMACHINE_EXIT); 476 } while (curstate != STOPMACHINE_EXIT);
462 477
463 local_irq_enable(); 478 local_irq_restore(flags);
464 return err; 479 return err;
465} 480}
466 481
@@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
487} 502}
488EXPORT_SYMBOL_GPL(stop_machine); 503EXPORT_SYMBOL_GPL(stop_machine);
489 504
505/**
506 * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
507 * @fn: the function to run
508 * @data: the data ptr for the @fn()
509 * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
510 *
511 * This is identical to stop_machine() but can be called from a CPU which
512 * is not active. The local CPU is in the process of hotplug (so no other
513 * CPU hotplug can start) and not marked active and doesn't have enough
514 * context to sleep.
515 *
516 * This function provides stop_machine() functionality for such state by
517 * using busy-wait for synchronization and executing @fn directly for local
518 * CPU.
519 *
520 * CONTEXT:
521 * Local CPU is inactive. Temporarily stops all active CPUs.
522 *
523 * RETURNS:
524 * 0 if all executions of @fn returned 0, any non zero return value if any
525 * returned non zero.
526 */
527int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
528 const struct cpumask *cpus)
529{
530 struct stop_machine_data smdata = { .fn = fn, .data = data,
531 .active_cpus = cpus };
532 struct cpu_stop_done done;
533 int ret;
534
535 /* Local CPU must be inactive and CPU hotplug in progress. */
536 BUG_ON(cpu_active(raw_smp_processor_id()));
537 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */
538
539 /* No proper task established and can't sleep - busy wait for lock. */
540 while (!mutex_trylock(&stop_cpus_mutex))
541 cpu_relax();
542
543 /* Schedule work on other CPUs and execute directly for local CPU */
544 set_state(&smdata, STOPMACHINE_PREPARE);
545 cpu_stop_init_done(&done, num_active_cpus());
546 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
547 &done);
548 ret = stop_machine_cpu_stop(&smdata);
549
550 /* Busy wait for completion. */
551 while (!completion_done(&done.completion))
552 cpu_relax();
553
554 mutex_unlock(&stop_cpus_mutex);
555 return ret ?: done.ret;
556}
557
490#endif /* CONFIG_STOP_MACHINE */ 558#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index e4128b278f2..1dbbe695a5e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/notifier.h>
12#include <linux/reboot.h> 11#include <linux/reboot.h>
13#include <linux/prctl.h> 12#include <linux/prctl.h>
14#include <linux/highuid.h> 13#include <linux/highuid.h>
@@ -38,6 +37,8 @@
38#include <linux/fs_struct.h> 37#include <linux/fs_struct.h>
39#include <linux/gfp.h> 38#include <linux/gfp.h>
40#include <linux/syscore_ops.h> 39#include <linux/syscore_ops.h>
40#include <linux/version.h>
41#include <linux/ctype.h>
41 42
42#include <linux/compat.h> 43#include <linux/compat.h>
43#include <linux/syscalls.h> 44#include <linux/syscalls.h>
@@ -45,6 +46,8 @@
45#include <linux/user_namespace.h> 46#include <linux/user_namespace.h>
46 47
47#include <linux/kmsg_dump.h> 48#include <linux/kmsg_dump.h>
49/* Move somewhere else to avoid recompiling? */
50#include <generated/utsrelease.h>
48 51
49#include <asm/uaccess.h> 52#include <asm/uaccess.h>
50#include <asm/io.h> 53#include <asm/io.h>
@@ -320,6 +323,37 @@ void kernel_restart_prepare(char *cmd)
320} 323}
321 324
322/** 325/**
326 * register_reboot_notifier - Register function to be called at reboot time
327 * @nb: Info about notifier function to be called
328 *
329 * Registers a function with the list of functions
330 * to be called at reboot time.
331 *
332 * Currently always returns zero, as blocking_notifier_chain_register()
333 * always returns zero.
334 */
335int register_reboot_notifier(struct notifier_block *nb)
336{
337 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
338}
339EXPORT_SYMBOL(register_reboot_notifier);
340
341/**
342 * unregister_reboot_notifier - Unregister previously registered reboot notifier
343 * @nb: Hook to be unregistered
344 *
345 * Unregisters a previously registered reboot
346 * notifier function.
347 *
348 * Returns zero on success, or %-ENOENT on failure.
349 */
350int unregister_reboot_notifier(struct notifier_block *nb)
351{
352 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
353}
354EXPORT_SYMBOL(unregister_reboot_notifier);
355
356/**
323 * kernel_restart - reboot the system 357 * kernel_restart - reboot the system
324 * @cmd: pointer to buffer containing command to execute for restart 358 * @cmd: pointer to buffer containing command to execute for restart
325 * or %NULL 359 * or %NULL
@@ -591,11 +625,18 @@ static int set_user(struct cred *new)
591 if (!new_user) 625 if (!new_user)
592 return -EAGAIN; 626 return -EAGAIN;
593 627
628 /*
629 * We don't fail in case of NPROC limit excess here because too many
630 * poorly written programs don't check set*uid() return code, assuming
631 * it never fails if called by root. We may still enforce NPROC limit
632 * for programs doing set*uid()+execve() by harmlessly deferring the
633 * failure to the execve() stage.
634 */
594 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && 635 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
595 new_user != INIT_USER) { 636 new_user != INIT_USER)
596 free_uid(new_user); 637 current->flags |= PF_NPROC_EXCEEDED;
597 return -EAGAIN; 638 else
598 } 639 current->flags &= ~PF_NPROC_EXCEEDED;
599 640
600 free_uid(new->user); 641 free_uid(new->user);
601 new->user = new_user; 642 new->user = new_user;
@@ -1124,6 +1165,34 @@ DECLARE_RWSEM(uts_sem);
1124#define override_architecture(name) 0 1165#define override_architecture(name) 0
1125#endif 1166#endif
1126 1167
1168/*
1169 * Work around broken programs that cannot handle "Linux 3.0".
1170 * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
1171 */
1172static int override_release(char __user *release, int len)
1173{
1174 int ret = 0;
1175 char buf[65];
1176
1177 if (current->personality & UNAME26) {
1178 char *rest = UTS_RELEASE;
1179 int ndots = 0;
1180 unsigned v;
1181
1182 while (*rest) {
1183 if (*rest == '.' && ++ndots >= 3)
1184 break;
1185 if (!isdigit(*rest) && *rest != '.')
1186 break;
1187 rest++;
1188 }
1189 v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
1190 snprintf(buf, len, "2.6.%u%s", v, rest);
1191 ret = copy_to_user(release, buf, len);
1192 }
1193 return ret;
1194}
1195
1127SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1196SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1128{ 1197{
1129 int errno = 0; 1198 int errno = 0;
@@ -1133,6 +1202,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1133 errno = -EFAULT; 1202 errno = -EFAULT;
1134 up_read(&uts_sem); 1203 up_read(&uts_sem);
1135 1204
1205 if (!errno && override_release(name->release, sizeof(name->release)))
1206 errno = -EFAULT;
1136 if (!errno && override_architecture(name)) 1207 if (!errno && override_architecture(name))
1137 errno = -EFAULT; 1208 errno = -EFAULT;
1138 return errno; 1209 return errno;
@@ -1154,6 +1225,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1154 error = -EFAULT; 1225 error = -EFAULT;
1155 up_read(&uts_sem); 1226 up_read(&uts_sem);
1156 1227
1228 if (!error && override_release(name->release, sizeof(name->release)))
1229 error = -EFAULT;
1157 if (!error && override_architecture(name)) 1230 if (!error && override_architecture(name))
1158 error = -EFAULT; 1231 error = -EFAULT;
1159 return error; 1232 return error;
@@ -1188,6 +1261,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1188 1261
1189 if (!error && override_architecture(name)) 1262 if (!error && override_architecture(name))
1190 error = -EFAULT; 1263 error = -EFAULT;
1264 if (!error && override_release(name->release, sizeof(name->release)))
1265 error = -EFAULT;
1191 return error ? -EFAULT : 0; 1266 return error ? -EFAULT : 0;
1192} 1267}
1193#endif 1268#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 62cbc8877fe..a9a5de07c4f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void)
16 return -ENOSYS; 16 return -ENOSYS;
17} 17}
18 18
19cond_syscall(sys_nfsservctl);
20cond_syscall(sys_quotactl); 19cond_syscall(sys_quotactl);
21cond_syscall(sys32_quotactl); 20cond_syscall(sys32_quotactl);
22cond_syscall(sys_acct); 21cond_syscall(sys_acct);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f175d98bd35..fd15163f360 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -96,6 +96,7 @@ extern char core_pattern[];
96extern unsigned int core_pipe_limit; 96extern unsigned int core_pipe_limit;
97extern int pid_max; 97extern int pid_max;
98extern int min_free_kbytes; 98extern int min_free_kbytes;
99extern int min_free_order_shift;
99extern int pid_max_min, pid_max_max; 100extern int pid_max_min, pid_max_max;
100extern int sysctl_drop_caches; 101extern int sysctl_drop_caches;
101extern int percpu_pagelist_fraction; 102extern int percpu_pagelist_fraction;
@@ -1189,6 +1190,13 @@ static struct ctl_table vm_table[] = {
1189 .extra1 = &zero, 1190 .extra1 = &zero,
1190 }, 1191 },
1191 { 1192 {
1193 .procname = "min_free_order_shift",
1194 .data = &min_free_order_shift,
1195 .maxlen = sizeof(min_free_order_shift),
1196 .mode = 0644,
1197 .proc_handler = &proc_dointvec
1198 },
1199 {
1192 .procname = "percpu_pagelist_fraction", 1200 .procname = "percpu_pagelist_fraction",
1193 .data = &percpu_pagelist_fraction, 1201 .data = &percpu_pagelist_fraction,
1194 .maxlen = sizeof(percpu_pagelist_fraction), 1202 .maxlen = sizeof(percpu_pagelist_fraction),
@@ -1590,16 +1598,11 @@ void sysctl_head_get(struct ctl_table_header *head)
1590 spin_unlock(&sysctl_lock); 1598 spin_unlock(&sysctl_lock);
1591} 1599}
1592 1600
1593static void free_head(struct rcu_head *rcu)
1594{
1595 kfree(container_of(rcu, struct ctl_table_header, rcu));
1596}
1597
1598void sysctl_head_put(struct ctl_table_header *head) 1601void sysctl_head_put(struct ctl_table_header *head)
1599{ 1602{
1600 spin_lock(&sysctl_lock); 1603 spin_lock(&sysctl_lock);
1601 if (!--head->count) 1604 if (!--head->count)
1602 call_rcu(&head->rcu, free_head); 1605 kfree_rcu(head, rcu);
1603 spin_unlock(&sysctl_lock); 1606 spin_unlock(&sysctl_lock);
1604} 1607}
1605 1608
@@ -1971,10 +1974,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1971 start_unregistering(header); 1974 start_unregistering(header);
1972 if (!--header->parent->count) { 1975 if (!--header->parent->count) {
1973 WARN_ON(1); 1976 WARN_ON(1);
1974 call_rcu(&header->parent->rcu, free_head); 1977 kfree_rcu(header->parent, rcu);
1975 } 1978 }
1976 if (!--header->count) 1979 if (!--header->count)
1977 call_rcu(&header->rcu, free_head); 1980 kfree_rcu(header, rcu);
1978 spin_unlock(&sysctl_lock); 1981 spin_unlock(&sysctl_lock);
1979} 1982}
1980 1983
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 3b8e028b960..2ce1b308672 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1,6 +1,6 @@
1#include <linux/stat.h> 1#include <linux/stat.h>
2#include <linux/sysctl.h> 2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h" 3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1354 1354
1355 fput(file); 1355 fput(file);
1356out_putname: 1356out_putname:
1357 putname(pathname); 1357 __putname(pathname);
1358out: 1358out:
1359 return result; 1359 return result;
1360} 1360}
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 4e4932a7b36..362da653813 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,6 @@
1#include <linux/stat.h> 1#include <linux/stat.h>
2#include <linux/sysctl.h> 2#include <linux/sysctl.h>
3#include "../fs/xfs/linux-2.6/xfs_sysctl.h" 3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h> 6#include <net/ip_vs.h>
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index fc0f2200541..e66046456f4 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -28,7 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <net/genetlink.h> 30#include <net/genetlink.h>
31#include <asm/atomic.h> 31#include <linux/atomic.h>
32 32
33/* 33/*
34 * Maximum length of a cpumask that can be specified in 34 * Maximum length of a cpumask that can be specified in
@@ -291,30 +291,28 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
291 if (!cpumask_subset(mask, cpu_possible_mask)) 291 if (!cpumask_subset(mask, cpu_possible_mask))
292 return -EINVAL; 292 return -EINVAL;
293 293
294 s = NULL;
295 if (isadd == REGISTER) { 294 if (isadd == REGISTER) {
296 for_each_cpu(cpu, mask) { 295 for_each_cpu(cpu, mask) {
297 if (!s) 296 s = kmalloc_node(sizeof(struct listener),
298 s = kmalloc_node(sizeof(struct listener), 297 GFP_KERNEL, cpu_to_node(cpu));
299 GFP_KERNEL, cpu_to_node(cpu));
300 if (!s) 298 if (!s)
301 goto cleanup; 299 goto cleanup;
300
302 s->pid = pid; 301 s->pid = pid;
303 INIT_LIST_HEAD(&s->list);
304 s->valid = 1; 302 s->valid = 1;
305 303
306 listeners = &per_cpu(listener_array, cpu); 304 listeners = &per_cpu(listener_array, cpu);
307 down_write(&listeners->sem); 305 down_write(&listeners->sem);
308 list_for_each_entry_safe(s2, tmp, &listeners->list, list) { 306 list_for_each_entry(s2, &listeners->list, list) {
309 if (s2->pid == pid) 307 if (s2->pid == pid && s2->valid)
310 goto next_cpu; 308 goto exists;
311 } 309 }
312 list_add(&s->list, &listeners->list); 310 list_add(&s->list, &listeners->list);
313 s = NULL; 311 s = NULL;
314next_cpu: 312exists:
315 up_write(&listeners->sem); 313 up_write(&listeners->sem);
314 kfree(s); /* nop if NULL */
316 } 315 }
317 kfree(s);
318 return 0; 316 return 0;
319 } 317 }
320 318
@@ -657,6 +655,7 @@ static struct genl_ops taskstats_ops = {
657 .cmd = TASKSTATS_CMD_GET, 655 .cmd = TASKSTATS_CMD_GET,
658 .doit = taskstats_user_cmd, 656 .doit = taskstats_user_cmd,
659 .policy = taskstats_cmd_get_policy, 657 .policy = taskstats_cmd_get_policy,
658 .flags = GENL_ADMIN_PERM,
660}; 659};
661 660
662static struct genl_ops cgroupstats_ops = { 661static struct genl_ops cgroupstats_ops = {
diff --git a/kernel/time.c b/kernel/time.c
index 8e8dc6d705c..d7760621452 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval);
575/* 575/*
576 * Convert jiffies/jiffies_64 to clock_t and back. 576 * Convert jiffies/jiffies_64 to clock_t and back.
577 */ 577 */
578clock_t jiffies_to_clock_t(long x) 578clock_t jiffies_to_clock_t(unsigned long x)
579{ 579{
580#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 580#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
581# if HZ < USER_HZ 581# if HZ < USER_HZ
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e2fd74b8e8c..cae2ad7491b 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,5 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o alarmtimer.o 2obj-y += timeconv.o posix-clock.o #alarmtimer.o
3 3
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 59f369f98a0..8b70c76910a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -181,7 +181,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
181 struct alarm *alarm; 181 struct alarm *alarm;
182 ktime_t expired = next->expires; 182 ktime_t expired = next->expires;
183 183
184 if (expired.tv64 >= now.tv64) 184 if (expired.tv64 > now.tv64)
185 break; 185 break;
186 186
187 alarm = container_of(next, struct alarm, node); 187 alarm = container_of(next, struct alarm, node);
@@ -441,6 +441,8 @@ static int alarm_timer_create(struct k_itimer *new_timer)
441static void alarm_timer_get(struct k_itimer *timr, 441static void alarm_timer_get(struct k_itimer *timr,
442 struct itimerspec *cur_setting) 442 struct itimerspec *cur_setting)
443{ 443{
444 memset(cur_setting, 0, sizeof(struct itimerspec));
445
444 cur_setting->it_interval = 446 cur_setting->it_interval =
445 ktime_to_timespec(timr->it.alarmtimer.period); 447 ktime_to_timespec(timr->it.alarmtimer.period);
446 cur_setting->it_value = 448 cur_setting->it_value =
@@ -479,11 +481,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
479 if (!rtcdev) 481 if (!rtcdev)
480 return -ENOTSUPP; 482 return -ENOTSUPP;
481 483
482 /* Save old values */ 484 /*
483 old_setting->it_interval = 485 * XXX HACK! Currently we can DOS a system if the interval
484 ktime_to_timespec(timr->it.alarmtimer.period); 486 * period on alarmtimers is too small. Cap the interval here
485 old_setting->it_value = 487 * to 100us and solve this properly in a future patch! -jstultz
486 ktime_to_timespec(timr->it.alarmtimer.node.expires); 488 */
489 if ((new_setting->it_interval.tv_sec == 0) &&
490 (new_setting->it_interval.tv_nsec < 100000))
491 new_setting->it_interval.tv_nsec = 100000;
492
493 if (old_setting)
494 alarm_timer_get(timr, old_setting);
487 495
488 /* If the timer was already set, cancel it */ 496 /* If the timer was already set, cancel it */
489 alarm_cancel(&timr->it.alarmtimer); 497 alarm_cancel(&timr->it.alarmtimer);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0980f0d9a0..8f77da18fef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -494,6 +494,22 @@ void clocksource_touch_watchdog(void)
494} 494}
495 495
496/** 496/**
497 * clocksource_max_adjustment- Returns max adjustment amount
498 * @cs: Pointer to clocksource
499 *
500 */
501static u32 clocksource_max_adjustment(struct clocksource *cs)
502{
503 u64 ret;
504 /*
505 * We won't try to correct for more then 11% adjustments (110,000 ppm),
506 */
507 ret = (u64)cs->mult * 11;
508 do_div(ret,100);
509 return (u32)ret;
510}
511
512/**
497 * clocksource_max_deferment - Returns max time the clocksource can be deferred 513 * clocksource_max_deferment - Returns max time the clocksource can be deferred
498 * @cs: Pointer to clocksource 514 * @cs: Pointer to clocksource
499 * 515 *
@@ -505,25 +521,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
505 /* 521 /*
506 * Calculate the maximum number of cycles that we can pass to the 522 * Calculate the maximum number of cycles that we can pass to the
507 * cyc2ns function without overflowing a 64-bit signed result. The 523 * cyc2ns function without overflowing a 64-bit signed result. The
508 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which 524 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
509 * is equivalent to the below. 525 * which is equivalent to the below.
510 * max_cycles < (2^63)/cs->mult 526 * max_cycles < (2^63)/(cs->mult + cs->maxadj)
511 * max_cycles < 2^(log2((2^63)/cs->mult)) 527 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
512 * max_cycles < 2^(log2(2^63) - log2(cs->mult)) 528 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
513 * max_cycles < 2^(63 - log2(cs->mult)) 529 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
514 * max_cycles < 1 << (63 - log2(cs->mult)) 530 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
515 * Please note that we add 1 to the result of the log2 to account for 531 * Please note that we add 1 to the result of the log2 to account for
516 * any rounding errors, ensure the above inequality is satisfied and 532 * any rounding errors, ensure the above inequality is satisfied and
517 * no overflow will occur. 533 * no overflow will occur.
518 */ 534 */
519 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); 535 max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
520 536
521 /* 537 /*
522 * The actual maximum number of cycles we can defer the clocksource is 538 * The actual maximum number of cycles we can defer the clocksource is
523 * determined by the minimum of max_cycles and cs->mask. 539 * determined by the minimum of max_cycles and cs->mask.
540 * Note: Here we subtract the maxadj to make sure we don't sleep for
541 * too long if there's a large negative adjustment.
524 */ 542 */
525 max_cycles = min_t(u64, max_cycles, (u64) cs->mask); 543 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
526 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); 544 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
545 cs->shift);
527 546
528 /* 547 /*
529 * To ensure that the clocksource does not wrap whilst we are idle, 548 * To ensure that the clocksource does not wrap whilst we are idle,
@@ -531,7 +550,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
531 * note a margin of 12.5% is used because this can be computed with 550 * note a margin of 12.5% is used because this can be computed with
532 * a shift, versus say 10% which would require division. 551 * a shift, versus say 10% which would require division.
533 */ 552 */
534 return max_nsecs - (max_nsecs >> 5); 553 return max_nsecs - (max_nsecs >> 3);
535} 554}
536 555
537#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET 556#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -642,7 +661,6 @@ static void clocksource_enqueue(struct clocksource *cs)
642void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 661void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
643{ 662{
644 u64 sec; 663 u64 sec;
645
646 /* 664 /*
647 * Calc the maximum number of seconds which we can run before 665 * Calc the maximum number of seconds which we can run before
648 * wrapping around. For clocksources which have a mask > 32bit 666 * wrapping around. For clocksources which have a mask > 32bit
@@ -653,7 +671,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
653 * ~ 0.06ppm granularity for NTP. We apply the same 12.5% 671 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
654 * margin as we do in clocksource_max_deferment() 672 * margin as we do in clocksource_max_deferment()
655 */ 673 */
656 sec = (cs->mask - (cs->mask >> 5)); 674 sec = (cs->mask - (cs->mask >> 3));
657 do_div(sec, freq); 675 do_div(sec, freq);
658 do_div(sec, scale); 676 do_div(sec, scale);
659 if (!sec) 677 if (!sec)
@@ -663,6 +681,20 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
663 681
664 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 682 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
665 NSEC_PER_SEC / scale, sec * scale); 683 NSEC_PER_SEC / scale, sec * scale);
684
685 /*
686 * for clocksources that have large mults, to avoid overflow.
687 * Since mult may be adjusted by ntp, add an safety extra margin
688 *
689 */
690 cs->maxadj = clocksource_max_adjustment(cs);
691 while ((cs->mult + cs->maxadj < cs->mult)
692 || (cs->mult - cs->maxadj > cs->mult)) {
693 cs->mult >>= 1;
694 cs->shift--;
695 cs->maxadj = clocksource_max_adjustment(cs);
696 }
697
666 cs->max_idle_ns = clocksource_max_deferment(cs); 698 cs->max_idle_ns = clocksource_max_deferment(cs);
667} 699}
668EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 700EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -703,6 +735,12 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);
703 */ 735 */
704int clocksource_register(struct clocksource *cs) 736int clocksource_register(struct clocksource *cs)
705{ 737{
738 /* calculate max adjustment for given mult/shift */
739 cs->maxadj = clocksource_max_adjustment(cs);
740 WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
741 "Clocksource %s might overflow on 11%% adjustment\n",
742 cs->name);
743
706 /* calculate max idle time permitted for this clocksource */ 744 /* calculate max idle time permitted for this clocksource */
707 cs->max_idle_ns = clocksource_max_deferment(cs); 745 cs->max_idle_ns = clocksource_max_deferment(cs);
708 746
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c7218d13273..7a90d021b79 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
71 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 71 (dev->features & CLOCK_EVT_FEAT_C3STOP))
72 return 0; 72 return 0;
73 73
74 clockevents_exchange_device(NULL, dev); 74 clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
75 tick_broadcast_device.evtdev = dev; 75 tick_broadcast_device.evtdev = dev;
76 if (!cpumask_empty(tick_get_broadcast_mask())) 76 if (!cpumask_empty(tick_get_broadcast_mask()))
77 tick_broadcast_start_periodic(dev); 77 tick_broadcast_start_periodic(dev);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 342408cf68d..6f9798bf240 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -249,6 +249,8 @@ ktime_t ktime_get(void)
249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec; 249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; 250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
251 nsecs += timekeeping_get_ns(); 251 nsecs += timekeeping_get_ns();
252 /* If arch requires, add in gettimeoffset() */
253 nsecs += arch_gettimeoffset();
252 254
253 } while (read_seqretry(&xtime_lock, seq)); 255 } while (read_seqretry(&xtime_lock, seq));
254 /* 256 /*
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts)
280 *ts = xtime; 282 *ts = xtime;
281 tomono = wall_to_monotonic; 283 tomono = wall_to_monotonic;
282 nsecs = timekeeping_get_ns(); 284 nsecs = timekeeping_get_ns();
285 /* If arch requires, add in gettimeoffset() */
286 nsecs += arch_gettimeoffset();
283 287
284 } while (read_seqretry(&xtime_lock, seq)); 288 } while (read_seqretry(&xtime_lock, seq));
285 289
@@ -604,6 +608,12 @@ static struct timespec timekeeping_suspend_time;
604 */ 608 */
605static void __timekeeping_inject_sleeptime(struct timespec *delta) 609static void __timekeeping_inject_sleeptime(struct timespec *delta)
606{ 610{
611 if (!timespec_valid(delta)) {
612 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
613 "sleep delta value!\n");
614 return;
615 }
616
607 xtime = timespec_add(xtime, *delta); 617 xtime = timespec_add(xtime, *delta);
608 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); 618 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
609 total_sleep_time = timespec_add(total_sleep_time, *delta); 619 total_sleep_time = timespec_add(total_sleep_time, *delta);
@@ -686,12 +696,34 @@ static void timekeeping_resume(void)
686static int timekeeping_suspend(void) 696static int timekeeping_suspend(void)
687{ 697{
688 unsigned long flags; 698 unsigned long flags;
699 struct timespec delta, delta_delta;
700 static struct timespec old_delta;
689 701
690 read_persistent_clock(&timekeeping_suspend_time); 702 read_persistent_clock(&timekeeping_suspend_time);
691 703
692 write_seqlock_irqsave(&xtime_lock, flags); 704 write_seqlock_irqsave(&xtime_lock, flags);
693 timekeeping_forward_now(); 705 timekeeping_forward_now();
694 timekeeping_suspended = 1; 706 timekeeping_suspended = 1;
707
708 /*
709 * To avoid drift caused by repeated suspend/resumes,
710 * which each can add ~1 second drift error,
711 * try to compensate so the difference in system time
712 * and persistent_clock time stays close to constant.
713 */
714 delta = timespec_sub(xtime, timekeeping_suspend_time);
715 delta_delta = timespec_sub(delta, old_delta);
716 if (abs(delta_delta.tv_sec) >= 2) {
717 /*
718 * if delta_delta is too large, assume time correction
719 * has occured and set old_delta to the current delta.
720 */
721 old_delta = delta;
722 } else {
723 /* Otherwise try to adjust old_system to compensate */
724 timekeeping_suspend_time =
725 timespec_add(timekeeping_suspend_time, delta_delta);
726 }
695 write_sequnlock_irqrestore(&xtime_lock, flags); 727 write_sequnlock_irqrestore(&xtime_lock, flags);
696 728
697 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 729 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
@@ -792,6 +824,13 @@ static void timekeeping_adjust(s64 offset)
792 } else 824 } else
793 return; 825 return;
794 826
827 WARN_ONCE(timekeeper.clock->maxadj &&
828 (timekeeper.mult + adj > timekeeper.clock->mult +
829 timekeeper.clock->maxadj),
830 "Adjusting %s more then 11%% (%ld vs %ld)\n",
831 timekeeper.clock->name, (long)timekeeper.mult + adj,
832 (long)timekeeper.clock->mult +
833 timekeeper.clock->maxadj);
795 timekeeper.mult += adj; 834 timekeeper.mult += adj;
796 timekeeper.xtime_interval += interval; 835 timekeeper.xtime_interval += interval;
797 timekeeper.xtime_nsec -= offset; 836 timekeeper.xtime_nsec -= offset;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2ad39e556cb..93168c0f991 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED
82 power:power_frequency 82 power:power_frequency
83 This is for userspace compatibility 83 This is for userspace compatibility
84 and will vanish after 5 kernel iterations, 84 and will vanish after 5 kernel iterations,
85 namely 2.6.41. 85 namely 3.1.
86 86
87config CONTEXT_SWITCH_TRACER 87config CONTEXT_SWITCH_TRACER
88 bool 88 bool
@@ -487,6 +487,39 @@ config RING_BUFFER_BENCHMARK
487 487
488 If unsure, say N. 488 If unsure, say N.
489 489
490config TRACELEVEL
491 bool "Add capability to prioritize traces"
492 depends on EVENT_TRACING
493 help
494 This option allows subsystem programmers to add priorities to trace
495 events by calling to tracelevel_register. Traces of high priority
496 will automatically be enabled on kernel boot, and users can change
497 the the trace level in a kernel parameter.
498
499config TRACEDUMP
500 bool "Dumping functionality for ftrace"
501 depends on FUNCTION_TRACER
502 help
503 This option adds functionality to dump tracing data in several forms
504 Data can be dumped in ascii form or as raw pages from the tracing
505 ring buffers, along with the saved cmdlines. This is specified by
506 the module parameter tracedump_ascii. Data will be compressed
507 using zlib.
508
509config TRACEDUMP_PANIC
510 bool "Tracedump to console on panic"
511 depends on TRACEDUMP
512 help
513 With this option, tracedump will automatically dump to the console
514 on a kernel panic.
515
516config TRACEDUMP_PROCFS
517 bool "Tracedump via proc file"
518 depends on TRACEDUMP
519 help
520 With this option, tracedump can be dumped from user space by reading
521 from /proc/tracedump.
522
490endif # FTRACE 523endif # FTRACE
491 524
492endif # TRACING_SUPPORT 525endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c..1360a1a90d5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -56,5 +56,7 @@ obj-$(CONFIG_TRACEPOINTS) += power-traces.o
56ifeq ($(CONFIG_TRACING),y) 56ifeq ($(CONFIG_TRACING),y)
57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o 57obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
58endif 58endif
59obj-$(CONFIG_TRACELEVEL) += tracelevel.o
60obj-$(CONFIG_TRACEDUMP) += tracedump.o
59 61
60libftrace-y := ftrace.o 62libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6957aa298df..7c910a5593a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -206,6 +206,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
206 what |= MASK_TC_BIT(rw, RAHEAD); 206 what |= MASK_TC_BIT(rw, RAHEAD);
207 what |= MASK_TC_BIT(rw, META); 207 what |= MASK_TC_BIT(rw, META);
208 what |= MASK_TC_BIT(rw, DISCARD); 208 what |= MASK_TC_BIT(rw, DISCARD);
209 what |= MASK_TC_BIT(rw, FLUSH);
210 what |= MASK_TC_BIT(rw, FUA);
209 211
210 pid = tsk->pid; 212 pid = tsk->pid;
211 if (act_log_check(bt, what, sector, pid)) 213 if (act_log_check(bt, what, sector, pid))
@@ -1054,6 +1056,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
1054 goto out; 1056 goto out;
1055 } 1057 }
1056 1058
1059 if (tc & BLK_TC_FLUSH)
1060 rwbs[i++] = 'F';
1061
1057 if (tc & BLK_TC_DISCARD) 1062 if (tc & BLK_TC_DISCARD)
1058 rwbs[i++] = 'D'; 1063 rwbs[i++] = 'D';
1059 else if (tc & BLK_TC_WRITE) 1064 else if (tc & BLK_TC_WRITE)
@@ -1063,10 +1068,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
1063 else 1068 else
1064 rwbs[i++] = 'N'; 1069 rwbs[i++] = 'N';
1065 1070
1071 if (tc & BLK_TC_FUA)
1072 rwbs[i++] = 'F';
1066 if (tc & BLK_TC_AHEAD) 1073 if (tc & BLK_TC_AHEAD)
1067 rwbs[i++] = 'A'; 1074 rwbs[i++] = 'A';
1068 if (tc & BLK_TC_BARRIER)
1069 rwbs[i++] = 'B';
1070 if (tc & BLK_TC_SYNC) 1075 if (tc & BLK_TC_SYNC)
1071 rwbs[i++] = 'S'; 1076 rwbs[i++] = 'S';
1072 if (tc & BLK_TC_META) 1077 if (tc & BLK_TC_META)
@@ -1132,7 +1137,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1132 1137
1133static int blk_log_action_classic(struct trace_iterator *iter, const char *act) 1138static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1134{ 1139{
1135 char rwbs[6]; 1140 char rwbs[RWBS_LEN];
1136 unsigned long long ts = iter->ts; 1141 unsigned long long ts = iter->ts;
1137 unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); 1142 unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1138 unsigned secs = (unsigned long)ts; 1143 unsigned secs = (unsigned long)ts;
@@ -1148,7 +1153,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1148 1153
1149static int blk_log_action(struct trace_iterator *iter, const char *act) 1154static int blk_log_action(struct trace_iterator *iter, const char *act)
1150{ 1155{
1151 char rwbs[6]; 1156 char rwbs[RWBS_LEN];
1152 const struct blk_io_trace *t = te_blk_io_trace(iter->ent); 1157 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1153 1158
1154 fill_rwbs(rwbs, t); 1159 fill_rwbs(rwbs, t);
@@ -1561,7 +1566,7 @@ static const struct {
1561} mask_maps[] = { 1566} mask_maps[] = {
1562 { BLK_TC_READ, "read" }, 1567 { BLK_TC_READ, "read" },
1563 { BLK_TC_WRITE, "write" }, 1568 { BLK_TC_WRITE, "write" },
1564 { BLK_TC_BARRIER, "barrier" }, 1569 { BLK_TC_FLUSH, "flush" },
1565 { BLK_TC_SYNC, "sync" }, 1570 { BLK_TC_SYNC, "sync" },
1566 { BLK_TC_QUEUE, "queue" }, 1571 { BLK_TC_QUEUE, "queue" },
1567 { BLK_TC_REQUEUE, "requeue" }, 1572 { BLK_TC_REQUEUE, "requeue" },
@@ -1573,6 +1578,7 @@ static const struct {
1573 { BLK_TC_META, "meta" }, 1578 { BLK_TC_META, "meta" },
1574 { BLK_TC_DISCARD, "discard" }, 1579 { BLK_TC_DISCARD, "discard" },
1575 { BLK_TC_DRV_DATA, "drv_data" }, 1580 { BLK_TC_DRV_DATA, "drv_data" },
1581 { BLK_TC_FUA, "fua" },
1576}; 1582};
1577 1583
1578static int blk_trace_str2mask(const char *str) 1584static int blk_trace_str2mask(const char *str)
@@ -1788,6 +1794,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1788{ 1794{
1789 int i = 0; 1795 int i = 0;
1790 1796
1797 if (rw & REQ_FLUSH)
1798 rwbs[i++] = 'F';
1799
1791 if (rw & WRITE) 1800 if (rw & WRITE)
1792 rwbs[i++] = 'W'; 1801 rwbs[i++] = 'W';
1793 else if (rw & REQ_DISCARD) 1802 else if (rw & REQ_DISCARD)
@@ -1797,6 +1806,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1797 else 1806 else
1798 rwbs[i++] = 'N'; 1807 rwbs[i++] = 'N';
1799 1808
1809 if (rw & REQ_FUA)
1810 rwbs[i++] = 'F';
1800 if (rw & REQ_RAHEAD) 1811 if (rw & REQ_RAHEAD)
1801 rwbs[i++] = 'A'; 1812 rwbs[i++] = 'A';
1802 if (rw & REQ_SYNC) 1813 if (rw & REQ_SYNC)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 908038f5744..798b16cd40f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,7 +32,6 @@
32 32
33#include <trace/events/sched.h> 33#include <trace/events/sched.h>
34 34
35#include <asm/ftrace.h>
36#include <asm/setup.h> 35#include <asm/setup.h>
37 36
38#include "trace_output.h" 37#include "trace_output.h"
@@ -82,14 +81,14 @@ static int ftrace_disabled __read_mostly;
82 81
83static DEFINE_MUTEX(ftrace_lock); 82static DEFINE_MUTEX(ftrace_lock);
84 83
85static struct ftrace_ops ftrace_list_end __read_mostly = 84static struct ftrace_ops ftrace_list_end __read_mostly = {
86{
87 .func = ftrace_stub, 85 .func = ftrace_stub,
88}; 86};
89 87
90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 88static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 89static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 90ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
91static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 93ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops; 94static struct ftrace_ops global_ops;
@@ -148,6 +147,7 @@ void clear_ftrace_function(void)
148{ 147{
149 ftrace_trace_function = ftrace_stub; 148 ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function = ftrace_stub; 149 __ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function_delay = ftrace_stub;
151 ftrace_pid_function = ftrace_stub; 151 ftrace_pid_function = ftrace_stub;
152} 152}
153 153
@@ -210,7 +210,12 @@ static void update_ftrace_function(void)
210#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 210#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
211 ftrace_trace_function = func; 211 ftrace_trace_function = func;
212#else 212#else
213#ifdef CONFIG_DYNAMIC_FTRACE
214 /* do not update till all functions have been modified */
215 __ftrace_trace_function_delay = func;
216#else
213 __ftrace_trace_function = func; 217 __ftrace_trace_function = func;
218#endif
214 ftrace_trace_function = ftrace_test_stop_func; 219 ftrace_trace_function = ftrace_test_stop_func;
215#endif 220#endif
216} 221}
@@ -785,8 +790,7 @@ static void unregister_ftrace_profiler(void)
785 unregister_ftrace_graph(); 790 unregister_ftrace_graph();
786} 791}
787#else 792#else
788static struct ftrace_ops ftrace_profile_ops __read_mostly = 793static struct ftrace_ops ftrace_profile_ops __read_mostly = {
789{
790 .func = function_profile_call, 794 .func = function_profile_call,
791}; 795};
792 796
@@ -806,19 +810,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
806 size_t cnt, loff_t *ppos) 810 size_t cnt, loff_t *ppos)
807{ 811{
808 unsigned long val; 812 unsigned long val;
809 char buf[64]; /* big enough to hold a number */
810 int ret; 813 int ret;
811 814
812 if (cnt >= sizeof(buf)) 815 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
813 return -EINVAL; 816 if (ret)
814
815 if (copy_from_user(&buf, ubuf, cnt))
816 return -EFAULT;
817
818 buf[cnt] = 0;
819
820 ret = strict_strtoul(buf, 10, &val);
821 if (ret < 0)
822 return ret; 817 return ret;
823 818
824 val = !!val; 819 val = !!val;
@@ -952,7 +947,7 @@ struct ftrace_func_probe {
952}; 947};
953 948
954enum { 949enum {
955 FTRACE_ENABLE_CALLS = (1 << 0), 950 FTRACE_UPDATE_CALLS = (1 << 0),
956 FTRACE_DISABLE_CALLS = (1 << 1), 951 FTRACE_DISABLE_CALLS = (1 << 1),
957 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 952 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
958 FTRACE_START_FUNC_RET = (1 << 3), 953 FTRACE_START_FUNC_RET = (1 << 3),
@@ -1182,8 +1177,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1182 return NULL; 1177 return NULL;
1183} 1178}
1184 1179
1180static void
1181ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
1182static void
1183ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
1184
1185static int 1185static int
1186ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) 1186ftrace_hash_move(struct ftrace_ops *ops, int enable,
1187 struct ftrace_hash **dst, struct ftrace_hash *src)
1187{ 1188{
1188 struct ftrace_func_entry *entry; 1189 struct ftrace_func_entry *entry;
1189 struct hlist_node *tp, *tn; 1190 struct hlist_node *tp, *tn;
@@ -1193,9 +1194,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1193 unsigned long key; 1194 unsigned long key;
1194 int size = src->count; 1195 int size = src->count;
1195 int bits = 0; 1196 int bits = 0;
1197 int ret;
1196 int i; 1198 int i;
1197 1199
1198 /* 1200 /*
1201 * Remove the current set, update the hash and add
1202 * them back.
1203 */
1204 ftrace_hash_rec_disable(ops, enable);
1205
1206 /*
1199 * If the new source is empty, just free dst and assign it 1207 * If the new source is empty, just free dst and assign it
1200 * the empty_hash. 1208 * the empty_hash.
1201 */ 1209 */
@@ -1215,9 +1223,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1215 if (bits > FTRACE_HASH_MAX_BITS) 1223 if (bits > FTRACE_HASH_MAX_BITS)
1216 bits = FTRACE_HASH_MAX_BITS; 1224 bits = FTRACE_HASH_MAX_BITS;
1217 1225
1226 ret = -ENOMEM;
1218 new_hash = alloc_ftrace_hash(bits); 1227 new_hash = alloc_ftrace_hash(bits);
1219 if (!new_hash) 1228 if (!new_hash)
1220 return -ENOMEM; 1229 goto out;
1221 1230
1222 size = 1 << src->size_bits; 1231 size = 1 << src->size_bits;
1223 for (i = 0; i < size; i++) { 1232 for (i = 0; i < size; i++) {
@@ -1236,7 +1245,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1236 rcu_assign_pointer(*dst, new_hash); 1245 rcu_assign_pointer(*dst, new_hash);
1237 free_ftrace_hash_rcu(old_hash); 1246 free_ftrace_hash_rcu(old_hash);
1238 1247
1239 return 0; 1248 ret = 0;
1249 out:
1250 /*
1251 * Enable regardless of ret:
1252 * On success, we enable the new hash.
1253 * On failure, we re-enable the original hash.
1254 */
1255 ftrace_hash_rec_enable(ops, enable);
1256
1257 return ret;
1240} 1258}
1241 1259
1242/* 1260/*
@@ -1498,7 +1516,7 @@ int ftrace_text_reserved(void *start, void *end)
1498 1516
1499 1517
1500static int 1518static int
1501__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1519__ftrace_replace_code(struct dyn_ftrace *rec, int update)
1502{ 1520{
1503 unsigned long ftrace_addr; 1521 unsigned long ftrace_addr;
1504 unsigned long flag = 0UL; 1522 unsigned long flag = 0UL;
@@ -1506,17 +1524,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1506 ftrace_addr = (unsigned long)FTRACE_ADDR; 1524 ftrace_addr = (unsigned long)FTRACE_ADDR;
1507 1525
1508 /* 1526 /*
1509 * If we are enabling tracing: 1527 * If we are updating calls:
1510 * 1528 *
1511 * If the record has a ref count, then we need to enable it 1529 * If the record has a ref count, then we need to enable it
1512 * because someone is using it. 1530 * because someone is using it.
1513 * 1531 *
1514 * Otherwise we make sure its disabled. 1532 * Otherwise we make sure its disabled.
1515 * 1533 *
1516 * If we are disabling tracing, then disable all records that 1534 * If we are disabling calls, then disable all records that
1517 * are enabled. 1535 * are enabled.
1518 */ 1536 */
1519 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1537 if (update && (rec->flags & ~FTRACE_FL_MASK))
1520 flag = FTRACE_FL_ENABLED; 1538 flag = FTRACE_FL_ENABLED;
1521 1539
1522 /* If the state of this record hasn't changed, then do nothing */ 1540 /* If the state of this record hasn't changed, then do nothing */
@@ -1532,7 +1550,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1532 return ftrace_make_nop(NULL, rec, ftrace_addr); 1550 return ftrace_make_nop(NULL, rec, ftrace_addr);
1533} 1551}
1534 1552
1535static void ftrace_replace_code(int enable) 1553static void ftrace_replace_code(int update)
1536{ 1554{
1537 struct dyn_ftrace *rec; 1555 struct dyn_ftrace *rec;
1538 struct ftrace_page *pg; 1556 struct ftrace_page *pg;
@@ -1546,7 +1564,7 @@ static void ftrace_replace_code(int enable)
1546 if (rec->flags & FTRACE_FL_FREE) 1564 if (rec->flags & FTRACE_FL_FREE)
1547 continue; 1565 continue;
1548 1566
1549 failed = __ftrace_replace_code(rec, enable); 1567 failed = __ftrace_replace_code(rec, update);
1550 if (failed) { 1568 if (failed) {
1551 ftrace_bug(failed, rec->ip); 1569 ftrace_bug(failed, rec->ip);
1552 /* Stop processing */ 1570 /* Stop processing */
@@ -1596,7 +1614,13 @@ static int __ftrace_modify_code(void *data)
1596{ 1614{
1597 int *command = data; 1615 int *command = data;
1598 1616
1599 if (*command & FTRACE_ENABLE_CALLS) 1617 /*
1618 * Do not call function tracer while we update the code.
1619 * We are in stop machine, no worrying about races.
1620 */
1621 function_trace_stop++;
1622
1623 if (*command & FTRACE_UPDATE_CALLS)
1600 ftrace_replace_code(1); 1624 ftrace_replace_code(1);
1601 else if (*command & FTRACE_DISABLE_CALLS) 1625 else if (*command & FTRACE_DISABLE_CALLS)
1602 ftrace_replace_code(0); 1626 ftrace_replace_code(0);
@@ -1609,6 +1633,18 @@ static int __ftrace_modify_code(void *data)
1609 else if (*command & FTRACE_STOP_FUNC_RET) 1633 else if (*command & FTRACE_STOP_FUNC_RET)
1610 ftrace_disable_ftrace_graph_caller(); 1634 ftrace_disable_ftrace_graph_caller();
1611 1635
1636#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1637 /*
1638 * For archs that call ftrace_test_stop_func(), we must
1639 * wait till after we update all the function callers
1640 * before we update the callback. This keeps different
1641 * ops that record different functions from corrupting
1642 * each other.
1643 */
1644 __ftrace_trace_function = __ftrace_trace_function_delay;
1645#endif
1646 function_trace_stop--;
1647
1612 return 0; 1648 return 0;
1613} 1649}
1614 1650
@@ -1652,7 +1688,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
1652 return -ENODEV; 1688 return -ENODEV;
1653 1689
1654 ftrace_start_up++; 1690 ftrace_start_up++;
1655 command |= FTRACE_ENABLE_CALLS; 1691 command |= FTRACE_UPDATE_CALLS;
1656 1692
1657 /* ops marked global share the filter hashes */ 1693 /* ops marked global share the filter hashes */
1658 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 1694 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1704,8 +1740,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1704 if (ops != &global_ops || !global_start_up) 1740 if (ops != &global_ops || !global_start_up)
1705 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 1741 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1706 1742
1707 if (!ftrace_start_up) 1743 command |= FTRACE_UPDATE_CALLS;
1708 command |= FTRACE_DISABLE_CALLS;
1709 1744
1710 if (saved_ftrace_func != ftrace_trace_function) { 1745 if (saved_ftrace_func != ftrace_trace_function) {
1711 saved_ftrace_func = ftrace_trace_function; 1746 saved_ftrace_func = ftrace_trace_function;
@@ -1727,7 +1762,7 @@ static void ftrace_startup_sysctl(void)
1727 saved_ftrace_func = NULL; 1762 saved_ftrace_func = NULL;
1728 /* ftrace_start_up is true if we want ftrace running */ 1763 /* ftrace_start_up is true if we want ftrace running */
1729 if (ftrace_start_up) 1764 if (ftrace_start_up)
1730 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1765 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
1731} 1766}
1732 1767
1733static void ftrace_shutdown_sysctl(void) 1768static void ftrace_shutdown_sysctl(void)
@@ -1744,10 +1779,36 @@ static cycle_t ftrace_update_time;
1744static unsigned long ftrace_update_cnt; 1779static unsigned long ftrace_update_cnt;
1745unsigned long ftrace_update_tot_cnt; 1780unsigned long ftrace_update_tot_cnt;
1746 1781
1782static int ops_traces_mod(struct ftrace_ops *ops)
1783{
1784 struct ftrace_hash *hash;
1785
1786 hash = ops->filter_hash;
1787 return !!(!hash || !hash->count);
1788}
1789
1747static int ftrace_update_code(struct module *mod) 1790static int ftrace_update_code(struct module *mod)
1748{ 1791{
1749 struct dyn_ftrace *p; 1792 struct dyn_ftrace *p;
1750 cycle_t start, stop; 1793 cycle_t start, stop;
1794 unsigned long ref = 0;
1795
1796 /*
1797 * When adding a module, we need to check if tracers are
1798 * currently enabled and if they are set to trace all functions.
1799 * If they are, we need to enable the module functions as well
1800 * as update the reference counts for those function records.
1801 */
1802 if (mod) {
1803 struct ftrace_ops *ops;
1804
1805 for (ops = ftrace_ops_list;
1806 ops != &ftrace_list_end; ops = ops->next) {
1807 if (ops->flags & FTRACE_OPS_FL_ENABLED &&
1808 ops_traces_mod(ops))
1809 ref++;
1810 }
1811 }
1751 1812
1752 start = ftrace_now(raw_smp_processor_id()); 1813 start = ftrace_now(raw_smp_processor_id());
1753 ftrace_update_cnt = 0; 1814 ftrace_update_cnt = 0;
@@ -1760,7 +1821,7 @@ static int ftrace_update_code(struct module *mod)
1760 1821
1761 p = ftrace_new_addrs; 1822 p = ftrace_new_addrs;
1762 ftrace_new_addrs = p->newlist; 1823 ftrace_new_addrs = p->newlist;
1763 p->flags = 0L; 1824 p->flags = ref;
1764 1825
1765 /* 1826 /*
1766 * Do the initial record conversion from mcount jump 1827 * Do the initial record conversion from mcount jump
@@ -1783,7 +1844,7 @@ static int ftrace_update_code(struct module *mod)
1783 * conversion puts the module to the correct state, thus 1844 * conversion puts the module to the correct state, thus
1784 * passing the ftrace_make_call check. 1845 * passing the ftrace_make_call check.
1785 */ 1846 */
1786 if (ftrace_start_up) { 1847 if (ftrace_start_up && ref) {
1787 int failed = __ftrace_replace_code(p, 1); 1848 int failed = __ftrace_replace_code(p, 1);
1788 if (failed) { 1849 if (failed) {
1789 ftrace_bug(failed, p->ip); 1850 ftrace_bug(failed, p->ip);
@@ -2407,10 +2468,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
2407 */ 2468 */
2408 2469
2409static int 2470static int
2410ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2471ftrace_mod_callback(struct ftrace_hash *hash,
2472 char *func, char *cmd, char *param, int enable)
2411{ 2473{
2412 struct ftrace_ops *ops = &global_ops;
2413 struct ftrace_hash *hash;
2414 char *mod; 2474 char *mod;
2415 int ret = -EINVAL; 2475 int ret = -EINVAL;
2416 2476
@@ -2430,11 +2490,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
2430 if (!strlen(mod)) 2490 if (!strlen(mod))
2431 return ret; 2491 return ret;
2432 2492
2433 if (enable)
2434 hash = ops->filter_hash;
2435 else
2436 hash = ops->notrace_hash;
2437
2438 ret = ftrace_match_module_records(hash, func, mod); 2493 ret = ftrace_match_module_records(hash, func, mod);
2439 if (!ret) 2494 if (!ret)
2440 ret = -EINVAL; 2495 ret = -EINVAL;
@@ -2760,7 +2815,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
2760 mutex_lock(&ftrace_cmd_mutex); 2815 mutex_lock(&ftrace_cmd_mutex);
2761 list_for_each_entry(p, &ftrace_commands, list) { 2816 list_for_each_entry(p, &ftrace_commands, list) {
2762 if (strcmp(p->name, command) == 0) { 2817 if (strcmp(p->name, command) == 0) {
2763 ret = p->func(func, command, next, enable); 2818 ret = p->func(hash, func, command, next, enable);
2764 goto out_unlock; 2819 goto out_unlock;
2765 } 2820 }
2766 } 2821 }
@@ -2857,7 +2912,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2857 ftrace_match_records(hash, buf, len); 2912 ftrace_match_records(hash, buf, len);
2858 2913
2859 mutex_lock(&ftrace_lock); 2914 mutex_lock(&ftrace_lock);
2860 ret = ftrace_hash_move(orig_hash, hash); 2915 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
2916 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
2917 && ftrace_enabled)
2918 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
2919
2861 mutex_unlock(&ftrace_lock); 2920 mutex_unlock(&ftrace_lock);
2862 2921
2863 mutex_unlock(&ftrace_regex_lock); 2922 mutex_unlock(&ftrace_regex_lock);
@@ -3040,18 +3099,12 @@ ftrace_regex_release(struct inode *inode, struct file *file)
3040 orig_hash = &iter->ops->notrace_hash; 3099 orig_hash = &iter->ops->notrace_hash;
3041 3100
3042 mutex_lock(&ftrace_lock); 3101 mutex_lock(&ftrace_lock);
3043 /* 3102 ret = ftrace_hash_move(iter->ops, filter_hash,
3044 * Remove the current set, update the hash and add 3103 orig_hash, iter->hash);
3045 * them back. 3104 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3046 */ 3105 && ftrace_enabled)
3047 ftrace_hash_rec_disable(iter->ops, filter_hash); 3106 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3048 ret = ftrace_hash_move(orig_hash, iter->hash); 3107
3049 if (!ret) {
3050 ftrace_hash_rec_enable(iter->ops, filter_hash);
3051 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3052 && ftrace_enabled)
3053 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3054 }
3055 mutex_unlock(&ftrace_lock); 3108 mutex_unlock(&ftrace_lock);
3056 } 3109 }
3057 free_ftrace_hash(iter->hash); 3110 free_ftrace_hash(iter->hash);
@@ -3330,7 +3383,7 @@ static int ftrace_process_locs(struct module *mod,
3330{ 3383{
3331 unsigned long *p; 3384 unsigned long *p;
3332 unsigned long addr; 3385 unsigned long addr;
3333 unsigned long flags; 3386 unsigned long flags = 0; /* Shut up gcc */
3334 3387
3335 mutex_lock(&ftrace_lock); 3388 mutex_lock(&ftrace_lock);
3336 p = start; 3389 p = start;
@@ -3348,12 +3401,18 @@ static int ftrace_process_locs(struct module *mod,
3348 } 3401 }
3349 3402
3350 /* 3403 /*
3351 * Disable interrupts to prevent interrupts from executing 3404 * We only need to disable interrupts on start up
3352 * code that is being modified. 3405 * because we are modifying code that an interrupt
3406 * may execute, and the modification is not atomic.
3407 * But for modules, nothing runs the code we modify
3408 * until we are finished with it, and there's no
3409 * reason to cause large interrupt latencies while we do it.
3353 */ 3410 */
3354 local_irq_save(flags); 3411 if (!mod)
3412 local_irq_save(flags);
3355 ftrace_update_code(mod); 3413 ftrace_update_code(mod);
3356 local_irq_restore(flags); 3414 if (!mod)
3415 local_irq_restore(flags);
3357 mutex_unlock(&ftrace_lock); 3416 mutex_unlock(&ftrace_lock);
3358 3417
3359 return 0; 3418 return 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b0c7aa40794..731201bf4ac 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 unsigned nr_pages) 997 unsigned nr_pages)
998{ 998{
999 struct buffer_page *bpage, *tmp; 999 struct buffer_page *bpage, *tmp;
1000 unsigned long addr;
1001 LIST_HEAD(pages); 1000 LIST_HEAD(pages);
1002 unsigned i; 1001 unsigned i;
1003 1002
1004 WARN_ON(!nr_pages); 1003 WARN_ON(!nr_pages);
1005 1004
1006 for (i = 0; i < nr_pages; i++) { 1005 for (i = 0; i < nr_pages; i++) {
1006 struct page *page;
1007 /*
1008 * __GFP_NORETRY flag makes sure that the allocation fails
1009 * gracefully without invoking oom-killer and the system is
1010 * not destabilized.
1011 */
1007 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1012 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1008 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 1013 GFP_KERNEL | __GFP_NORETRY,
1014 cpu_to_node(cpu_buffer->cpu));
1009 if (!bpage) 1015 if (!bpage)
1010 goto free_pages; 1016 goto free_pages;
1011 1017
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1013 1019
1014 list_add(&bpage->list, &pages); 1020 list_add(&bpage->list, &pages);
1015 1021
1016 addr = __get_free_page(GFP_KERNEL); 1022 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
1017 if (!addr) 1023 GFP_KERNEL | __GFP_NORETRY, 0);
1024 if (!page)
1018 goto free_pages; 1025 goto free_pages;
1019 bpage->page = (void *)addr; 1026 bpage->page = page_address(page);
1020 rb_init_page(bpage->page); 1027 rb_init_page(bpage->page);
1021 } 1028 }
1022 1029
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1045{ 1052{
1046 struct ring_buffer_per_cpu *cpu_buffer; 1053 struct ring_buffer_per_cpu *cpu_buffer;
1047 struct buffer_page *bpage; 1054 struct buffer_page *bpage;
1048 unsigned long addr; 1055 struct page *page;
1049 int ret; 1056 int ret;
1050 1057
1051 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), 1058 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1067 rb_check_bpage(cpu_buffer, bpage); 1074 rb_check_bpage(cpu_buffer, bpage);
1068 1075
1069 cpu_buffer->reader_page = bpage; 1076 cpu_buffer->reader_page = bpage;
1070 addr = __get_free_page(GFP_KERNEL); 1077 page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
1071 if (!addr) 1078 if (!page)
1072 goto fail_free_reader; 1079 goto fail_free_reader;
1073 bpage->page = (void *)addr; 1080 bpage->page = page_address(page);
1074 rb_init_page(bpage->page); 1081 rb_init_page(bpage->page);
1075 1082
1076 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1083 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1314 unsigned nr_pages, rm_pages, new_pages; 1321 unsigned nr_pages, rm_pages, new_pages;
1315 struct buffer_page *bpage, *tmp; 1322 struct buffer_page *bpage, *tmp;
1316 unsigned long buffer_size; 1323 unsigned long buffer_size;
1317 unsigned long addr;
1318 LIST_HEAD(pages); 1324 LIST_HEAD(pages);
1319 int i, cpu; 1325 int i, cpu;
1320 1326
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1375 1381
1376 for_each_buffer_cpu(buffer, cpu) { 1382 for_each_buffer_cpu(buffer, cpu) {
1377 for (i = 0; i < new_pages; i++) { 1383 for (i = 0; i < new_pages; i++) {
1384 struct page *page;
1385 /*
1386 * __GFP_NORETRY flag makes sure that the allocation
1387 * fails gracefully without invoking oom-killer and
1388 * the system is not destabilized.
1389 */
1378 bpage = kzalloc_node(ALIGN(sizeof(*bpage), 1390 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
1379 cache_line_size()), 1391 cache_line_size()),
1380 GFP_KERNEL, cpu_to_node(cpu)); 1392 GFP_KERNEL | __GFP_NORETRY,
1393 cpu_to_node(cpu));
1381 if (!bpage) 1394 if (!bpage)
1382 goto free_pages; 1395 goto free_pages;
1383 list_add(&bpage->list, &pages); 1396 list_add(&bpage->list, &pages);
1384 addr = __get_free_page(GFP_KERNEL); 1397 page = alloc_pages_node(cpu_to_node(cpu),
1385 if (!addr) 1398 GFP_KERNEL | __GFP_NORETRY, 0);
1399 if (!page)
1386 goto free_pages; 1400 goto free_pages;
1387 bpage->page = (void *)addr; 1401 bpage->page = page_address(page);
1388 rb_init_page(bpage->page); 1402 rb_init_page(bpage->page);
1389 } 1403 }
1390 } 1404 }
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3730 * Returns: 3744 * Returns:
3731 * The page allocated, or NULL on error. 3745 * The page allocated, or NULL on error.
3732 */ 3746 */
3733void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) 3747void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
3734{ 3748{
3735 struct buffer_data_page *bpage; 3749 struct buffer_data_page *bpage;
3736 unsigned long addr; 3750 struct page *page;
3737 3751
3738 addr = __get_free_page(GFP_KERNEL); 3752 page = alloc_pages_node(cpu_to_node(cpu),
3739 if (!addr) 3753 GFP_KERNEL | __GFP_NORETRY, 0);
3754 if (!page)
3740 return NULL; 3755 return NULL;
3741 3756
3742 bpage = (void *)addr; 3757 bpage = page_address(page);
3743 3758
3744 rb_init_page(bpage); 3759 rb_init_page(bpage);
3745 3760
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
3978 size_t cnt, loff_t *ppos) 3993 size_t cnt, loff_t *ppos)
3979{ 3994{
3980 unsigned long *p = filp->private_data; 3995 unsigned long *p = filp->private_data;
3981 char buf[64];
3982 unsigned long val; 3996 unsigned long val;
3983 int ret; 3997 int ret;
3984 3998
3985 if (cnt >= sizeof(buf)) 3999 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3986 return -EINVAL; 4000 if (ret)
3987
3988 if (copy_from_user(&buf, ubuf, cnt))
3989 return -EFAULT;
3990
3991 buf[cnt] = 0;
3992
3993 ret = strict_strtoul(buf, 10, &val);
3994 if (ret < 0)
3995 return ret; 4001 return ret;
3996 4002
3997 if (val) 4003 if (val)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a61463..a5457d577b9 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
106 int inc; 106 int inc;
107 int i; 107 int i;
108 108
109 bpage = ring_buffer_alloc_read_page(buffer); 109 bpage = ring_buffer_alloc_read_page(buffer, cpu);
110 if (!bpage) 110 if (!bpage)
111 return EVENT_DROPPED; 111 return EVENT_DROPPED;
112 112
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ee9c921d7f2..17a2d44e1af 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
343static int trace_stop_count; 343static int trace_stop_count;
344static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
345 345
346static void wakeup_work_handler(struct work_struct *work)
347{
348 wake_up(&trace_wait);
349}
350
351static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
352
346/** 353/**
347 * trace_wake_up - wake up tasks waiting for trace input 354 * trace_wake_up - wake up tasks waiting for trace input
348 * 355 *
349 * Simply wakes up any task that is blocked on the trace_wait 356 * Schedules a delayed work to wake up any task that is blocked on the
350 * queue. These is used with trace_poll for tasks polling the trace. 357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
351 */ 359 */
352void trace_wake_up(void) 360void trace_wake_up(void)
353{ 361{
354 int cpu; 362 const unsigned long delay = msecs_to_jiffies(2);
355 363
356 if (trace_flags & TRACE_ITER_BLOCK) 364 if (trace_flags & TRACE_ITER_BLOCK)
357 return; 365 return;
358 /* 366 schedule_delayed_work(&wakeup_work, delay);
359 * The runqueue_is_locked() can fail, but this is the best we
360 * have for now:
361 */
362 cpu = get_cpu();
363 if (!runqueue_is_locked(cpu))
364 wake_up(&trace_wait);
365 put_cpu();
366} 367}
367 368
368static int __init set_buf_size(char *str) 369static int __init set_buf_size(char *str)
@@ -424,6 +425,7 @@ static const char *trace_options[] = {
424 "graph-time", 425 "graph-time",
425 "record-cmd", 426 "record-cmd",
426 "overwrite", 427 "overwrite",
428 "disable_on_free",
427 NULL 429 NULL
428}; 430};
429 431
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
1191} 1193}
1192EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1194EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1193 1195
1196void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event,
1198 unsigned long flags, int pc,
1199 struct pt_regs *regs)
1200{
1201 ring_buffer_unlock_commit(buffer, event);
1202
1203 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1204 ftrace_trace_userstack(buffer, flags, pc);
1205}
1206EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
1207
1194void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1208void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1195 struct ring_buffer_event *event) 1209 struct ring_buffer_event *event)
1196{ 1210{
@@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1234} 1248}
1235 1249
1236#ifdef CONFIG_STACKTRACE 1250#ifdef CONFIG_STACKTRACE
1251
1252#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
1253struct ftrace_stack {
1254 unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
1255};
1256
1257static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
1258static DEFINE_PER_CPU(int, ftrace_stack_reserve);
1259
1237static void __ftrace_trace_stack(struct ring_buffer *buffer, 1260static void __ftrace_trace_stack(struct ring_buffer *buffer,
1238 unsigned long flags, 1261 unsigned long flags,
1239 int skip, int pc) 1262 int skip, int pc, struct pt_regs *regs)
1240{ 1263{
1241 struct ftrace_event_call *call = &event_kernel_stack; 1264 struct ftrace_event_call *call = &event_kernel_stack;
1242 struct ring_buffer_event *event; 1265 struct ring_buffer_event *event;
1243 struct stack_entry *entry; 1266 struct stack_entry *entry;
1244 struct stack_trace trace; 1267 struct stack_trace trace;
1268 int use_stack;
1269 int size = FTRACE_STACK_ENTRIES;
1270
1271 trace.nr_entries = 0;
1272 trace.skip = skip;
1273
1274 /*
1275 * Since events can happen in NMIs there's no safe way to
1276 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
1277 * or NMI comes in, it will just have to use the default
1278 * FTRACE_STACK_SIZE.
1279 */
1280 preempt_disable_notrace();
1281
1282 use_stack = ++__get_cpu_var(ftrace_stack_reserve);
1283 /*
1284 * We don't need any atomic variables, just a barrier.
1285 * If an interrupt comes in, we don't care, because it would
1286 * have exited and put the counter back to what we want.
1287 * We just need a barrier to keep gcc from moving things
1288 * around.
1289 */
1290 barrier();
1291 if (use_stack == 1) {
1292 trace.entries = &__get_cpu_var(ftrace_stack).calls[0];
1293 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
1294
1295 if (regs)
1296 save_stack_trace_regs(regs, &trace);
1297 else
1298 save_stack_trace(&trace);
1299
1300 if (trace.nr_entries > size)
1301 size = trace.nr_entries;
1302 } else
1303 /* From now on, use_stack is a boolean */
1304 use_stack = 0;
1305
1306 size *= sizeof(unsigned long);
1245 1307
1246 event = trace_buffer_lock_reserve(buffer, TRACE_STACK, 1308 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1247 sizeof(*entry), flags, pc); 1309 sizeof(*entry) + size, flags, pc);
1248 if (!event) 1310 if (!event)
1249 return; 1311 goto out;
1250 entry = ring_buffer_event_data(event); 1312 entry = ring_buffer_event_data(event);
1251 memset(&entry->caller, 0, sizeof(entry->caller));
1252 1313
1253 trace.nr_entries = 0; 1314 memset(&entry->caller, 0, size);
1254 trace.max_entries = FTRACE_STACK_ENTRIES; 1315
1255 trace.skip = skip; 1316 if (use_stack)
1256 trace.entries = entry->caller; 1317 memcpy(&entry->caller, trace.entries,
1318 trace.nr_entries * sizeof(unsigned long));
1319 else {
1320 trace.max_entries = FTRACE_STACK_ENTRIES;
1321 trace.entries = entry->caller;
1322 if (regs)
1323 save_stack_trace_regs(regs, &trace);
1324 else
1325 save_stack_trace(&trace);
1326 }
1327
1328 entry->size = trace.nr_entries;
1257 1329
1258 save_stack_trace(&trace);
1259 if (!filter_check_discard(call, entry, buffer, event)) 1330 if (!filter_check_discard(call, entry, buffer, event))
1260 ring_buffer_unlock_commit(buffer, event); 1331 ring_buffer_unlock_commit(buffer, event);
1332
1333 out:
1334 /* Again, don't let gcc optimize things here */
1335 barrier();
1336 __get_cpu_var(ftrace_stack_reserve)--;
1337 preempt_enable_notrace();
1338
1339}
1340
1341void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
1342 int skip, int pc, struct pt_regs *regs)
1343{
1344 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1345 return;
1346
1347 __ftrace_trace_stack(buffer, flags, skip, pc, regs);
1261} 1348}
1262 1349
1263void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 1350void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
@@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1266 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1353 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1267 return; 1354 return;
1268 1355
1269 __ftrace_trace_stack(buffer, flags, skip, pc); 1356 __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
1270} 1357}
1271 1358
1272void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1359void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1273 int pc) 1360 int pc)
1274{ 1361{
1275 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1362 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
1276} 1363}
1277 1364
1278/** 1365/**
@@ -1288,7 +1375,7 @@ void trace_dump_stack(void)
1288 local_save_flags(flags); 1375 local_save_flags(flags);
1289 1376
1290 /* skipping 3 traces, seems to get us at the caller of this function */ 1377 /* skipping 3 traces, seems to get us at the caller of this function */
1291 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1378 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
1292} 1379}
1293 1380
1294static DEFINE_PER_CPU(int, user_stack_count); 1381static DEFINE_PER_CPU(int, user_stack_count);
@@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1536 1623
1537 ftrace_enable_cpu(); 1624 ftrace_enable_cpu();
1538 1625
1539 return event ? ring_buffer_event_data(event) : NULL; 1626 if (event) {
1627 iter->ent_size = ring_buffer_event_length(event);
1628 return ring_buffer_event_data(event);
1629 }
1630 iter->ent_size = 0;
1631 return NULL;
1540} 1632}
1541 1633
1542static struct trace_entry * 1634static struct trace_entry *
@@ -2051,6 +2143,9 @@ void trace_default_header(struct seq_file *m)
2051{ 2143{
2052 struct trace_iterator *iter = m->private; 2144 struct trace_iterator *iter = m->private;
2053 2145
2146 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
2147 return;
2148
2054 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2149 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2055 /* print nothing if the buffers are empty */ 2150 /* print nothing if the buffers are empty */
2056 if (trace_empty(iter)) 2151 if (trace_empty(iter))
@@ -2701,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2701 size_t cnt, loff_t *ppos) 2796 size_t cnt, loff_t *ppos)
2702{ 2797{
2703 struct trace_array *tr = filp->private_data; 2798 struct trace_array *tr = filp->private_data;
2704 char buf[64];
2705 unsigned long val; 2799 unsigned long val;
2706 int ret; 2800 int ret;
2707 2801
2708 if (cnt >= sizeof(buf)) 2802 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2709 return -EINVAL; 2803 if (ret)
2710
2711 if (copy_from_user(&buf, ubuf, cnt))
2712 return -EFAULT;
2713
2714 buf[cnt] = 0;
2715
2716 ret = strict_strtoul(buf, 10, &val);
2717 if (ret < 0)
2718 return ret; 2804 return ret;
2719 2805
2720 val = !!val; 2806 val = !!val;
@@ -2767,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
2767 return t->init(tr); 2853 return t->init(tr);
2768} 2854}
2769 2855
2770static int tracing_resize_ring_buffer(unsigned long size) 2856static int __tracing_resize_ring_buffer(unsigned long size)
2771{ 2857{
2772 int ret; 2858 int ret;
2773 2859
@@ -2819,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size)
2819 return ret; 2905 return ret;
2820} 2906}
2821 2907
2908static ssize_t tracing_resize_ring_buffer(unsigned long size)
2909{
2910 int cpu, ret = size;
2911
2912 mutex_lock(&trace_types_lock);
2913
2914 tracing_stop();
2915
2916 /* disable all cpu buffers */
2917 for_each_tracing_cpu(cpu) {
2918 if (global_trace.data[cpu])
2919 atomic_inc(&global_trace.data[cpu]->disabled);
2920 if (max_tr.data[cpu])
2921 atomic_inc(&max_tr.data[cpu]->disabled);
2922 }
2923
2924 if (size != global_trace.entries)
2925 ret = __tracing_resize_ring_buffer(size);
2926
2927 if (ret < 0)
2928 ret = -ENOMEM;
2929
2930 for_each_tracing_cpu(cpu) {
2931 if (global_trace.data[cpu])
2932 atomic_dec(&global_trace.data[cpu]->disabled);
2933 if (max_tr.data[cpu])
2934 atomic_dec(&max_tr.data[cpu]->disabled);
2935 }
2936
2937 tracing_start();
2938 mutex_unlock(&trace_types_lock);
2939
2940 return ret;
2941}
2942
2822 2943
2823/** 2944/**
2824 * tracing_update_buffers - used by tracing facility to expand ring buffers 2945 * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -2836,7 +2957,7 @@ int tracing_update_buffers(void)
2836 2957
2837 mutex_lock(&trace_types_lock); 2958 mutex_lock(&trace_types_lock);
2838 if (!ring_buffer_expanded) 2959 if (!ring_buffer_expanded)
2839 ret = tracing_resize_ring_buffer(trace_buf_size); 2960 ret = __tracing_resize_ring_buffer(trace_buf_size);
2840 mutex_unlock(&trace_types_lock); 2961 mutex_unlock(&trace_types_lock);
2841 2962
2842 return ret; 2963 return ret;
@@ -2860,7 +2981,7 @@ static int tracing_set_tracer(const char *buf)
2860 mutex_lock(&trace_types_lock); 2981 mutex_lock(&trace_types_lock);
2861 2982
2862 if (!ring_buffer_expanded) { 2983 if (!ring_buffer_expanded) {
2863 ret = tracing_resize_ring_buffer(trace_buf_size); 2984 ret = __tracing_resize_ring_buffer(trace_buf_size);
2864 if (ret < 0) 2985 if (ret < 0)
2865 goto out; 2986 goto out;
2866 ret = 0; 2987 ret = 0;
@@ -2966,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
2966 size_t cnt, loff_t *ppos) 3087 size_t cnt, loff_t *ppos)
2967{ 3088{
2968 unsigned long *ptr = filp->private_data; 3089 unsigned long *ptr = filp->private_data;
2969 char buf[64];
2970 unsigned long val; 3090 unsigned long val;
2971 int ret; 3091 int ret;
2972 3092
2973 if (cnt >= sizeof(buf)) 3093 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2974 return -EINVAL; 3094 if (ret)
2975
2976 if (copy_from_user(&buf, ubuf, cnt))
2977 return -EFAULT;
2978
2979 buf[cnt] = 0;
2980
2981 ret = strict_strtoul(buf, 10, &val);
2982 if (ret < 0)
2983 return ret; 3095 return ret;
2984 3096
2985 *ptr = val * 1000; 3097 *ptr = val * 1000;
@@ -3434,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3434 size_t cnt, loff_t *ppos) 3546 size_t cnt, loff_t *ppos)
3435{ 3547{
3436 unsigned long val; 3548 unsigned long val;
3437 char buf[64]; 3549 int ret;
3438 int ret, cpu;
3439
3440 if (cnt >= sizeof(buf))
3441 return -EINVAL;
3442
3443 if (copy_from_user(&buf, ubuf, cnt))
3444 return -EFAULT;
3445
3446 buf[cnt] = 0;
3447 3550
3448 ret = strict_strtoul(buf, 10, &val); 3551 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3449 if (ret < 0) 3552 if (ret)
3450 return ret; 3553 return ret;
3451 3554
3452 /* must have at least 1 entry */ 3555 /* must have at least 1 entry */
3453 if (!val) 3556 if (!val)
3454 return -EINVAL; 3557 return -EINVAL;
3455 3558
3456 mutex_lock(&trace_types_lock);
3457
3458 tracing_stop();
3459
3460 /* disable all cpu buffers */
3461 for_each_tracing_cpu(cpu) {
3462 if (global_trace.data[cpu])
3463 atomic_inc(&global_trace.data[cpu]->disabled);
3464 if (max_tr.data[cpu])
3465 atomic_inc(&max_tr.data[cpu]->disabled);
3466 }
3467
3468 /* value is in KB */ 3559 /* value is in KB */
3469 val <<= 10; 3560 val <<= 10;
3470 3561
3471 if (val != global_trace.entries) { 3562 ret = tracing_resize_ring_buffer(val);
3472 ret = tracing_resize_ring_buffer(val); 3563 if (ret < 0)
3473 if (ret < 0) { 3564 return ret;
3474 cnt = ret;
3475 goto out;
3476 }
3477 }
3478 3565
3479 *ppos += cnt; 3566 *ppos += cnt;
3480 3567
3481 /* If check pages failed, return ENOMEM */ 3568 return cnt;
3482 if (tracing_disabled) 3569}
3483 cnt = -ENOMEM;
3484 out:
3485 for_each_tracing_cpu(cpu) {
3486 if (global_trace.data[cpu])
3487 atomic_dec(&global_trace.data[cpu]->disabled);
3488 if (max_tr.data[cpu])
3489 atomic_dec(&max_tr.data[cpu]->disabled);
3490 }
3491 3570
3492 tracing_start(); 3571static ssize_t
3493 mutex_unlock(&trace_types_lock); 3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3573 size_t cnt, loff_t *ppos)
3574{
3575 /*
3576 * There is no need to read what the user has written, this function
3577 * is just to make sure that there is no error when "echo" is used
3578 */
3579
3580 *ppos += cnt;
3494 3581
3495 return cnt; 3582 return cnt;
3496} 3583}
3497 3584
3585static int
3586tracing_free_buffer_release(struct inode *inode, struct file *filp)
3587{
3588 /* disable tracing ? */
3589 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3590 tracing_off();
3591 /* resize the ring buffer to 0 */
3592 tracing_resize_ring_buffer(0);
3593
3594 return 0;
3595}
3596
3498static int mark_printk(const char *fmt, ...) 3597static int mark_printk(const char *fmt, ...)
3499{ 3598{
3500 int ret; 3599 int ret;
@@ -3640,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = {
3640 .llseek = generic_file_llseek, 3739 .llseek = generic_file_llseek,
3641}; 3740};
3642 3741
3742static const struct file_operations tracing_free_buffer_fops = {
3743 .write = tracing_free_buffer_write,
3744 .release = tracing_free_buffer_release,
3745};
3746
3643static const struct file_operations tracing_mark_fops = { 3747static const struct file_operations tracing_mark_fops = {
3644 .open = tracing_open_generic, 3748 .open = tracing_open_generic,
3645 .write = tracing_mark_write, 3749 .write = tracing_mark_write,
@@ -3696,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3696 return 0; 3800 return 0;
3697 3801
3698 if (!info->spare) 3802 if (!info->spare)
3699 info->spare = ring_buffer_alloc_read_page(info->tr->buffer); 3803 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
3700 if (!info->spare) 3804 if (!info->spare)
3701 return -ENOMEM; 3805 return -ENOMEM;
3702 3806
@@ -3704,8 +3808,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3704 if (info->read < PAGE_SIZE) 3808 if (info->read < PAGE_SIZE)
3705 goto read; 3809 goto read;
3706 3810
3707 info->read = 0;
3708
3709 trace_access_lock(info->cpu); 3811 trace_access_lock(info->cpu);
3710 ret = ring_buffer_read_page(info->tr->buffer, 3812 ret = ring_buffer_read_page(info->tr->buffer,
3711 &info->spare, 3813 &info->spare,
@@ -3715,6 +3817,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3715 if (ret < 0) 3817 if (ret < 0)
3716 return 0; 3818 return 0;
3717 3819
3820 info->read = 0;
3821
3718read: 3822read:
3719 size = PAGE_SIZE - info->read; 3823 size = PAGE_SIZE - info->read;
3720 if (size > count) 3824 if (size > count)
@@ -3853,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3853 3957
3854 ref->ref = 1; 3958 ref->ref = 1;
3855 ref->buffer = info->tr->buffer; 3959 ref->buffer = info->tr->buffer;
3856 ref->page = ring_buffer_alloc_read_page(ref->buffer); 3960 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
3857 if (!ref->page) { 3961 if (!ref->page) {
3858 kfree(ref); 3962 kfree(ref);
3859 break; 3963 break;
@@ -3862,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3862 r = ring_buffer_read_page(ref->buffer, &ref->page, 3966 r = ring_buffer_read_page(ref->buffer, &ref->page,
3863 len, info->cpu, 1); 3967 len, info->cpu, 1);
3864 if (r < 0) { 3968 if (r < 0) {
3865 ring_buffer_free_read_page(ref->buffer, 3969 ring_buffer_free_read_page(ref->buffer, ref->page);
3866 ref->page);
3867 kfree(ref); 3970 kfree(ref);
3868 break; 3971 break;
3869 } 3972 }
@@ -4099,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4099{ 4202{
4100 struct trace_option_dentry *topt = filp->private_data; 4203 struct trace_option_dentry *topt = filp->private_data;
4101 unsigned long val; 4204 unsigned long val;
4102 char buf[64];
4103 int ret; 4205 int ret;
4104 4206
4105 if (cnt >= sizeof(buf)) 4207 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4106 return -EINVAL; 4208 if (ret)
4107
4108 if (copy_from_user(&buf, ubuf, cnt))
4109 return -EFAULT;
4110
4111 buf[cnt] = 0;
4112
4113 ret = strict_strtoul(buf, 10, &val);
4114 if (ret < 0)
4115 return ret; 4209 return ret;
4116 4210
4117 if (val != 0 && val != 1) 4211 if (val != 0 && val != 1)
@@ -4159,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4159 loff_t *ppos) 4253 loff_t *ppos)
4160{ 4254{
4161 long index = (long)filp->private_data; 4255 long index = (long)filp->private_data;
4162 char buf[64];
4163 unsigned long val; 4256 unsigned long val;
4164 int ret; 4257 int ret;
4165 4258
4166 if (cnt >= sizeof(buf)) 4259 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4167 return -EINVAL; 4260 if (ret)
4168
4169 if (copy_from_user(&buf, ubuf, cnt))
4170 return -EFAULT;
4171
4172 buf[cnt] = 0;
4173
4174 ret = strict_strtoul(buf, 10, &val);
4175 if (ret < 0)
4176 return ret; 4261 return ret;
4177 4262
4178 if (val != 0 && val != 1) 4263 if (val != 0 && val != 1)
@@ -4365,6 +4450,9 @@ static __init int tracer_init_debugfs(void)
4365 trace_create_file("buffer_size_kb", 0644, d_tracer, 4450 trace_create_file("buffer_size_kb", 0644, d_tracer,
4366 &global_trace, &tracing_entries_fops); 4451 &global_trace, &tracing_entries_fops);
4367 4452
4453 trace_create_file("free_buffer", 0644, d_tracer,
4454 &global_trace, &tracing_free_buffer_fops);
4455
4368 trace_create_file("trace_marker", 0220, d_tracer, 4456 trace_create_file("trace_marker", 0220, d_tracer,
4369 NULL, &tracing_mark_fops); 4457 NULL, &tracing_mark_fops);
4370 4458
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 229f8591f61..616846bcfee 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2,7 +2,7 @@
2#define _LINUX_KERNEL_TRACE_H 2#define _LINUX_KERNEL_TRACE_H
3 3
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <asm/atomic.h> 5#include <linux/atomic.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
@@ -278,6 +278,29 @@ struct tracer {
278}; 278};
279 279
280 280
281/* Only current can touch trace_recursion */
282#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
283#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
284
285/* Ring buffer has the 10 LSB bits to count */
286#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
287
288/* for function tracing recursion */
289#define TRACE_INTERNAL_BIT (1<<11)
290#define TRACE_GLOBAL_BIT (1<<12)
291/*
292 * Abuse of the trace_recursion.
293 * As we need a way to maintain state if we are tracing the function
294 * graph in irq because we want to trace a particular function that
295 * was called in irq context but we have irq tracing off. Since this
296 * can only be modified by current, we can reuse trace_recursion.
297 */
298#define TRACE_IRQ_BIT (1<<13)
299
300#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
301#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
302#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
303
281#define TRACE_PIPE_ALL_CPU -1 304#define TRACE_PIPE_ALL_CPU -1
282 305
283int tracer_init(struct tracer *t, struct trace_array *tr); 306int tracer_init(struct tracer *t, struct trace_array *tr);
@@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr,
389void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 412void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
390 int skip, int pc); 413 int skip, int pc);
391 414
415void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
416 int skip, int pc, struct pt_regs *regs);
417
392void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, 418void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
393 int pc); 419 int pc);
394 420
@@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
400{ 426{
401} 427}
402 428
429static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
430 unsigned long flags, int skip,
431 int pc, struct pt_regs *regs)
432{
433}
434
403static inline void ftrace_trace_userstack(struct ring_buffer *buffer, 435static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
404 unsigned long flags, int pc) 436 unsigned long flags, int pc)
405{ 437{
@@ -507,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr)
507 return 1; 539 return 1;
508 540
509 for (i = 0; i < ftrace_graph_count; i++) { 541 for (i = 0; i < ftrace_graph_count; i++) {
510 if (addr == ftrace_graph_funcs[i]) 542 if (addr == ftrace_graph_funcs[i]) {
543 /*
544 * If no irqs are to be traced, but a set_graph_function
545 * is set, and called by an interrupt handler, we still
546 * want to trace it.
547 */
548 if (in_irq())
549 trace_recursion_set(TRACE_IRQ_BIT);
550 else
551 trace_recursion_clear(TRACE_IRQ_BIT);
511 return 1; 552 return 1;
553 }
512 } 554 }
513 555
514 return 0; 556 return 0;
@@ -609,6 +651,7 @@ enum trace_iterator_flags {
609 TRACE_ITER_GRAPH_TIME = 0x80000, 651 TRACE_ITER_GRAPH_TIME = 0x80000,
610 TRACE_ITER_RECORD_CMD = 0x100000, 652 TRACE_ITER_RECORD_CMD = 0x100000,
611 TRACE_ITER_OVERWRITE = 0x200000, 653 TRACE_ITER_OVERWRITE = 0x200000,
654 TRACE_ITER_STOP_ON_FREE = 0x400000,
612}; 655};
613 656
614/* 657/*
@@ -677,6 +720,7 @@ struct event_subsystem {
677 struct dentry *entry; 720 struct dentry *entry;
678 struct event_filter *filter; 721 struct event_filter *filter;
679 int nr_events; 722 int nr_events;
723 int ref_count;
680}; 724};
681 725
682#define FILTER_PRED_INVALID ((unsigned short)-1) 726#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -784,19 +828,4 @@ extern const char *__stop___trace_bprintk_fmt[];
784 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 828 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
785#include "trace_entries.h" 829#include "trace_entries.h"
786 830
787/* Only current can touch trace_recursion */
788#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
789#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
790
791/* Ring buffer has the 10 LSB bits to count */
792#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
793
794/* for function tracing recursion */
795#define TRACE_INTERNAL_BIT (1<<11)
796#define TRACE_GLOBAL_BIT (1<<12)
797
798#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
799#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
800#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
801
802#endif /* _LINUX_KERNEL_TRACE_H */ 831#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e32744c84d9..93365907f21 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
161 TRACE_STACK, 161 TRACE_STACK,
162 162
163 F_STRUCT( 163 F_STRUCT(
164 __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) 164 __field( int, size )
165 __dynamic_array(unsigned long, caller )
165 ), 166 ),
166 167
167 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" 168 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 686ec399f2a..c212a7f934e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -244,6 +244,35 @@ static void ftrace_clear_events(void)
244 mutex_unlock(&event_mutex); 244 mutex_unlock(&event_mutex);
245} 245}
246 246
247static void __put_system(struct event_subsystem *system)
248{
249 struct event_filter *filter = system->filter;
250
251 WARN_ON_ONCE(system->ref_count == 0);
252 if (--system->ref_count)
253 return;
254
255 if (filter) {
256 kfree(filter->filter_string);
257 kfree(filter);
258 }
259 kfree(system->name);
260 kfree(system);
261}
262
263static void __get_system(struct event_subsystem *system)
264{
265 WARN_ON_ONCE(system->ref_count == 0);
266 system->ref_count++;
267}
268
269static void put_system(struct event_subsystem *system)
270{
271 mutex_lock(&event_mutex);
272 __put_system(system);
273 mutex_unlock(&event_mutex);
274}
275
247/* 276/*
248 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 277 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
249 */ 278 */
@@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
486 loff_t *ppos) 515 loff_t *ppos)
487{ 516{
488 struct ftrace_event_call *call = filp->private_data; 517 struct ftrace_event_call *call = filp->private_data;
489 char buf[64];
490 unsigned long val; 518 unsigned long val;
491 int ret; 519 int ret;
492 520
493 if (cnt >= sizeof(buf)) 521 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
494 return -EINVAL; 522 if (ret)
495
496 if (copy_from_user(&buf, ubuf, cnt))
497 return -EFAULT;
498
499 buf[cnt] = 0;
500
501 ret = strict_strtoul(buf, 10, &val);
502 if (ret < 0)
503 return ret; 523 return ret;
504 524
505 ret = tracing_update_buffers(); 525 ret = tracing_update_buffers();
@@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
528 loff_t *ppos) 548 loff_t *ppos)
529{ 549{
530 const char set_to_char[4] = { '?', '0', '1', 'X' }; 550 const char set_to_char[4] = { '?', '0', '1', 'X' };
531 const char *system = filp->private_data; 551 struct event_subsystem *system = filp->private_data;
532 struct ftrace_event_call *call; 552 struct ftrace_event_call *call;
533 char buf[2]; 553 char buf[2];
534 int set = 0; 554 int set = 0;
@@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
539 if (!call->name || !call->class || !call->class->reg) 559 if (!call->name || !call->class || !call->class->reg)
540 continue; 560 continue;
541 561
542 if (system && strcmp(call->class->system, system) != 0) 562 if (system && strcmp(call->class->system, system->name) != 0)
543 continue; 563 continue;
544 564
545 /* 565 /*
@@ -569,21 +589,13 @@ static ssize_t
569system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 589system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
570 loff_t *ppos) 590 loff_t *ppos)
571{ 591{
572 const char *system = filp->private_data; 592 struct event_subsystem *system = filp->private_data;
593 const char *name = NULL;
573 unsigned long val; 594 unsigned long val;
574 char buf[64];
575 ssize_t ret; 595 ssize_t ret;
576 596
577 if (cnt >= sizeof(buf)) 597 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
578 return -EINVAL; 598 if (ret)
579
580 if (copy_from_user(&buf, ubuf, cnt))
581 return -EFAULT;
582
583 buf[cnt] = 0;
584
585 ret = strict_strtoul(buf, 10, &val);
586 if (ret < 0)
587 return ret; 599 return ret;
588 600
589 ret = tracing_update_buffers(); 601 ret = tracing_update_buffers();
@@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
593 if (val != 0 && val != 1) 605 if (val != 0 && val != 1)
594 return -EINVAL; 606 return -EINVAL;
595 607
596 ret = __ftrace_set_clr_event(NULL, system, NULL, val); 608 /*
609 * Opening of "enable" adds a ref count to system,
610 * so the name is safe to use.
611 */
612 if (system)
613 name = system->name;
614
615 ret = __ftrace_set_clr_event(NULL, name, NULL, val);
597 if (ret) 616 if (ret)
598 goto out; 617 goto out;
599 618
@@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
826 return cnt; 845 return cnt;
827} 846}
828 847
848static LIST_HEAD(event_subsystems);
849
850static int subsystem_open(struct inode *inode, struct file *filp)
851{
852 struct event_subsystem *system = NULL;
853 int ret;
854
855 if (!inode->i_private)
856 goto skip_search;
857
858 /* Make sure the system still exists */
859 mutex_lock(&event_mutex);
860 list_for_each_entry(system, &event_subsystems, list) {
861 if (system == inode->i_private) {
862 /* Don't open systems with no events */
863 if (!system->nr_events) {
864 system = NULL;
865 break;
866 }
867 __get_system(system);
868 break;
869 }
870 }
871 mutex_unlock(&event_mutex);
872
873 if (system != inode->i_private)
874 return -ENODEV;
875
876 skip_search:
877 ret = tracing_open_generic(inode, filp);
878 if (ret < 0 && system)
879 put_system(system);
880
881 return ret;
882}
883
884static int subsystem_release(struct inode *inode, struct file *file)
885{
886 struct event_subsystem *system = inode->i_private;
887
888 if (system)
889 put_system(system);
890
891 return 0;
892}
893
829static ssize_t 894static ssize_t
830subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 895subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
831 loff_t *ppos) 896 loff_t *ppos)
@@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = {
963}; 1028};
964 1029
965static const struct file_operations ftrace_subsystem_filter_fops = { 1030static const struct file_operations ftrace_subsystem_filter_fops = {
966 .open = tracing_open_generic, 1031 .open = subsystem_open,
967 .read = subsystem_filter_read, 1032 .read = subsystem_filter_read,
968 .write = subsystem_filter_write, 1033 .write = subsystem_filter_write,
969 .llseek = default_llseek, 1034 .llseek = default_llseek,
1035 .release = subsystem_release,
970}; 1036};
971 1037
972static const struct file_operations ftrace_system_enable_fops = { 1038static const struct file_operations ftrace_system_enable_fops = {
973 .open = tracing_open_generic, 1039 .open = subsystem_open,
974 .read = system_enable_read, 1040 .read = system_enable_read,
975 .write = system_enable_write, 1041 .write = system_enable_write,
976 .llseek = default_llseek, 1042 .llseek = default_llseek,
1043 .release = subsystem_release,
977}; 1044};
978 1045
979static const struct file_operations ftrace_show_header_fops = { 1046static const struct file_operations ftrace_show_header_fops = {
@@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
1002 return d_events; 1069 return d_events;
1003} 1070}
1004 1071
1005static LIST_HEAD(event_subsystems);
1006
1007static struct dentry * 1072static struct dentry *
1008event_subsystem_dir(const char *name, struct dentry *d_events) 1073event_subsystem_dir(const char *name, struct dentry *d_events)
1009{ 1074{
@@ -1035,6 +1100,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1035 } 1100 }
1036 1101
1037 system->nr_events = 1; 1102 system->nr_events = 1;
1103 system->ref_count = 1;
1038 system->name = kstrdup(name, GFP_KERNEL); 1104 system->name = kstrdup(name, GFP_KERNEL);
1039 if (!system->name) { 1105 if (!system->name) {
1040 debugfs_remove(system->entry); 1106 debugfs_remove(system->entry);
@@ -1062,8 +1128,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1062 "'%s/filter' entry\n", name); 1128 "'%s/filter' entry\n", name);
1063 } 1129 }
1064 1130
1065 trace_create_file("enable", 0644, system->entry, 1131 trace_create_file("enable", 0644, system->entry, system,
1066 (void *)system->name,
1067 &ftrace_system_enable_fops); 1132 &ftrace_system_enable_fops);
1068 1133
1069 return system->entry; 1134 return system->entry;
@@ -1184,16 +1249,9 @@ static void remove_subsystem_dir(const char *name)
1184 list_for_each_entry(system, &event_subsystems, list) { 1249 list_for_each_entry(system, &event_subsystems, list) {
1185 if (strcmp(system->name, name) == 0) { 1250 if (strcmp(system->name, name) == 0) {
1186 if (!--system->nr_events) { 1251 if (!--system->nr_events) {
1187 struct event_filter *filter = system->filter;
1188
1189 debugfs_remove_recursive(system->entry); 1252 debugfs_remove_recursive(system->entry);
1190 list_del(&system->list); 1253 list_del(&system->list);
1191 if (filter) { 1254 __put_system(system);
1192 kfree(filter->filter_string);
1193 kfree(filter);
1194 }
1195 kfree(system->name);
1196 kfree(system);
1197 } 1255 }
1198 break; 1256 break;
1199 } 1257 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddcfbf2..bd3c6369f80 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1766,7 +1766,7 @@ static int replace_system_preds(struct event_subsystem *system,
1766 * replace the filter for the call. 1766 * replace the filter for the call.
1767 */ 1767 */
1768 filter = call->filter; 1768 filter = call->filter;
1769 call->filter = filter_item->filter; 1769 rcu_assign_pointer(call->filter, filter_item->filter);
1770 filter_item->filter = filter; 1770 filter_item->filter = filter;
1771 1771
1772 fail = false; 1772 fail = false;
@@ -1821,7 +1821,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1821 filter = call->filter; 1821 filter = call->filter;
1822 if (!filter) 1822 if (!filter)
1823 goto out_unlock; 1823 goto out_unlock;
1824 call->filter = NULL; 1824 RCU_INIT_POINTER(call->filter, NULL);
1825 /* Make sure the filter is not being used */ 1825 /* Make sure the filter is not being used */
1826 synchronize_sched(); 1826 synchronize_sched();
1827 __free_filter(filter); 1827 __free_filter(filter);
@@ -1862,7 +1862,7 @@ out:
1862 * string 1862 * string
1863 */ 1863 */
1864 tmp = call->filter; 1864 tmp = call->filter;
1865 call->filter = filter; 1865 rcu_assign_pointer(call->filter, filter);
1866 if (tmp) { 1866 if (tmp) {
1867 /* Make sure the call is done with the filter */ 1867 /* Make sure the call is done with the filter */
1868 synchronize_sched(); 1868 synchronize_sched();
@@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1886 1886
1887 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1888 1888
1889 /* Make sure the system still has events */
1890 if (!system->nr_events) {
1891 err = -ENODEV;
1892 goto out_unlock;
1893 }
1894
1889 if (!strcmp(strstrip(filter_string), "0")) { 1895 if (!strcmp(strstrip(filter_string), "0")) {
1890 filter_free_subsystem_preds(system); 1896 filter_free_subsystem_preds(system);
1891 remove_filter_string(system->filter); 1897 remove_filter_string(system->filter);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8d0e1cc4e97..c7b0c6a7db0 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
324} 324}
325 325
326static int 326static int
327ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) 327ftrace_trace_onoff_callback(struct ftrace_hash *hash,
328 char *glob, char *cmd, char *param, int enable)
328{ 329{
329 struct ftrace_probe_ops *ops; 330 struct ftrace_probe_ops *ops;
330 void *count = (void *)-1; 331 void *count = (void *)-1;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 962cdb24ed8..a7d2a4c653d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = {
74 74
75static struct trace_array *graph_array; 75static struct trace_array *graph_array;
76 76
77/*
78 * DURATION column is being also used to display IRQ signs,
79 * following values are used by print_graph_irq and others
80 * to fill in space into DURATION column.
81 */
82enum {
83 DURATION_FILL_FULL = -1,
84 DURATION_FILL_START = -2,
85 DURATION_FILL_END = -3,
86};
87
88static enum print_line_t
89print_graph_duration(unsigned long long duration, struct trace_seq *s,
90 u32 flags);
77 91
78/* Add a function return address to the trace stack on thread info.*/ 92/* Add a function return address to the trace stack on thread info.*/
79int 93int
@@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr,
213 227
214static inline int ftrace_graph_ignore_irqs(void) 228static inline int ftrace_graph_ignore_irqs(void)
215{ 229{
216 if (!ftrace_graph_skip_irqs) 230 if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
217 return 0; 231 return 0;
218 232
219 return in_irq(); 233 return in_irq();
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter,
577 return next; 591 return next;
578} 592}
579 593
580/* Signal a overhead of time execution to the output */
581static int
582print_graph_overhead(unsigned long long duration, struct trace_seq *s,
583 u32 flags)
584{
585 /* If duration disappear, we don't need anything */
586 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
587 return 1;
588
589 /* Non nested entry or return */
590 if (duration == -1)
591 return trace_seq_printf(s, " ");
592
593 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
594 /* Duration exceeded 100 msecs */
595 if (duration > 100000ULL)
596 return trace_seq_printf(s, "! ");
597
598 /* Duration exceeded 10 msecs */
599 if (duration > 10000ULL)
600 return trace_seq_printf(s, "+ ");
601 }
602
603 return trace_seq_printf(s, " ");
604}
605
606static int print_graph_abs_time(u64 t, struct trace_seq *s) 594static int print_graph_abs_time(u64 t, struct trace_seq *s)
607{ 595{
608 unsigned long usecs_rem; 596 unsigned long usecs_rem;
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
625 addr >= (unsigned long)__irqentry_text_end) 613 addr >= (unsigned long)__irqentry_text_end)
626 return TRACE_TYPE_UNHANDLED; 614 return TRACE_TYPE_UNHANDLED;
627 615
628 /* Absolute time */ 616 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
629 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 617 /* Absolute time */
630 ret = print_graph_abs_time(iter->ts, s); 618 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
631 if (!ret) 619 ret = print_graph_abs_time(iter->ts, s);
632 return TRACE_TYPE_PARTIAL_LINE; 620 if (!ret)
633 } 621 return TRACE_TYPE_PARTIAL_LINE;
622 }
634 623
635 /* Cpu */ 624 /* Cpu */
636 if (flags & TRACE_GRAPH_PRINT_CPU) { 625 if (flags & TRACE_GRAPH_PRINT_CPU) {
637 ret = print_graph_cpu(s, cpu); 626 ret = print_graph_cpu(s, cpu);
638 if (ret == TRACE_TYPE_PARTIAL_LINE) 627 if (ret == TRACE_TYPE_PARTIAL_LINE)
639 return TRACE_TYPE_PARTIAL_LINE; 628 return TRACE_TYPE_PARTIAL_LINE;
640 } 629 }
641 630
642 /* Proc */ 631 /* Proc */
643 if (flags & TRACE_GRAPH_PRINT_PROC) { 632 if (flags & TRACE_GRAPH_PRINT_PROC) {
644 ret = print_graph_proc(s, pid); 633 ret = print_graph_proc(s, pid);
645 if (ret == TRACE_TYPE_PARTIAL_LINE) 634 if (ret == TRACE_TYPE_PARTIAL_LINE)
646 return TRACE_TYPE_PARTIAL_LINE; 635 return TRACE_TYPE_PARTIAL_LINE;
647 ret = trace_seq_printf(s, " | "); 636 ret = trace_seq_printf(s, " | ");
648 if (!ret) 637 if (!ret)
649 return TRACE_TYPE_PARTIAL_LINE; 638 return TRACE_TYPE_PARTIAL_LINE;
639 }
650 } 640 }
651 641
652 /* No overhead */ 642 /* No overhead */
653 ret = print_graph_overhead(-1, s, flags); 643 ret = print_graph_duration(DURATION_FILL_START, s, flags);
654 if (!ret) 644 if (ret != TRACE_TYPE_HANDLED)
655 return TRACE_TYPE_PARTIAL_LINE; 645 return ret;
656 646
657 if (type == TRACE_GRAPH_ENT) 647 if (type == TRACE_GRAPH_ENT)
658 ret = trace_seq_printf(s, "==========>"); 648 ret = trace_seq_printf(s, "==========>");
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
662 if (!ret) 652 if (!ret)
663 return TRACE_TYPE_PARTIAL_LINE; 653 return TRACE_TYPE_PARTIAL_LINE;
664 654
665 /* Don't close the duration column if haven't one */ 655 ret = print_graph_duration(DURATION_FILL_END, s, flags);
666 if (flags & TRACE_GRAPH_PRINT_DURATION) 656 if (ret != TRACE_TYPE_HANDLED)
667 trace_seq_printf(s, " |"); 657 return ret;
658
668 ret = trace_seq_printf(s, "\n"); 659 ret = trace_seq_printf(s, "\n");
669 660
670 if (!ret) 661 if (!ret)
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
716} 707}
717 708
718static enum print_line_t 709static enum print_line_t
719print_graph_duration(unsigned long long duration, struct trace_seq *s) 710print_graph_duration(unsigned long long duration, struct trace_seq *s,
711 u32 flags)
720{ 712{
721 int ret; 713 int ret = -1;
714
715 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
716 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
717 return TRACE_TYPE_HANDLED;
718
719 /* No real adata, just filling the column with spaces */
720 switch (duration) {
721 case DURATION_FILL_FULL:
722 ret = trace_seq_printf(s, " | ");
723 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
724 case DURATION_FILL_START:
725 ret = trace_seq_printf(s, " ");
726 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
727 case DURATION_FILL_END:
728 ret = trace_seq_printf(s, " |");
729 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
730 }
731
732 /* Signal a overhead of time execution to the output */
733 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
734 /* Duration exceeded 100 msecs */
735 if (duration > 100000ULL)
736 ret = trace_seq_printf(s, "! ");
737 /* Duration exceeded 10 msecs */
738 else if (duration > 10000ULL)
739 ret = trace_seq_printf(s, "+ ");
740 }
741
742 /*
743 * The -1 means we either did not exceed the duration tresholds
744 * or we dont want to print out the overhead. Either way we need
745 * to fill out the space.
746 */
747 if (ret == -1)
748 ret = trace_seq_printf(s, " ");
749
750 /* Catching here any failure happenned above */
751 if (!ret)
752 return TRACE_TYPE_PARTIAL_LINE;
722 753
723 ret = trace_print_graph_duration(duration, s); 754 ret = trace_print_graph_duration(duration, s);
724 if (ret != TRACE_TYPE_HANDLED) 755 if (ret != TRACE_TYPE_HANDLED)
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter,
767 cpu_data->enter_funcs[call->depth] = 0; 798 cpu_data->enter_funcs[call->depth] = 0;
768 } 799 }
769 800
770 /* Overhead */ 801 /* Overhead and duration */
771 ret = print_graph_overhead(duration, s, flags); 802 ret = print_graph_duration(duration, s, flags);
772 if (!ret) 803 if (ret == TRACE_TYPE_PARTIAL_LINE)
773 return TRACE_TYPE_PARTIAL_LINE; 804 return TRACE_TYPE_PARTIAL_LINE;
774 805
775 /* Duration */
776 if (flags & TRACE_GRAPH_PRINT_DURATION) {
777 ret = print_graph_duration(duration, s);
778 if (ret == TRACE_TYPE_PARTIAL_LINE)
779 return TRACE_TYPE_PARTIAL_LINE;
780 }
781
782 /* Function */ 806 /* Function */
783 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 807 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
784 ret = trace_seq_printf(s, " "); 808 ret = trace_seq_printf(s, " ");
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter,
815 cpu_data->enter_funcs[call->depth] = call->func; 839 cpu_data->enter_funcs[call->depth] = call->func;
816 } 840 }
817 841
818 /* No overhead */
819 ret = print_graph_overhead(-1, s, flags);
820 if (!ret)
821 return TRACE_TYPE_PARTIAL_LINE;
822
823 /* No time */ 842 /* No time */
824 if (flags & TRACE_GRAPH_PRINT_DURATION) { 843 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
825 ret = trace_seq_printf(s, " | "); 844 if (ret != TRACE_TYPE_HANDLED)
826 if (!ret) 845 return ret;
827 return TRACE_TYPE_PARTIAL_LINE;
828 }
829 846
830 /* Function */ 847 /* Function */
831 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 848 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
865 return TRACE_TYPE_PARTIAL_LINE; 882 return TRACE_TYPE_PARTIAL_LINE;
866 } 883 }
867 884
885 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
886 return 0;
887
868 /* Absolute time */ 888 /* Absolute time */
869 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 889 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
870 ret = print_graph_abs_time(iter->ts, s); 890 ret = print_graph_abs_time(iter->ts, s);
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1078 if (print_graph_prologue(iter, s, 0, 0, flags)) 1098 if (print_graph_prologue(iter, s, 0, 0, flags))
1079 return TRACE_TYPE_PARTIAL_LINE; 1099 return TRACE_TYPE_PARTIAL_LINE;
1080 1100
1081 /* Overhead */ 1101 /* Overhead and duration */
1082 ret = print_graph_overhead(duration, s, flags); 1102 ret = print_graph_duration(duration, s, flags);
1083 if (!ret) 1103 if (ret == TRACE_TYPE_PARTIAL_LINE)
1084 return TRACE_TYPE_PARTIAL_LINE; 1104 return TRACE_TYPE_PARTIAL_LINE;
1085 1105
1086 /* Duration */
1087 if (flags & TRACE_GRAPH_PRINT_DURATION) {
1088 ret = print_graph_duration(duration, s);
1089 if (ret == TRACE_TYPE_PARTIAL_LINE)
1090 return TRACE_TYPE_PARTIAL_LINE;
1091 }
1092
1093 /* Closing brace */ 1106 /* Closing brace */
1094 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1107 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
1095 ret = trace_seq_printf(s, " "); 1108 ret = trace_seq_printf(s, " ");
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1146 if (print_graph_prologue(iter, s, 0, 0, flags)) 1159 if (print_graph_prologue(iter, s, 0, 0, flags))
1147 return TRACE_TYPE_PARTIAL_LINE; 1160 return TRACE_TYPE_PARTIAL_LINE;
1148 1161
1149 /* No overhead */
1150 ret = print_graph_overhead(-1, s, flags);
1151 if (!ret)
1152 return TRACE_TYPE_PARTIAL_LINE;
1153
1154 /* No time */ 1162 /* No time */
1155 if (flags & TRACE_GRAPH_PRINT_DURATION) { 1163 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
1156 ret = trace_seq_printf(s, " | "); 1164 if (ret != TRACE_TYPE_HANDLED)
1157 if (!ret) 1165 return ret;
1158 return TRACE_TYPE_PARTIAL_LINE;
1159 }
1160 1166
1161 /* Indentation */ 1167 /* Indentation */
1162 if (depth > 0) 1168 if (depth > 0)
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1207 1213
1208 1214
1209enum print_line_t 1215enum print_line_t
1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1216print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1211{ 1217{
1212 struct ftrace_graph_ent_entry *field; 1218 struct ftrace_graph_ent_entry *field;
1213 struct fgraph_data *data = iter->private; 1219 struct fgraph_data *data = iter->private;
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1270static enum print_line_t 1276static enum print_line_t
1271print_graph_function(struct trace_iterator *iter) 1277print_graph_function(struct trace_iterator *iter)
1272{ 1278{
1273 return __print_graph_function_flags(iter, tracer_flags.val); 1279 return print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1285} 1280}
1286 1281
1287static enum print_line_t 1282static enum print_line_t
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1309 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); 1304 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
1310 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); 1305 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
1311 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); 1306 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
1312 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); 1307 seq_printf(s, "#%.*s||| / \n", size, spaces);
1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1314} 1308}
1315 1309
1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags) 1310static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1329 if (flags & TRACE_GRAPH_PRINT_PROC) 1323 if (flags & TRACE_GRAPH_PRINT_PROC)
1330 seq_printf(s, " TASK/PID "); 1324 seq_printf(s, " TASK/PID ");
1331 if (lat) 1325 if (lat)
1332 seq_printf(s, "|||||"); 1326 seq_printf(s, "||||");
1333 if (flags & TRACE_GRAPH_PRINT_DURATION) 1327 if (flags & TRACE_GRAPH_PRINT_DURATION)
1334 seq_printf(s, " DURATION "); 1328 seq_printf(s, " DURATION ");
1335 seq_printf(s, " FUNCTION CALLS\n"); 1329 seq_printf(s, " FUNCTION CALLS\n");
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1343 if (flags & TRACE_GRAPH_PRINT_PROC) 1337 if (flags & TRACE_GRAPH_PRINT_PROC)
1344 seq_printf(s, " | | "); 1338 seq_printf(s, " | | ");
1345 if (lat) 1339 if (lat)
1346 seq_printf(s, "|||||"); 1340 seq_printf(s, "||||");
1347 if (flags & TRACE_GRAPH_PRINT_DURATION) 1341 if (flags & TRACE_GRAPH_PRINT_DURATION)
1348 seq_printf(s, " | | "); 1342 seq_printf(s, " | | ");
1349 seq_printf(s, " | | | |\n"); 1343 seq_printf(s, " | | | |\n");
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{ 1352{
1359 struct trace_iterator *iter = s->private; 1353 struct trace_iterator *iter = s->private;
1360 1354
1355 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
1356 return;
1357
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 1358 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */ 1359 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter)) 1360 if (trace_empty(iter))
1364 return; 1361 return;
1365 1362
1366 print_trace_header(s, iter); 1363 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION; 1364 }
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370 1365
1371 __print_graph_headers_flags(s, flags); 1366 __print_graph_headers_flags(s, flags);
1372} 1367}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c77424be284..667aa8cc0cf 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
226} 226}
227 227
228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ 228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
229 TRACE_GRAPH_PRINT_PROC) 229 TRACE_GRAPH_PRINT_PROC | \
230 TRACE_GRAPH_PRINT_ABS_TIME | \
231 TRACE_GRAPH_PRINT_DURATION)
230 232
231static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 233static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
232{ 234{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 27d13b36b8b..00d527c945a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref)
343DEFINE_FETCH_deref(string) 343DEFINE_FETCH_deref(string)
344DEFINE_FETCH_deref(string_size) 344DEFINE_FETCH_deref(string_size)
345 345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
346static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
347{ 355{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
@@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string_size NULL 385#define fetch_bitfield_string_size NULL
378 386
379static __kprobes void 387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
380free_bitfield_fetch_param(struct bitfield_fetch_param *data) 401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
381{ 402{
382 /* 403 /*
@@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
389 free_symbol_cache(data->orig.data); 410 free_symbol_cache(data->orig.data);
390 kfree(data); 411 kfree(data);
391} 412}
413
392/* Default (unsigned long) fetch type */ 414/* Default (unsigned long) fetch type */
393#define __DEFAULT_FETCH_TYPE(t) u##t 415#define __DEFAULT_FETCH_TYPE(t) u##t
394#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -536,6 +558,7 @@ struct probe_arg {
536/* Flags for trace_probe */ 558/* Flags for trace_probe */
537#define TP_FLAG_TRACE 1 559#define TP_FLAG_TRACE 1
538#define TP_FLAG_PROFILE 2 560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
539 562
540struct trace_probe { 563struct trace_probe {
541 struct list_head list; 564 struct list_head list;
@@ -555,16 +578,49 @@ struct trace_probe {
555 (sizeof(struct probe_arg) * (n))) 578 (sizeof(struct probe_arg) * (n)))
556 579
557 580
558static __kprobes int probe_is_return(struct trace_probe *tp) 581static __kprobes int trace_probe_is_return(struct trace_probe *tp)
559{ 582{
560 return tp->rp.handler != NULL; 583 return tp->rp.handler != NULL;
561} 584}
562 585
563static __kprobes const char *probe_symbol(struct trace_probe *tp) 586static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
564{ 587{
565 return tp->symbol ? tp->symbol : "unknown"; 588 return tp->symbol ? tp->symbol : "unknown";
566} 589}
567 590
591static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
592{
593 return tp->rp.kp.offset;
594}
595
596static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
597{
598 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
599}
600
601static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
602{
603 return !!(tp->flags & TP_FLAG_REGISTERED);
604}
605
606static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
607{
608 return !!(kprobe_gone(&tp->rp.kp));
609}
610
611static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
612 struct module *mod)
613{
614 int len = strlen(mod->name);
615 const char *name = trace_probe_symbol(tp);
616 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
617}
618
619static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
620{
621 return !!strchr(trace_probe_symbol(tp), ':');
622}
623
568static int register_probe_event(struct trace_probe *tp); 624static int register_probe_event(struct trace_probe *tp);
569static void unregister_probe_event(struct trace_probe *tp); 625static void unregister_probe_event(struct trace_probe *tp);
570 626
@@ -646,6 +702,16 @@ error:
646 return ERR_PTR(ret); 702 return ERR_PTR(ret);
647} 703}
648 704
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
649static void free_probe_arg(struct probe_arg *arg) 715static void free_probe_arg(struct probe_arg *arg)
650{ 716{
651 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) 717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
@@ -671,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp)
671 kfree(tp); 737 kfree(tp);
672} 738}
673 739
674static struct trace_probe *find_probe_event(const char *event, 740static struct trace_probe *find_trace_probe(const char *event,
675 const char *group) 741 const char *group)
676{ 742{
677 struct trace_probe *tp; 743 struct trace_probe *tp;
@@ -683,15 +749,104 @@ static struct trace_probe *find_probe_event(const char *event,
683 return NULL; 749 return NULL;
684} 750}
685 751
686/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 752/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
687static void unregister_trace_probe(struct trace_probe *tp) 753static int enable_trace_probe(struct trace_probe *tp, int flag)
688{ 754{
689 if (probe_is_return(tp)) 755 int ret = 0;
690 unregister_kretprobe(&tp->rp); 756
757 tp->flags |= flag;
758 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
759 !trace_probe_has_gone(tp)) {
760 if (trace_probe_is_return(tp))
761 ret = enable_kretprobe(&tp->rp);
762 else
763 ret = enable_kprobe(&tp->rp.kp);
764 }
765
766 return ret;
767}
768
769/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
770static void disable_trace_probe(struct trace_probe *tp, int flag)
771{
772 tp->flags &= ~flag;
773 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
774 if (trace_probe_is_return(tp))
775 disable_kretprobe(&tp->rp);
776 else
777 disable_kprobe(&tp->rp.kp);
778 }
779}
780
781/* Internal register function - just handle k*probes and flags */
782static int __register_trace_probe(struct trace_probe *tp)
783{
784 int i, ret;
785
786 if (trace_probe_is_registered(tp))
787 return -EINVAL;
788
789 for (i = 0; i < tp->nr_args; i++)
790 update_probe_arg(&tp->args[i]);
791
792 /* Set/clear disabled flag according to tp->flag */
793 if (trace_probe_is_enabled(tp))
794 tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
795 else
796 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
797
798 if (trace_probe_is_return(tp))
799 ret = register_kretprobe(&tp->rp);
691 else 800 else
692 unregister_kprobe(&tp->rp.kp); 801 ret = register_kprobe(&tp->rp.kp);
802
803 if (ret == 0)
804 tp->flags |= TP_FLAG_REGISTERED;
805 else {
806 pr_warning("Could not insert probe at %s+%lu: %d\n",
807 trace_probe_symbol(tp), trace_probe_offset(tp), ret);
808 if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
809 pr_warning("This probe might be able to register after"
810 "target module is loaded. Continue.\n");
811 ret = 0;
812 } else if (ret == -EILSEQ) {
813 pr_warning("Probing address(0x%p) is not an "
814 "instruction boundary.\n",
815 tp->rp.kp.addr);
816 ret = -EINVAL;
817 }
818 }
819
820 return ret;
821}
822
823/* Internal unregister function - just handle k*probes and flags */
824static void __unregister_trace_probe(struct trace_probe *tp)
825{
826 if (trace_probe_is_registered(tp)) {
827 if (trace_probe_is_return(tp))
828 unregister_kretprobe(&tp->rp);
829 else
830 unregister_kprobe(&tp->rp.kp);
831 tp->flags &= ~TP_FLAG_REGISTERED;
832 /* Cleanup kprobe for reuse */
833 if (tp->rp.kp.symbol_name)
834 tp->rp.kp.addr = NULL;
835 }
836}
837
838/* Unregister a trace_probe and probe_event: call with locking probe_lock */
839static int unregister_trace_probe(struct trace_probe *tp)
840{
841 /* Enabled event can not be unregistered */
842 if (trace_probe_is_enabled(tp))
843 return -EBUSY;
844
845 __unregister_trace_probe(tp);
693 list_del(&tp->list); 846 list_del(&tp->list);
694 unregister_probe_event(tp); 847 unregister_probe_event(tp);
848
849 return 0;
695} 850}
696 851
697/* Register a trace_probe and probe_event */ 852/* Register a trace_probe and probe_event */
@@ -702,41 +857,68 @@ static int register_trace_probe(struct trace_probe *tp)
702 857
703 mutex_lock(&probe_lock); 858 mutex_lock(&probe_lock);
704 859
705 /* register as an event */ 860 /* Delete old (same name) event if exist */
706 old_tp = find_probe_event(tp->call.name, tp->call.class->system); 861 old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
707 if (old_tp) { 862 if (old_tp) {
708 /* delete old event */ 863 ret = unregister_trace_probe(old_tp);
709 unregister_trace_probe(old_tp); 864 if (ret < 0)
865 goto end;
710 free_trace_probe(old_tp); 866 free_trace_probe(old_tp);
711 } 867 }
868
869 /* Register new event */
712 ret = register_probe_event(tp); 870 ret = register_probe_event(tp);
713 if (ret) { 871 if (ret) {
714 pr_warning("Failed to register probe event(%d)\n", ret); 872 pr_warning("Failed to register probe event(%d)\n", ret);
715 goto end; 873 goto end;
716 } 874 }
717 875
718 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; 876 /* Register k*probe */
719 if (probe_is_return(tp)) 877 ret = __register_trace_probe(tp);
720 ret = register_kretprobe(&tp->rp); 878 if (ret < 0)
721 else
722 ret = register_kprobe(&tp->rp.kp);
723
724 if (ret) {
725 pr_warning("Could not insert probe(%d)\n", ret);
726 if (ret == -EILSEQ) {
727 pr_warning("Probing address(0x%p) is not an "
728 "instruction boundary.\n",
729 tp->rp.kp.addr);
730 ret = -EINVAL;
731 }
732 unregister_probe_event(tp); 879 unregister_probe_event(tp);
733 } else 880 else
734 list_add_tail(&tp->list, &probe_list); 881 list_add_tail(&tp->list, &probe_list);
882
735end: 883end:
736 mutex_unlock(&probe_lock); 884 mutex_unlock(&probe_lock);
737 return ret; 885 return ret;
738} 886}
739 887
888/* Module notifier call back, checking event on the module */
889static int trace_probe_module_callback(struct notifier_block *nb,
890 unsigned long val, void *data)
891{
892 struct module *mod = data;
893 struct trace_probe *tp;
894 int ret;
895
896 if (val != MODULE_STATE_COMING)
897 return NOTIFY_DONE;
898
899 /* Update probes on coming module */
900 mutex_lock(&probe_lock);
901 list_for_each_entry(tp, &probe_list, list) {
902 if (trace_probe_within_module(tp, mod)) {
903 /* Don't need to check busy - this should have gone. */
904 __unregister_trace_probe(tp);
905 ret = __register_trace_probe(tp);
906 if (ret)
907 pr_warning("Failed to re-register probe %s on"
908 "%s: %d\n",
909 tp->call.name, mod->name, ret);
910 }
911 }
912 mutex_unlock(&probe_lock);
913
914 return NOTIFY_DONE;
915}
916
917static struct notifier_block trace_probe_module_nb = {
918 .notifier_call = trace_probe_module_callback,
919 .priority = 1 /* Invoked after kprobe module callback */
920};
921
740/* Split symbol and offset. */ 922/* Split symbol and offset. */
741static int split_symbol_offset(char *symbol, unsigned long *offset) 923static int split_symbol_offset(char *symbol, unsigned long *offset)
742{ 924{
@@ -962,8 +1144,8 @@ static int create_trace_probe(int argc, char **argv)
962{ 1144{
963 /* 1145 /*
964 * Argument syntax: 1146 * Argument syntax:
965 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 1147 * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
966 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 1148 * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
967 * Fetch args: 1149 * Fetch args:
968 * $retval : fetch return value 1150 * $retval : fetch return value
969 * $stack : fetch stack address 1151 * $stack : fetch stack address
@@ -1025,17 +1207,18 @@ static int create_trace_probe(int argc, char **argv)
1025 return -EINVAL; 1207 return -EINVAL;
1026 } 1208 }
1027 mutex_lock(&probe_lock); 1209 mutex_lock(&probe_lock);
1028 tp = find_probe_event(event, group); 1210 tp = find_trace_probe(event, group);
1029 if (!tp) { 1211 if (!tp) {
1030 mutex_unlock(&probe_lock); 1212 mutex_unlock(&probe_lock);
1031 pr_info("Event %s/%s doesn't exist.\n", group, event); 1213 pr_info("Event %s/%s doesn't exist.\n", group, event);
1032 return -ENOENT; 1214 return -ENOENT;
1033 } 1215 }
1034 /* delete an event */ 1216 /* delete an event */
1035 unregister_trace_probe(tp); 1217 ret = unregister_trace_probe(tp);
1036 free_trace_probe(tp); 1218 if (ret == 0)
1219 free_trace_probe(tp);
1037 mutex_unlock(&probe_lock); 1220 mutex_unlock(&probe_lock);
1038 return 0; 1221 return ret;
1039 } 1222 }
1040 1223
1041 if (argc < 2) { 1224 if (argc < 2) {
@@ -1144,20 +1327,30 @@ error:
1144 return ret; 1327 return ret;
1145} 1328}
1146 1329
1147static void cleanup_all_probes(void) 1330static int release_all_trace_probes(void)
1148{ 1331{
1149 struct trace_probe *tp; 1332 struct trace_probe *tp;
1333 int ret = 0;
1150 1334
1151 mutex_lock(&probe_lock); 1335 mutex_lock(&probe_lock);
1336 /* Ensure no probe is in use. */
1337 list_for_each_entry(tp, &probe_list, list)
1338 if (trace_probe_is_enabled(tp)) {
1339 ret = -EBUSY;
1340 goto end;
1341 }
1152 /* TODO: Use batch unregistration */ 1342 /* TODO: Use batch unregistration */
1153 while (!list_empty(&probe_list)) { 1343 while (!list_empty(&probe_list)) {
1154 tp = list_entry(probe_list.next, struct trace_probe, list); 1344 tp = list_entry(probe_list.next, struct trace_probe, list);
1155 unregister_trace_probe(tp); 1345 unregister_trace_probe(tp);
1156 free_trace_probe(tp); 1346 free_trace_probe(tp);
1157 } 1347 }
1348
1349end:
1158 mutex_unlock(&probe_lock); 1350 mutex_unlock(&probe_lock);
1159}
1160 1351
1352 return ret;
1353}
1161 1354
1162/* Probes listing interfaces */ 1355/* Probes listing interfaces */
1163static void *probes_seq_start(struct seq_file *m, loff_t *pos) 1356static void *probes_seq_start(struct seq_file *m, loff_t *pos)
@@ -1181,15 +1374,16 @@ static int probes_seq_show(struct seq_file *m, void *v)
1181 struct trace_probe *tp = v; 1374 struct trace_probe *tp = v;
1182 int i; 1375 int i;
1183 1376
1184 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 1377 seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
1185 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); 1378 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
1186 1379
1187 if (!tp->symbol) 1380 if (!tp->symbol)
1188 seq_printf(m, " 0x%p", tp->rp.kp.addr); 1381 seq_printf(m, " 0x%p", tp->rp.kp.addr);
1189 else if (tp->rp.kp.offset) 1382 else if (tp->rp.kp.offset)
1190 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); 1383 seq_printf(m, " %s+%u", trace_probe_symbol(tp),
1384 tp->rp.kp.offset);
1191 else 1385 else
1192 seq_printf(m, " %s", probe_symbol(tp)); 1386 seq_printf(m, " %s", trace_probe_symbol(tp));
1193 1387
1194 for (i = 0; i < tp->nr_args; i++) 1388 for (i = 0; i < tp->nr_args; i++)
1195 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); 1389 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
@@ -1207,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
1207 1401
1208static int probes_open(struct inode *inode, struct file *file) 1402static int probes_open(struct inode *inode, struct file *file)
1209{ 1403{
1210 if ((file->f_mode & FMODE_WRITE) && 1404 int ret;
1211 (file->f_flags & O_TRUNC)) 1405
1212 cleanup_all_probes(); 1406 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
1407 ret = release_all_trace_probes();
1408 if (ret < 0)
1409 return ret;
1410 }
1213 1411
1214 return seq_open(file, &probes_seq_op); 1412 return seq_open(file, &probes_seq_op);
1215} 1413}
@@ -1397,7 +1595,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1397 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1595 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1398 1596
1399 if (!filter_current_check_discard(buffer, call, entry, event)) 1597 if (!filter_current_check_discard(buffer, call, entry, event))
1400 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1598 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1599 irq_flags, pc, regs);
1401} 1600}
1402 1601
1403/* Kretprobe handler */ 1602/* Kretprobe handler */
@@ -1429,7 +1628,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1429 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1628 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1430 1629
1431 if (!filter_current_check_discard(buffer, call, entry, event)) 1630 if (!filter_current_check_discard(buffer, call, entry, event))
1432 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1631 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1632 irq_flags, pc, regs);
1433} 1633}
1434 1634
1435/* Event entry printers */ 1635/* Event entry printers */
@@ -1511,30 +1711,6 @@ partial:
1511 return TRACE_TYPE_PARTIAL_LINE; 1711 return TRACE_TYPE_PARTIAL_LINE;
1512} 1712}
1513 1713
1514static int probe_event_enable(struct ftrace_event_call *call)
1515{
1516 struct trace_probe *tp = (struct trace_probe *)call->data;
1517
1518 tp->flags |= TP_FLAG_TRACE;
1519 if (probe_is_return(tp))
1520 return enable_kretprobe(&tp->rp);
1521 else
1522 return enable_kprobe(&tp->rp.kp);
1523}
1524
1525static void probe_event_disable(struct ftrace_event_call *call)
1526{
1527 struct trace_probe *tp = (struct trace_probe *)call->data;
1528
1529 tp->flags &= ~TP_FLAG_TRACE;
1530 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1531 if (probe_is_return(tp))
1532 disable_kretprobe(&tp->rp);
1533 else
1534 disable_kprobe(&tp->rp.kp);
1535 }
1536}
1537
1538#undef DEFINE_FIELD 1714#undef DEFINE_FIELD
1539#define DEFINE_FIELD(type, item, name, is_signed) \ 1715#define DEFINE_FIELD(type, item, name, is_signed) \
1540 do { \ 1716 do { \
@@ -1596,7 +1772,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1596 1772
1597 const char *fmt, *arg; 1773 const char *fmt, *arg;
1598 1774
1599 if (!probe_is_return(tp)) { 1775 if (!trace_probe_is_return(tp)) {
1600 fmt = "(%lx)"; 1776 fmt = "(%lx)";
1601 arg = "REC->" FIELD_STRING_IP; 1777 arg = "REC->" FIELD_STRING_IP;
1602 } else { 1778 } else {
@@ -1713,49 +1889,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1713 head = this_cpu_ptr(call->perf_events); 1889 head = this_cpu_ptr(call->perf_events);
1714 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1890 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1715} 1891}
1716
1717static int probe_perf_enable(struct ftrace_event_call *call)
1718{
1719 struct trace_probe *tp = (struct trace_probe *)call->data;
1720
1721 tp->flags |= TP_FLAG_PROFILE;
1722
1723 if (probe_is_return(tp))
1724 return enable_kretprobe(&tp->rp);
1725 else
1726 return enable_kprobe(&tp->rp.kp);
1727}
1728
1729static void probe_perf_disable(struct ftrace_event_call *call)
1730{
1731 struct trace_probe *tp = (struct trace_probe *)call->data;
1732
1733 tp->flags &= ~TP_FLAG_PROFILE;
1734
1735 if (!(tp->flags & TP_FLAG_TRACE)) {
1736 if (probe_is_return(tp))
1737 disable_kretprobe(&tp->rp);
1738 else
1739 disable_kprobe(&tp->rp.kp);
1740 }
1741}
1742#endif /* CONFIG_PERF_EVENTS */ 1892#endif /* CONFIG_PERF_EVENTS */
1743 1893
1744static __kprobes 1894static __kprobes
1745int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) 1895int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1746{ 1896{
1897 struct trace_probe *tp = (struct trace_probe *)event->data;
1898
1747 switch (type) { 1899 switch (type) {
1748 case TRACE_REG_REGISTER: 1900 case TRACE_REG_REGISTER:
1749 return probe_event_enable(event); 1901 return enable_trace_probe(tp, TP_FLAG_TRACE);
1750 case TRACE_REG_UNREGISTER: 1902 case TRACE_REG_UNREGISTER:
1751 probe_event_disable(event); 1903 disable_trace_probe(tp, TP_FLAG_TRACE);
1752 return 0; 1904 return 0;
1753 1905
1754#ifdef CONFIG_PERF_EVENTS 1906#ifdef CONFIG_PERF_EVENTS
1755 case TRACE_REG_PERF_REGISTER: 1907 case TRACE_REG_PERF_REGISTER:
1756 return probe_perf_enable(event); 1908 return enable_trace_probe(tp, TP_FLAG_PROFILE);
1757 case TRACE_REG_PERF_UNREGISTER: 1909 case TRACE_REG_PERF_UNREGISTER:
1758 probe_perf_disable(event); 1910 disable_trace_probe(tp, TP_FLAG_PROFILE);
1759 return 0; 1911 return 0;
1760#endif 1912#endif
1761 } 1913 }
@@ -1805,7 +1957,7 @@ static int register_probe_event(struct trace_probe *tp)
1805 1957
1806 /* Initialize ftrace_event_call */ 1958 /* Initialize ftrace_event_call */
1807 INIT_LIST_HEAD(&call->class->fields); 1959 INIT_LIST_HEAD(&call->class->fields);
1808 if (probe_is_return(tp)) { 1960 if (trace_probe_is_return(tp)) {
1809 call->event.funcs = &kretprobe_funcs; 1961 call->event.funcs = &kretprobe_funcs;
1810 call->class->define_fields = kretprobe_event_define_fields; 1962 call->class->define_fields = kretprobe_event_define_fields;
1811 } else { 1963 } else {
@@ -1844,6 +1996,9 @@ static __init int init_kprobe_trace(void)
1844 struct dentry *d_tracer; 1996 struct dentry *d_tracer;
1845 struct dentry *entry; 1997 struct dentry *entry;
1846 1998
1999 if (register_module_notifier(&trace_probe_module_nb))
2000 return -EINVAL;
2001
1847 d_tracer = tracing_init_dentry(); 2002 d_tracer = tracing_init_dentry();
1848 if (!d_tracer) 2003 if (!d_tracer)
1849 return 0; 2004 return 0;
@@ -1897,12 +2052,12 @@ static __init int kprobe_trace_self_tests_init(void)
1897 warn++; 2052 warn++;
1898 } else { 2053 } else {
1899 /* Enable trace point */ 2054 /* Enable trace point */
1900 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); 2055 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
1901 if (WARN_ON_ONCE(tp == NULL)) { 2056 if (WARN_ON_ONCE(tp == NULL)) {
1902 pr_warning("error on getting new probe.\n"); 2057 pr_warning("error on getting new probe.\n");
1903 warn++; 2058 warn++;
1904 } else 2059 } else
1905 probe_event_enable(&tp->call); 2060 enable_trace_probe(tp, TP_FLAG_TRACE);
1906 } 2061 }
1907 2062
1908 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 2063 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
@@ -1912,12 +2067,12 @@ static __init int kprobe_trace_self_tests_init(void)
1912 warn++; 2067 warn++;
1913 } else { 2068 } else {
1914 /* Enable trace point */ 2069 /* Enable trace point */
1915 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); 2070 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
1916 if (WARN_ON_ONCE(tp == NULL)) { 2071 if (WARN_ON_ONCE(tp == NULL)) {
1917 pr_warning("error on getting new probe.\n"); 2072 pr_warning("error on getting new probe.\n");
1918 warn++; 2073 warn++;
1919 } else 2074 } else
1920 probe_event_enable(&tp->call); 2075 enable_trace_probe(tp, TP_FLAG_TRACE);
1921 } 2076 }
1922 2077
1923 if (warn) 2078 if (warn)
@@ -1925,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
1925 2080
1926 ret = target(1, 2, 3, 4, 5, 6); 2081 ret = target(1, 2, 3, 4, 5, 6);
1927 2082
2083 /* Disable trace points before removing it */
2084 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
2085 if (WARN_ON_ONCE(tp == NULL)) {
2086 pr_warning("error on getting test probe.\n");
2087 warn++;
2088 } else
2089 disable_trace_probe(tp, TP_FLAG_TRACE);
2090
2091 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
2092 if (WARN_ON_ONCE(tp == NULL)) {
2093 pr_warning("error on getting 2nd test probe.\n");
2094 warn++;
2095 } else
2096 disable_trace_probe(tp, TP_FLAG_TRACE);
2097
1928 ret = command_trace_probe("-:testprobe"); 2098 ret = command_trace_probe("-:testprobe");
1929 if (WARN_ON_ONCE(ret)) { 2099 if (WARN_ON_ONCE(ret)) {
1930 pr_warning("error on deleting a probe.\n"); 2100 pr_warning("error on deleting a probe.\n");
@@ -1938,7 +2108,7 @@ static __init int kprobe_trace_self_tests_init(void)
1938 } 2108 }
1939 2109
1940end: 2110end:
1941 cleanup_all_probes(); 2111 release_all_trace_probes();
1942 if (warn) 2112 if (warn)
1943 pr_cont("NG: Some tests are failed. Please check them.\n"); 2113 pr_cont("NG: Some tests are failed. Please check them.\n");
1944 else 2114 else
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 017fa376505..fd3c8aae55e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/time.h> 13#include <linux/time.h>
14 14
15#include <asm/atomic.h> 15#include <linux/atomic.h>
16 16
17#include "trace.h" 17#include "trace.h"
18#include "trace_output.h" 18#include "trace_output.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e37de492a9e..51999309a6c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1107{ 1107{
1108 struct stack_entry *field; 1108 struct stack_entry *field;
1109 struct trace_seq *s = &iter->seq; 1109 struct trace_seq *s = &iter->seq;
1110 int i; 1110 unsigned long *p;
1111 unsigned long *end;
1111 1112
1112 trace_assign_type(field, iter->ent); 1113 trace_assign_type(field, iter->ent);
1114 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1113 1115
1114 if (!trace_seq_puts(s, "<stack trace>\n")) 1116 if (!trace_seq_puts(s, "<stack trace>\n"))
1115 goto partial; 1117 goto partial;
1116 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1118
1117 if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) 1119 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1118 break;
1119 if (!trace_seq_puts(s, " => ")) 1120 if (!trace_seq_puts(s, " => "))
1120 goto partial; 1121 goto partial;
1121 1122
1122 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1123 if (!seq_print_ip_sym(s, *p, flags))
1123 goto partial; 1124 goto partial;
1124 if (!trace_seq_puts(s, "\n")) 1125 if (!trace_seq_puts(s, "\n"))
1125 goto partial; 1126 goto partial;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f029dd4fd2c..e4a70c0c71b 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)
227 graph_trace_close(iter); 227 graph_trace_close(iter);
228} 228}
229 229
230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) 230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
231 TRACE_GRAPH_PRINT_ABS_TIME | \
232 TRACE_GRAPH_PRINT_DURATION)
231 233
232static enum print_line_t wakeup_print_line(struct trace_iterator *iter) 234static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
233{ 235{
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b0b53b8e4c2..77575b386d9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
156{ 156{
157 long *ptr = filp->private_data; 157 long *ptr = filp->private_data;
158 unsigned long val, flags; 158 unsigned long val, flags;
159 char buf[64];
160 int ret; 159 int ret;
161 int cpu; 160 int cpu;
162 161
163 if (count >= sizeof(buf)) 162 ret = kstrtoul_from_user(ubuf, count, 10, &val);
164 return -EINVAL; 163 if (ret)
165
166 if (copy_from_user(&buf, ubuf, count))
167 return -EFAULT;
168
169 buf[count] = 0;
170
171 ret = strict_strtoul(buf, 10, &val);
172 if (ret < 0)
173 return ret; 164 return ret;
174 165
175 local_irq_save(flags); 166 local_irq_save(flags);
diff --git a/kernel/trace/tracedump.c b/kernel/trace/tracedump.c
new file mode 100644
index 00000000000..a83532bc36d
--- /dev/null
+++ b/kernel/trace/tracedump.c
@@ -0,0 +1,682 @@
1/*
2 * kernel/trace/tracedump.c
3 *
4 * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#include <linux/console.h>
22#include <linux/cpumask.h>
23#include <linux/init.h>
24#include <linux/irqflags.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/mutex.h>
28#include <linux/notifier.h>
29#include <linux/proc_fs.h>
30#include <linux/ring_buffer.h>
31#include <linux/sched.h>
32#include <linux/smp.h>
33#include <linux/string.h>
34#include <linux/threads.h>
35#include <linux/tracedump.h>
36#include <linux/uaccess.h>
37#include <linux/vmalloc.h>
38#include <linux/zlib.h>
39
40#include "trace.h"
41#include "trace_output.h"
42
43#define CPU_MAX (NR_CPUS-1)
44
45#define TRYM(fn, ...) do { \
46 int try_error = (fn); \
47 if (try_error < 0) { \
48 printk(__VA_ARGS__); \
49 return try_error; \
50 } \
51} while (0)
52
53#define TRY(fn) TRYM(fn, TAG "Caught error from %s in %s\n", #fn, __func__)
54
55/* Stolen from printk.c */
56#define for_each_console(con) \
57 for (con = console_drivers; con != NULL; con = con->next)
58
59#define TAG KERN_ERR "tracedump: "
60
61#define TD_MIN_CONSUME 2000
62#define TD_COMPRESS_CHUNK 0x8000
63
64static DEFINE_MUTEX(tracedump_proc_lock);
65
66static const char MAGIC_NUMBER[9] = "TRACEDUMP";
67static const char CPU_DELIM[7] = "CPU_END";
68#define CMDLINE_DELIM "|"
69
70/* Type of output */
71static bool current_format;
72static bool format_ascii;
73module_param(format_ascii, bool, S_IRUGO | S_IWUSR);
74MODULE_PARM_DESC(format_ascii, "Dump ascii or raw data");
75
76/* Max size of output */
77static uint panic_size = 0x80000;
78module_param(panic_size, uint, S_IRUGO | S_IWUSR);
79MODULE_PARM_DESC(panic_size, "Max dump size during kernel panic (bytes)");
80
81static uint compress_level = 9;
82module_param(compress_level, uint, S_IRUGO | S_IWUSR);
83MODULE_PARM_DESC(compress_level, "Level of compression to use. [0-9]");
84
85static char out_buf[TD_COMPRESS_CHUNK];
86static z_stream stream;
87static int compress_done;
88static int flush;
89
90static int old_trace_flags;
91
92static struct trace_iterator iter;
93static struct pager_s {
94 struct trace_array *tr;
95 void *spare;
96 int cpu;
97 int len;
98 char __user *ubuf;
99} pager;
100
101static char cmdline_buf[16+TASK_COMM_LEN];
102
103static int print_to_console(const char *buf, size_t len)
104{
105 struct console *con;
106
107 /* Stolen from printk.c */
108 for_each_console(con) {
109 if ((con->flags & CON_ENABLED) && con->write &&
110 (cpu_online(smp_processor_id()) ||
111 (con->flags & CON_ANYTIME)))
112 con->write(con, buf, len);
113 }
114 return 0;
115}
116
117static int print_to_user(const char *buf, size_t len)
118{
119 int size;
120 size = copy_to_user(pager.ubuf, buf, len);
121 if (size > 0) {
122 printk(TAG "Failed to copy to user %d bytes\n", size);
123 return -EINVAL;
124 }
125 return 0;
126}
127
128static int print(const char *buf, size_t len, int print_to)
129{
130 if (print_to == TD_PRINT_CONSOLE)
131 TRY(print_to_console(buf, len));
132 else if (print_to == TD_PRINT_USER)
133 TRY(print_to_user(buf, len));
134 return 0;
135}
136
137/* print_magic will print MAGIC_NUMBER using the
138 * print function selected by print_to.
139 */
140static inline ssize_t print_magic(int print_to)
141{
142 print(MAGIC_NUMBER, sizeof(MAGIC_NUMBER), print_to);
143 return sizeof(MAGIC_NUMBER);
144}
145
146static int iter_init(void)
147{
148 int cpu;
149
150 /* Make iter point to global ring buffer used in trace. */
151 trace_init_global_iter(&iter);
152
153 /* Disable tracing */
154 for_each_tracing_cpu(cpu) {
155 atomic_inc(&iter.tr->data[cpu]->disabled);
156 }
157
158 /* Save flags */
159 old_trace_flags = trace_flags;
160
161 /* Dont look at memory in panic mode. */
162 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
163
164 /* Prepare ring buffer iter */
165 for_each_tracing_cpu(cpu) {
166 iter.buffer_iter[cpu] =
167 ring_buffer_read_prepare(iter.tr->buffer, cpu);
168 }
169 ring_buffer_read_prepare_sync();
170 for_each_tracing_cpu(cpu) {
171 ring_buffer_read_start(iter.buffer_iter[cpu]);
172 tracing_iter_reset(&iter, cpu);
173 }
174 return 0;
175}
176
177/* iter_next gets the next entry in the ring buffer, ordered by time.
178 * If there are no more entries, returns 0.
179 */
180static ssize_t iter_next(void)
181{
182 /* Zero out the iterator's seq */
183 memset(&iter.seq, 0,
184 sizeof(struct trace_iterator) -
185 offsetof(struct trace_iterator, seq));
186
187 while (!trace_empty(&iter)) {
188 if (trace_find_next_entry_inc(&iter) == NULL) {
189 printk(TAG "trace_find_next_entry failed!\n");
190 return -EINVAL;
191 }
192
193 /* Copy the ring buffer data to iterator's seq */
194 print_trace_line(&iter);
195 if (iter.seq.len != 0)
196 return iter.seq.len;
197 }
198 return 0;
199}
200
201static int iter_deinit(void)
202{
203 int cpu;
204 /* Enable tracing */
205 for_each_tracing_cpu(cpu) {
206 ring_buffer_read_finish(iter.buffer_iter[cpu]);
207 }
208 for_each_tracing_cpu(cpu) {
209 atomic_dec(&iter.tr->data[cpu]->disabled);
210 }
211
212 /* Restore flags */
213 trace_flags = old_trace_flags;
214 return 0;
215}
216
217static int pager_init(void)
218{
219 int cpu;
220
221 /* Need to do this to get a pointer to global_trace (iter.tr).
222 Lame, I know. */
223 trace_init_global_iter(&iter);
224
225 /* Turn off tracing */
226 for_each_tracing_cpu(cpu) {
227 atomic_inc(&iter.tr->data[cpu]->disabled);
228 }
229
230 memset(&pager, 0, sizeof(pager));
231 pager.tr = iter.tr;
232 pager.len = TD_COMPRESS_CHUNK;
233
234 return 0;
235}
236
237/* pager_next_cpu moves the pager to the next cpu.
238 * Returns 0 if pager is done, else 1.
239 */
240static ssize_t pager_next_cpu(void)
241{
242 if (pager.cpu <= CPU_MAX) {
243 pager.cpu += 1;
244 return 1;
245 }
246
247 return 0;
248}
249
250/* pager_next gets the next page of data from the ring buffer
251 * of the current cpu. Returns page size or 0 if no more data.
252 */
253static ssize_t pager_next(void)
254{
255 int ret;
256
257 if (pager.cpu > CPU_MAX)
258 return 0;
259
260 if (!pager.spare)
261 pager.spare = ring_buffer_alloc_read_page(pager.tr->buffer, pager.cpu);
262 if (!pager.spare) {
263 printk(TAG "ring_buffer_alloc_read_page failed!");
264 return -ENOMEM;
265 }
266
267 ret = ring_buffer_read_page(pager.tr->buffer,
268 &pager.spare,
269 pager.len,
270 pager.cpu, 0);
271 if (ret < 0)
272 return 0;
273
274 return PAGE_SIZE;
275}
276
277static int pager_deinit(void)
278{
279 int cpu;
280 if (pager.spare != NULL)
281 ring_buffer_free_read_page(pager.tr->buffer, pager.spare);
282
283 for_each_tracing_cpu(cpu) {
284 atomic_dec(&iter.tr->data[cpu]->disabled);
285 }
286 return 0;
287}
288
289/* cmdline_next gets the next saved cmdline from the trace and
290 * puts it in cmdline_buf. Returns the size of the cmdline, or 0 if empty.
291 * but will reset itself on a subsequent call.
292 */
293static ssize_t cmdline_next(void)
294{
295 static int pid;
296 ssize_t size = 0;
297
298 if (pid >= PID_MAX_DEFAULT)
299 pid = -1;
300
301 while (size == 0 && pid < PID_MAX_DEFAULT) {
302 pid++;
303 trace_find_cmdline(pid, cmdline_buf);
304 if (!strncmp(cmdline_buf, "<...>", 5))
305 continue;
306
307 sprintf(&cmdline_buf[strlen(cmdline_buf)], " %d"
308 CMDLINE_DELIM, pid);
309 size = strlen(cmdline_buf);
310 }
311 return size;
312}
313
314/* comsume_events removes the first 'num' entries from the ring buffer. */
315static int consume_events(size_t num)
316{
317 TRY(iter_init());
318 for (; num > 0 && !trace_empty(&iter); num--) {
319 trace_find_next_entry_inc(&iter);
320 ring_buffer_consume(iter.tr->buffer, iter.cpu, &iter.ts,
321 &iter.lost_events);
322 }
323 TRY(iter_deinit());
324 return 0;
325}
326
327static int data_init(void)
328{
329 if (current_format)
330 TRY(iter_init());
331 else
332 TRY(pager_init());
333 return 0;
334}
335
336/* data_next will figure out the right 'next' function to
337 * call and will select the right buffer to pass back
338 * to compress_next.
339 *
340 * iter_next should be used to get data entry-by-entry, ordered
341 * by time, which is what we need in order to convert it to ascii.
342 *
343 * pager_next will return a full page of raw data at a time, one
344 * CPU at a time. pager_next_cpu must be called to get the next CPU.
345 * cmdline_next will get the next saved cmdline
346 */
347static ssize_t data_next(const char **buf)
348{
349 ssize_t size;
350
351 if (current_format) {
352 TRY(size = iter_next());
353 *buf = iter.seq.buffer;
354 } else {
355 TRY(size = pager_next());
356 *buf = pager.spare;
357 if (size == 0) {
358 if (pager_next_cpu()) {
359 size = sizeof(CPU_DELIM);
360 *buf = CPU_DELIM;
361 } else {
362 TRY(size = cmdline_next());
363 *buf = cmdline_buf;
364 }
365 }
366 }
367 return size;
368}
369
370static int data_deinit(void)
371{
372 if (current_format)
373 TRY(iter_deinit());
374 else
375 TRY(pager_deinit());
376 return 0;
377}
378
379static int compress_init(void)
380{
381 int workspacesize, ret;
382
383 compress_done = 0;
384 flush = Z_NO_FLUSH;
385 stream.data_type = current_format ? Z_ASCII : Z_BINARY;
386 workspacesize = zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL);
387 stream.workspace = vmalloc(workspacesize);
388 if (!stream.workspace) {
389 printk(TAG "Could not allocate "
390 "enough memory for zlib!\n");
391 return -ENOMEM;
392 }
393 memset(stream.workspace, 0, workspacesize);
394
395 ret = zlib_deflateInit(&stream, compress_level);
396 if (ret != Z_OK) {
397 printk(TAG "%s\n", stream.msg);
398 return ret;
399 }
400 stream.avail_in = 0;
401 stream.avail_out = 0;
402 TRY(data_init());
403 return 0;
404}
405
406/* compress_next will compress up to min(max_out, TD_COMPRESS_CHUNK) bytes
407 * of data into the output buffer. It gets the data by calling data_next.
408 * It will return the most data it possibly can. If it returns 0, then
409 * there is no more data.
410 *
411 * By the way that zlib works, each call to zlib_deflate will possibly
412 * consume up to avail_in bytes from next_in, and will fill up to
413 * avail_out bytes in next_out. Once flush == Z_FINISH, it can not take
414 * any more input. It will output until it is finished, and will return
415 * Z_STREAM_END.
416 */
417static ssize_t compress_next(size_t max_out)
418{
419 ssize_t ret;
420 max_out = min(max_out, (size_t)TD_COMPRESS_CHUNK);
421 stream.next_out = out_buf;
422 stream.avail_out = max_out;
423 while (stream.avail_out > 0 && !compress_done) {
424 if (stream.avail_in == 0 && flush != Z_FINISH) {
425 TRY(stream.avail_in =
426 data_next((const char **)&stream.next_in));
427 flush = (stream.avail_in == 0) ? Z_FINISH : Z_NO_FLUSH;
428 }
429 if (stream.next_in != NULL) {
430 TRYM((ret = zlib_deflate(&stream, flush)),
431 "zlib: %s\n", stream.msg);
432 compress_done = (ret == Z_STREAM_END);
433 }
434 }
435 ret = max_out - stream.avail_out;
436 return ret;
437}
438
439static int compress_deinit(void)
440{
441 TRY(data_deinit());
442
443 zlib_deflateEnd(&stream);
444 vfree(stream.workspace);
445
446 /* TODO: remove */
447 printk(TAG "Total in: %ld\n", stream.total_in);
448 printk(TAG "Total out: %ld\n", stream.total_out);
449 return stream.total_out;
450}
451
452static int compress_reset(void)
453{
454 TRY(compress_deinit());
455 TRY(compress_init());
456 return 0;
457}
458
459/* tracedump_init initializes all tracedump components.
460 * Call this before tracedump_next
461 */
462int tracedump_init(void)
463{
464 TRY(compress_init());
465 return 0;
466}
467
468/* tracedump_next will print up to max_out data from the tracing ring
469 * buffers using the print function selected by print_to. The data is
470 * compressed using zlib.
471 *
472 * The output type of the data is specified by the format_ascii module
473 * parameter. If format_ascii == 1, human-readable data will be output.
474 * Otherwise, it will output raw data from the ring buffer in cpu order,
475 * followed by the saved_cmdlines data.
476 */
477ssize_t tracedump_next(size_t max_out, int print_to)
478{
479 ssize_t size;
480 TRY(size = compress_next(max_out));
481 print(out_buf, size, print_to);
482 return size;
483}
484
485/* tracedump_all will print all data in the tracing ring buffers using
486 * the print function selected by print_to. The data is compressed using
487 * zlib, and is surrounded by MAGIC_NUMBER.
488 *
489 * The output type of the data is specified by the format_ascii module
490 * parameter. If format_ascii == 1, human-readable data will be output.
491 * Otherwise, it will output raw data from the ring buffer in cpu order,
492 * followed by the saved_cmdlines data.
493 */
494ssize_t tracedump_all(int print_to)
495{
496 ssize_t ret, size = 0;
497 TRY(size += print_magic(print_to));
498
499 do {
500 /* Here the size used doesn't really matter,
501 * since we're dumping everything. */
502 TRY(ret = tracedump_next(0xFFFFFFFF, print_to));
503 size += ret;
504 } while (ret > 0);
505
506 TRY(size += print_magic(print_to));
507
508 return size;
509}
510
511/* tracedump_deinit deinitializes all tracedump components.
512 * This must be called, even on error.
513 */
514int tracedump_deinit(void)
515{
516 TRY(compress_deinit());
517 return 0;
518}
519
520/* tracedump_reset reinitializes all tracedump components. */
521int tracedump_reset(void)
522{
523 TRY(compress_reset());
524 return 0;
525}
526
527
528
529/* tracedump_open opens the tracedump file for reading. */
530static int tracedump_open(struct inode *inode, struct file *file)
531{
532 int ret;
533 mutex_lock(&tracedump_proc_lock);
534 current_format = format_ascii;
535 ret = tracedump_init();
536 if (ret < 0)
537 goto err;
538
539 ret = nonseekable_open(inode, file);
540 if (ret < 0)
541 goto err;
542 return ret;
543
544err:
545 mutex_unlock(&tracedump_proc_lock);
546 return ret;
547}
548
549/* tracedump_read will reads data from tracedump_next and prints
550 * it to userspace. It will surround the data with MAGIC_NUMBER.
551 */
552static ssize_t tracedump_read(struct file *file, char __user *buf,
553 size_t len, loff_t *offset)
554{
555 static int done;
556 ssize_t size = 0;
557
558 pager.ubuf = buf;
559
560 if (*offset == 0) {
561 done = 0;
562 TRY(size = print_magic(TD_PRINT_USER));
563 } else if (!done) {
564 TRY(size = tracedump_next(len, TD_PRINT_USER));
565 if (size == 0) {
566 TRY(size = print_magic(TD_PRINT_USER));
567 done = 1;
568 }
569 }
570
571 *offset += size;
572
573 return size;
574}
575
576static int tracedump_release(struct inode *inode, struct file *file)
577{
578 int ret;
579 ret = tracedump_deinit();
580 mutex_unlock(&tracedump_proc_lock);
581 return ret;
582}
583
584/* tracedump_dump dumps all tracing data from the tracing ring buffers
585 * to all consoles. For details about the output format, see
586 * tracedump_all.
587
588 * At most max_out bytes are dumped. To accomplish this,
589 * tracedump_dump calls tracedump_all several times without writing the data,
590 * each time tossing out old data until it reaches its goal.
591 *
592 * Note: dumping raw pages currently does NOT follow the size limit.
593 */
594
595int tracedump_dump(size_t max_out)
596{
597 ssize_t size;
598 size_t consume;
599
600 printk(TAG "\n");
601
602 tracedump_init();
603
604 if (format_ascii) {
605 size = tracedump_all(TD_NO_PRINT);
606 if (size < 0) {
607 printk(TAG "failed to dump\n");
608 goto out;
609 }
610 while (size > max_out) {
611 TRY(tracedump_deinit());
612 /* Events take more or less 60 ascii bytes each,
613 not counting compression */
614 consume = TD_MIN_CONSUME + (size - max_out) /
615 (60 / (compress_level + 1));
616 TRY(consume_events(consume));
617 TRY(tracedump_init());
618 size = tracedump_all(TD_NO_PRINT);
619 if (size < 0) {
620 printk(TAG "failed to dump\n");
621 goto out;
622 }
623 }
624
625 TRY(tracedump_reset());
626 }
627 size = tracedump_all(TD_PRINT_CONSOLE);
628 if (size < 0) {
629 printk(TAG "failed to dump\n");
630 goto out;
631 }
632
633out:
634 tracedump_deinit();
635 printk(KERN_INFO "\n" TAG " end\n");
636 return size;
637}
638
639static const struct file_operations tracedump_fops = {
640 .owner = THIS_MODULE,
641 .open = tracedump_open,
642 .read = tracedump_read,
643 .release = tracedump_release,
644};
645
646#ifdef CONFIG_TRACEDUMP_PANIC
647static int tracedump_panic_handler(struct notifier_block *this,
648 unsigned long event, void *unused)
649{
650 tracedump_dump(panic_size);
651 return 0;
652}
653
654static struct notifier_block tracedump_panic_notifier = {
655 .notifier_call = tracedump_panic_handler,
656 .next = NULL,
657 .priority = 150 /* priority: INT_MAX >= x >= 0 */
658};
659#endif
660
661static int __init tracedump_initcall(void)
662{
663#ifdef CONFIG_TRACEDUMP_PROCFS
664 struct proc_dir_entry *entry;
665
666 /* Create a procfs file for easy dumping */
667 entry = create_proc_entry("tracedump", S_IFREG | S_IRUGO, NULL);
668 if (!entry)
669 printk(TAG "failed to create proc entry\n");
670 else
671 entry->proc_fops = &tracedump_fops;
672#endif
673
674#ifdef CONFIG_TRACEDUMP_PANIC
675 /* Automatically dump to console on a kernel panic */
676 atomic_notifier_chain_register(&panic_notifier_list,
677 &tracedump_panic_notifier);
678#endif
679 return 0;
680}
681
682early_initcall(tracedump_initcall);
diff --git a/kernel/trace/tracelevel.c b/kernel/trace/tracelevel.c
new file mode 100644
index 00000000000..9f8b8eedbb5
--- /dev/null
+++ b/kernel/trace/tracelevel.c
@@ -0,0 +1,142 @@
1/*
2 * kernel/trace/tracelevel.c
3 *
4 * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#include <linux/ftrace_event.h>
22#include <linux/list.h>
23#include <linux/moduleparam.h>
24#include <linux/mutex.h>
25#include <linux/tracelevel.h>
26#include <linux/vmalloc.h>
27
28#include "trace.h"
29
30#define TAG KERN_ERR "tracelevel: "
31
32struct tracelevel_record {
33 struct list_head list;
34 char *name;
35 int level;
36};
37
38static LIST_HEAD(tracelevel_list);
39
40static bool started;
41static unsigned int tracelevel_level = TRACELEVEL_DEFAULT;
42
43static DEFINE_MUTEX(tracelevel_record_lock);
44
45/* tracelevel_set_event sets a single event if set = 1, or
46 * clears an event if set = 0.
47 */
48static int tracelevel_set_event(struct tracelevel_record *evt, bool set)
49{
50 if (trace_set_clr_event(NULL, evt->name, set) < 0) {
51 printk(TAG "failed to set event %s\n", evt->name);
52 return -EINVAL;
53 }
54 return 0;
55}
56
57/* Registers an event. If possible, it also sets it.
58 * If not, we'll set it in tracelevel_init.
59 */
60int __tracelevel_register(char *name, unsigned int level)
61{
62 struct tracelevel_record *evt = (struct tracelevel_record *)
63 vmalloc(sizeof(struct tracelevel_record));
64 if (!evt) {
65 printk(TAG "failed to allocate tracelevel_record for %s\n",
66 name);
67 return -ENOMEM;
68 }
69
70 evt->name = name;
71 evt->level = level;
72
73 mutex_lock(&tracelevel_record_lock);
74 list_add(&evt->list, &tracelevel_list);
75 mutex_unlock(&tracelevel_record_lock);
76
77 if (level >= tracelevel_level && started)
78 tracelevel_set_event(evt, 1);
79 return 0;
80}
81
82/* tracelevel_set_level sets the global level, clears events
83 * lower than that level, and enables events greater or equal.
84 */
85int tracelevel_set_level(int level)
86{
87 struct tracelevel_record *evt = NULL;
88
89 if (level < 0 || level > TRACELEVEL_MAX)
90 return -EINVAL;
91 tracelevel_level = level;
92
93 mutex_lock(&tracelevel_record_lock);
94 list_for_each_entry(evt, &tracelevel_list, list) {
95 if (evt->level >= level)
96 tracelevel_set_event(evt, 1);
97 else
98 tracelevel_set_event(evt, 0);
99 }
100 mutex_unlock(&tracelevel_record_lock);
101 return 0;
102}
103
104static int param_set_level(const char *val, const struct kernel_param *kp)
105{
106 int level, ret;
107 ret = strict_strtol(val, 0, &level);
108 if (ret < 0)
109 return ret;
110 return tracelevel_set_level(level);
111}
112
113static int param_get_level(char *buffer, const struct kernel_param *kp)
114{
115 return param_get_int(buffer, kp);
116}
117
118static struct kernel_param_ops tracelevel_level_ops = {
119 .set = param_set_level,
120 .get = param_get_level
121};
122
123module_param_cb(level, &tracelevel_level_ops, &tracelevel_level, 0644);
124
125/* Turn on the tracing that has been registered thus far. */
126static int __init tracelevel_init(void)
127{
128 int ret;
129 started = true;
130
131 /* Ring buffer is initialize to 1 page until the user sets a tracer.
132 * Since we're doing this manually, we need to ask for expanded buffer.
133 */
134 ret = tracing_update_buffers();
135 if (ret < 0)
136 return ret;
137
138 return tracelevel_set_level(tracelevel_level);
139}
140
141/* Tracing mechanism is set up during fs_initcall. */
142fs_initcall_sync(tracelevel_init);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 24dc60d9fa1..5bbfac85866 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
78 78
79#define KB 1024 79#define KB 1024
80#define MB (1024*KB) 80#define MB (1024*KB)
81#define KB_MASK (~(KB-1))
81/* 82/*
82 * fill in extended accounting fields 83 * fill in extended accounting fields
83 */ 84 */
@@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
95 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; 96 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
96 mmput(mm); 97 mmput(mm);
97 } 98 }
98 stats->read_char = p->ioac.rchar; 99 stats->read_char = p->ioac.rchar & KB_MASK;
99 stats->write_char = p->ioac.wchar; 100 stats->write_char = p->ioac.wchar & KB_MASK;
100 stats->read_syscalls = p->ioac.syscr; 101 stats->read_syscalls = p->ioac.syscr & KB_MASK;
101 stats->write_syscalls = p->ioac.syscw; 102 stats->write_syscalls = p->ioac.syscw & KB_MASK;
102#ifdef CONFIG_TASK_IO_ACCOUNTING 103#ifdef CONFIG_TASK_IO_ACCOUNTING
103 stats->read_bytes = p->ioac.read_bytes; 104 stats->read_bytes = p->ioac.read_bytes & KB_MASK;
104 stats->write_bytes = p->ioac.write_bytes; 105 stats->write_bytes = p->ioac.write_bytes & KB_MASK;
105 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; 106 stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
106#else 107#else
107 stats->read_bytes = 0; 108 stats->read_bytes = 0;
108 stats->write_bytes = 0; 109 stats->write_bytes = 0;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56ad479..36491cd5b7d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -200,6 +200,7 @@ static int is_softlockup(unsigned long touch_ts)
200} 200}
201 201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR 202#ifdef CONFIG_HARDLOCKUP_DETECTOR
203
203static struct perf_event_attr wd_hw_attr = { 204static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE, 205 .type = PERF_TYPE_HARDWARE,
205 .config = PERF_COUNT_HW_CPU_CYCLES, 206 .config = PERF_COUNT_HW_CPU_CYCLES,
@@ -209,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = {
209}; 210};
210 211
211/* Callback function for perf event subsystem */ 212/* Callback function for perf event subsystem */
212static void watchdog_overflow_callback(struct perf_event *event, int nmi, 213static void watchdog_overflow_callback(struct perf_event *event,
213 struct perf_sample_data *data, 214 struct perf_sample_data *data,
214 struct pt_regs *regs) 215 struct pt_regs *regs)
215{ 216{
@@ -368,10 +369,11 @@ static int watchdog_nmi_enable(int cpu)
368 if (event != NULL) 369 if (event != NULL)
369 goto out_enable; 370 goto out_enable;
370 371
371 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
374 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 374
375 /* Try to register using hardware perf events */
376 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
375 if (!IS_ERR(event)) { 377 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 378 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save; 379 goto out_save;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0400553f0d0..1783aabc612 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
221 * per-CPU workqueues: 221 * per-CPU workqueues:
222 */ 222 */
223struct workqueue_struct { 223struct workqueue_struct {
224 unsigned int flags; /* I: WQ_* flags */ 224 unsigned int flags; /* W: WQ_* flags */
225 union { 225 union {
226 struct cpu_workqueue_struct __percpu *pcpu; 226 struct cpu_workqueue_struct __percpu *pcpu;
227 struct cpu_workqueue_struct *single; 227 struct cpu_workqueue_struct *single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
240 mayday_mask_t mayday_mask; /* cpus requesting rescue */ 240 mayday_mask_t mayday_mask; /* cpus requesting rescue */
241 struct worker *rescuer; /* I: rescue worker */ 241 struct worker *rescuer; /* I: rescue worker */
242 242
243 int nr_drainers; /* W: drain in progress */
243 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
244 const char *name; /* I: workqueue name */ 245 const char *name; /* I: workqueue name */
245#ifdef CONFIG_LOCKDEP 246#ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
990 debug_work_activate(work); 991 debug_work_activate(work);
991 992
992 /* if dying, only works from the same workqueue are allowed */ 993 /* if dying, only works from the same workqueue are allowed */
993 if (unlikely(wq->flags & WQ_DYING) && 994 if (unlikely(wq->flags & WQ_DRAINING) &&
994 WARN_ON_ONCE(!is_chained_work(wq))) 995 WARN_ON_ONCE(!is_chained_work(wq)))
995 return; 996 return;
996 997
@@ -2381,6 +2382,59 @@ out_unlock:
2381} 2382}
2382EXPORT_SYMBOL_GPL(flush_workqueue); 2383EXPORT_SYMBOL_GPL(flush_workqueue);
2383 2384
2385/**
2386 * drain_workqueue - drain a workqueue
2387 * @wq: workqueue to drain
2388 *
2389 * Wait until the workqueue becomes empty. While draining is in progress,
2390 * only chain queueing is allowed. IOW, only currently pending or running
2391 * work items on @wq can queue further work items on it. @wq is flushed
2392 * repeatedly until it becomes empty. The number of flushing is detemined
2393 * by the depth of chaining and should be relatively short. Whine if it
2394 * takes too long.
2395 */
2396void drain_workqueue(struct workqueue_struct *wq)
2397{
2398 unsigned int flush_cnt = 0;
2399 unsigned int cpu;
2400
2401 /*
2402 * __queue_work() needs to test whether there are drainers, is much
2403 * hotter than drain_workqueue() and already looks at @wq->flags.
2404 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2405 */
2406 spin_lock(&workqueue_lock);
2407 if (!wq->nr_drainers++)
2408 wq->flags |= WQ_DRAINING;
2409 spin_unlock(&workqueue_lock);
2410reflush:
2411 flush_workqueue(wq);
2412
2413 for_each_cwq_cpu(cpu, wq) {
2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2415 bool drained;
2416
2417 spin_lock_irq(&cwq->gcwq->lock);
2418 drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
2419 spin_unlock_irq(&cwq->gcwq->lock);
2420
2421 if (drained)
2422 continue;
2423
2424 if (++flush_cnt == 10 ||
2425 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2426 pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
2427 wq->name, flush_cnt);
2428 goto reflush;
2429 }
2430
2431 spin_lock(&workqueue_lock);
2432 if (!--wq->nr_drainers)
2433 wq->flags &= ~WQ_DRAINING;
2434 spin_unlock(&workqueue_lock);
2435}
2436EXPORT_SYMBOL_GPL(drain_workqueue);
2437
2384static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 2438static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2385 bool wait_executing) 2439 bool wait_executing)
2386{ 2440{
@@ -3009,34 +3063,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3009 */ 3063 */
3010void destroy_workqueue(struct workqueue_struct *wq) 3064void destroy_workqueue(struct workqueue_struct *wq)
3011{ 3065{
3012 unsigned int flush_cnt = 0;
3013 unsigned int cpu; 3066 unsigned int cpu;
3014 3067
3015 /* 3068 /* drain it before proceeding with destruction */
3016 * Mark @wq dying and drain all pending works. Once WQ_DYING is 3069 drain_workqueue(wq);
3017 * set, only chain queueing is allowed. IOW, only currently
3018 * pending or running work items on @wq can queue further work
3019 * items on it. @wq is flushed repeatedly until it becomes empty.
3020 * The number of flushing is detemined by the depth of chaining and
3021 * should be relatively short. Whine if it takes too long.
3022 */
3023 wq->flags |= WQ_DYING;
3024reflush:
3025 flush_workqueue(wq);
3026
3027 for_each_cwq_cpu(cpu, wq) {
3028 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3029
3030 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3031 continue;
3032
3033 if (++flush_cnt == 10 ||
3034 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3035 printk(KERN_WARNING "workqueue %s: flush on "
3036 "destruction isn't complete after %u tries\n",
3037 wq->name, flush_cnt);
3038 goto reflush;
3039 }
3040 3070
3041 /* 3071 /*
3042 * wq list is used to freeze wq, remove from list after 3072 * wq list is used to freeze wq, remove from list after