aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-08-04 03:09:27 -0400
committerIngo Molnar <mingo@elte.hu>2011-08-04 03:09:27 -0400
commitd7619fe39d9769b4d4545cc511c891deea18ae08 (patch)
tree0a902533414001075b2245825e145cc2e35ce985 /kernel
parent9ea71503a8ed9184d2d0b8ccc4d269d05f7940ae (diff)
parented8f37370d83e695c0a4fa5d5fc7a83ecb947526 (diff)
Merge branch 'linus' into core/urgent
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/async.c12
-rw-r--r--kernel/audit.c31
-rw-r--r--kernel/audit_tree.c8
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cgroup.c18
-rw-r--r--kernel/compat.c7
-rw-r--r--kernel/configs.c4
-rw-r--r--kernel/cpuset.c10
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c22
-rw-r--r--kernel/debug/kdb/kdb_bt.c5
-rw-r--r--kernel/debug/kdb/kdb_cmds4
-rw-r--r--kernel/debug/kdb/kdb_debugger.c21
-rw-r--r--kernel/debug/kdb/kdb_io.c36
-rw-r--r--kernel/debug/kdb/kdb_main.c4
-rw-r--r--kernel/debug/kdb/kdb_private.h3
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c938
-rw-r--r--kernel/events/hw_breakpoint.c10
-rw-r--r--kernel/events/internal.h96
-rw-r--r--kernel/events/ring_buffer.c380
-rw-r--r--kernel/exit.c94
-rw-r--r--kernel/fork.c124
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/irqdomain.c180
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c33
-rw-r--r--kernel/lockdep.c33
-rw-r--r--kernel/module.c80
-rw-r--r--kernel/notifier.c31
-rw-r--r--kernel/nsproxy.c4
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c18
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pm_qos_params.c6
-rw-r--r--kernel/power/Kconfig12
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/suspend.c20
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/ptrace.c197
-rw-r--r--kernel/rcupdate.c2
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/rcutree_trace.c2
-rw-r--r--kernel/resource.c21
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/rwsem.c18
-rw-r--r--kernel/sched.c403
-rw-r--r--kernel/sched_autogroup.h1
-rw-r--r--kernel/sched_fair.c118
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_rt.c26
-rw-r--r--kernel/signal.c442
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/stop_machine.c80
-rw-r--r--kernel/sys.c32
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time/timekeeping.c28
-rw-r--r--kernel/trace/ftrace.c157
-rw-r--r--kernel/trace/ring_buffer.c66
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c326
-rw-r--r--kernel/trace/trace.h63
-rw-r--r--kernel/trace/trace_entries.h3
-rw-r--r--kernel/trace/trace_events.c139
-rw-r--r--kernel/trace/trace_events_filter.c6
-rw-r--r--kernel/trace/trace_functions.c3
-rw-r--r--kernel/trace/trace_functions_graph.c225
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c324
-rw-r--r--kernel/trace/trace_mmiotrace.c2
-rw-r--r--kernel/trace/trace_output.c11
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_stack.c13
-rw-r--r--kernel/watchdog.c8
-rw-r--r--kernel/workqueue.c81
83 files changed, 3247 insertions, 1906 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..24e7cb0ba26a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
35 35
36config PREEMPT 36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)" 37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 select PREEMPT_COUNT
38 help 39 help
39 This option reduces the latency of the kernel by making 40 This option reduces the latency of the kernel by making
40 all kernel code (that is not executing in a critical section) 41 all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
52 53
53endchoice 54endchoice
54 55
56config PREEMPT_COUNT
57 bool \ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d64cfcc8b42..d06467fc8f7c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -125,11 +125,10 @@ targets += config_data.gz
125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE 125$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
126 $(call if_changed,gzip) 126 $(call if_changed,gzip)
127 127
128quiet_cmd_ikconfiggz = IKCFG $@ 128 filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
129 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
130targets += config_data.h 129targets += config_data.h
131$(obj)/config_data.h: $(obj)/config_data.gz FORCE 130$(obj)/config_data.h: $(obj)/config_data.gz FORCE
132 $(call if_changed,ikconfiggz) 131 $(call filechk,ikconfiggz)
133 132
134$(obj)/time.o: $(obj)/timeconst.h 133$(obj)/time.o: $(obj)/timeconst.h
135 134
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb913c77..d5fe7af0de2e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel.
49*/ 49*/
50 50
51#include <linux/async.h> 51#include <linux/async.h>
52#include <linux/atomic.h>
53#include <linux/ktime.h>
52#include <linux/module.h> 54#include <linux/module.h>
53#include <linux/wait.h> 55#include <linux/wait.h>
54#include <linux/sched.h> 56#include <linux/sched.h>
55#include <linux/slab.h> 57#include <linux/slab.h>
56#include <linux/workqueue.h> 58#include <linux/workqueue.h>
57#include <asm/atomic.h>
58 59
59static async_cookie_t next_cookie = 1; 60static async_cookie_t next_cookie = 1;
60 61
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work)
128 129
129 /* 2) run (and print duration) */ 130 /* 2) run (and print duration) */
130 if (initcall_debug && system_state == SYSTEM_BOOTING) { 131 if (initcall_debug && system_state == SYSTEM_BOOTING) {
131 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, 132 printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
133 (long long)entry->cookie,
132 entry->func, task_pid_nr(current)); 134 entry->func, task_pid_nr(current));
133 calltime = ktime_get(); 135 calltime = ktime_get();
134 } 136 }
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work)
136 if (initcall_debug && system_state == SYSTEM_BOOTING) { 138 if (initcall_debug && system_state == SYSTEM_BOOTING) {
137 rettime = ktime_get(); 139 rettime = ktime_get();
138 delta = ktime_sub(rettime, calltime); 140 delta = ktime_sub(rettime, calltime);
139 printk("initcall %lli_%pF returned 0 after %lld usecs\n", 141 printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
140 (long long)entry->cookie, 142 (long long)entry->cookie,
141 entry->func, 143 entry->func,
142 (long long)ktime_to_ns(delta) >> 10); 144 (long long)ktime_to_ns(delta) >> 10);
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
270 ktime_t starttime, delta, endtime; 272 ktime_t starttime, delta, endtime;
271 273
272 if (initcall_debug && system_state == SYSTEM_BOOTING) { 274 if (initcall_debug && system_state == SYSTEM_BOOTING) {
273 printk("async_waiting @ %i\n", task_pid_nr(current)); 275 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
274 starttime = ktime_get(); 276 starttime = ktime_get();
275 } 277 }
276 278
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
280 endtime = ktime_get(); 282 endtime = ktime_get();
281 delta = ktime_sub(endtime, starttime); 283 delta = ktime_sub(endtime, starttime);
282 284
283 printk("async_continuing @ %i after %lli usec\n", 285 printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
284 task_pid_nr(current), 286 task_pid_nr(current),
285 (long long)ktime_to_ns(delta) >> 10); 287 (long long)ktime_to_ns(delta) >> 10);
286 } 288 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 939500317066..0a1355ca3d79 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -43,7 +43,7 @@
43 43
44#include <linux/init.h> 44#include <linux/init.h>
45#include <asm/types.h> 45#include <asm/types.h>
46#include <asm/atomic.h> 46#include <linux/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/slab.h> 49#include <linux/slab.h>
@@ -55,6 +55,9 @@
55#include <net/sock.h> 55#include <net/sock.h>
56#include <net/netlink.h> 56#include <net/netlink.h>
57#include <linux/skbuff.h> 57#include <linux/skbuff.h>
58#ifdef CONFIG_SECURITY
59#include <linux/security.h>
60#endif
58#include <linux/netlink.h> 61#include <linux/netlink.h>
59#include <linux/freezer.h> 62#include <linux/freezer.h>
60#include <linux/tty.h> 63#include <linux/tty.h>
@@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
1502 } 1505 }
1503} 1506}
1504 1507
1508#ifdef CONFIG_SECURITY
1509/**
1510 * audit_log_secctx - Converts and logs SELinux context
1511 * @ab: audit_buffer
1512 * @secid: security number
1513 *
1514 * This is a helper function that calls security_secid_to_secctx to convert
1515 * secid to secctx and then adds the (converted) SELinux context to the audit
1516 * log by calling audit_log_format, thus also preventing leak of internal secid
1517 * to userspace. If secid cannot be converted audit_panic is called.
1518 */
1519void audit_log_secctx(struct audit_buffer *ab, u32 secid)
1520{
1521 u32 len;
1522 char *secctx;
1523
1524 if (security_secid_to_secctx(secid, &secctx, &len)) {
1525 audit_panic("Cannot convert secid to context");
1526 } else {
1527 audit_log_format(ab, " obj=%s", secctx);
1528 security_release_secctx(secctx, len);
1529 }
1530}
1531EXPORT_SYMBOL(audit_log_secctx);
1532#endif
1533
1505EXPORT_SYMBOL(audit_log_start); 1534EXPORT_SYMBOL(audit_log_start);
1506EXPORT_SYMBOL(audit_log_end); 1535EXPORT_SYMBOL(audit_log_end);
1507EXPORT_SYMBOL(audit_log_format); 1536EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e99dda04b126..5bf0790497e7 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree)
93 atomic_inc(&tree->count); 93 atomic_inc(&tree->count);
94} 94}
95 95
96static void __put_tree(struct rcu_head *rcu)
97{
98 struct audit_tree *tree = container_of(rcu, struct audit_tree, head);
99 kfree(tree);
100}
101
102static inline void put_tree(struct audit_tree *tree) 96static inline void put_tree(struct audit_tree *tree)
103{ 97{
104 if (atomic_dec_and_test(&tree->count)) 98 if (atomic_dec_and_test(&tree->count))
105 call_rcu(&tree->head, __put_tree); 99 kfree_rcu(tree, head);
106} 100}
107 101
108/* to avoid bringing the entire thing in audit.h */ 102/* to avoid bringing the entire thing in audit.h */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 00d79df03e76..ce4b054acee5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -44,7 +44,7 @@
44 44
45#include <linux/init.h> 45#include <linux/init.h>
46#include <asm/types.h> 46#include <asm/types.h>
47#include <asm/atomic.h> 47#include <linux/atomic.h>
48#include <linux/fs.h> 48#include <linux/fs.h>
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2731d115d725..1d2b6ceea95d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,9 +27,11 @@
27 */ 27 */
28 28
29#include <linux/cgroup.h> 29#include <linux/cgroup.h>
30#include <linux/cred.h>
30#include <linux/ctype.h> 31#include <linux/ctype.h>
31#include <linux/errno.h> 32#include <linux/errno.h>
32#include <linux/fs.h> 33#include <linux/fs.h>
34#include <linux/init_task.h>
33#include <linux/kernel.h> 35#include <linux/kernel.h>
34#include <linux/list.h> 36#include <linux/list.h>
35#include <linux/mm.h> 37#include <linux/mm.h>
@@ -59,7 +61,7 @@
59#include <linux/poll.h> 61#include <linux/poll.h>
60#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 62#include <linux/flex_array.h> /* used in cgroup_attach_proc */
61 63
62#include <asm/atomic.h> 64#include <linux/atomic.h>
63 65
64static DEFINE_MUTEX(cgroup_mutex); 66static DEFINE_MUTEX(cgroup_mutex);
65 67
@@ -1514,6 +1516,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1514 struct cgroup *root_cgrp = &root->top_cgroup; 1516 struct cgroup *root_cgrp = &root->top_cgroup;
1515 struct inode *inode; 1517 struct inode *inode;
1516 struct cgroupfs_root *existing_root; 1518 struct cgroupfs_root *existing_root;
1519 const struct cred *cred;
1517 int i; 1520 int i;
1518 1521
1519 BUG_ON(sb->s_root != NULL); 1522 BUG_ON(sb->s_root != NULL);
@@ -1593,7 +1596,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1593 BUG_ON(!list_empty(&root_cgrp->children)); 1596 BUG_ON(!list_empty(&root_cgrp->children));
1594 BUG_ON(root->number_of_cgroups != 1); 1597 BUG_ON(root->number_of_cgroups != 1);
1595 1598
1599 cred = override_creds(&init_cred);
1596 cgroup_populate_dir(root_cgrp); 1600 cgroup_populate_dir(root_cgrp);
1601 revert_creds(cred);
1597 mutex_unlock(&cgroup_mutex); 1602 mutex_unlock(&cgroup_mutex);
1598 mutex_unlock(&inode->i_mutex); 1603 mutex_unlock(&inode->i_mutex);
1599 } else { 1604 } else {
@@ -1697,7 +1702,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1697{ 1702{
1698 char *start; 1703 char *start;
1699 struct dentry *dentry = rcu_dereference_check(cgrp->dentry, 1704 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1700 rcu_read_lock_held() ||
1701 cgroup_lock_is_held()); 1705 cgroup_lock_is_held());
1702 1706
1703 if (!dentry || cgrp == dummytop) { 1707 if (!dentry || cgrp == dummytop) {
@@ -1723,7 +1727,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1723 break; 1727 break;
1724 1728
1725 dentry = rcu_dereference_check(cgrp->dentry, 1729 dentry = rcu_dereference_check(cgrp->dentry,
1726 rcu_read_lock_held() ||
1727 cgroup_lock_is_held()); 1730 cgroup_lock_is_held());
1728 if (!cgrp->parent) 1731 if (!cgrp->parent)
1729 continue; 1732 continue;
@@ -3542,7 +3545,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3542 } 3545 }
3543 3546
3544 /* the process need read permission on control file */ 3547 /* the process need read permission on control file */
3545 ret = file_permission(cfile, MAY_READ); 3548 /* AV: shouldn't we check that it's been opened for read instead? */
3549 ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
3546 if (ret < 0) 3550 if (ret < 0)
3547 goto fail; 3551 goto fail;
3548 3552
@@ -4813,8 +4817,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
4813 * on this or this is under rcu_read_lock(). Once css->id is allocated, 4817 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4814 * it's unchanged until freed. 4818 * it's unchanged until freed.
4815 */ 4819 */
4816 cssid = rcu_dereference_check(css->id, 4820 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
4817 rcu_read_lock_held() || atomic_read(&css->refcnt));
4818 4821
4819 if (cssid) 4822 if (cssid)
4820 return cssid->id; 4823 return cssid->id;
@@ -4826,8 +4829,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4826{ 4829{
4827 struct css_id *cssid; 4830 struct css_id *cssid;
4828 4831
4829 cssid = rcu_dereference_check(css->id, 4832 cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
4830 rcu_read_lock_held() || atomic_read(&css->refcnt));
4831 4833
4832 if (cssid) 4834 if (cssid)
4833 return cssid->depth; 4835 return cssid->depth;
diff --git a/kernel/compat.c b/kernel/compat.c
index fc9eb093acd5..e2435ee9993a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
158 __put_user(ts->tv_sec, &cts->tv_sec) || 158 __put_user(ts->tv_sec, &cts->tv_sec) ||
159 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; 159 __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
160} 160}
161EXPORT_SYMBOL_GPL(put_compat_timespec);
161 162
162static long compat_nanosleep_restart(struct restart_block *restart) 163static long compat_nanosleep_restart(struct restart_block *restart)
163{ 164{
@@ -890,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
890 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 891 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
891 } 892 }
892} 893}
894EXPORT_SYMBOL_GPL(sigset_from_compat);
893 895
894asmlinkage long 896asmlinkage long
895compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, 897compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
@@ -991,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
991 sigset_from_compat(&newset, &newset32); 993 sigset_from_compat(&newset, &newset32);
992 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 994 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
993 995
994 spin_lock_irq(&current->sighand->siglock);
995 current->saved_sigmask = current->blocked; 996 current->saved_sigmask = current->blocked;
996 current->blocked = newset; 997 set_current_blocked(&newset);
997 recalc_sigpending();
998 spin_unlock_irq(&current->sighand->siglock);
999 998
1000 current->state = TASK_INTERRUPTIBLE; 999 current->state = TASK_INTERRUPTIBLE;
1001 schedule(); 1000 schedule();
diff --git a/kernel/configs.c b/kernel/configs.c
index b4066b44a99d..42e8fa075eed 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void)
92module_init(ikconfig_init); 92module_init(ikconfig_init);
93module_exit(ikconfig_cleanup); 93module_exit(ikconfig_cleanup);
94 94
95#endif /* CONFIG_IKCONFIG_PROC */
96
95MODULE_LICENSE("GPL"); 97MODULE_LICENSE("GPL");
96MODULE_AUTHOR("Randy Dunlap"); 98MODULE_AUTHOR("Randy Dunlap");
97MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); 99MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel");
98
99#endif /* CONFIG_IKCONFIG_PROC */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9c9b7545c810..10131fdaff70 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
55#include <linux/sort.h> 55#include <linux/sort.h>
56 56
57#include <asm/uaccess.h> 57#include <asm/uaccess.h>
58#include <asm/atomic.h> 58#include <linux/atomic.h>
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/workqueue.h> 60#include <linux/workqueue.h>
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
@@ -2460,11 +2460,19 @@ static int cpuset_spread_node(int *rotor)
2460 2460
2461int cpuset_mem_spread_node(void) 2461int cpuset_mem_spread_node(void)
2462{ 2462{
2463 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
2464 current->cpuset_mem_spread_rotor =
2465 node_random(&current->mems_allowed);
2466
2463 return cpuset_spread_node(&current->cpuset_mem_spread_rotor); 2467 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2464} 2468}
2465 2469
2466int cpuset_slab_spread_node(void) 2470int cpuset_slab_spread_node(void)
2467{ 2471{
2472 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
2473 current->cpuset_slab_spread_rotor =
2474 node_random(&current->mems_allowed);
2475
2468 return cpuset_spread_node(&current->cpuset_slab_spread_rotor); 2476 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2469} 2477}
2470 2478
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index bad6786dee88..0d7c08784efb 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -51,7 +51,7 @@
51 51
52#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
53#include <asm/byteorder.h> 53#include <asm/byteorder.h>
54#include <asm/atomic.h> 54#include <linux/atomic.h>
55#include <asm/system.h> 55#include <asm/system.h>
56 56
57#include "debug_core.h" 57#include "debug_core.h"
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a11db956dd62..34872482315e 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -42,6 +42,8 @@
42/* Our I/O buffers. */ 42/* Our I/O buffers. */
43static char remcom_in_buffer[BUFMAX]; 43static char remcom_in_buffer[BUFMAX];
44static char remcom_out_buffer[BUFMAX]; 44static char remcom_out_buffer[BUFMAX];
45static int gdbstub_use_prev_in_buf;
46static int gdbstub_prev_in_buf_pos;
45 47
46/* Storage for the registers, in GDB format. */ 48/* Storage for the registers, in GDB format. */
47static unsigned long gdb_regs[(NUMREGBYTES + 49static unsigned long gdb_regs[(NUMREGBYTES +
@@ -58,6 +60,13 @@ static int gdbstub_read_wait(void)
58 int ret = -1; 60 int ret = -1;
59 int i; 61 int i;
60 62
63 if (unlikely(gdbstub_use_prev_in_buf)) {
64 if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf)
65 return remcom_in_buffer[gdbstub_prev_in_buf_pos++];
66 else
67 gdbstub_use_prev_in_buf = 0;
68 }
69
61 /* poll any additional I/O interfaces that are defined */ 70 /* poll any additional I/O interfaces that are defined */
62 while (ret < 0) 71 while (ret < 0)
63 for (i = 0; kdb_poll_funcs[i] != NULL; i++) { 72 for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
@@ -109,7 +118,6 @@ static void get_packet(char *buffer)
109 buffer[count] = ch; 118 buffer[count] = ch;
110 count = count + 1; 119 count = count + 1;
111 } 120 }
112 buffer[count] = 0;
113 121
114 if (ch == '#') { 122 if (ch == '#') {
115 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; 123 xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
@@ -124,6 +132,7 @@ static void get_packet(char *buffer)
124 if (dbg_io_ops->flush) 132 if (dbg_io_ops->flush)
125 dbg_io_ops->flush(); 133 dbg_io_ops->flush();
126 } 134 }
135 buffer[count] = 0;
127 } while (checksum != xmitcsum); 136 } while (checksum != xmitcsum);
128} 137}
129 138
@@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
1082 case 'c': 1091 case 'c':
1083 strcpy(remcom_in_buffer, cmd); 1092 strcpy(remcom_in_buffer, cmd);
1084 return 0; 1093 return 0;
1085 case '?': 1094 case '$':
1086 gdb_cmd_status(ks); 1095 strcpy(remcom_in_buffer, cmd);
1087 break; 1096 gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
1088 case '\0': 1097 gdbstub_prev_in_buf_pos = 0;
1089 strcpy(remcom_out_buffer, ""); 1098 return 0;
1090 break;
1091 } 1099 }
1092 dbg_io_ops->write_char('+'); 1100 dbg_io_ops->write_char('+');
1093 put_packet(remcom_out_buffer); 1101 put_packet(remcom_out_buffer);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 2f62fe85f16a..7179eac7b41c 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv)
112 unsigned long addr; 112 unsigned long addr;
113 long offset; 113 long offset;
114 114
115 kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ 115 /* Prompt after each proc in bta */
116 kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each 116 kdbgetintenv("BTAPROMPT", &btaprompt);
117 * proc in bta */
118 117
119 if (strcmp(argv[0], "bta") == 0) { 118 if (strcmp(argv[0], "bta") == 0) {
120 struct task_struct *g, *p; 119 struct task_struct *g, *p;
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
index 56c88e4db309..9834ad303ab6 100644
--- a/kernel/debug/kdb/kdb_cmds
+++ b/kernel/debug/kdb/kdb_cmds
@@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging"
18endefcmd 18endefcmd
19 19
20defcmd dumpall "" "First line debugging" 20defcmd dumpall "" "First line debugging"
21 set BTSYMARG 1
22 set BTARGS 9
23 pid R 21 pid R
24 -dumpcommon 22 -dumpcommon
25 -bta 23 -bta
26endefcmd 24endefcmd
27 25
28defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" 26defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
29 set BTSYMARG 1
30 set BTARGS 9
31 pid R 27 pid R
32 -dumpcommon 28 -dumpcommon
33 -btc 29 -btc
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index dd0b1b7dd02c..d9ca9aa481ec 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs);
30int kdb_poll_idx = 1; 30int kdb_poll_idx = 1;
31EXPORT_SYMBOL_GPL(kdb_poll_idx); 31EXPORT_SYMBOL_GPL(kdb_poll_idx);
32 32
33static struct kgdb_state *kdb_ks;
34
33int kdb_stub(struct kgdb_state *ks) 35int kdb_stub(struct kgdb_state *ks)
34{ 36{
35 int error = 0; 37 int error = 0;
@@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks)
39 kdb_dbtrap_t db_result = KDB_DB_NOBPT; 41 kdb_dbtrap_t db_result = KDB_DB_NOBPT;
40 int i; 42 int i;
41 43
44 kdb_ks = ks;
42 if (KDB_STATE(REENTRY)) { 45 if (KDB_STATE(REENTRY)) {
43 reason = KDB_REASON_SWITCH; 46 reason = KDB_REASON_SWITCH;
44 KDB_STATE_CLEAR(REENTRY); 47 KDB_STATE_CLEAR(REENTRY);
@@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks)
123 KDB_STATE_CLEAR(PAGER); 126 KDB_STATE_CLEAR(PAGER);
124 kdbnearsym_cleanup(); 127 kdbnearsym_cleanup();
125 if (error == KDB_CMD_KGDB) { 128 if (error == KDB_CMD_KGDB) {
126 if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { 129 if (KDB_STATE(DOING_KGDB))
127 /*
128 * This inteface glue which allows kdb to transition in into
129 * the gdb stub. In order to do this the '?' or '' gdb serial
130 * packet response is processed here. And then control is
131 * passed to the gdbstub.
132 */
133 if (KDB_STATE(DOING_KGDB))
134 gdbstub_state(ks, "?");
135 else
136 gdbstub_state(ks, "");
137 KDB_STATE_CLEAR(DOING_KGDB); 130 KDB_STATE_CLEAR(DOING_KGDB);
138 KDB_STATE_CLEAR(DOING_KGDB2);
139 }
140 return DBG_PASS_EVENT; 131 return DBG_PASS_EVENT;
141 } 132 }
142 kdb_bp_install(ks->linux_regs); 133 kdb_bp_install(ks->linux_regs);
@@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks)
166 return kgdb_info[ks->cpu].ret_state; 157 return kgdb_info[ks->cpu].ret_state;
167} 158}
168 159
160void kdb_gdb_state_pass(char *buf)
161{
162 gdbstub_state(kdb_ks, buf);
163}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 96fdaac46a80..4802eb5840e1 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN];
31 31
32int kdb_trap_printk; 32int kdb_trap_printk;
33 33
34static void kgdb_transition_check(char *buffer) 34static int kgdb_transition_check(char *buffer)
35{ 35{
36 int slen = strlen(buffer); 36 if (buffer[0] != '+' && buffer[0] != '$') {
37 if (strncmp(buffer, "$?#3f", slen) != 0 &&
38 strncmp(buffer, "$qSupported#37", slen) != 0 &&
39 strncmp(buffer, "+$qSupported#37", slen) != 0) {
40 KDB_STATE_SET(KGDB_TRANS); 37 KDB_STATE_SET(KGDB_TRANS);
41 kdb_printf("%s", buffer); 38 kdb_printf("%s", buffer);
39 } else {
40 int slen = strlen(buffer);
41 if (slen > 3 && buffer[slen - 3] == '#') {
42 kdb_gdb_state_pass(buffer);
43 strcpy(buffer, "kgdb");
44 KDB_STATE_SET(DOING_KGDB);
45 return 1;
46 }
42 } 47 }
48 return 0;
43} 49}
44 50
45static int kdb_read_get_key(char *buffer, size_t bufsize) 51static int kdb_read_get_key(char *buffer, size_t bufsize)
@@ -251,6 +257,10 @@ poll_again:
251 case 13: /* enter */ 257 case 13: /* enter */
252 *lastchar++ = '\n'; 258 *lastchar++ = '\n';
253 *lastchar++ = '\0'; 259 *lastchar++ = '\0';
260 if (!KDB_STATE(KGDB_TRANS)) {
261 KDB_STATE_SET(KGDB_TRANS);
262 kdb_printf("%s", buffer);
263 }
254 kdb_printf("\n"); 264 kdb_printf("\n");
255 return buffer; 265 return buffer;
256 case 4: /* Del */ 266 case 4: /* Del */
@@ -382,22 +392,26 @@ poll_again:
382 * printed characters if we think that 392 * printed characters if we think that
383 * kgdb is connecting, until the check 393 * kgdb is connecting, until the check
384 * fails */ 394 * fails */
385 if (!KDB_STATE(KGDB_TRANS)) 395 if (!KDB_STATE(KGDB_TRANS)) {
386 kgdb_transition_check(buffer); 396 if (kgdb_transition_check(buffer))
387 else 397 return buffer;
398 } else {
388 kdb_printf("%c", key); 399 kdb_printf("%c", key);
400 }
389 } 401 }
390 /* Special escape to kgdb */ 402 /* Special escape to kgdb */
391 if (lastchar - buffer >= 5 && 403 if (lastchar - buffer >= 5 &&
392 strcmp(lastchar - 5, "$?#3f") == 0) { 404 strcmp(lastchar - 5, "$?#3f") == 0) {
405 kdb_gdb_state_pass(lastchar - 5);
393 strcpy(buffer, "kgdb"); 406 strcpy(buffer, "kgdb");
394 KDB_STATE_SET(DOING_KGDB); 407 KDB_STATE_SET(DOING_KGDB);
395 return buffer; 408 return buffer;
396 } 409 }
397 if (lastchar - buffer >= 14 && 410 if (lastchar - buffer >= 11 &&
398 strcmp(lastchar - 14, "$qSupported#37") == 0) { 411 strcmp(lastchar - 11, "$qSupported") == 0) {
412 kdb_gdb_state_pass(lastchar - 11);
399 strcpy(buffer, "kgdb"); 413 strcpy(buffer, "kgdb");
400 KDB_STATE_SET(DOING_KGDB2); 414 KDB_STATE_SET(DOING_KGDB);
401 return buffer; 415 return buffer;
402 } 416 }
403 } 417 }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index be14779bcef6..63786e71a3cd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -145,7 +145,6 @@ static char *__env[] = {
145#endif 145#endif
146 "RADIX=16", 146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */ 147 "MDCOUNT=8", /* lines of md output */
148 "BTARGS=9", /* 9 possible args in bt */
149 KDB_PLATFORM_ENV, 148 KDB_PLATFORM_ENV,
150 "DTABCOUNT=30", 149 "DTABCOUNT=30",
151 "NOSECT=1", 150 "NOSECT=1",
@@ -172,6 +171,7 @@ static char *__env[] = {
172 (char *)0, 171 (char *)0,
173 (char *)0, 172 (char *)0,
174 (char *)0, 173 (char *)0,
174 (char *)0,
175}; 175};
176 176
177static const int __nenv = (sizeof(__env) / sizeof(char *)); 177static const int __nenv = (sizeof(__env) / sizeof(char *));
@@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1386 } 1386 }
1387 1387
1388 if (result == KDB_CMD_KGDB) { 1388 if (result == KDB_CMD_KGDB) {
1389 if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) 1389 if (!KDB_STATE(DOING_KGDB))
1390 kdb_printf("Entering please attach debugger " 1390 kdb_printf("Entering please attach debugger "
1391 "or use $D#44+ or $3#33\n"); 1391 "or use $D#44+ or $3#33\n");
1392 break; 1392 break;
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 35d69ed1dfb5..e381d105b40b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -21,7 +21,6 @@
21#define KDB_CMD_SS (-1003) 21#define KDB_CMD_SS (-1003)
22#define KDB_CMD_SSB (-1004) 22#define KDB_CMD_SSB (-1004)
23#define KDB_CMD_KGDB (-1005) 23#define KDB_CMD_KGDB (-1005)
24#define KDB_CMD_KGDB2 (-1006)
25 24
26/* Internal debug flags */ 25/* Internal debug flags */
27#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ 26#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
@@ -146,7 +145,6 @@ extern int kdb_state;
146 * keyboard on this cpu */ 145 * keyboard on this cpu */
147#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ 146#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
148#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ 147#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
149#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
150#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ 148#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
151#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch 149#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
152 * specific use */ 150 * specific use */
@@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val);
218extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); 216extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
219extern void kdb_meminfo_proc_show(void); 217extern void kdb_meminfo_proc_show(void);
220extern char *kdb_getstr(char *, size_t, char *); 218extern char *kdb_getstr(char *, size_t, char *);
219extern void kdb_gdb_state_pass(char *buf);
221 220
222/* Defines for kdb_symbol_print */ 221/* Defines for kdb_symbol_print */
223#define KDB_SP_SPACEB 0x0001 /* Space before string */ 222#define KDB_SP_SPACEB 0x0001 /* Space before string */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index ead9b610aa71..418b3f7053aa 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -19,8 +19,10 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/delayacct.h> 21#include <linux/delayacct.h>
22#include <linux/module.h>
22 23
23int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ 24int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
25EXPORT_SYMBOL_GPL(delayacct_on);
24struct kmem_cache *delayacct_cache; 26struct kmem_cache *delayacct_cache;
25 27
26static int __init delayacct_setup_disable(char *str) 28static int __init delayacct_setup_disable(char *str)
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 1ce23d3d8394..89e5e8aa4c36 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o 5obj-y := core.o ring_buffer.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9efe7108ccaf..b8785e26ee1c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,8 @@
36#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h> 37#include <linux/hw_breakpoint.h>
38 38
39#include "internal.h"
40
39#include <asm/irq_regs.h> 41#include <asm/irq_regs.h>
40 42
41struct remote_function_call { 43struct remote_function_call {
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 202 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201} 203}
202 204
205static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
206 struct perf_event_context *ctx)
207{
208 raw_spin_lock(&cpuctx->ctx.lock);
209 if (ctx)
210 raw_spin_lock(&ctx->lock);
211}
212
213static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
214 struct perf_event_context *ctx)
215{
216 if (ctx)
217 raw_spin_unlock(&ctx->lock);
218 raw_spin_unlock(&cpuctx->ctx.lock);
219}
220
203#ifdef CONFIG_CGROUP_PERF 221#ifdef CONFIG_CGROUP_PERF
204 222
205/* 223/*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
340 rcu_read_lock(); 358 rcu_read_lock();
341 359
342 list_for_each_entry_rcu(pmu, &pmus, entry) { 360 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 361 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345 362
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /* 363 /*
349 * perf_cgroup_events says at least one 364 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events. 365 * context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
353 * events for a context. 368 * events for a context.
354 */ 369 */
355 if (cpuctx->ctx.nr_cgroups > 0) { 370 if (cpuctx->ctx.nr_cgroups > 0) {
371 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
372 perf_pmu_disable(cpuctx->ctx.pmu);
356 373
357 if (mode & PERF_CGROUP_SWOUT) { 374 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 375 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
372 cpuctx->cgrp = perf_cgroup_from_task(task); 389 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 390 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 } 391 }
392 perf_pmu_enable(cpuctx->ctx.pmu);
393 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
375 } 394 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 } 395 }
379 396
380 rcu_read_unlock(); 397 rcu_read_unlock();
@@ -731,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event)
731 748
732/* 749/*
733 * Update the total_time_enabled and total_time_running fields for a event. 750 * Update the total_time_enabled and total_time_running fields for a event.
751 * The caller of this function needs to hold the ctx->lock.
734 */ 752 */
735static void update_event_times(struct perf_event *event) 753static void update_event_times(struct perf_event *event)
736{ 754{
@@ -1105,6 +1123,10 @@ static int __perf_remove_from_context(void *info)
1105 raw_spin_lock(&ctx->lock); 1123 raw_spin_lock(&ctx->lock);
1106 event_sched_out(event, cpuctx, ctx); 1124 event_sched_out(event, cpuctx, ctx);
1107 list_del_event(event, ctx); 1125 list_del_event(event, ctx);
1126 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1127 ctx->is_active = 0;
1128 cpuctx->task_ctx = NULL;
1129 }
1108 raw_spin_unlock(&ctx->lock); 1130 raw_spin_unlock(&ctx->lock);
1109 1131
1110 return 0; 1132 return 0;
@@ -1454,8 +1476,24 @@ static void add_event_to_ctx(struct perf_event *event,
1454 event->tstamp_stopped = tstamp; 1476 event->tstamp_stopped = tstamp;
1455} 1477}
1456 1478
1457static void perf_event_context_sched_in(struct perf_event_context *ctx, 1479static void task_ctx_sched_out(struct perf_event_context *ctx);
1458 struct task_struct *tsk); 1480static void
1481ctx_sched_in(struct perf_event_context *ctx,
1482 struct perf_cpu_context *cpuctx,
1483 enum event_type_t event_type,
1484 struct task_struct *task);
1485
1486static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1487 struct perf_event_context *ctx,
1488 struct task_struct *task)
1489{
1490 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1491 if (ctx)
1492 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1493 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1494 if (ctx)
1495 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1496}
1459 1497
1460/* 1498/*
1461 * Cross CPU call to install and enable a performance event 1499 * Cross CPU call to install and enable a performance event
@@ -1466,20 +1504,37 @@ static int __perf_install_in_context(void *info)
1466{ 1504{
1467 struct perf_event *event = info; 1505 struct perf_event *event = info;
1468 struct perf_event_context *ctx = event->ctx; 1506 struct perf_event_context *ctx = event->ctx;
1469 struct perf_event *leader = event->group_leader;
1470 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1507 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1471 int err; 1508 struct perf_event_context *task_ctx = cpuctx->task_ctx;
1509 struct task_struct *task = current;
1510
1511 perf_ctx_lock(cpuctx, task_ctx);
1512 perf_pmu_disable(cpuctx->ctx.pmu);
1472 1513
1473 /* 1514 /*
1474 * In case we're installing a new context to an already running task, 1515 * If there was an active task_ctx schedule it out.
1475 * could also happen before perf_event_task_sched_in() on architectures
1476 * which do context switches with IRQs enabled.
1477 */ 1516 */
1478 if (ctx->task && !cpuctx->task_ctx) 1517 if (task_ctx)
1479 perf_event_context_sched_in(ctx, ctx->task); 1518 task_ctx_sched_out(task_ctx);
1519
1520 /*
1521 * If the context we're installing events in is not the
1522 * active task_ctx, flip them.
1523 */
1524 if (ctx->task && task_ctx != ctx) {
1525 if (task_ctx)
1526 raw_spin_unlock(&task_ctx->lock);
1527 raw_spin_lock(&ctx->lock);
1528 task_ctx = ctx;
1529 }
1530
1531 if (task_ctx) {
1532 cpuctx->task_ctx = task_ctx;
1533 task = task_ctx->task;
1534 }
1535
1536 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1480 1537
1481 raw_spin_lock(&ctx->lock);
1482 ctx->is_active = 1;
1483 update_context_time(ctx); 1538 update_context_time(ctx);
1484 /* 1539 /*
1485 * update cgrp time only if current cgrp 1540 * update cgrp time only if current cgrp
@@ -1490,43 +1545,13 @@ static int __perf_install_in_context(void *info)
1490 1545
1491 add_event_to_ctx(event, ctx); 1546 add_event_to_ctx(event, ctx);
1492 1547
1493 if (!event_filter_match(event))
1494 goto unlock;
1495
1496 /*
1497 * Don't put the event on if it is disabled or if
1498 * it is in a group and the group isn't on.
1499 */
1500 if (event->state != PERF_EVENT_STATE_INACTIVE ||
1501 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1502 goto unlock;
1503
1504 /* 1548 /*
1505 * An exclusive event can't go on if there are already active 1549 * Schedule everything back in
1506 * hardware events, and no hardware event can go on if there
1507 * is already an exclusive event on.
1508 */ 1550 */
1509 if (!group_can_go_on(event, cpuctx, 1)) 1551 perf_event_sched_in(cpuctx, task_ctx, task);
1510 err = -EEXIST;
1511 else
1512 err = event_sched_in(event, cpuctx, ctx);
1513
1514 if (err) {
1515 /*
1516 * This event couldn't go on. If it is in a group
1517 * then we have to pull the whole group off.
1518 * If the event group is pinned then put it in error state.
1519 */
1520 if (leader != event)
1521 group_sched_out(leader, cpuctx, ctx);
1522 if (leader->attr.pinned) {
1523 update_group_times(leader);
1524 leader->state = PERF_EVENT_STATE_ERROR;
1525 }
1526 }
1527 1552
1528unlock: 1553 perf_pmu_enable(cpuctx->ctx.pmu);
1529 raw_spin_unlock(&ctx->lock); 1554 perf_ctx_unlock(cpuctx, task_ctx);
1530 1555
1531 return 0; 1556 return 0;
1532} 1557}
@@ -1739,7 +1764,7 @@ out:
1739 raw_spin_unlock_irq(&ctx->lock); 1764 raw_spin_unlock_irq(&ctx->lock);
1740} 1765}
1741 1766
1742static int perf_event_refresh(struct perf_event *event, int refresh) 1767int perf_event_refresh(struct perf_event *event, int refresh)
1743{ 1768{
1744 /* 1769 /*
1745 * not supported on inherited events 1770 * not supported on inherited events
@@ -1752,36 +1777,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1752 1777
1753 return 0; 1778 return 0;
1754} 1779}
1780EXPORT_SYMBOL_GPL(perf_event_refresh);
1755 1781
1756static void ctx_sched_out(struct perf_event_context *ctx, 1782static void ctx_sched_out(struct perf_event_context *ctx,
1757 struct perf_cpu_context *cpuctx, 1783 struct perf_cpu_context *cpuctx,
1758 enum event_type_t event_type) 1784 enum event_type_t event_type)
1759{ 1785{
1760 struct perf_event *event; 1786 struct perf_event *event;
1787 int is_active = ctx->is_active;
1761 1788
1762 raw_spin_lock(&ctx->lock); 1789 ctx->is_active &= ~event_type;
1763 perf_pmu_disable(ctx->pmu);
1764 ctx->is_active = 0;
1765 if (likely(!ctx->nr_events)) 1790 if (likely(!ctx->nr_events))
1766 goto out; 1791 return;
1792
1767 update_context_time(ctx); 1793 update_context_time(ctx);
1768 update_cgrp_time_from_cpuctx(cpuctx); 1794 update_cgrp_time_from_cpuctx(cpuctx);
1769
1770 if (!ctx->nr_active) 1795 if (!ctx->nr_active)
1771 goto out; 1796 return;
1772 1797
1773 if (event_type & EVENT_PINNED) { 1798 perf_pmu_disable(ctx->pmu);
1799 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1774 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1800 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1775 group_sched_out(event, cpuctx, ctx); 1801 group_sched_out(event, cpuctx, ctx);
1776 } 1802 }
1777 1803
1778 if (event_type & EVENT_FLEXIBLE) { 1804 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1779 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1805 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1780 group_sched_out(event, cpuctx, ctx); 1806 group_sched_out(event, cpuctx, ctx);
1781 } 1807 }
1782out:
1783 perf_pmu_enable(ctx->pmu); 1808 perf_pmu_enable(ctx->pmu);
1784 raw_spin_unlock(&ctx->lock);
1785} 1809}
1786 1810
1787/* 1811/*
@@ -1929,8 +1953,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1929 rcu_read_unlock(); 1953 rcu_read_unlock();
1930 1954
1931 if (do_switch) { 1955 if (do_switch) {
1956 raw_spin_lock(&ctx->lock);
1932 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 1957 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1933 cpuctx->task_ctx = NULL; 1958 cpuctx->task_ctx = NULL;
1959 raw_spin_unlock(&ctx->lock);
1934 } 1960 }
1935} 1961}
1936 1962
@@ -1965,8 +1991,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
1965 perf_cgroup_sched_out(task); 1991 perf_cgroup_sched_out(task);
1966} 1992}
1967 1993
1968static void task_ctx_sched_out(struct perf_event_context *ctx, 1994static void task_ctx_sched_out(struct perf_event_context *ctx)
1969 enum event_type_t event_type)
1970{ 1995{
1971 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1996 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1972 1997
@@ -1976,7 +2001,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1976 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2001 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1977 return; 2002 return;
1978 2003
1979 ctx_sched_out(ctx, cpuctx, event_type); 2004 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1980 cpuctx->task_ctx = NULL; 2005 cpuctx->task_ctx = NULL;
1981} 2006}
1982 2007
@@ -2055,11 +2080,11 @@ ctx_sched_in(struct perf_event_context *ctx,
2055 struct task_struct *task) 2080 struct task_struct *task)
2056{ 2081{
2057 u64 now; 2082 u64 now;
2083 int is_active = ctx->is_active;
2058 2084
2059 raw_spin_lock(&ctx->lock); 2085 ctx->is_active |= event_type;
2060 ctx->is_active = 1;
2061 if (likely(!ctx->nr_events)) 2086 if (likely(!ctx->nr_events))
2062 goto out; 2087 return;
2063 2088
2064 now = perf_clock(); 2089 now = perf_clock();
2065 ctx->timestamp = now; 2090 ctx->timestamp = now;
@@ -2068,15 +2093,12 @@ ctx_sched_in(struct perf_event_context *ctx,
2068 * First go through the list and put on any pinned groups 2093 * First go through the list and put on any pinned groups
2069 * in order to give them the best chance of going on. 2094 * in order to give them the best chance of going on.
2070 */ 2095 */
2071 if (event_type & EVENT_PINNED) 2096 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2072 ctx_pinned_sched_in(ctx, cpuctx); 2097 ctx_pinned_sched_in(ctx, cpuctx);
2073 2098
2074 /* Then walk through the lower prio flexible groups */ 2099 /* Then walk through the lower prio flexible groups */
2075 if (event_type & EVENT_FLEXIBLE) 2100 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2076 ctx_flexible_sched_in(ctx, cpuctx); 2101 ctx_flexible_sched_in(ctx, cpuctx);
2077
2078out:
2079 raw_spin_unlock(&ctx->lock);
2080} 2102}
2081 2103
2082static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2104static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2088,19 +2110,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2088 ctx_sched_in(ctx, cpuctx, event_type, task); 2110 ctx_sched_in(ctx, cpuctx, event_type, task);
2089} 2111}
2090 2112
2091static void task_ctx_sched_in(struct perf_event_context *ctx,
2092 enum event_type_t event_type)
2093{
2094 struct perf_cpu_context *cpuctx;
2095
2096 cpuctx = __get_cpu_context(ctx);
2097 if (cpuctx->task_ctx == ctx)
2098 return;
2099
2100 ctx_sched_in(ctx, cpuctx, event_type, NULL);
2101 cpuctx->task_ctx = ctx;
2102}
2103
2104static void perf_event_context_sched_in(struct perf_event_context *ctx, 2113static void perf_event_context_sched_in(struct perf_event_context *ctx,
2105 struct task_struct *task) 2114 struct task_struct *task)
2106{ 2115{
@@ -2110,6 +2119,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2110 if (cpuctx->task_ctx == ctx) 2119 if (cpuctx->task_ctx == ctx)
2111 return; 2120 return;
2112 2121
2122 perf_ctx_lock(cpuctx, ctx);
2113 perf_pmu_disable(ctx->pmu); 2123 perf_pmu_disable(ctx->pmu);
2114 /* 2124 /*
2115 * We want to keep the following priority order: 2125 * We want to keep the following priority order:
@@ -2118,18 +2128,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2118 */ 2128 */
2119 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2129 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2120 2130
2121 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2131 perf_event_sched_in(cpuctx, ctx, task);
2122 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2123 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2124 2132
2125 cpuctx->task_ctx = ctx; 2133 cpuctx->task_ctx = ctx;
2126 2134
2135 perf_pmu_enable(ctx->pmu);
2136 perf_ctx_unlock(cpuctx, ctx);
2137
2127 /* 2138 /*
2128 * Since these rotations are per-cpu, we need to ensure the 2139 * Since these rotations are per-cpu, we need to ensure the
2129 * cpu-context we got scheduled on is actually rotating. 2140 * cpu-context we got scheduled on is actually rotating.
2130 */ 2141 */
2131 perf_pmu_rotate_start(ctx->pmu); 2142 perf_pmu_rotate_start(ctx->pmu);
2132 perf_pmu_enable(ctx->pmu);
2133} 2143}
2134 2144
2135/* 2145/*
@@ -2269,7 +2279,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2269 u64 interrupts, now; 2279 u64 interrupts, now;
2270 s64 delta; 2280 s64 delta;
2271 2281
2272 raw_spin_lock(&ctx->lock);
2273 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2282 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2274 if (event->state != PERF_EVENT_STATE_ACTIVE) 2283 if (event->state != PERF_EVENT_STATE_ACTIVE)
2275 continue; 2284 continue;
@@ -2301,7 +2310,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2301 if (delta > 0) 2310 if (delta > 0)
2302 perf_adjust_period(event, period, delta); 2311 perf_adjust_period(event, period, delta);
2303 } 2312 }
2304 raw_spin_unlock(&ctx->lock);
2305} 2313}
2306 2314
2307/* 2315/*
@@ -2309,16 +2317,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2309 */ 2317 */
2310static void rotate_ctx(struct perf_event_context *ctx) 2318static void rotate_ctx(struct perf_event_context *ctx)
2311{ 2319{
2312 raw_spin_lock(&ctx->lock);
2313
2314 /* 2320 /*
2315 * Rotate the first entry last of non-pinned groups. Rotation might be 2321 * Rotate the first entry last of non-pinned groups. Rotation might be
2316 * disabled by the inheritance code. 2322 * disabled by the inheritance code.
2317 */ 2323 */
2318 if (!ctx->rotate_disable) 2324 if (!ctx->rotate_disable)
2319 list_rotate_left(&ctx->flexible_groups); 2325 list_rotate_left(&ctx->flexible_groups);
2320
2321 raw_spin_unlock(&ctx->lock);
2322} 2326}
2323 2327
2324/* 2328/*
@@ -2345,6 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2345 rotate = 1; 2349 rotate = 1;
2346 } 2350 }
2347 2351
2352 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2348 perf_pmu_disable(cpuctx->ctx.pmu); 2353 perf_pmu_disable(cpuctx->ctx.pmu);
2349 perf_ctx_adjust_freq(&cpuctx->ctx, interval); 2354 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2350 if (ctx) 2355 if (ctx)
@@ -2355,21 +2360,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2355 2360
2356 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2361 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2357 if (ctx) 2362 if (ctx)
2358 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 2363 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2359 2364
2360 rotate_ctx(&cpuctx->ctx); 2365 rotate_ctx(&cpuctx->ctx);
2361 if (ctx) 2366 if (ctx)
2362 rotate_ctx(ctx); 2367 rotate_ctx(ctx);
2363 2368
2364 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); 2369 perf_event_sched_in(cpuctx, ctx, current);
2365 if (ctx)
2366 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
2367 2370
2368done: 2371done:
2369 if (remove) 2372 if (remove)
2370 list_del_init(&cpuctx->rotation_list); 2373 list_del_init(&cpuctx->rotation_list);
2371 2374
2372 perf_pmu_enable(cpuctx->ctx.pmu); 2375 perf_pmu_enable(cpuctx->ctx.pmu);
2376 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2373} 2377}
2374 2378
2375void perf_event_task_tick(void) 2379void perf_event_task_tick(void)
@@ -2424,9 +2428,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2424 * in. 2428 * in.
2425 */ 2429 */
2426 perf_cgroup_sched_out(current); 2430 perf_cgroup_sched_out(current);
2427 task_ctx_sched_out(ctx, EVENT_ALL);
2428 2431
2429 raw_spin_lock(&ctx->lock); 2432 raw_spin_lock(&ctx->lock);
2433 task_ctx_sched_out(ctx);
2430 2434
2431 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2435 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2432 ret = event_enable_on_exec(event, ctx); 2436 ret = event_enable_on_exec(event, ctx);
@@ -2835,16 +2839,12 @@ retry:
2835 unclone_ctx(ctx); 2839 unclone_ctx(ctx);
2836 ++ctx->pin_count; 2840 ++ctx->pin_count;
2837 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2841 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2838 } 2842 } else {
2839
2840 if (!ctx) {
2841 ctx = alloc_perf_context(pmu, task); 2843 ctx = alloc_perf_context(pmu, task);
2842 err = -ENOMEM; 2844 err = -ENOMEM;
2843 if (!ctx) 2845 if (!ctx)
2844 goto errout; 2846 goto errout;
2845 2847
2846 get_ctx(ctx);
2847
2848 err = 0; 2848 err = 0;
2849 mutex_lock(&task->perf_event_mutex); 2849 mutex_lock(&task->perf_event_mutex);
2850 /* 2850 /*
@@ -2856,14 +2856,14 @@ retry:
2856 else if (task->perf_event_ctxp[ctxn]) 2856 else if (task->perf_event_ctxp[ctxn])
2857 err = -EAGAIN; 2857 err = -EAGAIN;
2858 else { 2858 else {
2859 get_ctx(ctx);
2859 ++ctx->pin_count; 2860 ++ctx->pin_count;
2860 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2861 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2861 } 2862 }
2862 mutex_unlock(&task->perf_event_mutex); 2863 mutex_unlock(&task->perf_event_mutex);
2863 2864
2864 if (unlikely(err)) { 2865 if (unlikely(err)) {
2865 put_task_struct(task); 2866 put_ctx(ctx);
2866 kfree(ctx);
2867 2867
2868 if (err == -EAGAIN) 2868 if (err == -EAGAIN)
2869 goto retry; 2869 goto retry;
@@ -2890,7 +2890,7 @@ static void free_event_rcu(struct rcu_head *head)
2890 kfree(event); 2890 kfree(event);
2891} 2891}
2892 2892
2893static void perf_buffer_put(struct perf_buffer *buffer); 2893static void ring_buffer_put(struct ring_buffer *rb);
2894 2894
2895static void free_event(struct perf_event *event) 2895static void free_event(struct perf_event *event)
2896{ 2896{
@@ -2913,9 +2913,9 @@ static void free_event(struct perf_event *event)
2913 } 2913 }
2914 } 2914 }
2915 2915
2916 if (event->buffer) { 2916 if (event->rb) {
2917 perf_buffer_put(event->buffer); 2917 ring_buffer_put(event->rb);
2918 event->buffer = NULL; 2918 event->rb = NULL;
2919 } 2919 }
2920 2920
2921 if (is_cgroup_event(event)) 2921 if (is_cgroup_event(event))
@@ -2934,12 +2934,6 @@ int perf_event_release_kernel(struct perf_event *event)
2934{ 2934{
2935 struct perf_event_context *ctx = event->ctx; 2935 struct perf_event_context *ctx = event->ctx;
2936 2936
2937 /*
2938 * Remove from the PMU, can't get re-enabled since we got
2939 * here because the last ref went.
2940 */
2941 perf_event_disable(event);
2942
2943 WARN_ON_ONCE(ctx->parent_ctx); 2937 WARN_ON_ONCE(ctx->parent_ctx);
2944 /* 2938 /*
2945 * There are two ways this annotation is useful: 2939 * There are two ways this annotation is useful:
@@ -2956,8 +2950,8 @@ int perf_event_release_kernel(struct perf_event *event)
2956 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 2950 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2957 raw_spin_lock_irq(&ctx->lock); 2951 raw_spin_lock_irq(&ctx->lock);
2958 perf_group_detach(event); 2952 perf_group_detach(event);
2959 list_del_event(event, ctx);
2960 raw_spin_unlock_irq(&ctx->lock); 2953 raw_spin_unlock_irq(&ctx->lock);
2954 perf_remove_from_context(event);
2961 mutex_unlock(&ctx->mutex); 2955 mutex_unlock(&ctx->mutex);
2962 2956
2963 free_event(event); 2957 free_event(event);
@@ -3149,13 +3143,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3149static unsigned int perf_poll(struct file *file, poll_table *wait) 3143static unsigned int perf_poll(struct file *file, poll_table *wait)
3150{ 3144{
3151 struct perf_event *event = file->private_data; 3145 struct perf_event *event = file->private_data;
3152 struct perf_buffer *buffer; 3146 struct ring_buffer *rb;
3153 unsigned int events = POLL_HUP; 3147 unsigned int events = POLL_HUP;
3154 3148
3155 rcu_read_lock(); 3149 rcu_read_lock();
3156 buffer = rcu_dereference(event->buffer); 3150 rb = rcu_dereference(event->rb);
3157 if (buffer) 3151 if (rb)
3158 events = atomic_xchg(&buffer->poll, 0); 3152 events = atomic_xchg(&rb->poll, 0);
3159 rcu_read_unlock(); 3153 rcu_read_unlock();
3160 3154
3161 poll_wait(file, &event->waitq, wait); 3155 poll_wait(file, &event->waitq, wait);
@@ -3358,6 +3352,18 @@ static int perf_event_index(struct perf_event *event)
3358 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3352 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3359} 3353}
3360 3354
3355static void calc_timer_values(struct perf_event *event,
3356 u64 *running,
3357 u64 *enabled)
3358{
3359 u64 now, ctx_time;
3360
3361 now = perf_clock();
3362 ctx_time = event->shadow_ctx_time + now;
3363 *enabled = ctx_time - event->tstamp_enabled;
3364 *running = ctx_time - event->tstamp_running;
3365}
3366
3361/* 3367/*
3362 * Callers need to ensure there can be no nesting of this function, otherwise 3368 * Callers need to ensure there can be no nesting of this function, otherwise
3363 * the seqlock logic goes bad. We can not serialize this because the arch 3369 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3366,14 +3372,25 @@ static int perf_event_index(struct perf_event *event)
3366void perf_event_update_userpage(struct perf_event *event) 3372void perf_event_update_userpage(struct perf_event *event)
3367{ 3373{
3368 struct perf_event_mmap_page *userpg; 3374 struct perf_event_mmap_page *userpg;
3369 struct perf_buffer *buffer; 3375 struct ring_buffer *rb;
3376 u64 enabled, running;
3370 3377
3371 rcu_read_lock(); 3378 rcu_read_lock();
3372 buffer = rcu_dereference(event->buffer); 3379 /*
3373 if (!buffer) 3380 * compute total_time_enabled, total_time_running
3381 * based on snapshot values taken when the event
3382 * was last scheduled in.
3383 *
3384 * we cannot simply called update_context_time()
3385 * because of locking issue as we can be called in
3386 * NMI context
3387 */
3388 calc_timer_values(event, &enabled, &running);
3389 rb = rcu_dereference(event->rb);
3390 if (!rb)
3374 goto unlock; 3391 goto unlock;
3375 3392
3376 userpg = buffer->user_page; 3393 userpg = rb->user_page;
3377 3394
3378 /* 3395 /*
3379 * Disable preemption so as to not let the corresponding user-space 3396 * Disable preemption so as to not let the corresponding user-space
@@ -3387,10 +3404,10 @@ void perf_event_update_userpage(struct perf_event *event)
3387 if (event->state == PERF_EVENT_STATE_ACTIVE) 3404 if (event->state == PERF_EVENT_STATE_ACTIVE)
3388 userpg->offset -= local64_read(&event->hw.prev_count); 3405 userpg->offset -= local64_read(&event->hw.prev_count);
3389 3406
3390 userpg->time_enabled = event->total_time_enabled + 3407 userpg->time_enabled = enabled +
3391 atomic64_read(&event->child_total_time_enabled); 3408 atomic64_read(&event->child_total_time_enabled);
3392 3409
3393 userpg->time_running = event->total_time_running + 3410 userpg->time_running = running +
3394 atomic64_read(&event->child_total_time_running); 3411 atomic64_read(&event->child_total_time_running);
3395 3412
3396 barrier(); 3413 barrier();
@@ -3400,220 +3417,10 @@ unlock:
3400 rcu_read_unlock(); 3417 rcu_read_unlock();
3401} 3418}
3402 3419
3403static unsigned long perf_data_size(struct perf_buffer *buffer);
3404
3405static void
3406perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3407{
3408 long max_size = perf_data_size(buffer);
3409
3410 if (watermark)
3411 buffer->watermark = min(max_size, watermark);
3412
3413 if (!buffer->watermark)
3414 buffer->watermark = max_size / 2;
3415
3416 if (flags & PERF_BUFFER_WRITABLE)
3417 buffer->writable = 1;
3418
3419 atomic_set(&buffer->refcount, 1);
3420}
3421
3422#ifndef CONFIG_PERF_USE_VMALLOC
3423
3424/*
3425 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3426 */
3427
3428static struct page *
3429perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3430{
3431 if (pgoff > buffer->nr_pages)
3432 return NULL;
3433
3434 if (pgoff == 0)
3435 return virt_to_page(buffer->user_page);
3436
3437 return virt_to_page(buffer->data_pages[pgoff - 1]);
3438}
3439
3440static void *perf_mmap_alloc_page(int cpu)
3441{
3442 struct page *page;
3443 int node;
3444
3445 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3446 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3447 if (!page)
3448 return NULL;
3449
3450 return page_address(page);
3451}
3452
3453static struct perf_buffer *
3454perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3455{
3456 struct perf_buffer *buffer;
3457 unsigned long size;
3458 int i;
3459
3460 size = sizeof(struct perf_buffer);
3461 size += nr_pages * sizeof(void *);
3462
3463 buffer = kzalloc(size, GFP_KERNEL);
3464 if (!buffer)
3465 goto fail;
3466
3467 buffer->user_page = perf_mmap_alloc_page(cpu);
3468 if (!buffer->user_page)
3469 goto fail_user_page;
3470
3471 for (i = 0; i < nr_pages; i++) {
3472 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
3473 if (!buffer->data_pages[i])
3474 goto fail_data_pages;
3475 }
3476
3477 buffer->nr_pages = nr_pages;
3478
3479 perf_buffer_init(buffer, watermark, flags);
3480
3481 return buffer;
3482
3483fail_data_pages:
3484 for (i--; i >= 0; i--)
3485 free_page((unsigned long)buffer->data_pages[i]);
3486
3487 free_page((unsigned long)buffer->user_page);
3488
3489fail_user_page:
3490 kfree(buffer);
3491
3492fail:
3493 return NULL;
3494}
3495
3496static void perf_mmap_free_page(unsigned long addr)
3497{
3498 struct page *page = virt_to_page((void *)addr);
3499
3500 page->mapping = NULL;
3501 __free_page(page);
3502}
3503
3504static void perf_buffer_free(struct perf_buffer *buffer)
3505{
3506 int i;
3507
3508 perf_mmap_free_page((unsigned long)buffer->user_page);
3509 for (i = 0; i < buffer->nr_pages; i++)
3510 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3511 kfree(buffer);
3512}
3513
3514static inline int page_order(struct perf_buffer *buffer)
3515{
3516 return 0;
3517}
3518
3519#else
3520
3521/*
3522 * Back perf_mmap() with vmalloc memory.
3523 *
3524 * Required for architectures that have d-cache aliasing issues.
3525 */
3526
3527static inline int page_order(struct perf_buffer *buffer)
3528{
3529 return buffer->page_order;
3530}
3531
3532static struct page *
3533perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3534{
3535 if (pgoff > (1UL << page_order(buffer)))
3536 return NULL;
3537
3538 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
3539}
3540
3541static void perf_mmap_unmark_page(void *addr)
3542{
3543 struct page *page = vmalloc_to_page(addr);
3544
3545 page->mapping = NULL;
3546}
3547
3548static void perf_buffer_free_work(struct work_struct *work)
3549{
3550 struct perf_buffer *buffer;
3551 void *base;
3552 int i, nr;
3553
3554 buffer = container_of(work, struct perf_buffer, work);
3555 nr = 1 << page_order(buffer);
3556
3557 base = buffer->user_page;
3558 for (i = 0; i < nr + 1; i++)
3559 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3560
3561 vfree(base);
3562 kfree(buffer);
3563}
3564
3565static void perf_buffer_free(struct perf_buffer *buffer)
3566{
3567 schedule_work(&buffer->work);
3568}
3569
3570static struct perf_buffer *
3571perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3572{
3573 struct perf_buffer *buffer;
3574 unsigned long size;
3575 void *all_buf;
3576
3577 size = sizeof(struct perf_buffer);
3578 size += sizeof(void *);
3579
3580 buffer = kzalloc(size, GFP_KERNEL);
3581 if (!buffer)
3582 goto fail;
3583
3584 INIT_WORK(&buffer->work, perf_buffer_free_work);
3585
3586 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3587 if (!all_buf)
3588 goto fail_all_buf;
3589
3590 buffer->user_page = all_buf;
3591 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3592 buffer->page_order = ilog2(nr_pages);
3593 buffer->nr_pages = 1;
3594
3595 perf_buffer_init(buffer, watermark, flags);
3596
3597 return buffer;
3598
3599fail_all_buf:
3600 kfree(buffer);
3601
3602fail:
3603 return NULL;
3604}
3605
3606#endif
3607
3608static unsigned long perf_data_size(struct perf_buffer *buffer)
3609{
3610 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
3611}
3612
3613static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 3420static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3614{ 3421{
3615 struct perf_event *event = vma->vm_file->private_data; 3422 struct perf_event *event = vma->vm_file->private_data;
3616 struct perf_buffer *buffer; 3423 struct ring_buffer *rb;
3617 int ret = VM_FAULT_SIGBUS; 3424 int ret = VM_FAULT_SIGBUS;
3618 3425
3619 if (vmf->flags & FAULT_FLAG_MKWRITE) { 3426 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3623,14 +3430,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3623 } 3430 }
3624 3431
3625 rcu_read_lock(); 3432 rcu_read_lock();
3626 buffer = rcu_dereference(event->buffer); 3433 rb = rcu_dereference(event->rb);
3627 if (!buffer) 3434 if (!rb)
3628 goto unlock; 3435 goto unlock;
3629 3436
3630 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 3437 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3631 goto unlock; 3438 goto unlock;
3632 3439
3633 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); 3440 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3634 if (!vmf->page) 3441 if (!vmf->page)
3635 goto unlock; 3442 goto unlock;
3636 3443
@@ -3645,35 +3452,35 @@ unlock:
3645 return ret; 3452 return ret;
3646} 3453}
3647 3454
3648static void perf_buffer_free_rcu(struct rcu_head *rcu_head) 3455static void rb_free_rcu(struct rcu_head *rcu_head)
3649{ 3456{
3650 struct perf_buffer *buffer; 3457 struct ring_buffer *rb;
3651 3458
3652 buffer = container_of(rcu_head, struct perf_buffer, rcu_head); 3459 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3653 perf_buffer_free(buffer); 3460 rb_free(rb);
3654} 3461}
3655 3462
3656static struct perf_buffer *perf_buffer_get(struct perf_event *event) 3463static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3657{ 3464{
3658 struct perf_buffer *buffer; 3465 struct ring_buffer *rb;
3659 3466
3660 rcu_read_lock(); 3467 rcu_read_lock();
3661 buffer = rcu_dereference(event->buffer); 3468 rb = rcu_dereference(event->rb);
3662 if (buffer) { 3469 if (rb) {
3663 if (!atomic_inc_not_zero(&buffer->refcount)) 3470 if (!atomic_inc_not_zero(&rb->refcount))
3664 buffer = NULL; 3471 rb = NULL;
3665 } 3472 }
3666 rcu_read_unlock(); 3473 rcu_read_unlock();
3667 3474
3668 return buffer; 3475 return rb;
3669} 3476}
3670 3477
3671static void perf_buffer_put(struct perf_buffer *buffer) 3478static void ring_buffer_put(struct ring_buffer *rb)
3672{ 3479{
3673 if (!atomic_dec_and_test(&buffer->refcount)) 3480 if (!atomic_dec_and_test(&rb->refcount))
3674 return; 3481 return;
3675 3482
3676 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); 3483 call_rcu(&rb->rcu_head, rb_free_rcu);
3677} 3484}
3678 3485
3679static void perf_mmap_open(struct vm_area_struct *vma) 3486static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3688,16 +3495,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3688 struct perf_event *event = vma->vm_file->private_data; 3495 struct perf_event *event = vma->vm_file->private_data;
3689 3496
3690 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3497 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3691 unsigned long size = perf_data_size(event->buffer); 3498 unsigned long size = perf_data_size(event->rb);
3692 struct user_struct *user = event->mmap_user; 3499 struct user_struct *user = event->mmap_user;
3693 struct perf_buffer *buffer = event->buffer; 3500 struct ring_buffer *rb = event->rb;
3694 3501
3695 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3502 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3696 vma->vm_mm->locked_vm -= event->mmap_locked; 3503 vma->vm_mm->locked_vm -= event->mmap_locked;
3697 rcu_assign_pointer(event->buffer, NULL); 3504 rcu_assign_pointer(event->rb, NULL);
3698 mutex_unlock(&event->mmap_mutex); 3505 mutex_unlock(&event->mmap_mutex);
3699 3506
3700 perf_buffer_put(buffer); 3507 ring_buffer_put(rb);
3701 free_uid(user); 3508 free_uid(user);
3702 } 3509 }
3703} 3510}
@@ -3715,7 +3522,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3715 unsigned long user_locked, user_lock_limit; 3522 unsigned long user_locked, user_lock_limit;
3716 struct user_struct *user = current_user(); 3523 struct user_struct *user = current_user();
3717 unsigned long locked, lock_limit; 3524 unsigned long locked, lock_limit;
3718 struct perf_buffer *buffer; 3525 struct ring_buffer *rb;
3719 unsigned long vma_size; 3526 unsigned long vma_size;
3720 unsigned long nr_pages; 3527 unsigned long nr_pages;
3721 long user_extra, extra; 3528 long user_extra, extra;
@@ -3724,7 +3531,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3724 /* 3531 /*
3725 * Don't allow mmap() of inherited per-task counters. This would 3532 * Don't allow mmap() of inherited per-task counters. This would
3726 * create a performance issue due to all children writing to the 3533 * create a performance issue due to all children writing to the
3727 * same buffer. 3534 * same rb.
3728 */ 3535 */
3729 if (event->cpu == -1 && event->attr.inherit) 3536 if (event->cpu == -1 && event->attr.inherit)
3730 return -EINVAL; 3537 return -EINVAL;
@@ -3736,7 +3543,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3736 nr_pages = (vma_size / PAGE_SIZE) - 1; 3543 nr_pages = (vma_size / PAGE_SIZE) - 1;
3737 3544
3738 /* 3545 /*
3739 * If we have buffer pages ensure they're a power-of-two number, so we 3546 * If we have rb pages ensure they're a power-of-two number, so we
3740 * can do bitmasks instead of modulo. 3547 * can do bitmasks instead of modulo.
3741 */ 3548 */
3742 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 3549 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3750,9 +3557,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3750 3557
3751 WARN_ON_ONCE(event->ctx->parent_ctx); 3558 WARN_ON_ONCE(event->ctx->parent_ctx);
3752 mutex_lock(&event->mmap_mutex); 3559 mutex_lock(&event->mmap_mutex);
3753 if (event->buffer) { 3560 if (event->rb) {
3754 if (event->buffer->nr_pages == nr_pages) 3561 if (event->rb->nr_pages == nr_pages)
3755 atomic_inc(&event->buffer->refcount); 3562 atomic_inc(&event->rb->refcount);
3756 else 3563 else
3757 ret = -EINVAL; 3564 ret = -EINVAL;
3758 goto unlock; 3565 goto unlock;
@@ -3782,18 +3589,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3782 goto unlock; 3589 goto unlock;
3783 } 3590 }
3784 3591
3785 WARN_ON(event->buffer); 3592 WARN_ON(event->rb);
3786 3593
3787 if (vma->vm_flags & VM_WRITE) 3594 if (vma->vm_flags & VM_WRITE)
3788 flags |= PERF_BUFFER_WRITABLE; 3595 flags |= RING_BUFFER_WRITABLE;
3789 3596
3790 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, 3597 rb = rb_alloc(nr_pages,
3791 event->cpu, flags); 3598 event->attr.watermark ? event->attr.wakeup_watermark : 0,
3792 if (!buffer) { 3599 event->cpu, flags);
3600
3601 if (!rb) {
3793 ret = -ENOMEM; 3602 ret = -ENOMEM;
3794 goto unlock; 3603 goto unlock;
3795 } 3604 }
3796 rcu_assign_pointer(event->buffer, buffer); 3605 rcu_assign_pointer(event->rb, rb);
3797 3606
3798 atomic_long_add(user_extra, &user->locked_vm); 3607 atomic_long_add(user_extra, &user->locked_vm);
3799 event->mmap_locked = extra; 3608 event->mmap_locked = extra;
@@ -3892,117 +3701,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3892} 3701}
3893EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3702EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3894 3703
3895/*
3896 * Output
3897 */
3898static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3899 unsigned long offset, unsigned long head)
3900{
3901 unsigned long mask;
3902
3903 if (!buffer->writable)
3904 return true;
3905
3906 mask = perf_data_size(buffer) - 1;
3907
3908 offset = (offset - tail) & mask;
3909 head = (head - tail) & mask;
3910
3911 if ((int)(head - offset) < 0)
3912 return false;
3913
3914 return true;
3915}
3916
3917static void perf_output_wakeup(struct perf_output_handle *handle)
3918{
3919 atomic_set(&handle->buffer->poll, POLL_IN);
3920
3921 if (handle->nmi) {
3922 handle->event->pending_wakeup = 1;
3923 irq_work_queue(&handle->event->pending);
3924 } else
3925 perf_event_wakeup(handle->event);
3926}
3927
3928/*
3929 * We need to ensure a later event_id doesn't publish a head when a former
3930 * event isn't done writing. However since we need to deal with NMIs we
3931 * cannot fully serialize things.
3932 *
3933 * We only publish the head (and generate a wakeup) when the outer-most
3934 * event completes.
3935 */
3936static void perf_output_get_handle(struct perf_output_handle *handle)
3937{
3938 struct perf_buffer *buffer = handle->buffer;
3939
3940 preempt_disable();
3941 local_inc(&buffer->nest);
3942 handle->wakeup = local_read(&buffer->wakeup);
3943}
3944
3945static void perf_output_put_handle(struct perf_output_handle *handle)
3946{
3947 struct perf_buffer *buffer = handle->buffer;
3948 unsigned long head;
3949
3950again:
3951 head = local_read(&buffer->head);
3952
3953 /*
3954 * IRQ/NMI can happen here, which means we can miss a head update.
3955 */
3956
3957 if (!local_dec_and_test(&buffer->nest))
3958 goto out;
3959
3960 /*
3961 * Publish the known good head. Rely on the full barrier implied
3962 * by atomic_dec_and_test() order the buffer->head read and this
3963 * write.
3964 */
3965 buffer->user_page->data_head = head;
3966
3967 /*
3968 * Now check if we missed an update, rely on the (compiler)
3969 * barrier in atomic_dec_and_test() to re-read buffer->head.
3970 */
3971 if (unlikely(head != local_read(&buffer->head))) {
3972 local_inc(&buffer->nest);
3973 goto again;
3974 }
3975
3976 if (handle->wakeup != local_read(&buffer->wakeup))
3977 perf_output_wakeup(handle);
3978
3979out:
3980 preempt_enable();
3981}
3982
3983__always_inline void perf_output_copy(struct perf_output_handle *handle,
3984 const void *buf, unsigned int len)
3985{
3986 do {
3987 unsigned long size = min_t(unsigned long, handle->size, len);
3988
3989 memcpy(handle->addr, buf, size);
3990
3991 len -= size;
3992 handle->addr += size;
3993 buf += size;
3994 handle->size -= size;
3995 if (!handle->size) {
3996 struct perf_buffer *buffer = handle->buffer;
3997
3998 handle->page++;
3999 handle->page &= buffer->nr_pages - 1;
4000 handle->addr = buffer->data_pages[handle->page];
4001 handle->size = PAGE_SIZE << page_order(buffer);
4002 }
4003 } while (len);
4004}
4005
4006static void __perf_event_header__init_id(struct perf_event_header *header, 3704static void __perf_event_header__init_id(struct perf_event_header *header,
4007 struct perf_sample_data *data, 3705 struct perf_sample_data *data,
4008 struct perf_event *event) 3706 struct perf_event *event)
@@ -4033,9 +3731,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4033 } 3731 }
4034} 3732}
4035 3733
4036static void perf_event_header__init_id(struct perf_event_header *header, 3734void perf_event_header__init_id(struct perf_event_header *header,
4037 struct perf_sample_data *data, 3735 struct perf_sample_data *data,
4038 struct perf_event *event) 3736 struct perf_event *event)
4039{ 3737{
4040 if (event->attr.sample_id_all) 3738 if (event->attr.sample_id_all)
4041 __perf_event_header__init_id(header, data, event); 3739 __perf_event_header__init_id(header, data, event);
@@ -4062,121 +3760,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4062 perf_output_put(handle, data->cpu_entry); 3760 perf_output_put(handle, data->cpu_entry);
4063} 3761}
4064 3762
4065static void perf_event__output_id_sample(struct perf_event *event, 3763void perf_event__output_id_sample(struct perf_event *event,
4066 struct perf_output_handle *handle, 3764 struct perf_output_handle *handle,
4067 struct perf_sample_data *sample) 3765 struct perf_sample_data *sample)
4068{ 3766{
4069 if (event->attr.sample_id_all) 3767 if (event->attr.sample_id_all)
4070 __perf_event__output_id_sample(handle, sample); 3768 __perf_event__output_id_sample(handle, sample);
4071} 3769}
4072 3770
4073int perf_output_begin(struct perf_output_handle *handle,
4074 struct perf_event *event, unsigned int size,
4075 int nmi, int sample)
4076{
4077 struct perf_buffer *buffer;
4078 unsigned long tail, offset, head;
4079 int have_lost;
4080 struct perf_sample_data sample_data;
4081 struct {
4082 struct perf_event_header header;
4083 u64 id;
4084 u64 lost;
4085 } lost_event;
4086
4087 rcu_read_lock();
4088 /*
4089 * For inherited events we send all the output towards the parent.
4090 */
4091 if (event->parent)
4092 event = event->parent;
4093
4094 buffer = rcu_dereference(event->buffer);
4095 if (!buffer)
4096 goto out;
4097
4098 handle->buffer = buffer;
4099 handle->event = event;
4100 handle->nmi = nmi;
4101 handle->sample = sample;
4102
4103 if (!buffer->nr_pages)
4104 goto out;
4105
4106 have_lost = local_read(&buffer->lost);
4107 if (have_lost) {
4108 lost_event.header.size = sizeof(lost_event);
4109 perf_event_header__init_id(&lost_event.header, &sample_data,
4110 event);
4111 size += lost_event.header.size;
4112 }
4113
4114 perf_output_get_handle(handle);
4115
4116 do {
4117 /*
4118 * Userspace could choose to issue a mb() before updating the
4119 * tail pointer. So that all reads will be completed before the
4120 * write is issued.
4121 */
4122 tail = ACCESS_ONCE(buffer->user_page->data_tail);
4123 smp_rmb();
4124 offset = head = local_read(&buffer->head);
4125 head += size;
4126 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
4127 goto fail;
4128 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
4129
4130 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4131 local_add(buffer->watermark, &buffer->wakeup);
4132
4133 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4134 handle->page &= buffer->nr_pages - 1;
4135 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4136 handle->addr = buffer->data_pages[handle->page];
4137 handle->addr += handle->size;
4138 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
4139
4140 if (have_lost) {
4141 lost_event.header.type = PERF_RECORD_LOST;
4142 lost_event.header.misc = 0;
4143 lost_event.id = event->id;
4144 lost_event.lost = local_xchg(&buffer->lost, 0);
4145
4146 perf_output_put(handle, lost_event);
4147 perf_event__output_id_sample(event, handle, &sample_data);
4148 }
4149
4150 return 0;
4151
4152fail:
4153 local_inc(&buffer->lost);
4154 perf_output_put_handle(handle);
4155out:
4156 rcu_read_unlock();
4157
4158 return -ENOSPC;
4159}
4160
4161void perf_output_end(struct perf_output_handle *handle)
4162{
4163 struct perf_event *event = handle->event;
4164 struct perf_buffer *buffer = handle->buffer;
4165
4166 int wakeup_events = event->attr.wakeup_events;
4167
4168 if (handle->sample && wakeup_events) {
4169 int events = local_inc_return(&buffer->events);
4170 if (events >= wakeup_events) {
4171 local_sub(wakeup_events, &buffer->events);
4172 local_inc(&buffer->wakeup);
4173 }
4174 }
4175
4176 perf_output_put_handle(handle);
4177 rcu_read_unlock();
4178}
4179
4180static void perf_output_read_one(struct perf_output_handle *handle, 3771static void perf_output_read_one(struct perf_output_handle *handle,
4181 struct perf_event *event, 3772 struct perf_event *event,
4182 u64 enabled, u64 running) 3773 u64 enabled, u64 running)
@@ -4197,7 +3788,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
4197 if (read_format & PERF_FORMAT_ID) 3788 if (read_format & PERF_FORMAT_ID)
4198 values[n++] = primary_event_id(event); 3789 values[n++] = primary_event_id(event);
4199 3790
4200 perf_output_copy(handle, values, n * sizeof(u64)); 3791 __output_copy(handle, values, n * sizeof(u64));
4201} 3792}
4202 3793
4203/* 3794/*
@@ -4227,7 +3818,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4227 if (read_format & PERF_FORMAT_ID) 3818 if (read_format & PERF_FORMAT_ID)
4228 values[n++] = primary_event_id(leader); 3819 values[n++] = primary_event_id(leader);
4229 3820
4230 perf_output_copy(handle, values, n * sizeof(u64)); 3821 __output_copy(handle, values, n * sizeof(u64));
4231 3822
4232 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3823 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4233 n = 0; 3824 n = 0;
@@ -4239,7 +3830,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4239 if (read_format & PERF_FORMAT_ID) 3830 if (read_format & PERF_FORMAT_ID)
4240 values[n++] = primary_event_id(sub); 3831 values[n++] = primary_event_id(sub);
4241 3832
4242 perf_output_copy(handle, values, n * sizeof(u64)); 3833 __output_copy(handle, values, n * sizeof(u64));
4243 } 3834 }
4244} 3835}
4245 3836
@@ -4249,7 +3840,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4249static void perf_output_read(struct perf_output_handle *handle, 3840static void perf_output_read(struct perf_output_handle *handle,
4250 struct perf_event *event) 3841 struct perf_event *event)
4251{ 3842{
4252 u64 enabled = 0, running = 0, now, ctx_time; 3843 u64 enabled = 0, running = 0;
4253 u64 read_format = event->attr.read_format; 3844 u64 read_format = event->attr.read_format;
4254 3845
4255 /* 3846 /*
@@ -4261,12 +3852,8 @@ static void perf_output_read(struct perf_output_handle *handle,
4261 * because of locking issue as we are called in 3852 * because of locking issue as we are called in
4262 * NMI context 3853 * NMI context
4263 */ 3854 */
4264 if (read_format & PERF_FORMAT_TOTAL_TIMES) { 3855 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4265 now = perf_clock(); 3856 calc_timer_values(event, &enabled, &running);
4266 ctx_time = event->shadow_ctx_time + now;
4267 enabled = ctx_time - event->tstamp_enabled;
4268 running = ctx_time - event->tstamp_running;
4269 }
4270 3857
4271 if (event->attr.read_format & PERF_FORMAT_GROUP) 3858 if (event->attr.read_format & PERF_FORMAT_GROUP)
4272 perf_output_read_group(handle, event, enabled, running); 3859 perf_output_read_group(handle, event, enabled, running);
@@ -4319,7 +3906,7 @@ void perf_output_sample(struct perf_output_handle *handle,
4319 3906
4320 size *= sizeof(u64); 3907 size *= sizeof(u64);
4321 3908
4322 perf_output_copy(handle, data->callchain, size); 3909 __output_copy(handle, data->callchain, size);
4323 } else { 3910 } else {
4324 u64 nr = 0; 3911 u64 nr = 0;
4325 perf_output_put(handle, nr); 3912 perf_output_put(handle, nr);
@@ -4329,8 +3916,8 @@ void perf_output_sample(struct perf_output_handle *handle,
4329 if (sample_type & PERF_SAMPLE_RAW) { 3916 if (sample_type & PERF_SAMPLE_RAW) {
4330 if (data->raw) { 3917 if (data->raw) {
4331 perf_output_put(handle, data->raw->size); 3918 perf_output_put(handle, data->raw->size);
4332 perf_output_copy(handle, data->raw->data, 3919 __output_copy(handle, data->raw->data,
4333 data->raw->size); 3920 data->raw->size);
4334 } else { 3921 } else {
4335 struct { 3922 struct {
4336 u32 size; 3923 u32 size;
@@ -4342,6 +3929,20 @@ void perf_output_sample(struct perf_output_handle *handle,
4342 perf_output_put(handle, raw); 3929 perf_output_put(handle, raw);
4343 } 3930 }
4344 } 3931 }
3932
3933 if (!event->attr.watermark) {
3934 int wakeup_events = event->attr.wakeup_events;
3935
3936 if (wakeup_events) {
3937 struct ring_buffer *rb = handle->rb;
3938 int events = local_inc_return(&rb->events);
3939
3940 if (events >= wakeup_events) {
3941 local_sub(wakeup_events, &rb->events);
3942 local_inc(&rb->wakeup);
3943 }
3944 }
3945 }
4345} 3946}
4346 3947
4347void perf_prepare_sample(struct perf_event_header *header, 3948void perf_prepare_sample(struct perf_event_header *header,
@@ -4386,7 +3987,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4386 } 3987 }
4387} 3988}
4388 3989
4389static void perf_event_output(struct perf_event *event, int nmi, 3990static void perf_event_output(struct perf_event *event,
4390 struct perf_sample_data *data, 3991 struct perf_sample_data *data,
4391 struct pt_regs *regs) 3992 struct pt_regs *regs)
4392{ 3993{
@@ -4398,7 +3999,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
4398 3999
4399 perf_prepare_sample(&header, data, event, regs); 4000 perf_prepare_sample(&header, data, event, regs);
4400 4001
4401 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 4002 if (perf_output_begin(&handle, event, header.size))
4402 goto exit; 4003 goto exit;
4403 4004
4404 perf_output_sample(&handle, &header, data, event); 4005 perf_output_sample(&handle, &header, data, event);
@@ -4438,7 +4039,7 @@ perf_event_read_event(struct perf_event *event,
4438 int ret; 4039 int ret;
4439 4040
4440 perf_event_header__init_id(&read_event.header, &sample, event); 4041 perf_event_header__init_id(&read_event.header, &sample, event);
4441 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 4042 ret = perf_output_begin(&handle, event, read_event.header.size);
4442 if (ret) 4043 if (ret)
4443 return; 4044 return;
4444 4045
@@ -4481,7 +4082,7 @@ static void perf_event_task_output(struct perf_event *event,
4481 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4082 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4482 4083
4483 ret = perf_output_begin(&handle, event, 4084 ret = perf_output_begin(&handle, event,
4484 task_event->event_id.header.size, 0, 0); 4085 task_event->event_id.header.size);
4485 if (ret) 4086 if (ret)
4486 goto out; 4087 goto out;
4487 4088
@@ -4618,7 +4219,7 @@ static void perf_event_comm_output(struct perf_event *event,
4618 4219
4619 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4220 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4620 ret = perf_output_begin(&handle, event, 4221 ret = perf_output_begin(&handle, event,
4621 comm_event->event_id.header.size, 0, 0); 4222 comm_event->event_id.header.size);
4622 4223
4623 if (ret) 4224 if (ret)
4624 goto out; 4225 goto out;
@@ -4627,7 +4228,7 @@ static void perf_event_comm_output(struct perf_event *event,
4627 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4228 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4628 4229
4629 perf_output_put(&handle, comm_event->event_id); 4230 perf_output_put(&handle, comm_event->event_id);
4630 perf_output_copy(&handle, comm_event->comm, 4231 __output_copy(&handle, comm_event->comm,
4631 comm_event->comm_size); 4232 comm_event->comm_size);
4632 4233
4633 perf_event__output_id_sample(event, &handle, &sample); 4234 perf_event__output_id_sample(event, &handle, &sample);
@@ -4765,7 +4366,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4765 4366
4766 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 4367 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4767 ret = perf_output_begin(&handle, event, 4368 ret = perf_output_begin(&handle, event,
4768 mmap_event->event_id.header.size, 0, 0); 4369 mmap_event->event_id.header.size);
4769 if (ret) 4370 if (ret)
4770 goto out; 4371 goto out;
4771 4372
@@ -4773,7 +4374,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4773 mmap_event->event_id.tid = perf_event_tid(event, current); 4374 mmap_event->event_id.tid = perf_event_tid(event, current);
4774 4375
4775 perf_output_put(&handle, mmap_event->event_id); 4376 perf_output_put(&handle, mmap_event->event_id);
4776 perf_output_copy(&handle, mmap_event->file_name, 4377 __output_copy(&handle, mmap_event->file_name,
4777 mmap_event->file_size); 4378 mmap_event->file_size);
4778 4379
4779 perf_event__output_id_sample(event, &handle, &sample); 4380 perf_event__output_id_sample(event, &handle, &sample);
@@ -4829,7 +4430,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4829 4430
4830 if (file) { 4431 if (file) {
4831 /* 4432 /*
4832 * d_path works from the end of the buffer backwards, so we 4433 * d_path works from the end of the rb backwards, so we
4833 * need to add enough zero bytes after the string to handle 4434 * need to add enough zero bytes after the string to handle
4834 * the 64bit alignment we do later. 4435 * the 64bit alignment we do later.
4835 */ 4436 */
@@ -4960,7 +4561,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4960 perf_event_header__init_id(&throttle_event.header, &sample, event); 4561 perf_event_header__init_id(&throttle_event.header, &sample, event);
4961 4562
4962 ret = perf_output_begin(&handle, event, 4563 ret = perf_output_begin(&handle, event,
4963 throttle_event.header.size, 1, 0); 4564 throttle_event.header.size);
4964 if (ret) 4565 if (ret)
4965 return; 4566 return;
4966 4567
@@ -4973,7 +4574,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4973 * Generic event overflow handling, sampling. 4574 * Generic event overflow handling, sampling.
4974 */ 4575 */
4975 4576
4976static int __perf_event_overflow(struct perf_event *event, int nmi, 4577static int __perf_event_overflow(struct perf_event *event,
4977 int throttle, struct perf_sample_data *data, 4578 int throttle, struct perf_sample_data *data,
4978 struct pt_regs *regs) 4579 struct pt_regs *regs)
4979{ 4580{
@@ -5016,34 +4617,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
5016 if (events && atomic_dec_and_test(&event->event_limit)) { 4617 if (events && atomic_dec_and_test(&event->event_limit)) {
5017 ret = 1; 4618 ret = 1;
5018 event->pending_kill = POLL_HUP; 4619 event->pending_kill = POLL_HUP;
5019 if (nmi) { 4620 event->pending_disable = 1;
5020 event->pending_disable = 1; 4621 irq_work_queue(&event->pending);
5021 irq_work_queue(&event->pending);
5022 } else
5023 perf_event_disable(event);
5024 } 4622 }
5025 4623
5026 if (event->overflow_handler) 4624 if (event->overflow_handler)
5027 event->overflow_handler(event, nmi, data, regs); 4625 event->overflow_handler(event, data, regs);
5028 else 4626 else
5029 perf_event_output(event, nmi, data, regs); 4627 perf_event_output(event, data, regs);
5030 4628
5031 if (event->fasync && event->pending_kill) { 4629 if (event->fasync && event->pending_kill) {
5032 if (nmi) { 4630 event->pending_wakeup = 1;
5033 event->pending_wakeup = 1; 4631 irq_work_queue(&event->pending);
5034 irq_work_queue(&event->pending);
5035 } else
5036 perf_event_wakeup(event);
5037 } 4632 }
5038 4633
5039 return ret; 4634 return ret;
5040} 4635}
5041 4636
5042int perf_event_overflow(struct perf_event *event, int nmi, 4637int perf_event_overflow(struct perf_event *event,
5043 struct perf_sample_data *data, 4638 struct perf_sample_data *data,
5044 struct pt_regs *regs) 4639 struct pt_regs *regs)
5045{ 4640{
5046 return __perf_event_overflow(event, nmi, 1, data, regs); 4641 return __perf_event_overflow(event, 1, data, regs);
5047} 4642}
5048 4643
5049/* 4644/*
@@ -5092,7 +4687,7 @@ again:
5092} 4687}
5093 4688
5094static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 4689static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5095 int nmi, struct perf_sample_data *data, 4690 struct perf_sample_data *data,
5096 struct pt_regs *regs) 4691 struct pt_regs *regs)
5097{ 4692{
5098 struct hw_perf_event *hwc = &event->hw; 4693 struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4701,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5106 return; 4701 return;
5107 4702
5108 for (; overflow; overflow--) { 4703 for (; overflow; overflow--) {
5109 if (__perf_event_overflow(event, nmi, throttle, 4704 if (__perf_event_overflow(event, throttle,
5110 data, regs)) { 4705 data, regs)) {
5111 /* 4706 /*
5112 * We inhibit the overflow from happening when 4707 * We inhibit the overflow from happening when
@@ -5119,7 +4714,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5119} 4714}
5120 4715
5121static void perf_swevent_event(struct perf_event *event, u64 nr, 4716static void perf_swevent_event(struct perf_event *event, u64 nr,
5122 int nmi, struct perf_sample_data *data, 4717 struct perf_sample_data *data,
5123 struct pt_regs *regs) 4718 struct pt_regs *regs)
5124{ 4719{
5125 struct hw_perf_event *hwc = &event->hw; 4720 struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4728,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
5133 return; 4728 return;
5134 4729
5135 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4730 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5136 return perf_swevent_overflow(event, 1, nmi, data, regs); 4731 return perf_swevent_overflow(event, 1, data, regs);
5137 4732
5138 if (local64_add_negative(nr, &hwc->period_left)) 4733 if (local64_add_negative(nr, &hwc->period_left))
5139 return; 4734 return;
5140 4735
5141 perf_swevent_overflow(event, 0, nmi, data, regs); 4736 perf_swevent_overflow(event, 0, data, regs);
5142} 4737}
5143 4738
5144static int perf_exclude_event(struct perf_event *event, 4739static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4821,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5226} 4821}
5227 4822
5228static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4823static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5229 u64 nr, int nmi, 4824 u64 nr,
5230 struct perf_sample_data *data, 4825 struct perf_sample_data *data,
5231 struct pt_regs *regs) 4826 struct pt_regs *regs)
5232{ 4827{
@@ -5242,7 +4837,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5242 4837
5243 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4838 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5244 if (perf_swevent_match(event, type, event_id, data, regs)) 4839 if (perf_swevent_match(event, type, event_id, data, regs))
5245 perf_swevent_event(event, nr, nmi, data, regs); 4840 perf_swevent_event(event, nr, data, regs);
5246 } 4841 }
5247end: 4842end:
5248 rcu_read_unlock(); 4843 rcu_read_unlock();
@@ -5263,8 +4858,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
5263 put_recursion_context(swhash->recursion, rctx); 4858 put_recursion_context(swhash->recursion, rctx);
5264} 4859}
5265 4860
5266void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4861void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5267 struct pt_regs *regs, u64 addr)
5268{ 4862{
5269 struct perf_sample_data data; 4863 struct perf_sample_data data;
5270 int rctx; 4864 int rctx;
@@ -5276,7 +4870,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5276 4870
5277 perf_sample_data_init(&data, addr); 4871 perf_sample_data_init(&data, addr);
5278 4872
5279 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4873 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5280 4874
5281 perf_swevent_put_recursion_context(rctx); 4875 perf_swevent_put_recursion_context(rctx);
5282 preempt_enable_notrace(); 4876 preempt_enable_notrace();
@@ -5524,7 +5118,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5524 5118
5525 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5119 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5526 if (perf_tp_event_match(event, &data, regs)) 5120 if (perf_tp_event_match(event, &data, regs))
5527 perf_swevent_event(event, count, 1, &data, regs); 5121 perf_swevent_event(event, count, &data, regs);
5528 } 5122 }
5529 5123
5530 perf_swevent_put_recursion_context(rctx); 5124 perf_swevent_put_recursion_context(rctx);
@@ -5617,7 +5211,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5617 perf_sample_data_init(&sample, bp->attr.bp_addr); 5211 perf_sample_data_init(&sample, bp->attr.bp_addr);
5618 5212
5619 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5213 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5620 perf_swevent_event(bp, 1, 1, &sample, regs); 5214 perf_swevent_event(bp, 1, &sample, regs);
5621} 5215}
5622#endif 5216#endif
5623 5217
@@ -5646,7 +5240,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5646 5240
5647 if (regs && !perf_exclude_event(event, regs)) { 5241 if (regs && !perf_exclude_event(event, regs)) {
5648 if (!(event->attr.exclude_idle && current->pid == 0)) 5242 if (!(event->attr.exclude_idle && current->pid == 0))
5649 if (perf_event_overflow(event, 0, &data, regs)) 5243 if (perf_event_overflow(event, &data, regs))
5650 ret = HRTIMER_NORESTART; 5244 ret = HRTIMER_NORESTART;
5651 } 5245 }
5652 5246
@@ -5986,6 +5580,7 @@ free_dev:
5986} 5580}
5987 5581
5988static struct lock_class_key cpuctx_mutex; 5582static struct lock_class_key cpuctx_mutex;
5583static struct lock_class_key cpuctx_lock;
5989 5584
5990int perf_pmu_register(struct pmu *pmu, char *name, int type) 5585int perf_pmu_register(struct pmu *pmu, char *name, int type)
5991{ 5586{
@@ -6036,6 +5631,7 @@ skip_type:
6036 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5631 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6037 __perf_event_init_context(&cpuctx->ctx); 5632 __perf_event_init_context(&cpuctx->ctx);
6038 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 5633 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5634 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6039 cpuctx->ctx.type = cpu_context; 5635 cpuctx->ctx.type = cpu_context;
6040 cpuctx->ctx.pmu = pmu; 5636 cpuctx->ctx.pmu = pmu;
6041 cpuctx->jiffies_interval = 1; 5637 cpuctx->jiffies_interval = 1;
@@ -6150,7 +5746,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6150 struct task_struct *task, 5746 struct task_struct *task,
6151 struct perf_event *group_leader, 5747 struct perf_event *group_leader,
6152 struct perf_event *parent_event, 5748 struct perf_event *parent_event,
6153 perf_overflow_handler_t overflow_handler) 5749 perf_overflow_handler_t overflow_handler,
5750 void *context)
6154{ 5751{
6155 struct pmu *pmu; 5752 struct pmu *pmu;
6156 struct perf_event *event; 5753 struct perf_event *event;
@@ -6208,10 +5805,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6208#endif 5805#endif
6209 } 5806 }
6210 5807
6211 if (!overflow_handler && parent_event) 5808 if (!overflow_handler && parent_event) {
6212 overflow_handler = parent_event->overflow_handler; 5809 overflow_handler = parent_event->overflow_handler;
5810 context = parent_event->overflow_handler_context;
5811 }
6213 5812
6214 event->overflow_handler = overflow_handler; 5813 event->overflow_handler = overflow_handler;
5814 event->overflow_handler_context = context;
6215 5815
6216 if (attr->disabled) 5816 if (attr->disabled)
6217 event->state = PERF_EVENT_STATE_OFF; 5817 event->state = PERF_EVENT_STATE_OFF;
@@ -6326,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6326 if (ret) 5926 if (ret)
6327 return -EFAULT; 5927 return -EFAULT;
6328 5928
6329 /*
6330 * If the type exists, the corresponding creation will verify
6331 * the attr->config.
6332 */
6333 if (attr->type >= PERF_TYPE_MAX)
6334 return -EINVAL;
6335
6336 if (attr->__reserved_1) 5929 if (attr->__reserved_1)
6337 return -EINVAL; 5930 return -EINVAL;
6338 5931
@@ -6354,7 +5947,7 @@ err_size:
6354static int 5947static int
6355perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5948perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6356{ 5949{
6357 struct perf_buffer *buffer = NULL, *old_buffer = NULL; 5950 struct ring_buffer *rb = NULL, *old_rb = NULL;
6358 int ret = -EINVAL; 5951 int ret = -EINVAL;
6359 5952
6360 if (!output_event) 5953 if (!output_event)
@@ -6371,7 +5964,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6371 goto out; 5964 goto out;
6372 5965
6373 /* 5966 /*
6374 * If its not a per-cpu buffer, it must be the same task. 5967 * If its not a per-cpu rb, it must be the same task.
6375 */ 5968 */
6376 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 5969 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6377 goto out; 5970 goto out;
@@ -6383,20 +5976,20 @@ set:
6383 goto unlock; 5976 goto unlock;
6384 5977
6385 if (output_event) { 5978 if (output_event) {
6386 /* get the buffer we want to redirect to */ 5979 /* get the rb we want to redirect to */
6387 buffer = perf_buffer_get(output_event); 5980 rb = ring_buffer_get(output_event);
6388 if (!buffer) 5981 if (!rb)
6389 goto unlock; 5982 goto unlock;
6390 } 5983 }
6391 5984
6392 old_buffer = event->buffer; 5985 old_rb = event->rb;
6393 rcu_assign_pointer(event->buffer, buffer); 5986 rcu_assign_pointer(event->rb, rb);
6394 ret = 0; 5987 ret = 0;
6395unlock: 5988unlock:
6396 mutex_unlock(&event->mmap_mutex); 5989 mutex_unlock(&event->mmap_mutex);
6397 5990
6398 if (old_buffer) 5991 if (old_rb)
6399 perf_buffer_put(old_buffer); 5992 ring_buffer_put(old_rb);
6400out: 5993out:
6401 return ret; 5994 return ret;
6402} 5995}
@@ -6478,7 +6071,8 @@ SYSCALL_DEFINE5(perf_event_open,
6478 } 6071 }
6479 } 6072 }
6480 6073
6481 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); 6074 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6075 NULL, NULL);
6482 if (IS_ERR(event)) { 6076 if (IS_ERR(event)) {
6483 err = PTR_ERR(event); 6077 err = PTR_ERR(event);
6484 goto err_task; 6078 goto err_task;
@@ -6663,7 +6257,8 @@ err_fd:
6663struct perf_event * 6257struct perf_event *
6664perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 6258perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6665 struct task_struct *task, 6259 struct task_struct *task,
6666 perf_overflow_handler_t overflow_handler) 6260 perf_overflow_handler_t overflow_handler,
6261 void *context)
6667{ 6262{
6668 struct perf_event_context *ctx; 6263 struct perf_event_context *ctx;
6669 struct perf_event *event; 6264 struct perf_event *event;
@@ -6673,7 +6268,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6673 * Get the target context (task or percpu): 6268 * Get the target context (task or percpu):
6674 */ 6269 */
6675 6270
6676 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); 6271 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6272 overflow_handler, context);
6677 if (IS_ERR(event)) { 6273 if (IS_ERR(event)) {
6678 err = PTR_ERR(event); 6274 err = PTR_ERR(event);
6679 goto err; 6275 goto err;
@@ -6780,7 +6376,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6780 * our context. 6376 * our context.
6781 */ 6377 */
6782 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 6378 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6783 task_ctx_sched_out(child_ctx, EVENT_ALL);
6784 6379
6785 /* 6380 /*
6786 * Take the context lock here so that if find_get_context is 6381 * Take the context lock here so that if find_get_context is
@@ -6788,6 +6383,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6788 * incremented the context's refcount before we do put_ctx below. 6383 * incremented the context's refcount before we do put_ctx below.
6789 */ 6384 */
6790 raw_spin_lock(&child_ctx->lock); 6385 raw_spin_lock(&child_ctx->lock);
6386 task_ctx_sched_out(child_ctx);
6791 child->perf_event_ctxp[ctxn] = NULL; 6387 child->perf_event_ctxp[ctxn] = NULL;
6792 /* 6388 /*
6793 * If this context is a clone; unclone it so it can't get 6389 * If this context is a clone; unclone it so it can't get
@@ -6957,7 +6553,7 @@ inherit_event(struct perf_event *parent_event,
6957 parent_event->cpu, 6553 parent_event->cpu,
6958 child, 6554 child,
6959 group_leader, parent_event, 6555 group_leader, parent_event,
6960 NULL); 6556 NULL, NULL);
6961 if (IS_ERR(child_event)) 6557 if (IS_ERR(child_event))
6962 return child_event; 6558 return child_event;
6963 get_ctx(child_ctx); 6559 get_ctx(child_ctx);
@@ -6984,6 +6580,8 @@ inherit_event(struct perf_event *parent_event,
6984 6580
6985 child_event->ctx = child_ctx; 6581 child_event->ctx = child_ctx;
6986 child_event->overflow_handler = parent_event->overflow_handler; 6582 child_event->overflow_handler = parent_event->overflow_handler;
6583 child_event->overflow_handler_context
6584 = parent_event->overflow_handler_context;
6987 6585
6988 /* 6586 /*
6989 * Precalculate sample_data sizes 6587 * Precalculate sample_data sizes
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..b7971d6f38bf 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
431struct perf_event * 431struct perf_event *
432register_user_hw_breakpoint(struct perf_event_attr *attr, 432register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 void *context,
434 struct task_struct *tsk) 435 struct task_struct *tsk)
435{ 436{
436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered); 437 return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
438 context);
437} 439}
438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 440EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
439 441
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
502 */ 504 */
503struct perf_event * __percpu * 505struct perf_event * __percpu *
504register_wide_hw_breakpoint(struct perf_event_attr *attr, 506register_wide_hw_breakpoint(struct perf_event_attr *attr,
505 perf_overflow_handler_t triggered) 507 perf_overflow_handler_t triggered,
508 void *context)
506{ 509{
507 struct perf_event * __percpu *cpu_events, **pevent, *bp; 510 struct perf_event * __percpu *cpu_events, **pevent, *bp;
508 long err; 511 long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
515 get_online_cpus(); 518 get_online_cpus();
516 for_each_online_cpu(cpu) { 519 for_each_online_cpu(cpu) {
517 pevent = per_cpu_ptr(cpu_events, cpu); 520 pevent = per_cpu_ptr(cpu_events, cpu);
518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); 521 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
522 triggered, context);
519 523
520 *pevent = bp; 524 *pevent = bp;
521 525
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000000..09097dd8116c
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H
3
4#define RING_BUFFER_WRITABLE 0x01
5
6struct ring_buffer {
7 atomic_t refcount;
8 struct rcu_head rcu_head;
9#ifdef CONFIG_PERF_USE_VMALLOC
10 struct work_struct work;
11 int page_order; /* allocation order */
12#endif
13 int nr_pages; /* nr of data pages */
14 int writable; /* are we writable */
15
16 atomic_t poll; /* POLL_ for wakeups */
17
18 local_t head; /* write position */
19 local_t nest; /* nested writers */
20 local_t events; /* event limit */
21 local_t wakeup; /* wakeup stamp */
22 local_t lost; /* nr records lost */
23
24 long watermark; /* wakeup watermark */
25
26 struct perf_event_mmap_page *user_page;
27 void *data_pages[0];
28};
29
30extern void rb_free(struct ring_buffer *rb);
31extern struct ring_buffer *
32rb_alloc(int nr_pages, long watermark, int cpu, int flags);
33extern void perf_event_wakeup(struct perf_event *event);
34
35extern void
36perf_event_header__init_id(struct perf_event_header *header,
37 struct perf_sample_data *data,
38 struct perf_event *event);
39extern void
40perf_event__output_id_sample(struct perf_event *event,
41 struct perf_output_handle *handle,
42 struct perf_sample_data *sample);
43
44extern struct page *
45perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
46
47#ifdef CONFIG_PERF_USE_VMALLOC
48/*
49 * Back perf_mmap() with vmalloc memory.
50 *
51 * Required for architectures that have d-cache aliasing issues.
52 */
53
54static inline int page_order(struct ring_buffer *rb)
55{
56 return rb->page_order;
57}
58
59#else
60
61static inline int page_order(struct ring_buffer *rb)
62{
63 return 0;
64}
65#endif
66
67static unsigned long perf_data_size(struct ring_buffer *rb)
68{
69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
70}
71
72static inline void
73__output_copy(struct perf_output_handle *handle,
74 const void *buf, unsigned int len)
75{
76 do {
77 unsigned long size = min_t(unsigned long, handle->size, len);
78
79 memcpy(handle->addr, buf, size);
80
81 len -= size;
82 handle->addr += size;
83 buf += size;
84 handle->size -= size;
85 if (!handle->size) {
86 struct ring_buffer *rb = handle->rb;
87
88 handle->page++;
89 handle->page &= rb->nr_pages - 1;
90 handle->addr = rb->data_pages[handle->page];
91 handle->size = PAGE_SIZE << page_order(rb);
92 }
93 } while (len);
94}
95
96#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000000..a2a29205cc0f
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
1/*
2 * Performance events ring-buffer code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15
16#include "internal.h"
17
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long mask;
22
23 if (!rb->writable)
24 return true;
25
26 mask = perf_data_size(rb) - 1;
27
28 offset = (offset - tail) & mask;
29 head = (head - tail) & mask;
30
31 if ((int)(head - offset) < 0)
32 return false;
33
34 return true;
35}
36
37static void perf_output_wakeup(struct perf_output_handle *handle)
38{
39 atomic_set(&handle->rb->poll, POLL_IN);
40
41 handle->event->pending_wakeup = 1;
42 irq_work_queue(&handle->event->pending);
43}
44
45/*
46 * We need to ensure a later event_id doesn't publish a head when a former
47 * event isn't done writing. However since we need to deal with NMIs we
48 * cannot fully serialize things.
49 *
50 * We only publish the head (and generate a wakeup) when the outer-most
51 * event completes.
52 */
53static void perf_output_get_handle(struct perf_output_handle *handle)
54{
55 struct ring_buffer *rb = handle->rb;
56
57 preempt_disable();
58 local_inc(&rb->nest);
59 handle->wakeup = local_read(&rb->wakeup);
60}
61
62static void perf_output_put_handle(struct perf_output_handle *handle)
63{
64 struct ring_buffer *rb = handle->rb;
65 unsigned long head;
66
67again:
68 head = local_read(&rb->head);
69
70 /*
71 * IRQ/NMI can happen here, which means we can miss a head update.
72 */
73
74 if (!local_dec_and_test(&rb->nest))
75 goto out;
76
77 /*
78 * Publish the known good head. Rely on the full barrier implied
79 * by atomic_dec_and_test() order the rb->head read and this
80 * write.
81 */
82 rb->user_page->data_head = head;
83
84 /*
85 * Now check if we missed an update, rely on the (compiler)
86 * barrier in atomic_dec_and_test() to re-read rb->head.
87 */
88 if (unlikely(head != local_read(&rb->head))) {
89 local_inc(&rb->nest);
90 goto again;
91 }
92
93 if (handle->wakeup != local_read(&rb->wakeup))
94 perf_output_wakeup(handle);
95
96out:
97 preempt_enable();
98}
99
100int perf_output_begin(struct perf_output_handle *handle,
101 struct perf_event *event, unsigned int size)
102{
103 struct ring_buffer *rb;
104 unsigned long tail, offset, head;
105 int have_lost;
106 struct perf_sample_data sample_data;
107 struct {
108 struct perf_event_header header;
109 u64 id;
110 u64 lost;
111 } lost_event;
112
113 rcu_read_lock();
114 /*
115 * For inherited events we send all the output towards the parent.
116 */
117 if (event->parent)
118 event = event->parent;
119
120 rb = rcu_dereference(event->rb);
121 if (!rb)
122 goto out;
123
124 handle->rb = rb;
125 handle->event = event;
126
127 if (!rb->nr_pages)
128 goto out;
129
130 have_lost = local_read(&rb->lost);
131 if (have_lost) {
132 lost_event.header.size = sizeof(lost_event);
133 perf_event_header__init_id(&lost_event.header, &sample_data,
134 event);
135 size += lost_event.header.size;
136 }
137
138 perf_output_get_handle(handle);
139
140 do {
141 /*
142 * Userspace could choose to issue a mb() before updating the
143 * tail pointer. So that all reads will be completed before the
144 * write is issued.
145 */
146 tail = ACCESS_ONCE(rb->user_page->data_tail);
147 smp_rmb();
148 offset = head = local_read(&rb->head);
149 head += size;
150 if (unlikely(!perf_output_space(rb, tail, offset, head)))
151 goto fail;
152 } while (local_cmpxchg(&rb->head, offset, head) != offset);
153
154 if (head - local_read(&rb->wakeup) > rb->watermark)
155 local_add(rb->watermark, &rb->wakeup);
156
157 handle->page = offset >> (PAGE_SHIFT + page_order(rb));
158 handle->page &= rb->nr_pages - 1;
159 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
160 handle->addr = rb->data_pages[handle->page];
161 handle->addr += handle->size;
162 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
163
164 if (have_lost) {
165 lost_event.header.type = PERF_RECORD_LOST;
166 lost_event.header.misc = 0;
167 lost_event.id = event->id;
168 lost_event.lost = local_xchg(&rb->lost, 0);
169
170 perf_output_put(handle, lost_event);
171 perf_event__output_id_sample(event, handle, &sample_data);
172 }
173
174 return 0;
175
176fail:
177 local_inc(&rb->lost);
178 perf_output_put_handle(handle);
179out:
180 rcu_read_unlock();
181
182 return -ENOSPC;
183}
184
185void perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len)
187{
188 __output_copy(handle, buf, len);
189}
190
191void perf_output_end(struct perf_output_handle *handle)
192{
193 perf_output_put_handle(handle);
194 rcu_read_unlock();
195}
196
197static void
198ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
199{
200 long max_size = perf_data_size(rb);
201
202 if (watermark)
203 rb->watermark = min(max_size, watermark);
204
205 if (!rb->watermark)
206 rb->watermark = max_size / 2;
207
208 if (flags & RING_BUFFER_WRITABLE)
209 rb->writable = 1;
210
211 atomic_set(&rb->refcount, 1);
212}
213
214#ifndef CONFIG_PERF_USE_VMALLOC
215
216/*
217 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
218 */
219
220struct page *
221perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
222{
223 if (pgoff > rb->nr_pages)
224 return NULL;
225
226 if (pgoff == 0)
227 return virt_to_page(rb->user_page);
228
229 return virt_to_page(rb->data_pages[pgoff - 1]);
230}
231
232static void *perf_mmap_alloc_page(int cpu)
233{
234 struct page *page;
235 int node;
236
237 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
238 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
239 if (!page)
240 return NULL;
241
242 return page_address(page);
243}
244
245struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
246{
247 struct ring_buffer *rb;
248 unsigned long size;
249 int i;
250
251 size = sizeof(struct ring_buffer);
252 size += nr_pages * sizeof(void *);
253
254 rb = kzalloc(size, GFP_KERNEL);
255 if (!rb)
256 goto fail;
257
258 rb->user_page = perf_mmap_alloc_page(cpu);
259 if (!rb->user_page)
260 goto fail_user_page;
261
262 for (i = 0; i < nr_pages; i++) {
263 rb->data_pages[i] = perf_mmap_alloc_page(cpu);
264 if (!rb->data_pages[i])
265 goto fail_data_pages;
266 }
267
268 rb->nr_pages = nr_pages;
269
270 ring_buffer_init(rb, watermark, flags);
271
272 return rb;
273
274fail_data_pages:
275 for (i--; i >= 0; i--)
276 free_page((unsigned long)rb->data_pages[i]);
277
278 free_page((unsigned long)rb->user_page);
279
280fail_user_page:
281 kfree(rb);
282
283fail:
284 return NULL;
285}
286
287static void perf_mmap_free_page(unsigned long addr)
288{
289 struct page *page = virt_to_page((void *)addr);
290
291 page->mapping = NULL;
292 __free_page(page);
293}
294
295void rb_free(struct ring_buffer *rb)
296{
297 int i;
298
299 perf_mmap_free_page((unsigned long)rb->user_page);
300 for (i = 0; i < rb->nr_pages; i++)
301 perf_mmap_free_page((unsigned long)rb->data_pages[i]);
302 kfree(rb);
303}
304
305#else
306
307struct page *
308perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
309{
310 if (pgoff > (1UL << page_order(rb)))
311 return NULL;
312
313 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
314}
315
316static void perf_mmap_unmark_page(void *addr)
317{
318 struct page *page = vmalloc_to_page(addr);
319
320 page->mapping = NULL;
321}
322
323static void rb_free_work(struct work_struct *work)
324{
325 struct ring_buffer *rb;
326 void *base;
327 int i, nr;
328
329 rb = container_of(work, struct ring_buffer, work);
330 nr = 1 << page_order(rb);
331
332 base = rb->user_page;
333 for (i = 0; i < nr + 1; i++)
334 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
335
336 vfree(base);
337 kfree(rb);
338}
339
340void rb_free(struct ring_buffer *rb)
341{
342 schedule_work(&rb->work);
343}
344
345struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
346{
347 struct ring_buffer *rb;
348 unsigned long size;
349 void *all_buf;
350
351 size = sizeof(struct ring_buffer);
352 size += sizeof(void *);
353
354 rb = kzalloc(size, GFP_KERNEL);
355 if (!rb)
356 goto fail;
357
358 INIT_WORK(&rb->work, rb_free_work);
359
360 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
361 if (!all_buf)
362 goto fail_all_buf;
363
364 rb->user_page = all_buf;
365 rb->data_pages[0] = all_buf + PAGE_SIZE;
366 rb->page_order = ilog2(nr_pages);
367 rb->nr_pages = 1;
368
369 ring_buffer_init(rb, watermark, flags);
370
371 return rb;
372
373fail_all_buf:
374 kfree(rb);
375
376fail:
377 return NULL;
378}
379
380#endif
diff --git a/kernel/exit.c b/kernel/exit.c
index f2b321bae440..2913b3509d42 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
85 struct tty_struct *uninitialized_var(tty); 85 struct tty_struct *uninitialized_var(tty);
86 86
87 sighand = rcu_dereference_check(tsk->sighand, 87 sighand = rcu_dereference_check(tsk->sighand,
88 rcu_read_lock_held() ||
89 lockdep_tasklist_lock_is_held()); 88 lockdep_tasklist_lock_is_held());
90 spin_lock(&sighand->siglock); 89 spin_lock(&sighand->siglock);
91 90
@@ -169,7 +168,6 @@ void release_task(struct task_struct * p)
169 struct task_struct *leader; 168 struct task_struct *leader;
170 int zap_leader; 169 int zap_leader;
171repeat: 170repeat:
172 tracehook_prepare_release_task(p);
173 /* don't need to get the RCU readlock here - the process is dead and 171 /* don't need to get the RCU readlock here - the process is dead and
174 * can't be modifying its own credentials. But shut RCU-lockdep up */ 172 * can't be modifying its own credentials. But shut RCU-lockdep up */
175 rcu_read_lock(); 173 rcu_read_lock();
@@ -179,7 +177,7 @@ repeat:
179 proc_flush_task(p); 177 proc_flush_task(p);
180 178
181 write_lock_irq(&tasklist_lock); 179 write_lock_irq(&tasklist_lock);
182 tracehook_finish_release_task(p); 180 ptrace_release_task(p);
183 __exit_signal(p); 181 __exit_signal(p);
184 182
185 /* 183 /*
@@ -190,22 +188,12 @@ repeat:
190 zap_leader = 0; 188 zap_leader = 0;
191 leader = p->group_leader; 189 leader = p->group_leader;
192 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 190 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
193 BUG_ON(task_detached(leader));
194 do_notify_parent(leader, leader->exit_signal);
195 /* 191 /*
196 * If we were the last child thread and the leader has 192 * If we were the last child thread and the leader has
197 * exited already, and the leader's parent ignores SIGCHLD, 193 * exited already, and the leader's parent ignores SIGCHLD,
198 * then we are the one who should release the leader. 194 * then we are the one who should release the leader.
199 *
200 * do_notify_parent() will have marked it self-reaping in
201 * that case.
202 */
203 zap_leader = task_detached(leader);
204
205 /*
206 * This maintains the invariant that release_task()
207 * only runs on a task in EXIT_DEAD, just for sanity.
208 */ 195 */
196 zap_leader = do_notify_parent(leader, leader->exit_signal);
209 if (zap_leader) 197 if (zap_leader)
210 leader->exit_state = EXIT_DEAD; 198 leader->exit_state = EXIT_DEAD;
211 } 199 }
@@ -277,18 +265,16 @@ int is_current_pgrp_orphaned(void)
277 return retval; 265 return retval;
278} 266}
279 267
280static int has_stopped_jobs(struct pid *pgrp) 268static bool has_stopped_jobs(struct pid *pgrp)
281{ 269{
282 int retval = 0;
283 struct task_struct *p; 270 struct task_struct *p;
284 271
285 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 272 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
286 if (!task_is_stopped(p)) 273 if (p->signal->flags & SIGNAL_STOP_STOPPED)
287 continue; 274 return true;
288 retval = 1;
289 break;
290 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 275 } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
291 return retval; 276
277 return false;
292} 278}
293 279
294/* 280/*
@@ -751,7 +737,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
751{ 737{
752 list_move_tail(&p->sibling, &p->real_parent->children); 738 list_move_tail(&p->sibling, &p->real_parent->children);
753 739
754 if (task_detached(p)) 740 if (p->exit_state == EXIT_DEAD)
755 return; 741 return;
756 /* 742 /*
757 * If this is a threaded reparent there is no need to 743 * If this is a threaded reparent there is no need to
@@ -764,10 +750,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
764 p->exit_signal = SIGCHLD; 750 p->exit_signal = SIGCHLD;
765 751
766 /* If it has exited notify the new parent about this child's death. */ 752 /* If it has exited notify the new parent about this child's death. */
767 if (!task_ptrace(p) && 753 if (!p->ptrace &&
768 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 754 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
769 do_notify_parent(p, p->exit_signal); 755 if (do_notify_parent(p, p->exit_signal)) {
770 if (task_detached(p)) {
771 p->exit_state = EXIT_DEAD; 756 p->exit_state = EXIT_DEAD;
772 list_move_tail(&p->sibling, dead); 757 list_move_tail(&p->sibling, dead);
773 } 758 }
@@ -794,7 +779,7 @@ static void forget_original_parent(struct task_struct *father)
794 do { 779 do {
795 t->real_parent = reaper; 780 t->real_parent = reaper;
796 if (t->parent == father) { 781 if (t->parent == father) {
797 BUG_ON(task_ptrace(t)); 782 BUG_ON(t->ptrace);
798 t->parent = t->real_parent; 783 t->parent = t->real_parent;
799 } 784 }
800 if (t->pdeath_signal) 785 if (t->pdeath_signal)
@@ -819,8 +804,7 @@ static void forget_original_parent(struct task_struct *father)
819 */ 804 */
820static void exit_notify(struct task_struct *tsk, int group_dead) 805static void exit_notify(struct task_struct *tsk, int group_dead)
821{ 806{
822 int signal; 807 bool autoreap;
823 void *cookie;
824 808
825 /* 809 /*
826 * This does two things: 810 * This does two things:
@@ -851,26 +835,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
851 * we have changed execution domain as these two values started 835 * we have changed execution domain as these two values started
852 * the same after a fork. 836 * the same after a fork.
853 */ 837 */
854 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 838 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
855 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
856 tsk->self_exec_id != tsk->parent_exec_id)) 840 tsk->self_exec_id != tsk->parent_exec_id))
857 tsk->exit_signal = SIGCHLD; 841 tsk->exit_signal = SIGCHLD;
858 842
859 signal = tracehook_notify_death(tsk, &cookie, group_dead); 843 if (unlikely(tsk->ptrace)) {
860 if (signal >= 0) 844 int sig = thread_group_leader(tsk) &&
861 signal = do_notify_parent(tsk, signal); 845 thread_group_empty(tsk) &&
846 !ptrace_reparented(tsk) ?
847 tsk->exit_signal : SIGCHLD;
848 autoreap = do_notify_parent(tsk, sig);
849 } else if (thread_group_leader(tsk)) {
850 autoreap = thread_group_empty(tsk) &&
851 do_notify_parent(tsk, tsk->exit_signal);
852 } else {
853 autoreap = true;
854 }
862 855
863 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 856 tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
864 857
865 /* mt-exec, de_thread() is waiting for group leader */ 858 /* mt-exec, de_thread() is waiting for group leader */
866 if (unlikely(tsk->signal->notify_count < 0)) 859 if (unlikely(tsk->signal->notify_count < 0))
867 wake_up_process(tsk->signal->group_exit_task); 860 wake_up_process(tsk->signal->group_exit_task);
868 write_unlock_irq(&tasklist_lock); 861 write_unlock_irq(&tasklist_lock);
869 862
870 tracehook_report_death(tsk, signal, cookie, group_dead);
871
872 /* If the process is dead, release it - nobody will wait for it */ 863 /* If the process is dead, release it - nobody will wait for it */
873 if (signal == DEATH_REAP) 864 if (autoreap)
874 release_task(tsk); 865 release_task(tsk);
875} 866}
876 867
@@ -906,7 +897,6 @@ NORET_TYPE void do_exit(long code)
906 897
907 profile_task_exit(tsk); 898 profile_task_exit(tsk);
908 899
909 WARN_ON(atomic_read(&tsk->fs_excl));
910 WARN_ON(blk_needs_flush_plug(tsk)); 900 WARN_ON(blk_needs_flush_plug(tsk));
911 901
912 if (unlikely(in_interrupt())) 902 if (unlikely(in_interrupt()))
@@ -923,7 +913,7 @@ NORET_TYPE void do_exit(long code)
923 */ 913 */
924 set_fs(USER_DS); 914 set_fs(USER_DS);
925 915
926 tracehook_report_exit(&code); 916 ptrace_event(PTRACE_EVENT_EXIT, code);
927 917
928 validate_creds_for_do_exit(tsk); 918 validate_creds_for_do_exit(tsk);
929 919
@@ -990,6 +980,7 @@ NORET_TYPE void do_exit(long code)
990 trace_sched_process_exit(tsk); 980 trace_sched_process_exit(tsk);
991 981
992 exit_sem(tsk); 982 exit_sem(tsk);
983 exit_shm(tsk);
993 exit_files(tsk); 984 exit_files(tsk);
994 exit_fs(tsk); 985 exit_fs(tsk);
995 check_stack_usage(); 986 check_stack_usage();
@@ -1235,9 +1226,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1235 traced = ptrace_reparented(p); 1226 traced = ptrace_reparented(p);
1236 /* 1227 /*
1237 * It can be ptraced but not reparented, check 1228 * It can be ptraced but not reparented, check
1238 * !task_detached() to filter out sub-threads. 1229 * thread_group_leader() to filter out sub-threads.
1239 */ 1230 */
1240 if (likely(!traced) && likely(!task_detached(p))) { 1231 if (likely(!traced) && thread_group_leader(p)) {
1241 struct signal_struct *psig; 1232 struct signal_struct *psig;
1242 struct signal_struct *sig; 1233 struct signal_struct *sig;
1243 unsigned long maxrss; 1234 unsigned long maxrss;
@@ -1345,16 +1336,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1345 /* We dropped tasklist, ptracer could die and untrace */ 1336 /* We dropped tasklist, ptracer could die and untrace */
1346 ptrace_unlink(p); 1337 ptrace_unlink(p);
1347 /* 1338 /*
1348 * If this is not a detached task, notify the parent. 1339 * If this is not a sub-thread, notify the parent.
1349 * If it's still not detached after that, don't release 1340 * If parent wants a zombie, don't release it now.
1350 * it now.
1351 */ 1341 */
1352 if (!task_detached(p)) { 1342 if (thread_group_leader(p) &&
1353 do_notify_parent(p, p->exit_signal); 1343 !do_notify_parent(p, p->exit_signal)) {
1354 if (!task_detached(p)) { 1344 p->exit_state = EXIT_ZOMBIE;
1355 p->exit_state = EXIT_ZOMBIE; 1345 p = NULL;
1356 p = NULL;
1357 }
1358 } 1346 }
1359 write_unlock_irq(&tasklist_lock); 1347 write_unlock_irq(&tasklist_lock);
1360 } 1348 }
@@ -1367,7 +1355,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1367static int *task_stopped_code(struct task_struct *p, bool ptrace) 1355static int *task_stopped_code(struct task_struct *p, bool ptrace)
1368{ 1356{
1369 if (ptrace) { 1357 if (ptrace) {
1370 if (task_is_stopped_or_traced(p)) 1358 if (task_is_stopped_or_traced(p) &&
1359 !(p->jobctl & JOBCTL_LISTENING))
1371 return &p->exit_code; 1360 return &p->exit_code;
1372 } else { 1361 } else {
1373 if (p->signal->flags & SIGNAL_STOP_STOPPED) 1362 if (p->signal->flags & SIGNAL_STOP_STOPPED)
@@ -1563,7 +1552,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1563 * Notification and reaping will be cascaded to the real 1552 * Notification and reaping will be cascaded to the real
1564 * parent when the ptracer detaches. 1553 * parent when the ptracer detaches.
1565 */ 1554 */
1566 if (likely(!ptrace) && unlikely(task_ptrace(p))) { 1555 if (likely(!ptrace) && unlikely(p->ptrace)) {
1567 /* it will become visible, clear notask_error */ 1556 /* it will become visible, clear notask_error */
1568 wo->notask_error = 0; 1557 wo->notask_error = 0;
1569 return 0; 1558 return 0;
@@ -1606,8 +1595,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1606 * own children, it should create a separate process which 1595 * own children, it should create a separate process which
1607 * takes the role of real parent. 1596 * takes the role of real parent.
1608 */ 1597 */
1609 if (likely(!ptrace) && task_ptrace(p) && 1598 if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
1610 same_thread_group(p->parent, p->real_parent))
1611 return 0; 1599 return 0;
1612 1600
1613 /* 1601 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30401a0..e7ceaca89609 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,7 +37,6 @@
37#include <linux/swap.h> 37#include <linux/swap.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/jiffies.h> 39#include <linux/jiffies.h>
40#include <linux/tracehook.h>
41#include <linux/futex.h> 40#include <linux/futex.h>
42#include <linux/compat.h> 41#include <linux/compat.h>
43#include <linux/kthread.h> 42#include <linux/kthread.h>
@@ -81,7 +80,7 @@
81 * Protected counters by write_lock_irq(&tasklist_lock) 80 * Protected counters by write_lock_irq(&tasklist_lock)
82 */ 81 */
83unsigned long total_forks; /* Handle normal Linux uptimes. */ 82unsigned long total_forks; /* Handle normal Linux uptimes. */
84int nr_threads; /* The idle threads do not count.. */ 83int nr_threads; /* The idle threads do not count.. */
85 84
86int max_threads; /* tunable limit on nr_threads */ 85int max_threads; /* tunable limit on nr_threads */
87 86
@@ -233,7 +232,7 @@ void __init fork_init(unsigned long mempages)
233 /* 232 /*
234 * we need to allow at least 20 threads to boot a system 233 * we need to allow at least 20 threads to boot a system
235 */ 234 */
236 if(max_threads < 20) 235 if (max_threads < 20)
237 max_threads = 20; 236 max_threads = 20;
238 237
239 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; 238 init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
@@ -269,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
269 return NULL; 268 return NULL;
270 } 269 }
271 270
272 err = arch_dup_task_struct(tsk, orig); 271 err = arch_dup_task_struct(tsk, orig);
273 if (err) 272 if (err)
274 goto out; 273 goto out;
275 274
@@ -289,9 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
289 tsk->stack_canary = get_random_int(); 288 tsk->stack_canary = get_random_int();
290#endif 289#endif
291 290
292 /* One for us, one for whoever does the "release_task()" (usually parent) */ 291 /*
293 atomic_set(&tsk->usage,2); 292 * One for us, one for whoever does the "release_task()" (usually
294 atomic_set(&tsk->fs_excl, 0); 293 * parent)
294 */
295 atomic_set(&tsk->usage, 2);
295#ifdef CONFIG_BLK_DEV_IO_TRACE 296#ifdef CONFIG_BLK_DEV_IO_TRACE
296 tsk->btrace_seq = 0; 297 tsk->btrace_seq = 0;
297#endif 298#endif
@@ -439,7 +440,7 @@ fail_nomem:
439 goto out; 440 goto out;
440} 441}
441 442
442static inline int mm_alloc_pgd(struct mm_struct * mm) 443static inline int mm_alloc_pgd(struct mm_struct *mm)
443{ 444{
444 mm->pgd = pgd_alloc(mm); 445 mm->pgd = pgd_alloc(mm);
445 if (unlikely(!mm->pgd)) 446 if (unlikely(!mm->pgd))
@@ -447,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm)
447 return 0; 448 return 0;
448} 449}
449 450
450static inline void mm_free_pgd(struct mm_struct * mm) 451static inline void mm_free_pgd(struct mm_struct *mm)
451{ 452{
452 pgd_free(mm, mm->pgd); 453 pgd_free(mm, mm->pgd);
453} 454}
@@ -484,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm)
484#endif 485#endif
485} 486}
486 487
487static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 488static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
488{ 489{
489 atomic_set(&mm->mm_users, 1); 490 atomic_set(&mm->mm_users, 1);
490 atomic_set(&mm->mm_count, 1); 491 atomic_set(&mm->mm_count, 1);
@@ -515,9 +516,9 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
515/* 516/*
516 * Allocate and initialize an mm_struct. 517 * Allocate and initialize an mm_struct.
517 */ 518 */
518struct mm_struct * mm_alloc(void) 519struct mm_struct *mm_alloc(void)
519{ 520{
520 struct mm_struct * mm; 521 struct mm_struct *mm;
521 522
522 mm = allocate_mm(); 523 mm = allocate_mm();
523 if (!mm) 524 if (!mm)
@@ -585,7 +586,7 @@ void added_exe_file_vma(struct mm_struct *mm)
585void removed_exe_file_vma(struct mm_struct *mm) 586void removed_exe_file_vma(struct mm_struct *mm)
586{ 587{
587 mm->num_exe_file_vmas--; 588 mm->num_exe_file_vmas--;
588 if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ 589 if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
589 fput(mm->exe_file); 590 fput(mm->exe_file);
590 mm->exe_file = NULL; 591 mm->exe_file = NULL;
591 } 592 }
@@ -777,9 +778,9 @@ fail_nocontext:
777 return NULL; 778 return NULL;
778} 779}
779 780
780static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) 781static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
781{ 782{
782 struct mm_struct * mm, *oldmm; 783 struct mm_struct *mm, *oldmm;
783 int retval; 784 int retval;
784 785
785 tsk->min_flt = tsk->maj_flt = 0; 786 tsk->min_flt = tsk->maj_flt = 0;
@@ -846,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
846 return 0; 847 return 0;
847} 848}
848 849
849static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 850static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
850{ 851{
851 struct files_struct *oldf, *newf; 852 struct files_struct *oldf, *newf;
852 int error = 0; 853 int error = 0;
@@ -1013,7 +1014,7 @@ static void rt_mutex_init_task(struct task_struct *p)
1013{ 1014{
1014 raw_spin_lock_init(&p->pi_lock); 1015 raw_spin_lock_init(&p->pi_lock);
1015#ifdef CONFIG_RT_MUTEXES 1016#ifdef CONFIG_RT_MUTEXES
1016 plist_head_init_raw(&p->pi_waiters, &p->pi_lock); 1017 plist_head_init(&p->pi_waiters);
1017 p->pi_blocked_on = NULL; 1018 p->pi_blocked_on = NULL;
1018#endif 1019#endif
1019} 1020}
@@ -1168,13 +1169,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1168 cgroup_fork(p); 1169 cgroup_fork(p);
1169#ifdef CONFIG_NUMA 1170#ifdef CONFIG_NUMA
1170 p->mempolicy = mpol_dup(p->mempolicy); 1171 p->mempolicy = mpol_dup(p->mempolicy);
1171 if (IS_ERR(p->mempolicy)) { 1172 if (IS_ERR(p->mempolicy)) {
1172 retval = PTR_ERR(p->mempolicy); 1173 retval = PTR_ERR(p->mempolicy);
1173 p->mempolicy = NULL; 1174 p->mempolicy = NULL;
1174 goto bad_fork_cleanup_cgroup; 1175 goto bad_fork_cleanup_cgroup;
1175 } 1176 }
1176 mpol_fix_fork_child_flag(p); 1177 mpol_fix_fork_child_flag(p);
1177#endif 1178#endif
1179#ifdef CONFIG_CPUSETS
1180 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1181 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1182#endif
1178#ifdef CONFIG_TRACE_IRQFLAGS 1183#ifdef CONFIG_TRACE_IRQFLAGS
1179 p->irq_events = 0; 1184 p->irq_events = 0;
1180#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1185#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1214,25 +1219,33 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1214 retval = perf_event_init_task(p); 1219 retval = perf_event_init_task(p);
1215 if (retval) 1220 if (retval)
1216 goto bad_fork_cleanup_policy; 1221 goto bad_fork_cleanup_policy;
1217 1222 retval = audit_alloc(p);
1218 if ((retval = audit_alloc(p))) 1223 if (retval)
1219 goto bad_fork_cleanup_policy; 1224 goto bad_fork_cleanup_policy;
1220 /* copy all the process information */ 1225 /* copy all the process information */
1221 if ((retval = copy_semundo(clone_flags, p))) 1226 retval = copy_semundo(clone_flags, p);
1227 if (retval)
1222 goto bad_fork_cleanup_audit; 1228 goto bad_fork_cleanup_audit;
1223 if ((retval = copy_files(clone_flags, p))) 1229 retval = copy_files(clone_flags, p);
1230 if (retval)
1224 goto bad_fork_cleanup_semundo; 1231 goto bad_fork_cleanup_semundo;
1225 if ((retval = copy_fs(clone_flags, p))) 1232 retval = copy_fs(clone_flags, p);
1233 if (retval)
1226 goto bad_fork_cleanup_files; 1234 goto bad_fork_cleanup_files;
1227 if ((retval = copy_sighand(clone_flags, p))) 1235 retval = copy_sighand(clone_flags, p);
1236 if (retval)
1228 goto bad_fork_cleanup_fs; 1237 goto bad_fork_cleanup_fs;
1229 if ((retval = copy_signal(clone_flags, p))) 1238 retval = copy_signal(clone_flags, p);
1239 if (retval)
1230 goto bad_fork_cleanup_sighand; 1240 goto bad_fork_cleanup_sighand;
1231 if ((retval = copy_mm(clone_flags, p))) 1241 retval = copy_mm(clone_flags, p);
1242 if (retval)
1232 goto bad_fork_cleanup_signal; 1243 goto bad_fork_cleanup_signal;
1233 if ((retval = copy_namespaces(clone_flags, p))) 1244 retval = copy_namespaces(clone_flags, p);
1245 if (retval)
1234 goto bad_fork_cleanup_mm; 1246 goto bad_fork_cleanup_mm;
1235 if ((retval = copy_io(clone_flags, p))) 1247 retval = copy_io(clone_flags, p);
1248 if (retval)
1236 goto bad_fork_cleanup_namespaces; 1249 goto bad_fork_cleanup_namespaces;
1237 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); 1250 retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
1238 if (retval) 1251 if (retval)
@@ -1254,7 +1267,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1254 /* 1267 /*
1255 * Clear TID on mm_release()? 1268 * Clear TID on mm_release()?
1256 */ 1269 */
1257 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; 1270 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
1258#ifdef CONFIG_BLOCK 1271#ifdef CONFIG_BLOCK
1259 p->plug = NULL; 1272 p->plug = NULL;
1260#endif 1273#endif
@@ -1322,7 +1335,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1322 * it's process group. 1335 * it's process group.
1323 * A fatal signal pending means that current will exit, so the new 1336 * A fatal signal pending means that current will exit, so the new
1324 * thread can't slip out of an OOM kill (or normal SIGKILL). 1337 * thread can't slip out of an OOM kill (or normal SIGKILL).
1325 */ 1338 */
1326 recalc_sigpending(); 1339 recalc_sigpending();
1327 if (signal_pending(current)) { 1340 if (signal_pending(current)) {
1328 spin_unlock(&current->sighand->siglock); 1341 spin_unlock(&current->sighand->siglock);
@@ -1340,7 +1353,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1340 } 1353 }
1341 1354
1342 if (likely(p->pid)) { 1355 if (likely(p->pid)) {
1343 tracehook_finish_clone(p, clone_flags, trace); 1356 ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1344 1357
1345 if (thread_group_leader(p)) { 1358 if (thread_group_leader(p)) {
1346 if (is_child_reaper(pid)) 1359 if (is_child_reaper(pid))
@@ -1481,10 +1494,22 @@ long do_fork(unsigned long clone_flags,
1481 } 1494 }
1482 1495
1483 /* 1496 /*
1484 * When called from kernel_thread, don't do user tracing stuff. 1497 * Determine whether and which event to report to ptracer. When
1498 * called from kernel_thread or CLONE_UNTRACED is explicitly
1499 * requested, no event is reported; otherwise, report if the event
1500 * for the type of forking is enabled.
1485 */ 1501 */
1486 if (likely(user_mode(regs))) 1502 if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) {
1487 trace = tracehook_prepare_clone(clone_flags); 1503 if (clone_flags & CLONE_VFORK)
1504 trace = PTRACE_EVENT_VFORK;
1505 else if ((clone_flags & CSIGNAL) != SIGCHLD)
1506 trace = PTRACE_EVENT_CLONE;
1507 else
1508 trace = PTRACE_EVENT_FORK;
1509
1510 if (likely(!ptrace_event_enabled(current, trace)))
1511 trace = 0;
1512 }
1488 1513
1489 p = copy_process(clone_flags, stack_start, regs, stack_size, 1514 p = copy_process(clone_flags, stack_start, regs, stack_size,
1490 child_tidptr, NULL, trace); 1515 child_tidptr, NULL, trace);
@@ -1508,26 +1533,26 @@ long do_fork(unsigned long clone_flags,
1508 } 1533 }
1509 1534
1510 audit_finish_fork(p); 1535 audit_finish_fork(p);
1511 tracehook_report_clone(regs, clone_flags, nr, p);
1512 1536
1513 /* 1537 /*
1514 * We set PF_STARTING at creation in case tracing wants to 1538 * We set PF_STARTING at creation in case tracing wants to
1515 * use this to distinguish a fully live task from one that 1539 * use this to distinguish a fully live task from one that
1516 * hasn't gotten to tracehook_report_clone() yet. Now we 1540 * hasn't finished SIGSTOP raising yet. Now we clear it
1517 * clear it and set the child going. 1541 * and set the child going.
1518 */ 1542 */
1519 p->flags &= ~PF_STARTING; 1543 p->flags &= ~PF_STARTING;
1520 1544
1521 wake_up_new_task(p); 1545 wake_up_new_task(p);
1522 1546
1523 tracehook_report_clone_complete(trace, regs, 1547 /* forking complete and child started to run, tell ptracer */
1524 clone_flags, nr, p); 1548 if (unlikely(trace))
1549 ptrace_event(trace, nr);
1525 1550
1526 if (clone_flags & CLONE_VFORK) { 1551 if (clone_flags & CLONE_VFORK) {
1527 freezer_do_not_count(); 1552 freezer_do_not_count();
1528 wait_for_completion(&vfork); 1553 wait_for_completion(&vfork);
1529 freezer_count(); 1554 freezer_count();
1530 tracehook_report_vfork_done(p, nr); 1555 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1531 } 1556 }
1532 } else { 1557 } else {
1533 nr = PTR_ERR(p); 1558 nr = PTR_ERR(p);
@@ -1574,6 +1599,7 @@ void __init proc_caches_init(void)
1574 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1599 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1575 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1600 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1576 mmap_init(); 1601 mmap_init();
1602 nsproxy_cache_init();
1577} 1603}
1578 1604
1579/* 1605/*
@@ -1670,12 +1696,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1670 */ 1696 */
1671 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) 1697 if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1672 do_sysvsem = 1; 1698 do_sysvsem = 1;
1673 if ((err = unshare_fs(unshare_flags, &new_fs))) 1699 err = unshare_fs(unshare_flags, &new_fs);
1700 if (err)
1674 goto bad_unshare_out; 1701 goto bad_unshare_out;
1675 if ((err = unshare_fd(unshare_flags, &new_fd))) 1702 err = unshare_fd(unshare_flags, &new_fd);
1703 if (err)
1676 goto bad_unshare_cleanup_fs; 1704 goto bad_unshare_cleanup_fs;
1677 if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, 1705 err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
1678 new_fs))) 1706 if (err)
1679 goto bad_unshare_cleanup_fd; 1707 goto bad_unshare_cleanup_fd;
1680 1708
1681 if (new_fs || new_fd || do_sysvsem || new_nsproxy) { 1709 if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
diff --git a/kernel/futex.c b/kernel/futex.c
index 70bb54bd2468..11cbe052b2e8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -384,8 +384,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
384 int ret; 384 int ret;
385 385
386 down_read(&mm->mmap_sem); 386 down_read(&mm->mmap_sem);
387 ret = get_user_pages(current, mm, (unsigned long)uaddr, 387 ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
388 1, 1, 0, NULL, NULL); 388 FAULT_FLAG_WRITE);
389 up_read(&mm->mmap_sem); 389 up_read(&mm->mmap_sem);
390 390
391 return ret < 0 ? ret : 0; 391 return ret < 0 ? ret : 0;
@@ -2727,7 +2727,7 @@ static int __init futex_init(void)
2727 futex_cmpxchg_enabled = 1; 2727 futex_cmpxchg_enabled = 1;
2728 2728
2729 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2729 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2730 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); 2730 plist_head_init(&futex_queues[i].chain);
2731 spin_lock_init(&futex_queues[i].lock); 2731 spin_lock_init(&futex_queues[i].lock);
2732 } 2732 }
2733 2733
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 5bf924d80b5c..a92028196cc1 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -3,7 +3,7 @@ menu "GCOV-based kernel profiling"
3config GCOV_KERNEL 3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling" 4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS 5 depends on DEBUG_FS
6 select CONSTRUCTORS 6 select CONSTRUCTORS if !UML
7 default n 7 default n
8 ---help--- 8 ---help---
9 This option enables gcov-based code profiling (e.g. for code coverage 9 This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1d051b38e0b..5a38bf4de641 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -52,6 +52,10 @@ config IRQ_EDGE_EOI_HANDLER
52config GENERIC_IRQ_CHIP 52config GENERIC_IRQ_CHIP
53 bool 53 bool
54 54
55# Generic irq_domain hw <--> linux irq number translation
56config IRQ_DOMAIN
57 bool
58
55# Support forced irq threading 59# Support forced irq threading
56config IRQ_FORCED_THREADING 60config IRQ_FORCED_THREADING
57 bool 61 bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 73290056cfb6..fff17381f0af 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,7 @@
2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o 3obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 4obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
5obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
5obj-$(CONFIG_PROC_FS) += proc.o 6obj-$(CONFIG_PROC_FS) += proc.o
6obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 8obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef4ffcdfa55..bd8e788d71e0 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
88 struct irq_devres match_data = { irq, dev_id }; 88 struct irq_devres match_data = { irq, dev_id };
89 89
90 free_irq(irq, dev_id);
91 WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, 90 WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match,
92 &match_data)); 91 &match_data));
92 free_irq(irq, dev_id);
93} 93}
94EXPORT_SYMBOL(devm_free_irq); 94EXPORT_SYMBOL(devm_free_irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
new file mode 100644
index 000000000000..d5828da3fd38
--- /dev/null
+++ b/kernel/irq/irqdomain.c
@@ -0,0 +1,180 @@
1#include <linux/irq.h>
2#include <linux/irqdomain.h>
3#include <linux/module.h>
4#include <linux/mutex.h>
5#include <linux/of.h>
6#include <linux/of_address.h>
7#include <linux/slab.h>
8
9static LIST_HEAD(irq_domain_list);
10static DEFINE_MUTEX(irq_domain_mutex);
11
12/**
13 * irq_domain_add() - Register an irq_domain
14 * @domain: ptr to initialized irq_domain structure
15 *
16 * Registers an irq_domain structure. The irq_domain must at a minimum be
17 * initialized with an ops structure pointer, and either a ->to_irq hook or
18 * a valid irq_base value. Everything else is optional.
19 */
20void irq_domain_add(struct irq_domain *domain)
21{
22 struct irq_data *d;
23 int hwirq;
24
25 /*
26 * This assumes that the irq_domain owner has already allocated
27 * the irq_descs. This block will be removed when support for dynamic
28 * allocation of irq_descs is added to irq_domain.
29 */
30 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
31 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
32 if (d || d->domain) {
33 /* things are broken; just report, don't clean up */
34 WARN(1, "error: irq_desc already assigned to a domain");
35 return;
36 }
37 d->domain = domain;
38 d->hwirq = hwirq;
39 }
40
41 mutex_lock(&irq_domain_mutex);
42 list_add(&domain->list, &irq_domain_list);
43 mutex_unlock(&irq_domain_mutex);
44}
45
46/**
47 * irq_domain_del() - Unregister an irq_domain
48 * @domain: ptr to registered irq_domain.
49 */
50void irq_domain_del(struct irq_domain *domain)
51{
52 struct irq_data *d;
53 int hwirq;
54
55 mutex_lock(&irq_domain_mutex);
56 list_del(&domain->list);
57 mutex_unlock(&irq_domain_mutex);
58
59 /* Clear the irq_domain assignments */
60 for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
61 d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
62 d->domain = NULL;
63 }
64}
65
66#if defined(CONFIG_OF_IRQ)
67/**
68 * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
69 *
70 * Used by the device tree interrupt mapping code to translate a device tree
71 * interrupt specifier to a valid linux irq number. Returns either a valid
72 * linux IRQ number or 0.
73 *
74 * When the caller no longer need the irq number returned by this function it
75 * should arrange to call irq_dispose_mapping().
76 */
77unsigned int irq_create_of_mapping(struct device_node *controller,
78 const u32 *intspec, unsigned int intsize)
79{
80 struct irq_domain *domain;
81 unsigned long hwirq;
82 unsigned int irq, type;
83 int rc = -EINVAL;
84
85 /* Find a domain which can translate the irq spec */
86 mutex_lock(&irq_domain_mutex);
87 list_for_each_entry(domain, &irq_domain_list, list) {
88 if (!domain->ops->dt_translate)
89 continue;
90 rc = domain->ops->dt_translate(domain, controller,
91 intspec, intsize, &hwirq, &type);
92 if (rc == 0)
93 break;
94 }
95 mutex_unlock(&irq_domain_mutex);
96
97 if (rc != 0)
98 return 0;
99
100 irq = irq_domain_to_irq(domain, hwirq);
101 if (type != IRQ_TYPE_NONE)
102 irq_set_irq_type(irq, type);
103 pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
104 controller->full_name, (int)hwirq, irq, type);
105 return irq;
106}
107EXPORT_SYMBOL_GPL(irq_create_of_mapping);
108
109/**
110 * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
111 * @irq: linux irq number to be discarded
112 *
113 * Calling this function indicates the caller no longer needs a reference to
114 * the linux irq number returned by a prior call to irq_create_of_mapping().
115 */
116void irq_dispose_mapping(unsigned int irq)
117{
118 /*
119 * nothing yet; will be filled when support for dynamic allocation of
120 * irq_descs is added to irq_domain
121 */
122}
123EXPORT_SYMBOL_GPL(irq_dispose_mapping);
124
125int irq_domain_simple_dt_translate(struct irq_domain *d,
126 struct device_node *controller,
127 const u32 *intspec, unsigned int intsize,
128 unsigned long *out_hwirq, unsigned int *out_type)
129{
130 if (d->of_node != controller)
131 return -EINVAL;
132 if (intsize < 1)
133 return -EINVAL;
134
135 *out_hwirq = intspec[0];
136 *out_type = IRQ_TYPE_NONE;
137 if (intsize > 1)
138 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
139 return 0;
140}
141
142struct irq_domain_ops irq_domain_simple_ops = {
143 .dt_translate = irq_domain_simple_dt_translate,
144};
145EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
146
147/**
148 * irq_domain_create_simple() - Set up a 'simple' translation range
149 */
150void irq_domain_add_simple(struct device_node *controller, int irq_base)
151{
152 struct irq_domain *domain;
153
154 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
155 if (!domain) {
156 WARN_ON(1);
157 return;
158 }
159
160 domain->irq_base = irq_base;
161 domain->of_node = of_node_get(controller);
162 domain->ops = &irq_domain_simple_ops;
163 irq_domain_add(domain);
164}
165EXPORT_SYMBOL_GPL(irq_domain_add_simple);
166
167void irq_domain_generate_simple(const struct of_device_id *match,
168 u64 phys_base, unsigned int irq_start)
169{
170 struct device_node *node;
171 pr_info("looking for phys_base=%llx, irq_start=%i\n",
172 (unsigned long long) phys_base, (int) irq_start);
173 node = of_find_matching_node_by_address(NULL, match, phys_base);
174 if (node)
175 irq_domain_add_simple(node, irq_start);
176 else
177 pr_info("no node found\n");
178}
179EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
180#endif /* CONFIG_OF_IRQ */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8d814cbc8109..296fbc84d659 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void)
1095 size_t size = 0; 1095 size_t size = 0;
1096 mutex_lock(&kexec_mutex); 1096 mutex_lock(&kexec_mutex);
1097 if (crashk_res.end != crashk_res.start) 1097 if (crashk_res.end != crashk_res.start)
1098 size = crashk_res.end - crashk_res.start + 1; 1098 size = resource_size(&crashk_res);
1099 mutex_unlock(&kexec_mutex); 1099 mutex_unlock(&kexec_mutex);
1100 return size; 1100 return size;
1101} 1101}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 77981813a1e7..b30fd54eb985 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
1255/* 1255/*
1256 * If we have a symbol_name argument, look it up and add the offset field 1256 * If we have a symbol_name argument, look it up and add the offset field
1257 * to it. This way, we can specify a relative address to a symbol. 1257 * to it. This way, we can specify a relative address to a symbol.
1258 * This returns encoded errors if it fails to look up symbol or invalid
1259 * combination of parameters.
1258 */ 1260 */
1259static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) 1261static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
1260{ 1262{
1261 kprobe_opcode_t *addr = p->addr; 1263 kprobe_opcode_t *addr = p->addr;
1264
1265 if ((p->symbol_name && p->addr) ||
1266 (!p->symbol_name && !p->addr))
1267 goto invalid;
1268
1262 if (p->symbol_name) { 1269 if (p->symbol_name) {
1263 if (addr)
1264 return NULL;
1265 kprobe_lookup_name(p->symbol_name, addr); 1270 kprobe_lookup_name(p->symbol_name, addr);
1271 if (!addr)
1272 return ERR_PTR(-ENOENT);
1266 } 1273 }
1267 1274
1268 if (!addr) 1275 addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
1269 return NULL; 1276 if (addr)
1270 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 1277 return addr;
1278
1279invalid:
1280 return ERR_PTR(-EINVAL);
1271} 1281}
1272 1282
1273/* Check passed kprobe is valid and return kprobe in kprobe_table. */ 1283/* Check passed kprobe is valid and return kprobe in kprobe_table. */
@@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1311 kprobe_opcode_t *addr; 1321 kprobe_opcode_t *addr;
1312 1322
1313 addr = kprobe_addr(p); 1323 addr = kprobe_addr(p);
1314 if (!addr) 1324 if (IS_ERR(addr))
1315 return -EINVAL; 1325 return PTR_ERR(addr);
1316 p->addr = addr; 1326 p->addr = addr;
1317 1327
1318 ret = check_kprobe_rereg(p); 1328 ret = check_kprobe_rereg(p);
@@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1335 */ 1345 */
1336 probed_mod = __module_text_address((unsigned long) p->addr); 1346 probed_mod = __module_text_address((unsigned long) p->addr);
1337 if (probed_mod) { 1347 if (probed_mod) {
1348 /* Return -ENOENT if fail. */
1349 ret = -ENOENT;
1338 /* 1350 /*
1339 * We must hold a refcount of the probed module while updating 1351 * We must hold a refcount of the probed module while updating
1340 * its code to prohibit unexpected unloading. 1352 * its code to prohibit unexpected unloading.
@@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1351 module_put(probed_mod); 1363 module_put(probed_mod);
1352 goto fail_with_jump_label; 1364 goto fail_with_jump_label;
1353 } 1365 }
1366 /* ret will be updated by following code */
1354 } 1367 }
1355 preempt_enable(); 1368 preempt_enable();
1356 jump_label_unlock(); 1369 jump_label_unlock();
@@ -1399,7 +1412,7 @@ out:
1399fail_with_jump_label: 1412fail_with_jump_label:
1400 preempt_enable(); 1413 preempt_enable();
1401 jump_label_unlock(); 1414 jump_label_unlock();
1402 return -EINVAL; 1415 return ret;
1403} 1416}
1404EXPORT_SYMBOL_GPL(register_kprobe); 1417EXPORT_SYMBOL_GPL(register_kprobe);
1405 1418
@@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1686 1699
1687 if (kretprobe_blacklist_size) { 1700 if (kretprobe_blacklist_size) {
1688 addr = kprobe_addr(&rp->kp); 1701 addr = kprobe_addr(&rp->kp);
1689 if (!addr) 1702 if (IS_ERR(addr))
1690 return -EINVAL; 1703 return PTR_ERR(addr);
1691 1704
1692 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 1705 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
1693 if (kretprobe_blacklist[i].addr == addr) 1706 if (kretprobe_blacklist[i].addr == addr)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 298c9276dfdb..3956f5149e25 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2468,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2468 2468
2469 BUG_ON(usage_bit >= LOCK_USAGE_STATES); 2469 BUG_ON(usage_bit >= LOCK_USAGE_STATES);
2470 2470
2471 if (hlock_class(hlock)->key == &__lockdep_no_validate__)
2472 continue;
2473
2471 if (!mark_lock(curr, hlock, usage_bit)) 2474 if (!mark_lock(curr, hlock, usage_bit))
2472 return 0; 2475 return 0;
2473 } 2476 }
@@ -2478,15 +2481,10 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
2478/* 2481/*
2479 * Hardirqs will be enabled: 2482 * Hardirqs will be enabled:
2480 */ 2483 */
2481void trace_hardirqs_on_caller(unsigned long ip) 2484static void __trace_hardirqs_on_caller(unsigned long ip)
2482{ 2485{
2483 struct task_struct *curr = current; 2486 struct task_struct *curr = current;
2484 2487
2485 time_hardirqs_on(CALLER_ADDR0, ip);
2486
2487 if (unlikely(!debug_locks || current->lockdep_recursion))
2488 return;
2489
2490 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) 2488 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
2491 return; 2489 return;
2492 2490
@@ -2502,8 +2500,6 @@ void trace_hardirqs_on_caller(unsigned long ip)
2502 /* we'll do an OFF -> ON transition: */ 2500 /* we'll do an OFF -> ON transition: */
2503 curr->hardirqs_enabled = 1; 2501 curr->hardirqs_enabled = 1;
2504 2502
2505 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2506 return;
2507 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) 2503 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
2508 return; 2504 return;
2509 /* 2505 /*
@@ -2525,6 +2521,21 @@ void trace_hardirqs_on_caller(unsigned long ip)
2525 curr->hardirq_enable_event = ++curr->irq_events; 2521 curr->hardirq_enable_event = ++curr->irq_events;
2526 debug_atomic_inc(hardirqs_on_events); 2522 debug_atomic_inc(hardirqs_on_events);
2527} 2523}
2524
2525void trace_hardirqs_on_caller(unsigned long ip)
2526{
2527 time_hardirqs_on(CALLER_ADDR0, ip);
2528
2529 if (unlikely(!debug_locks || current->lockdep_recursion))
2530 return;
2531
2532 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2533 return;
2534
2535 current->lockdep_recursion = 1;
2536 __trace_hardirqs_on_caller(ip);
2537 current->lockdep_recursion = 0;
2538}
2528EXPORT_SYMBOL(trace_hardirqs_on_caller); 2539EXPORT_SYMBOL(trace_hardirqs_on_caller);
2529 2540
2530void trace_hardirqs_on(void) 2541void trace_hardirqs_on(void)
@@ -2574,7 +2585,7 @@ void trace_softirqs_on(unsigned long ip)
2574{ 2585{
2575 struct task_struct *curr = current; 2586 struct task_struct *curr = current;
2576 2587
2577 if (unlikely(!debug_locks)) 2588 if (unlikely(!debug_locks || current->lockdep_recursion))
2578 return; 2589 return;
2579 2590
2580 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2591 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
@@ -2585,6 +2596,7 @@ void trace_softirqs_on(unsigned long ip)
2585 return; 2596 return;
2586 } 2597 }
2587 2598
2599 current->lockdep_recursion = 1;
2588 /* 2600 /*
2589 * We'll do an OFF -> ON transition: 2601 * We'll do an OFF -> ON transition:
2590 */ 2602 */
@@ -2599,6 +2611,7 @@ void trace_softirqs_on(unsigned long ip)
2599 */ 2611 */
2600 if (curr->hardirqs_enabled) 2612 if (curr->hardirqs_enabled)
2601 mark_held_locks(curr, SOFTIRQ); 2613 mark_held_locks(curr, SOFTIRQ);
2614 current->lockdep_recursion = 0;
2602} 2615}
2603 2616
2604/* 2617/*
@@ -2608,7 +2621,7 @@ void trace_softirqs_off(unsigned long ip)
2608{ 2621{
2609 struct task_struct *curr = current; 2622 struct task_struct *curr = current;
2610 2623
2611 if (unlikely(!debug_locks)) 2624 if (unlikely(!debug_locks || current->lockdep_recursion))
2612 return; 2625 return;
2613 2626
2614 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2627 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
diff --git a/kernel/module.c b/kernel/module.c
index 795bdc7f5c3f..04379f92f843 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -545,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
545 mod->field = kstrdup(s, GFP_KERNEL); \ 545 mod->field = kstrdup(s, GFP_KERNEL); \
546} \ 546} \
547static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ 547static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
548 struct module *mod, char *buffer) \ 548 struct module_kobject *mk, char *buffer) \
549{ \ 549{ \
550 return sprintf(buffer, "%s\n", mod->field); \ 550 return sprintf(buffer, "%s\n", mk->mod->field); \
551} \ 551} \
552static int modinfo_##field##_exists(struct module *mod) \ 552static int modinfo_##field##_exists(struct module *mod) \
553{ \ 553{ \
@@ -902,9 +902,9 @@ void symbol_put_addr(void *addr)
902EXPORT_SYMBOL_GPL(symbol_put_addr); 902EXPORT_SYMBOL_GPL(symbol_put_addr);
903 903
904static ssize_t show_refcnt(struct module_attribute *mattr, 904static ssize_t show_refcnt(struct module_attribute *mattr,
905 struct module *mod, char *buffer) 905 struct module_kobject *mk, char *buffer)
906{ 906{
907 return sprintf(buffer, "%u\n", module_refcount(mod)); 907 return sprintf(buffer, "%u\n", module_refcount(mk->mod));
908} 908}
909 909
910static struct module_attribute refcnt = { 910static struct module_attribute refcnt = {
@@ -952,11 +952,11 @@ static inline int module_unload_init(struct module *mod)
952#endif /* CONFIG_MODULE_UNLOAD */ 952#endif /* CONFIG_MODULE_UNLOAD */
953 953
954static ssize_t show_initstate(struct module_attribute *mattr, 954static ssize_t show_initstate(struct module_attribute *mattr,
955 struct module *mod, char *buffer) 955 struct module_kobject *mk, char *buffer)
956{ 956{
957 const char *state = "unknown"; 957 const char *state = "unknown";
958 958
959 switch (mod->state) { 959 switch (mk->mod->state) {
960 case MODULE_STATE_LIVE: 960 case MODULE_STATE_LIVE:
961 state = "live"; 961 state = "live";
962 break; 962 break;
@@ -975,10 +975,27 @@ static struct module_attribute initstate = {
975 .show = show_initstate, 975 .show = show_initstate,
976}; 976};
977 977
978static ssize_t store_uevent(struct module_attribute *mattr,
979 struct module_kobject *mk,
980 const char *buffer, size_t count)
981{
982 enum kobject_action action;
983
984 if (kobject_action_type(buffer, count, &action) == 0)
985 kobject_uevent(&mk->kobj, action);
986 return count;
987}
988
989struct module_attribute module_uevent = {
990 .attr = { .name = "uevent", .mode = 0200 },
991 .store = store_uevent,
992};
993
978static struct module_attribute *modinfo_attrs[] = { 994static struct module_attribute *modinfo_attrs[] = {
979 &modinfo_version, 995 &modinfo_version,
980 &modinfo_srcversion, 996 &modinfo_srcversion,
981 &initstate, 997 &initstate,
998 &module_uevent,
982#ifdef CONFIG_MODULE_UNLOAD 999#ifdef CONFIG_MODULE_UNLOAD
983 &refcnt, 1000 &refcnt,
984#endif 1001#endif
@@ -1187,7 +1204,7 @@ struct module_sect_attrs
1187}; 1204};
1188 1205
1189static ssize_t module_sect_show(struct module_attribute *mattr, 1206static ssize_t module_sect_show(struct module_attribute *mattr,
1190 struct module *mod, char *buf) 1207 struct module_kobject *mk, char *buf)
1191{ 1208{
1192 struct module_sect_attr *sattr = 1209 struct module_sect_attr *sattr =
1193 container_of(mattr, struct module_sect_attr, mattr); 1210 container_of(mattr, struct module_sect_attr, mattr);
@@ -1697,6 +1714,15 @@ static void unset_module_core_ro_nx(struct module *mod) { }
1697static void unset_module_init_ro_nx(struct module *mod) { } 1714static void unset_module_init_ro_nx(struct module *mod) { }
1698#endif 1715#endif
1699 1716
1717void __weak module_free(struct module *mod, void *module_region)
1718{
1719 vfree(module_region);
1720}
1721
1722void __weak module_arch_cleanup(struct module *mod)
1723{
1724}
1725
1700/* Free a module, remove from lists, etc. */ 1726/* Free a module, remove from lists, etc. */
1701static void free_module(struct module *mod) 1727static void free_module(struct module *mod)
1702{ 1728{
@@ -1851,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1851 return ret; 1877 return ret;
1852} 1878}
1853 1879
1880int __weak apply_relocate(Elf_Shdr *sechdrs,
1881 const char *strtab,
1882 unsigned int symindex,
1883 unsigned int relsec,
1884 struct module *me)
1885{
1886 pr_err("module %s: REL relocation unsupported\n", me->name);
1887 return -ENOEXEC;
1888}
1889
1890int __weak apply_relocate_add(Elf_Shdr *sechdrs,
1891 const char *strtab,
1892 unsigned int symindex,
1893 unsigned int relsec,
1894 struct module *me)
1895{
1896 pr_err("module %s: RELA relocation unsupported\n", me->name);
1897 return -ENOEXEC;
1898}
1899
1854static int apply_relocations(struct module *mod, const struct load_info *info) 1900static int apply_relocations(struct module *mod, const struct load_info *info)
1855{ 1901{
1856 unsigned int i; 1902 unsigned int i;
@@ -2235,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug)
2235 ddebug_remove_module(debug->modname); 2281 ddebug_remove_module(debug->modname);
2236} 2282}
2237 2283
2284void * __weak module_alloc(unsigned long size)
2285{
2286 return size == 0 ? NULL : vmalloc_exec(size);
2287}
2288
2238static void *module_alloc_update_bounds(unsigned long size) 2289static void *module_alloc_update_bounds(unsigned long size)
2239{ 2290{
2240 void *ret = module_alloc(size); 2291 void *ret = module_alloc(size);
@@ -2645,6 +2696,14 @@ static void flush_module_icache(const struct module *mod)
2645 set_fs(old_fs); 2696 set_fs(old_fs);
2646} 2697}
2647 2698
2699int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
2700 Elf_Shdr *sechdrs,
2701 char *secstrings,
2702 struct module *mod)
2703{
2704 return 0;
2705}
2706
2648static struct module *layout_and_allocate(struct load_info *info) 2707static struct module *layout_and_allocate(struct load_info *info)
2649{ 2708{
2650 /* Module within temporary copy. */ 2709 /* Module within temporary copy. */
@@ -2716,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info)
2716 module_free(mod, mod->module_core); 2775 module_free(mod, mod->module_core);
2717} 2776}
2718 2777
2778int __weak module_finalize(const Elf_Ehdr *hdr,
2779 const Elf_Shdr *sechdrs,
2780 struct module *me)
2781{
2782 return 0;
2783}
2784
2719static int post_relocation(struct module *mod, const struct load_info *info) 2785static int post_relocation(struct module *mod, const struct load_info *info)
2720{ 2786{
2721 /* Sort exception table now relocations are done. */ 2787 /* Sort exception table now relocations are done. */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb568..8d7b435806c9 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)
525} 525}
526EXPORT_SYMBOL_GPL(srcu_init_notifier_head); 526EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
527 527
528/**
529 * register_reboot_notifier - Register function to be called at reboot time
530 * @nb: Info about notifier function to be called
531 *
532 * Registers a function with the list of functions
533 * to be called at reboot time.
534 *
535 * Currently always returns zero, as blocking_notifier_chain_register()
536 * always returns zero.
537 */
538int register_reboot_notifier(struct notifier_block *nb)
539{
540 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
541}
542EXPORT_SYMBOL(register_reboot_notifier);
543
544/**
545 * unregister_reboot_notifier - Unregister previously registered reboot notifier
546 * @nb: Hook to be unregistered
547 *
548 * Unregisters a previously registered reboot
549 * notifier function.
550 *
551 * Returns zero on success, or %-ENOENT on failure.
552 */
553int unregister_reboot_notifier(struct notifier_block *nb)
554{
555 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
556}
557EXPORT_SYMBOL(unregister_reboot_notifier);
558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 528static ATOMIC_NOTIFIER_HEAD(die_chain);
560 529
561int notrace __kprobes notify_die(enum die_val val, const char *str, 530int notrace __kprobes notify_die(enum die_val val, const char *str,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index d6a00f3de15d..9aeab4b98c64 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -271,10 +271,8 @@ out:
271 return err; 271 return err;
272} 272}
273 273
274static int __init nsproxy_cache_init(void) 274int __init nsproxy_cache_init(void)
275{ 275{
276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
277 return 0; 277 return 0;
278} 278}
279
280module_init(nsproxy_cache_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 69231670eb95..d7bb6974efb5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -119,6 +119,8 @@ NORET_TYPE void panic(const char * fmt, ...)
119 } 119 }
120 mdelay(PANIC_TIMER_STEP); 120 mdelay(PANIC_TIMER_STEP);
121 } 121 }
122 }
123 if (panic_timeout != 0) {
122 /* 124 /*
123 * This will not be a clean reboot, with everything 125 * This will not be a clean reboot, with everything
124 * shutting down. But if there is a chance of 126 * shutting down. But if there is a chance of
diff --git a/kernel/params.c b/kernel/params.c
index ed72e1330862..22df3e0d142a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -225,8 +225,8 @@ int parse_args(const char *name,
225 int ret; \ 225 int ret; \
226 \ 226 \
227 ret = strtolfn(val, 0, &l); \ 227 ret = strtolfn(val, 0, &l); \
228 if (ret == -EINVAL || ((type)l != l)) \ 228 if (ret < 0 || ((type)l != l)) \
229 return -EINVAL; \ 229 return ret < 0 ? ret : -EINVAL; \
230 *((type *)kp->arg) = l; \ 230 *((type *)kp->arg) = l; \
231 return 0; \ 231 return 0; \
232 } \ 232 } \
@@ -511,7 +511,7 @@ struct module_param_attrs
511#define to_param_attr(n) container_of(n, struct param_attribute, mattr) 511#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
512 512
513static ssize_t param_attr_show(struct module_attribute *mattr, 513static ssize_t param_attr_show(struct module_attribute *mattr,
514 struct module *mod, char *buf) 514 struct module_kobject *mk, char *buf)
515{ 515{
516 int count; 516 int count;
517 struct param_attribute *attribute = to_param_attr(mattr); 517 struct param_attribute *attribute = to_param_attr(mattr);
@@ -531,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
531 531
532/* sysfs always hands a nul-terminated string in buf. We rely on that. */ 532/* sysfs always hands a nul-terminated string in buf. We rely on that. */
533static ssize_t param_attr_store(struct module_attribute *mattr, 533static ssize_t param_attr_store(struct module_attribute *mattr,
534 struct module *owner, 534 struct module_kobject *km,
535 const char *buf, size_t len) 535 const char *buf, size_t len)
536{ 536{
537 int err; 537 int err;
@@ -730,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
730 mk->kobj.kset = module_kset; 730 mk->kobj.kset = module_kset;
731 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, 731 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
732 "%s", name); 732 "%s", name);
733#ifdef CONFIG_MODULES
734 if (!err)
735 err = sysfs_create_file(&mk->kobj, &module_uevent.attr);
736#endif
733 if (err) { 737 if (err) {
734 kobject_put(&mk->kobj); 738 kobject_put(&mk->kobj);
735 printk(KERN_ERR 739 printk(KERN_ERR
@@ -807,7 +811,7 @@ static void __init param_sysfs_builtin(void)
807} 811}
808 812
809ssize_t __modver_version_show(struct module_attribute *mattr, 813ssize_t __modver_version_show(struct module_attribute *mattr,
810 struct module *mod, char *buf) 814 struct module_kobject *mk, char *buf)
811{ 815{
812 struct module_version_attribute *vattr = 816 struct module_version_attribute *vattr =
813 container_of(mattr, struct module_version_attribute, mattr); 817 container_of(mattr, struct module_version_attribute, mattr);
@@ -852,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
852 if (!attribute->show) 856 if (!attribute->show)
853 return -EIO; 857 return -EIO;
854 858
855 ret = attribute->show(attribute, mk->mod, buf); 859 ret = attribute->show(attribute, mk, buf);
856 860
857 return ret; 861 return ret;
858} 862}
@@ -871,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
871 if (!attribute->store) 875 if (!attribute->store)
872 return -EIO; 876 return -EIO;
873 877
874 ret = attribute->store(attribute, mk->mod, buf, len); 878 ret = attribute->store(attribute, mk, buf, len);
875 879
876 return ret; 880 return ret;
877} 881}
diff --git a/kernel/pid.c b/kernel/pid.c
index 57a8346a270e..e432057f3b21 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
405 if (pid) { 405 if (pid) {
406 struct hlist_node *first; 406 struct hlist_node *first;
407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), 407 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
408 rcu_read_lock_held() ||
409 lockdep_tasklist_lock_is_held()); 408 lockdep_tasklist_lock_is_held());
410 if (first) 409 if (first)
411 result = hlist_entry(first, struct task_struct, pids[(type)].node); 410 result = hlist_entry(first, struct task_struct, pids[(type)].node);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 6824ca7d4d0c..37f05d0f0793 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -74,7 +74,7 @@ static DEFINE_SPINLOCK(pm_qos_lock);
74static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
76static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), 77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
78 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
79 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, 80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
@@ -84,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
84 84
85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
86static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), 87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
88 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
89 .name = "network_latency", 89 .name = "network_latency",
90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, 90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
@@ -95,7 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
95 95
96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
97static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), 98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
99 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 100 .name = "network_throughput",
101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, 101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 87f4d24b55b0..b1914cb9095c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -193,8 +193,8 @@ config APM_EMULATION
193 notification of APM "events" (e.g. battery status change). 193 notification of APM "events" (e.g. battery status change).
194 194
195 In order to use APM, you will need supporting software. For location 195 In order to use APM, you will need supporting software. For location
196 and more information, read <file:Documentation/power/pm.txt> and the 196 and more information, read <file:Documentation/power/apm-acpi.txt>
197 Battery Powered Linux mini-HOWTO, available from 197 and the Battery Powered Linux mini-HOWTO, available from
198 <http://www.tldp.org/docs.html#howto>. 198 <http://www.tldp.org/docs.html#howto>.
199 199
200 This driver does not spin down disk drives (see the hdparm(8) 200 This driver does not spin down disk drives (see the hdparm(8)
@@ -224,6 +224,10 @@ config PM_OPP
224 implementations a ready to use framework to manage OPPs. 224 implementations a ready to use framework to manage OPPs.
225 For more information, read <file:Documentation/power/opp.txt> 225 For more information, read <file:Documentation/power/opp.txt>
226 226
227config PM_RUNTIME_CLK 227config PM_CLK
228 def_bool y 228 def_bool y
229 depends on PM_RUNTIME && HAVE_CLK 229 depends on PM && HAVE_CLK
230
231config PM_GENERIC_DOMAINS
232 bool
233 depends on PM
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 2981af4ce7cb..6c601f871964 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier);
37 37
38int pm_notifier_call_chain(unsigned long val) 38int pm_notifier_call_chain(unsigned long val)
39{ 39{
40 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) 40 int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
41 == NOTIFY_BAD) ? -EINVAL : 0; 41
42 return notifier_to_errno(ret);
42} 43}
43 44
44/* If set, devices may be suspended and resumed asynchronously. */ 45/* If set, devices may be suspended and resumed asynchronously. */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1c41ba215419..b6b71ad2208f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
44 suspend_ops = ops; 44 suspend_ops = ops;
45 mutex_unlock(&pm_mutex); 45 mutex_unlock(&pm_mutex);
46} 46}
47EXPORT_SYMBOL_GPL(suspend_set_ops);
47 48
48bool valid_state(suspend_state_t state) 49bool valid_state(suspend_state_t state)
49{ 50{
@@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state)
65{ 66{
66 return state == PM_SUSPEND_MEM; 67 return state == PM_SUSPEND_MEM;
67} 68}
69EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
68 70
69static int suspend_test(int level) 71static int suspend_test(int level)
70{ 72{
@@ -126,12 +128,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
126} 128}
127 129
128/** 130/**
129 * suspend_enter - enter the desired system sleep state. 131 * suspend_enter - enter the desired system sleep state.
130 * @state: state to enter 132 * @state: State to enter
133 * @wakeup: Returns information that suspend should not be entered again.
131 * 134 *
132 * This function should be called after devices have been suspended. 135 * This function should be called after devices have been suspended.
133 */ 136 */
134static int suspend_enter(suspend_state_t state) 137static int suspend_enter(suspend_state_t state, bool *wakeup)
135{ 138{
136 int error; 139 int error;
137 140
@@ -165,7 +168,8 @@ static int suspend_enter(suspend_state_t state)
165 168
166 error = syscore_suspend(); 169 error = syscore_suspend();
167 if (!error) { 170 if (!error) {
168 if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { 171 *wakeup = pm_wakeup_pending();
172 if (!(suspend_test(TEST_CORE) || *wakeup)) {
169 error = suspend_ops->enter(state); 173 error = suspend_ops->enter(state);
170 events_check_enabled = false; 174 events_check_enabled = false;
171 } 175 }
@@ -199,6 +203,7 @@ static int suspend_enter(suspend_state_t state)
199int suspend_devices_and_enter(suspend_state_t state) 203int suspend_devices_and_enter(suspend_state_t state)
200{ 204{
201 int error; 205 int error;
206 bool wakeup = false;
202 207
203 if (!suspend_ops) 208 if (!suspend_ops)
204 return -ENOSYS; 209 return -ENOSYS;
@@ -220,7 +225,10 @@ int suspend_devices_and_enter(suspend_state_t state)
220 if (suspend_test(TEST_DEVICES)) 225 if (suspend_test(TEST_DEVICES))
221 goto Recover_platform; 226 goto Recover_platform;
222 227
223 error = suspend_enter(state); 228 do {
229 error = suspend_enter(state, &wakeup);
230 } while (!error && !wakeup
231 && suspend_ops->suspend_again && suspend_ops->suspend_again());
224 232
225 Resume_devices: 233 Resume_devices:
226 suspend_test_start(); 234 suspend_test_start();
diff --git a/kernel/printk.c b/kernel/printk.c
index 35185392173f..37dff3429adb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -782,7 +782,7 @@ static inline int can_use_console(unsigned int cpu)
782static int console_trylock_for_printk(unsigned int cpu) 782static int console_trylock_for_printk(unsigned int cpu)
783 __releases(&logbuf_lock) 783 __releases(&logbuf_lock)
784{ 784{
785 int retval = 0; 785 int retval = 0, wake = 0;
786 786
787 if (console_trylock()) { 787 if (console_trylock()) {
788 retval = 1; 788 retval = 1;
@@ -795,12 +795,14 @@ static int console_trylock_for_printk(unsigned int cpu)
795 */ 795 */
796 if (!can_use_console(cpu)) { 796 if (!can_use_console(cpu)) {
797 console_locked = 0; 797 console_locked = 0;
798 up(&console_sem); 798 wake = 1;
799 retval = 0; 799 retval = 0;
800 } 800 }
801 } 801 }
802 printk_cpu = UINT_MAX; 802 printk_cpu = UINT_MAX;
803 spin_unlock(&logbuf_lock); 803 spin_unlock(&logbuf_lock);
804 if (wake)
805 up(&console_sem);
804 return retval; 806 return retval;
805} 807}
806static const char recursion_bug_msg [] = 808static const char recursion_bug_msg [] =
@@ -1242,7 +1244,7 @@ void console_unlock(void)
1242{ 1244{
1243 unsigned long flags; 1245 unsigned long flags;
1244 unsigned _con_start, _log_end; 1246 unsigned _con_start, _log_end;
1245 unsigned wake_klogd = 0; 1247 unsigned wake_klogd = 0, retry = 0;
1246 1248
1247 if (console_suspended) { 1249 if (console_suspended) {
1248 up(&console_sem); 1250 up(&console_sem);
@@ -1251,6 +1253,7 @@ void console_unlock(void)
1251 1253
1252 console_may_schedule = 0; 1254 console_may_schedule = 0;
1253 1255
1256again:
1254 for ( ; ; ) { 1257 for ( ; ; ) {
1255 spin_lock_irqsave(&logbuf_lock, flags); 1258 spin_lock_irqsave(&logbuf_lock, flags);
1256 wake_klogd |= log_start - log_end; 1259 wake_klogd |= log_start - log_end;
@@ -1271,8 +1274,23 @@ void console_unlock(void)
1271 if (unlikely(exclusive_console)) 1274 if (unlikely(exclusive_console))
1272 exclusive_console = NULL; 1275 exclusive_console = NULL;
1273 1276
1277 spin_unlock(&logbuf_lock);
1278
1274 up(&console_sem); 1279 up(&console_sem);
1280
1281 /*
1282 * Someone could have filled up the buffer again, so re-check if there's
1283 * something to flush. In case we cannot trylock the console_sem again,
1284 * there's a new owner and the console_unlock() from them will do the
1285 * flush, no worries.
1286 */
1287 spin_lock(&logbuf_lock);
1288 if (con_start != log_end)
1289 retry = 1;
1275 spin_unlock_irqrestore(&logbuf_lock, flags); 1290 spin_unlock_irqrestore(&logbuf_lock, flags);
1291 if (retry && console_trylock())
1292 goto again;
1293
1276 if (wake_klogd) 1294 if (wake_klogd)
1277 wake_up_klogd(); 1295 wake_up_klogd();
1278} 1296}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df115790cd9..9de3ecfd20f9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -23,8 +23,15 @@
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h> 25#include <linux/hw_breakpoint.h>
26#include <linux/cn_proc.h>
26 27
27 28
29static int ptrace_trapping_sleep_fn(void *flags)
30{
31 schedule();
32 return 0;
33}
34
28/* 35/*
29 * ptrace a task: make the debugger its new parent and 36 * ptrace a task: make the debugger its new parent and
30 * move it to the ptrace list. 37 * move it to the ptrace list.
@@ -77,13 +84,20 @@ void __ptrace_unlink(struct task_struct *child)
77 spin_lock(&child->sighand->siglock); 84 spin_lock(&child->sighand->siglock);
78 85
79 /* 86 /*
80 * Reinstate GROUP_STOP_PENDING if group stop is in effect and 87 * Clear all pending traps and TRAPPING. TRAPPING should be
88 * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
89 */
90 task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
91 task_clear_jobctl_trapping(child);
92
93 /*
94 * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
81 * @child isn't dead. 95 * @child isn't dead.
82 */ 96 */
83 if (!(child->flags & PF_EXITING) && 97 if (!(child->flags & PF_EXITING) &&
84 (child->signal->flags & SIGNAL_STOP_STOPPED || 98 (child->signal->flags & SIGNAL_STOP_STOPPED ||
85 child->signal->group_stop_count)) 99 child->signal->group_stop_count))
86 child->group_stop |= GROUP_STOP_PENDING; 100 child->jobctl |= JOBCTL_STOP_PENDING;
87 101
88 /* 102 /*
89 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick 103 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
@@ -91,16 +105,30 @@ void __ptrace_unlink(struct task_struct *child)
91 * is in TASK_TRACED; otherwise, we might unduly disrupt 105 * is in TASK_TRACED; otherwise, we might unduly disrupt
92 * TASK_KILLABLE sleeps. 106 * TASK_KILLABLE sleeps.
93 */ 107 */
94 if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) 108 if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
95 signal_wake_up(child, task_is_traced(child)); 109 signal_wake_up(child, task_is_traced(child));
96 110
97 spin_unlock(&child->sighand->siglock); 111 spin_unlock(&child->sighand->siglock);
98} 112}
99 113
100/* 114/**
101 * Check that we have indeed attached to the thing.. 115 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
116 * @child: ptracee to check for
117 * @ignore_state: don't check whether @child is currently %TASK_TRACED
118 *
119 * Check whether @child is being ptraced by %current and ready for further
120 * ptrace operations. If @ignore_state is %false, @child also should be in
121 * %TASK_TRACED state and on return the child is guaranteed to be traced
122 * and not executing. If @ignore_state is %true, @child can be in any
123 * state.
124 *
125 * CONTEXT:
126 * Grabs and releases tasklist_lock and @child->sighand->siglock.
127 *
128 * RETURNS:
129 * 0 on success, -ESRCH if %child is not ready.
102 */ 130 */
103int ptrace_check_attach(struct task_struct *child, int kill) 131int ptrace_check_attach(struct task_struct *child, bool ignore_state)
104{ 132{
105 int ret = -ESRCH; 133 int ret = -ESRCH;
106 134
@@ -119,13 +147,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
119 */ 147 */
120 spin_lock_irq(&child->sighand->siglock); 148 spin_lock_irq(&child->sighand->siglock);
121 WARN_ON_ONCE(task_is_stopped(child)); 149 WARN_ON_ONCE(task_is_stopped(child));
122 if (task_is_traced(child) || kill) 150 if (ignore_state || (task_is_traced(child) &&
151 !(child->jobctl & JOBCTL_LISTENING)))
123 ret = 0; 152 ret = 0;
124 spin_unlock_irq(&child->sighand->siglock); 153 spin_unlock_irq(&child->sighand->siglock);
125 } 154 }
126 read_unlock(&tasklist_lock); 155 read_unlock(&tasklist_lock);
127 156
128 if (!ret && !kill) 157 if (!ret && !ignore_state)
129 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; 158 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
130 159
131 /* All systems go.. */ 160 /* All systems go.. */
@@ -182,11 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
182 return !err; 211 return !err;
183} 212}
184 213
185static int ptrace_attach(struct task_struct *task) 214static int ptrace_attach(struct task_struct *task, long request,
215 unsigned long flags)
186{ 216{
187 bool wait_trap = false; 217 bool seize = (request == PTRACE_SEIZE);
188 int retval; 218 int retval;
189 219
220 /*
221 * SEIZE will enable new ptrace behaviors which will be implemented
222 * gradually. SEIZE_DEVEL is used to prevent applications
223 * expecting full SEIZE behaviors trapping on kernel commits which
224 * are still in the process of implementing them.
225 *
226 * Only test programs for new ptrace behaviors being implemented
227 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
228 *
229 * Once SEIZE behaviors are completely implemented, this flag and
230 * the following test will be removed.
231 */
232 retval = -EIO;
233 if (seize && !(flags & PTRACE_SEIZE_DEVEL))
234 goto out;
235
190 audit_ptrace(task); 236 audit_ptrace(task);
191 237
192 retval = -EPERM; 238 retval = -EPERM;
@@ -218,16 +264,21 @@ static int ptrace_attach(struct task_struct *task)
218 goto unlock_tasklist; 264 goto unlock_tasklist;
219 265
220 task->ptrace = PT_PTRACED; 266 task->ptrace = PT_PTRACED;
267 if (seize)
268 task->ptrace |= PT_SEIZED;
221 if (task_ns_capable(task, CAP_SYS_PTRACE)) 269 if (task_ns_capable(task, CAP_SYS_PTRACE))
222 task->ptrace |= PT_PTRACE_CAP; 270 task->ptrace |= PT_PTRACE_CAP;
223 271
224 __ptrace_link(task, current); 272 __ptrace_link(task, current);
225 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 273
274 /* SEIZE doesn't trap tracee on attach */
275 if (!seize)
276 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
226 277
227 spin_lock(&task->sighand->siglock); 278 spin_lock(&task->sighand->siglock);
228 279
229 /* 280 /*
230 * If the task is already STOPPED, set GROUP_STOP_PENDING and 281 * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
231 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING 282 * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
232 * will be cleared if the child completes the transition or any 283 * will be cleared if the child completes the transition or any
233 * event which clears the group stop states happens. We'll wait 284 * event which clears the group stop states happens. We'll wait
@@ -243,11 +294,9 @@ static int ptrace_attach(struct task_struct *task)
243 * The following task_is_stopped() test is safe as both transitions 294 * The following task_is_stopped() test is safe as both transitions
244 * in and out of STOPPED are protected by siglock. 295 * in and out of STOPPED are protected by siglock.
245 */ 296 */
246 if (task_is_stopped(task)) { 297 if (task_is_stopped(task) &&
247 task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; 298 task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
248 signal_wake_up(task, 1); 299 signal_wake_up(task, 1);
249 wait_trap = true;
250 }
251 300
252 spin_unlock(&task->sighand->siglock); 301 spin_unlock(&task->sighand->siglock);
253 302
@@ -257,9 +306,12 @@ unlock_tasklist:
257unlock_creds: 306unlock_creds:
258 mutex_unlock(&task->signal->cred_guard_mutex); 307 mutex_unlock(&task->signal->cred_guard_mutex);
259out: 308out:
260 if (wait_trap) 309 if (!retval) {
261 wait_event(current->signal->wait_chldexit, 310 wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
262 !(task->group_stop & GROUP_STOP_TRAPPING)); 311 ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
312 proc_ptrace_connector(task, PTRACE_ATTACH);
313 }
314
263 return retval; 315 return retval;
264} 316}
265 317
@@ -322,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh)
322 */ 374 */
323static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 375static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
324{ 376{
377 bool dead;
378
325 __ptrace_unlink(p); 379 __ptrace_unlink(p);
326 380
327 if (p->exit_state == EXIT_ZOMBIE) { 381 if (p->exit_state != EXIT_ZOMBIE)
328 if (!task_detached(p) && thread_group_empty(p)) { 382 return false;
329 if (!same_thread_group(p->real_parent, tracer)) 383
330 do_notify_parent(p, p->exit_signal); 384 dead = !thread_group_leader(p);
331 else if (ignoring_children(tracer->sighand)) { 385
332 __wake_up_parent(p, tracer); 386 if (!dead && thread_group_empty(p)) {
333 p->exit_signal = -1; 387 if (!same_thread_group(p->real_parent, tracer))
334 } 388 dead = do_notify_parent(p, p->exit_signal);
335 } 389 else if (ignoring_children(tracer->sighand)) {
336 if (task_detached(p)) { 390 __wake_up_parent(p, tracer);
337 /* Mark it as in the process of being reaped. */ 391 dead = true;
338 p->exit_state = EXIT_DEAD;
339 return true;
340 } 392 }
341 } 393 }
342 394 /* Mark it as in the process of being reaped. */
343 return false; 395 if (dead)
396 p->exit_state = EXIT_DEAD;
397 return dead;
344} 398}
345 399
346static int ptrace_detach(struct task_struct *child, unsigned int data) 400static int ptrace_detach(struct task_struct *child, unsigned int data)
@@ -365,6 +419,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
365 } 419 }
366 write_unlock_irq(&tasklist_lock); 420 write_unlock_irq(&tasklist_lock);
367 421
422 proc_ptrace_connector(child, PTRACE_DETACH);
368 if (unlikely(dead)) 423 if (unlikely(dead))
369 release_task(child); 424 release_task(child);
370 425
@@ -611,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
611int ptrace_request(struct task_struct *child, long request, 666int ptrace_request(struct task_struct *child, long request,
612 unsigned long addr, unsigned long data) 667 unsigned long addr, unsigned long data)
613{ 668{
669 bool seized = child->ptrace & PT_SEIZED;
614 int ret = -EIO; 670 int ret = -EIO;
615 siginfo_t siginfo; 671 siginfo_t siginfo, *si;
616 void __user *datavp = (void __user *) data; 672 void __user *datavp = (void __user *) data;
617 unsigned long __user *datalp = datavp; 673 unsigned long __user *datalp = datavp;
674 unsigned long flags;
618 675
619 switch (request) { 676 switch (request) {
620 case PTRACE_PEEKTEXT: 677 case PTRACE_PEEKTEXT:
@@ -647,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request,
647 ret = ptrace_setsiginfo(child, &siginfo); 704 ret = ptrace_setsiginfo(child, &siginfo);
648 break; 705 break;
649 706
707 case PTRACE_INTERRUPT:
708 /*
709 * Stop tracee without any side-effect on signal or job
710 * control. At least one trap is guaranteed to happen
711 * after this request. If @child is already trapped, the
712 * current trap is not disturbed and another trap will
713 * happen after the current trap is ended with PTRACE_CONT.
714 *
715 * The actual trap might not be PTRACE_EVENT_STOP trap but
716 * the pending condition is cleared regardless.
717 */
718 if (unlikely(!seized || !lock_task_sighand(child, &flags)))
719 break;
720
721 /*
722 * INTERRUPT doesn't disturb existing trap sans one
723 * exception. If ptracer issued LISTEN for the current
724 * STOP, this INTERRUPT should clear LISTEN and re-trap
725 * tracee into STOP.
726 */
727 if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
728 signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
729
730 unlock_task_sighand(child, &flags);
731 ret = 0;
732 break;
733
734 case PTRACE_LISTEN:
735 /*
736 * Listen for events. Tracee must be in STOP. It's not
737 * resumed per-se but is not considered to be in TRACED by
738 * wait(2) or ptrace(2). If an async event (e.g. group
739 * stop state change) happens, tracee will enter STOP trap
740 * again. Alternatively, ptracer can issue INTERRUPT to
741 * finish listening and re-trap tracee into STOP.
742 */
743 if (unlikely(!seized || !lock_task_sighand(child, &flags)))
744 break;
745
746 si = child->last_siginfo;
747 if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
748 break;
749
750 child->jobctl |= JOBCTL_LISTENING;
751
752 /*
753 * If NOTIFY is set, it means event happened between start
754 * of this trap and now. Trigger re-trap immediately.
755 */
756 if (child->jobctl & JOBCTL_TRAP_NOTIFY)
757 signal_wake_up(child, true);
758
759 unlock_task_sighand(child, &flags);
760 ret = 0;
761 break;
762
650 case PTRACE_DETACH: /* detach a process that was attached. */ 763 case PTRACE_DETACH: /* detach a process that was attached. */
651 ret = ptrace_detach(child, data); 764 ret = ptrace_detach(child, data);
652 break; 765 break;
@@ -761,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
761 goto out; 874 goto out;
762 } 875 }
763 876
764 if (request == PTRACE_ATTACH) { 877 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
765 ret = ptrace_attach(child); 878 ret = ptrace_attach(child, request, data);
766 /* 879 /*
767 * Some architectures need to do book-keeping after 880 * Some architectures need to do book-keeping after
768 * a ptrace attach. 881 * a ptrace attach.
@@ -772,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
772 goto out_put_task_struct; 885 goto out_put_task_struct;
773 } 886 }
774 887
775 ret = ptrace_check_attach(child, request == PTRACE_KILL); 888 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
889 request == PTRACE_INTERRUPT);
776 if (ret < 0) 890 if (ret < 0)
777 goto out_put_task_struct; 891 goto out_put_task_struct;
778 892
@@ -903,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
903 goto out; 1017 goto out;
904 } 1018 }
905 1019
906 if (request == PTRACE_ATTACH) { 1020 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
907 ret = ptrace_attach(child); 1021 ret = ptrace_attach(child, request, data);
908 /* 1022 /*
909 * Some architectures need to do book-keeping after 1023 * Some architectures need to do book-keeping after
910 * a ptrace attach. 1024 * a ptrace attach.
@@ -914,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
914 goto out_put_task_struct; 1028 goto out_put_task_struct;
915 } 1029 }
916 1030
917 ret = ptrace_check_attach(child, request == PTRACE_KILL); 1031 ret = ptrace_check_attach(child, request == PTRACE_KILL ||
1032 request == PTRACE_INTERRUPT);
918 if (!ret) 1033 if (!ret)
919 ret = compat_arch_ptrace(child, request, addr, data); 1034 ret = compat_arch_ptrace(child, request, addr, data);
920 1035
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 7784bd216b6a..ddddb320be61 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -37,7 +37,7 @@
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/interrupt.h> 38#include <linux/interrupt.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <asm/atomic.h> 40#include <linux/atomic.h>
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/percpu.h> 42#include <linux/percpu.h>
43#include <linux/notifier.h> 43#include <linux/notifier.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e138db03382..98f51b13bb7e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -33,7 +33,7 @@
33#include <linux/rcupdate.h> 33#include <linux/rcupdate.h>
34#include <linux/interrupt.h> 34#include <linux/interrupt.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <asm/atomic.h> 36#include <linux/atomic.h>
37#include <linux/bitops.h> 37#include <linux/bitops.h>
38#include <linux/completion.h> 38#include <linux/completion.h>
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
@@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused)
941 idx = cur_ops->readlock(); 941 idx = cur_ops->readlock();
942 completed = cur_ops->completed(); 942 completed = cur_ops->completed();
943 p = rcu_dereference_check(rcu_torture_current, 943 p = rcu_dereference_check(rcu_torture_current,
944 rcu_read_lock_held() ||
945 rcu_read_lock_bh_held() || 944 rcu_read_lock_bh_held() ||
946 rcu_read_lock_sched_held() || 945 rcu_read_lock_sched_held() ||
947 srcu_read_lock_held(&srcu_ctl)); 946 srcu_read_lock_held(&srcu_ctl));
@@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg)
1002 idx = cur_ops->readlock(); 1001 idx = cur_ops->readlock();
1003 completed = cur_ops->completed(); 1002 completed = cur_ops->completed();
1004 p = rcu_dereference_check(rcu_torture_current, 1003 p = rcu_dereference_check(rcu_torture_current,
1005 rcu_read_lock_held() ||
1006 rcu_read_lock_bh_held() || 1004 rcu_read_lock_bh_held() ||
1007 rcu_read_lock_sched_held() || 1005 rcu_read_lock_sched_held() ||
1008 srcu_read_lock_held(&srcu_ctl)); 1006 srcu_read_lock_held(&srcu_ctl));
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4e144876dc68..3b0c0986afc0 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -31,7 +31,7 @@
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/interrupt.h> 32#include <linux/interrupt.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <asm/atomic.h> 34#include <linux/atomic.h>
35#include <linux/bitops.h> 35#include <linux/bitops.h>
36#include <linux/module.h> 36#include <linux/module.h>
37#include <linux/completion.h> 37#include <linux/completion.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 3ff40178dce7..3b3cedc52592 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -553,6 +553,27 @@ int allocate_resource(struct resource *root, struct resource *new,
553 553
554EXPORT_SYMBOL(allocate_resource); 554EXPORT_SYMBOL(allocate_resource);
555 555
556/**
557 * lookup_resource - find an existing resource by a resource start address
558 * @root: root resource descriptor
559 * @start: resource start address
560 *
561 * Returns a pointer to the resource if found, NULL otherwise
562 */
563struct resource *lookup_resource(struct resource *root, resource_size_t start)
564{
565 struct resource *res;
566
567 read_lock(&resource_lock);
568 for (res = root->child; res; res = res->sibling) {
569 if (res->start == start)
570 break;
571 }
572 read_unlock(&resource_lock);
573
574 return res;
575}
576
556/* 577/*
557 * Insert a resource into the resource tree. If successful, return NULL, 578 * Insert a resource into the resource tree. If successful, return NULL,
558 * otherwise return the conflicting resource (compare to __request_resource()) 579 * otherwise return the conflicting resource (compare to __request_resource())
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index ab449117aaf2..255e1662acdb 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
890{ 890{
891 lock->owner = NULL; 891 lock->owner = NULL;
892 raw_spin_lock_init(&lock->wait_lock); 892 raw_spin_lock_init(&lock->wait_lock);
893 plist_head_init_raw(&lock->wait_list, &lock->wait_lock); 893 plist_head_init(&lock->wait_list);
894 894
895 debug_rt_mutex_init(lock, name); 895 debug_rt_mutex_init(lock, name);
896} 896}
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index cae050b05f5e..9f48f3d82e9b 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -11,7 +11,7 @@
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/atomic.h> 14#include <linux/atomic.h>
15 15
16/* 16/*
17 * lock for reading 17 * lock for reading
@@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
117 117
118EXPORT_SYMBOL(down_read_nested); 118EXPORT_SYMBOL(down_read_nested);
119 119
120void down_read_non_owner(struct rw_semaphore *sem)
121{
122 might_sleep();
123
124 __down_read(sem);
125}
126
127EXPORT_SYMBOL(down_read_non_owner);
128
129void down_write_nested(struct rw_semaphore *sem, int subclass) 120void down_write_nested(struct rw_semaphore *sem, int subclass)
130{ 121{
131 might_sleep(); 122 might_sleep();
@@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
136 127
137EXPORT_SYMBOL(down_write_nested); 128EXPORT_SYMBOL(down_write_nested);
138 129
139void up_read_non_owner(struct rw_semaphore *sem)
140{
141 __up_read(sem);
142}
143
144EXPORT_SYMBOL(up_read_non_owner);
145
146#endif 130#endif
147 131
148 132
diff --git a/kernel/sched.c b/kernel/sched.c
index 31e92aee6242..ccacdbdecf45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
75#include <asm/tlb.h> 75#include <asm/tlb.h>
76#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
77#include <asm/mutex.h> 77#include <asm/mutex.h>
78#ifdef CONFIG_PARAVIRT
79#include <asm/paravirt.h>
80#endif
78 81
79#include "sched_cpupri.h" 82#include "sched_cpupri.h"
80#include "workqueue_sched.h" 83#include "workqueue_sched.h"
@@ -124,7 +127,7 @@
124 127
125static inline int rt_policy(int policy) 128static inline int rt_policy(int policy)
126{ 129{
127 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 130 if (policy == SCHED_FIFO || policy == SCHED_RR)
128 return 1; 131 return 1;
129 return 0; 132 return 0;
130} 133}
@@ -422,6 +425,7 @@ struct rt_rq {
422 */ 425 */
423struct root_domain { 426struct root_domain {
424 atomic_t refcount; 427 atomic_t refcount;
428 atomic_t rto_count;
425 struct rcu_head rcu; 429 struct rcu_head rcu;
426 cpumask_var_t span; 430 cpumask_var_t span;
427 cpumask_var_t online; 431 cpumask_var_t online;
@@ -431,7 +435,6 @@ struct root_domain {
431 * one runnable RT task. 435 * one runnable RT task.
432 */ 436 */
433 cpumask_var_t rto_mask; 437 cpumask_var_t rto_mask;
434 atomic_t rto_count;
435 struct cpupri cpupri; 438 struct cpupri cpupri;
436}; 439};
437 440
@@ -528,6 +531,12 @@ struct rq {
528#ifdef CONFIG_IRQ_TIME_ACCOUNTING 531#ifdef CONFIG_IRQ_TIME_ACCOUNTING
529 u64 prev_irq_time; 532 u64 prev_irq_time;
530#endif 533#endif
534#ifdef CONFIG_PARAVIRT
535 u64 prev_steal_time;
536#endif
537#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
538 u64 prev_steal_time_rq;
539#endif
531 540
532 /* calc_load related fields */ 541 /* calc_load related fields */
533 unsigned long calc_load_update; 542 unsigned long calc_load_update;
@@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
581 590
582#define rcu_dereference_check_sched_domain(p) \ 591#define rcu_dereference_check_sched_domain(p) \
583 rcu_dereference_check((p), \ 592 rcu_dereference_check((p), \
584 rcu_read_lock_held() || \
585 lockdep_is_held(&sched_domains_mutex)) 593 lockdep_is_held(&sched_domains_mutex))
586 594
587/* 595/*
@@ -1568,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1568 return rq->avg_load_per_task; 1576 return rq->avg_load_per_task;
1569} 1577}
1570 1578
1571#ifdef CONFIG_FAIR_GROUP_SCHED
1572
1573/*
1574 * Compute the cpu's hierarchical load factor for each task group.
1575 * This needs to be done in a top-down fashion because the load of a child
1576 * group is a fraction of its parents load.
1577 */
1578static int tg_load_down(struct task_group *tg, void *data)
1579{
1580 unsigned long load;
1581 long cpu = (long)data;
1582
1583 if (!tg->parent) {
1584 load = cpu_rq(cpu)->load.weight;
1585 } else {
1586 load = tg->parent->cfs_rq[cpu]->h_load;
1587 load *= tg->se[cpu]->load.weight;
1588 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1589 }
1590
1591 tg->cfs_rq[cpu]->h_load = load;
1592
1593 return 0;
1594}
1595
1596static void update_h_load(long cpu)
1597{
1598 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1599}
1600
1601#endif
1602
1603#ifdef CONFIG_PREEMPT 1579#ifdef CONFIG_PREEMPT
1604 1580
1605static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1581static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1953,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
1953} 1929}
1954EXPORT_SYMBOL_GPL(account_system_vtime); 1930EXPORT_SYMBOL_GPL(account_system_vtime);
1955 1931
1956static void update_rq_clock_task(struct rq *rq, s64 delta) 1932#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1933
1934#ifdef CONFIG_PARAVIRT
1935static inline u64 steal_ticks(u64 steal)
1957{ 1936{
1958 s64 irq_delta; 1937 if (unlikely(steal > NSEC_PER_SEC))
1938 return div_u64(steal, TICK_NSEC);
1939
1940 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
1941}
1942#endif
1959 1943
1944static void update_rq_clock_task(struct rq *rq, s64 delta)
1945{
1946/*
1947 * In theory, the compile should just see 0 here, and optimize out the call
1948 * to sched_rt_avg_update. But I don't trust it...
1949 */
1950#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1951 s64 steal = 0, irq_delta = 0;
1952#endif
1953#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1960 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 1954 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1961 1955
1962 /* 1956 /*
@@ -1979,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1979 1973
1980 rq->prev_irq_time += irq_delta; 1974 rq->prev_irq_time += irq_delta;
1981 delta -= irq_delta; 1975 delta -= irq_delta;
1976#endif
1977#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1978 if (static_branch((&paravirt_steal_rq_enabled))) {
1979 u64 st;
1980
1981 steal = paravirt_steal_clock(cpu_of(rq));
1982 steal -= rq->prev_steal_time_rq;
1983
1984 if (unlikely(steal > delta))
1985 steal = delta;
1986
1987 st = steal_ticks(steal);
1988 steal = st * TICK_NSEC;
1989
1990 rq->prev_steal_time_rq += steal;
1991
1992 delta -= steal;
1993 }
1994#endif
1995
1982 rq->clock_task += delta; 1996 rq->clock_task += delta;
1983 1997
1984 if (irq_delta && sched_feat(NONIRQ_POWER)) 1998#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
1985 sched_rt_avg_update(rq, irq_delta); 1999 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
2000 sched_rt_avg_update(rq, irq_delta + steal);
2001#endif
1986} 2002}
1987 2003
2004#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1988static int irqtime_account_hi_update(void) 2005static int irqtime_account_hi_update(void)
1989{ 2006{
1990 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2007 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2019,12 +2036,7 @@ static int irqtime_account_si_update(void)
2019 2036
2020#define sched_clock_irqtime (0) 2037#define sched_clock_irqtime (0)
2021 2038
2022static void update_rq_clock_task(struct rq *rq, s64 delta) 2039#endif
2023{
2024 rq->clock_task += delta;
2025}
2026
2027#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2028 2040
2029#include "sched_idletask.c" 2041#include "sched_idletask.c"
2030#include "sched_fair.c" 2042#include "sched_fair.c"
@@ -2220,7 +2232,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2220 2232
2221 if (task_cpu(p) != new_cpu) { 2233 if (task_cpu(p) != new_cpu) {
2222 p->se.nr_migrations++; 2234 p->se.nr_migrations++;
2223 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2235 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
2224 } 2236 }
2225 2237
2226 __set_task_cpu(p, new_cpu); 2238 __set_task_cpu(p, new_cpu);
@@ -2497,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2497 if (p->sched_class->task_woken) 2509 if (p->sched_class->task_woken)
2498 p->sched_class->task_woken(rq, p); 2510 p->sched_class->task_woken(rq, p);
2499 2511
2500 if (unlikely(rq->idle_stamp)) { 2512 if (rq->idle_stamp) {
2501 u64 delta = rq->clock - rq->idle_stamp; 2513 u64 delta = rq->clock - rq->idle_stamp;
2502 u64 max = 2*sysctl_sched_migration_cost; 2514 u64 max = 2*sysctl_sched_migration_cost;
2503 2515
@@ -2886,7 +2898,7 @@ void sched_fork(struct task_struct *p)
2886#if defined(CONFIG_SMP) 2898#if defined(CONFIG_SMP)
2887 p->on_cpu = 0; 2899 p->on_cpu = 0;
2888#endif 2900#endif
2889#ifdef CONFIG_PREEMPT 2901#ifdef CONFIG_PREEMPT_COUNT
2890 /* Want to start with kernel preemption disabled. */ 2902 /* Want to start with kernel preemption disabled. */
2891 task_thread_info(p)->preempt_count = 1; 2903 task_thread_info(p)->preempt_count = 1;
2892#endif 2904#endif
@@ -3877,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
3877 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3889 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3878} 3890}
3879 3891
3892static __always_inline bool steal_account_process_tick(void)
3893{
3894#ifdef CONFIG_PARAVIRT
3895 if (static_branch(&paravirt_steal_enabled)) {
3896 u64 steal, st = 0;
3897
3898 steal = paravirt_steal_clock(smp_processor_id());
3899 steal -= this_rq()->prev_steal_time;
3900
3901 st = steal_ticks(steal);
3902 this_rq()->prev_steal_time += st * TICK_NSEC;
3903
3904 account_steal_time(st);
3905 return st;
3906 }
3907#endif
3908 return false;
3909}
3910
3880#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3911#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3881 3912
3882#ifdef CONFIG_IRQ_TIME_ACCOUNTING 3913#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3908,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3908 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 3939 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3909 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3940 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3910 3941
3942 if (steal_account_process_tick())
3943 return;
3944
3911 if (irqtime_account_hi_update()) { 3945 if (irqtime_account_hi_update()) {
3912 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3946 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3913 } else if (irqtime_account_si_update()) { 3947 } else if (irqtime_account_si_update()) {
@@ -3961,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
3961 return; 3995 return;
3962 } 3996 }
3963 3997
3998 if (steal_account_process_tick())
3999 return;
4000
3964 if (user_tick) 4001 if (user_tick)
3965 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 4002 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3966 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 4003 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4338,11 +4375,8 @@ EXPORT_SYMBOL(schedule);
4338 4375
4339static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 4376static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4340{ 4377{
4341 bool ret = false;
4342
4343 rcu_read_lock();
4344 if (lock->owner != owner) 4378 if (lock->owner != owner)
4345 goto fail; 4379 return false;
4346 4380
4347 /* 4381 /*
4348 * Ensure we emit the owner->on_cpu, dereference _after_ checking 4382 * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4352,11 +4386,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4352 */ 4386 */
4353 barrier(); 4387 barrier();
4354 4388
4355 ret = owner->on_cpu; 4389 return owner->on_cpu;
4356fail:
4357 rcu_read_unlock();
4358
4359 return ret;
4360} 4390}
4361 4391
4362/* 4392/*
@@ -4368,21 +4398,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4368 if (!sched_feat(OWNER_SPIN)) 4398 if (!sched_feat(OWNER_SPIN))
4369 return 0; 4399 return 0;
4370 4400
4401 rcu_read_lock();
4371 while (owner_running(lock, owner)) { 4402 while (owner_running(lock, owner)) {
4372 if (need_resched()) 4403 if (need_resched())
4373 return 0; 4404 break;
4374 4405
4375 arch_mutex_cpu_relax(); 4406 arch_mutex_cpu_relax();
4376 } 4407 }
4408 rcu_read_unlock();
4377 4409
4378 /* 4410 /*
4379 * If the owner changed to another task there is likely 4411 * We break out the loop above on need_resched() and when the
4380 * heavy contention, stop spinning. 4412 * owner changed, which is a sign for heavy contention. Return
4413 * success only when lock->owner is NULL.
4381 */ 4414 */
4382 if (lock->owner) 4415 return lock->owner == NULL;
4383 return 0;
4384
4385 return 1;
4386} 4416}
4387#endif 4417#endif
4388 4418
@@ -6589,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6589 break; 6619 break;
6590 } 6620 }
6591 6621
6592 if (!group->cpu_power) { 6622 if (!group->sgp->power) {
6593 printk(KERN_CONT "\n"); 6623 printk(KERN_CONT "\n");
6594 printk(KERN_ERR "ERROR: domain->cpu_power not " 6624 printk(KERN_ERR "ERROR: domain->cpu_power not "
6595 "set\n"); 6625 "set\n");
@@ -6613,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6613 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6643 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6614 6644
6615 printk(KERN_CONT " %s", str); 6645 printk(KERN_CONT " %s", str);
6616 if (group->cpu_power != SCHED_POWER_SCALE) { 6646 if (group->sgp->power != SCHED_POWER_SCALE) {
6617 printk(KERN_CONT " (cpu_power = %d)", 6647 printk(KERN_CONT " (cpu_power = %d)",
6618 group->cpu_power); 6648 group->sgp->power);
6619 } 6649 }
6620 6650
6621 group = group->next; 6651 group = group->next;
@@ -6806,11 +6836,39 @@ static struct root_domain *alloc_rootdomain(void)
6806 return rd; 6836 return rd;
6807} 6837}
6808 6838
6839static void free_sched_groups(struct sched_group *sg, int free_sgp)
6840{
6841 struct sched_group *tmp, *first;
6842
6843 if (!sg)
6844 return;
6845
6846 first = sg;
6847 do {
6848 tmp = sg->next;
6849
6850 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6851 kfree(sg->sgp);
6852
6853 kfree(sg);
6854 sg = tmp;
6855 } while (sg != first);
6856}
6857
6809static void free_sched_domain(struct rcu_head *rcu) 6858static void free_sched_domain(struct rcu_head *rcu)
6810{ 6859{
6811 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 6860 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6812 if (atomic_dec_and_test(&sd->groups->ref)) 6861
6862 /*
6863 * If its an overlapping domain it has private groups, iterate and
6864 * nuke them all.
6865 */
6866 if (sd->flags & SD_OVERLAP) {
6867 free_sched_groups(sd->groups, 1);
6868 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6869 kfree(sd->groups->sgp);
6813 kfree(sd->groups); 6870 kfree(sd->groups);
6871 }
6814 kfree(sd); 6872 kfree(sd);
6815} 6873}
6816 6874
@@ -6977,6 +7035,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6977struct sd_data { 7035struct sd_data {
6978 struct sched_domain **__percpu sd; 7036 struct sched_domain **__percpu sd;
6979 struct sched_group **__percpu sg; 7037 struct sched_group **__percpu sg;
7038 struct sched_group_power **__percpu sgp;
6980}; 7039};
6981 7040
6982struct s_data { 7041struct s_data {
@@ -6996,15 +7055,73 @@ struct sched_domain_topology_level;
6996typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 7055typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6997typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 7056typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6998 7057
7058#define SDTL_OVERLAP 0x01
7059
6999struct sched_domain_topology_level { 7060struct sched_domain_topology_level {
7000 sched_domain_init_f init; 7061 sched_domain_init_f init;
7001 sched_domain_mask_f mask; 7062 sched_domain_mask_f mask;
7063 int flags;
7002 struct sd_data data; 7064 struct sd_data data;
7003}; 7065};
7004 7066
7005/* 7067static int
7006 * Assumes the sched_domain tree is fully constructed 7068build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7007 */ 7069{
7070 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
7071 const struct cpumask *span = sched_domain_span(sd);
7072 struct cpumask *covered = sched_domains_tmpmask;
7073 struct sd_data *sdd = sd->private;
7074 struct sched_domain *child;
7075 int i;
7076
7077 cpumask_clear(covered);
7078
7079 for_each_cpu(i, span) {
7080 struct cpumask *sg_span;
7081
7082 if (cpumask_test_cpu(i, covered))
7083 continue;
7084
7085 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7086 GFP_KERNEL, cpu_to_node(i));
7087
7088 if (!sg)
7089 goto fail;
7090
7091 sg_span = sched_group_cpus(sg);
7092
7093 child = *per_cpu_ptr(sdd->sd, i);
7094 if (child->child) {
7095 child = child->child;
7096 cpumask_copy(sg_span, sched_domain_span(child));
7097 } else
7098 cpumask_set_cpu(i, sg_span);
7099
7100 cpumask_or(covered, covered, sg_span);
7101
7102 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
7103 atomic_inc(&sg->sgp->ref);
7104
7105 if (cpumask_test_cpu(cpu, sg_span))
7106 groups = sg;
7107
7108 if (!first)
7109 first = sg;
7110 if (last)
7111 last->next = sg;
7112 last = sg;
7113 last->next = first;
7114 }
7115 sd->groups = groups;
7116
7117 return 0;
7118
7119fail:
7120 free_sched_groups(first, 0);
7121
7122 return -ENOMEM;
7123}
7124
7008static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 7125static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
7009{ 7126{
7010 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 7127 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -7013,24 +7130,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
7013 if (child) 7130 if (child)
7014 cpu = cpumask_first(sched_domain_span(child)); 7131 cpu = cpumask_first(sched_domain_span(child));
7015 7132
7016 if (sg) 7133 if (sg) {
7017 *sg = *per_cpu_ptr(sdd->sg, cpu); 7134 *sg = *per_cpu_ptr(sdd->sg, cpu);
7135 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7136 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
7137 }
7018 7138
7019 return cpu; 7139 return cpu;
7020} 7140}
7021 7141
7022/* 7142/*
7023 * build_sched_groups takes the cpumask we wish to span, and a pointer
7024 * to a function which identifies what group(along with sched group) a CPU
7025 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
7026 * (due to the fact that we keep track of groups covered with a struct cpumask).
7027 *
7028 * build_sched_groups will build a circular linked list of the groups 7143 * build_sched_groups will build a circular linked list of the groups
7029 * covered by the given span, and will set each group's ->cpumask correctly, 7144 * covered by the given span, and will set each group's ->cpumask correctly,
7030 * and ->cpu_power to 0. 7145 * and ->cpu_power to 0.
7146 *
7147 * Assumes the sched_domain tree is fully constructed
7031 */ 7148 */
7032static void 7149static int
7033build_sched_groups(struct sched_domain *sd) 7150build_sched_groups(struct sched_domain *sd, int cpu)
7034{ 7151{
7035 struct sched_group *first = NULL, *last = NULL; 7152 struct sched_group *first = NULL, *last = NULL;
7036 struct sd_data *sdd = sd->private; 7153 struct sd_data *sdd = sd->private;
@@ -7038,6 +7155,12 @@ build_sched_groups(struct sched_domain *sd)
7038 struct cpumask *covered; 7155 struct cpumask *covered;
7039 int i; 7156 int i;
7040 7157
7158 get_group(cpu, sdd, &sd->groups);
7159 atomic_inc(&sd->groups->ref);
7160
7161 if (cpu != cpumask_first(sched_domain_span(sd)))
7162 return 0;
7163
7041 lockdep_assert_held(&sched_domains_mutex); 7164 lockdep_assert_held(&sched_domains_mutex);
7042 covered = sched_domains_tmpmask; 7165 covered = sched_domains_tmpmask;
7043 7166
@@ -7052,7 +7175,7 @@ build_sched_groups(struct sched_domain *sd)
7052 continue; 7175 continue;
7053 7176
7054 cpumask_clear(sched_group_cpus(sg)); 7177 cpumask_clear(sched_group_cpus(sg));
7055 sg->cpu_power = 0; 7178 sg->sgp->power = 0;
7056 7179
7057 for_each_cpu(j, span) { 7180 for_each_cpu(j, span) {
7058 if (get_group(j, sdd, NULL) != group) 7181 if (get_group(j, sdd, NULL) != group)
@@ -7069,6 +7192,8 @@ build_sched_groups(struct sched_domain *sd)
7069 last = sg; 7192 last = sg;
7070 } 7193 }
7071 last->next = first; 7194 last->next = first;
7195
7196 return 0;
7072} 7197}
7073 7198
7074/* 7199/*
@@ -7083,12 +7208,17 @@ build_sched_groups(struct sched_domain *sd)
7083 */ 7208 */
7084static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7209static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7085{ 7210{
7086 WARN_ON(!sd || !sd->groups); 7211 struct sched_group *sg = sd->groups;
7087 7212
7088 if (cpu != group_first_cpu(sd->groups)) 7213 WARN_ON(!sd || !sg);
7089 return; 7214
7215 do {
7216 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7217 sg = sg->next;
7218 } while (sg != sd->groups);
7090 7219
7091 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7220 if (cpu != group_first_cpu(sg))
7221 return;
7092 7222
7093 update_group_power(sd, cpu); 7223 update_group_power(sd, cpu);
7094} 7224}
@@ -7209,15 +7339,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7209static void claim_allocations(int cpu, struct sched_domain *sd) 7339static void claim_allocations(int cpu, struct sched_domain *sd)
7210{ 7340{
7211 struct sd_data *sdd = sd->private; 7341 struct sd_data *sdd = sd->private;
7212 struct sched_group *sg = sd->groups;
7213 7342
7214 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 7343 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7215 *per_cpu_ptr(sdd->sd, cpu) = NULL; 7344 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7216 7345
7217 if (cpu == cpumask_first(sched_group_cpus(sg))) { 7346 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7218 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7219 *per_cpu_ptr(sdd->sg, cpu) = NULL; 7347 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7220 } 7348
7349 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7350 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7221} 7351}
7222 7352
7223#ifdef CONFIG_SCHED_SMT 7353#ifdef CONFIG_SCHED_SMT
@@ -7242,7 +7372,7 @@ static struct sched_domain_topology_level default_topology[] = {
7242#endif 7372#endif
7243 { sd_init_CPU, cpu_cpu_mask, }, 7373 { sd_init_CPU, cpu_cpu_mask, },
7244#ifdef CONFIG_NUMA 7374#ifdef CONFIG_NUMA
7245 { sd_init_NODE, cpu_node_mask, }, 7375 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7246 { sd_init_ALLNODES, cpu_allnodes_mask, }, 7376 { sd_init_ALLNODES, cpu_allnodes_mask, },
7247#endif 7377#endif
7248 { NULL, }, 7378 { NULL, },
@@ -7266,9 +7396,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
7266 if (!sdd->sg) 7396 if (!sdd->sg)
7267 return -ENOMEM; 7397 return -ENOMEM;
7268 7398
7399 sdd->sgp = alloc_percpu(struct sched_group_power *);
7400 if (!sdd->sgp)
7401 return -ENOMEM;
7402
7269 for_each_cpu(j, cpu_map) { 7403 for_each_cpu(j, cpu_map) {
7270 struct sched_domain *sd; 7404 struct sched_domain *sd;
7271 struct sched_group *sg; 7405 struct sched_group *sg;
7406 struct sched_group_power *sgp;
7272 7407
7273 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 7408 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7274 GFP_KERNEL, cpu_to_node(j)); 7409 GFP_KERNEL, cpu_to_node(j));
@@ -7283,6 +7418,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
7283 return -ENOMEM; 7418 return -ENOMEM;
7284 7419
7285 *per_cpu_ptr(sdd->sg, j) = sg; 7420 *per_cpu_ptr(sdd->sg, j) = sg;
7421
7422 sgp = kzalloc_node(sizeof(struct sched_group_power),
7423 GFP_KERNEL, cpu_to_node(j));
7424 if (!sgp)
7425 return -ENOMEM;
7426
7427 *per_cpu_ptr(sdd->sgp, j) = sgp;
7286 } 7428 }
7287 } 7429 }
7288 7430
@@ -7298,11 +7440,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
7298 struct sd_data *sdd = &tl->data; 7440 struct sd_data *sdd = &tl->data;
7299 7441
7300 for_each_cpu(j, cpu_map) { 7442 for_each_cpu(j, cpu_map) {
7301 kfree(*per_cpu_ptr(sdd->sd, j)); 7443 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7444 if (sd && (sd->flags & SD_OVERLAP))
7445 free_sched_groups(sd->groups, 0);
7302 kfree(*per_cpu_ptr(sdd->sg, j)); 7446 kfree(*per_cpu_ptr(sdd->sg, j));
7447 kfree(*per_cpu_ptr(sdd->sgp, j));
7303 } 7448 }
7304 free_percpu(sdd->sd); 7449 free_percpu(sdd->sd);
7305 free_percpu(sdd->sg); 7450 free_percpu(sdd->sg);
7451 free_percpu(sdd->sgp);
7306 } 7452 }
7307} 7453}
7308 7454
@@ -7348,8 +7494,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
7348 struct sched_domain_topology_level *tl; 7494 struct sched_domain_topology_level *tl;
7349 7495
7350 sd = NULL; 7496 sd = NULL;
7351 for (tl = sched_domain_topology; tl->init; tl++) 7497 for (tl = sched_domain_topology; tl->init; tl++) {
7352 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 7498 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7499 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7500 sd->flags |= SD_OVERLAP;
7501 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7502 break;
7503 }
7353 7504
7354 while (sd->child) 7505 while (sd->child)
7355 sd = sd->child; 7506 sd = sd->child;
@@ -7361,13 +7512,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
7361 for_each_cpu(i, cpu_map) { 7512 for_each_cpu(i, cpu_map) {
7362 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7513 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7363 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 7514 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7364 get_group(i, sd->private, &sd->groups); 7515 if (sd->flags & SD_OVERLAP) {
7365 atomic_inc(&sd->groups->ref); 7516 if (build_overlap_sched_groups(sd, i))
7366 7517 goto error;
7367 if (i != cpumask_first(sched_domain_span(sd))) 7518 } else {
7368 continue; 7519 if (build_sched_groups(sd, i))
7369 7520 goto error;
7370 build_sched_groups(sd); 7521 }
7371 } 7522 }
7372 } 7523 }
7373 7524
@@ -7777,17 +7928,10 @@ int in_sched_functions(unsigned long addr)
7777 && addr < (unsigned long)__sched_text_end); 7928 && addr < (unsigned long)__sched_text_end);
7778} 7929}
7779 7930
7780static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7931static void init_cfs_rq(struct cfs_rq *cfs_rq)
7781{ 7932{
7782 cfs_rq->tasks_timeline = RB_ROOT; 7933 cfs_rq->tasks_timeline = RB_ROOT;
7783 INIT_LIST_HEAD(&cfs_rq->tasks); 7934 INIT_LIST_HEAD(&cfs_rq->tasks);
7784#ifdef CONFIG_FAIR_GROUP_SCHED
7785 cfs_rq->rq = rq;
7786 /* allow initial update_cfs_load() to truncate */
7787#ifdef CONFIG_SMP
7788 cfs_rq->load_stamp = 1;
7789#endif
7790#endif
7791 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7935 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7792#ifndef CONFIG_64BIT 7936#ifndef CONFIG_64BIT
7793 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 7937 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7807,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7807 /* delimiter for bitsearch: */ 7951 /* delimiter for bitsearch: */
7808 __set_bit(MAX_RT_PRIO, array->bitmap); 7952 __set_bit(MAX_RT_PRIO, array->bitmap);
7809 7953
7810#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 7954#if defined CONFIG_SMP
7811 rt_rq->highest_prio.curr = MAX_RT_PRIO; 7955 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7812#ifdef CONFIG_SMP
7813 rt_rq->highest_prio.next = MAX_RT_PRIO; 7956 rt_rq->highest_prio.next = MAX_RT_PRIO;
7814#endif
7815#endif
7816#ifdef CONFIG_SMP
7817 rt_rq->rt_nr_migratory = 0; 7957 rt_rq->rt_nr_migratory = 0;
7818 rt_rq->overloaded = 0; 7958 rt_rq->overloaded = 0;
7819 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 7959 plist_head_init(&rt_rq->pushable_tasks);
7820#endif 7960#endif
7821 7961
7822 rt_rq->rt_time = 0; 7962 rt_rq->rt_time = 0;
7823 rt_rq->rt_throttled = 0; 7963 rt_rq->rt_throttled = 0;
7824 rt_rq->rt_runtime = 0; 7964 rt_rq->rt_runtime = 0;
7825 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 7965 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
7826
7827#ifdef CONFIG_RT_GROUP_SCHED
7828 rt_rq->rt_nr_boosted = 0;
7829 rt_rq->rq = rq;
7830#endif
7831} 7966}
7832 7967
7833#ifdef CONFIG_FAIR_GROUP_SCHED 7968#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7836,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7836 struct sched_entity *parent) 7971 struct sched_entity *parent)
7837{ 7972{
7838 struct rq *rq = cpu_rq(cpu); 7973 struct rq *rq = cpu_rq(cpu);
7839 tg->cfs_rq[cpu] = cfs_rq; 7974
7840 init_cfs_rq(cfs_rq, rq);
7841 cfs_rq->tg = tg; 7975 cfs_rq->tg = tg;
7976 cfs_rq->rq = rq;
7977#ifdef CONFIG_SMP
7978 /* allow initial update_cfs_load() to truncate */
7979 cfs_rq->load_stamp = 1;
7980#endif
7842 7981
7982 tg->cfs_rq[cpu] = cfs_rq;
7843 tg->se[cpu] = se; 7983 tg->se[cpu] = se;
7984
7844 /* se could be NULL for root_task_group */ 7985 /* se could be NULL for root_task_group */
7845 if (!se) 7986 if (!se)
7846 return; 7987 return;
@@ -7863,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7863{ 8004{
7864 struct rq *rq = cpu_rq(cpu); 8005 struct rq *rq = cpu_rq(cpu);
7865 8006
7866 tg->rt_rq[cpu] = rt_rq; 8007 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7867 init_rt_rq(rt_rq, rq); 8008 rt_rq->rt_nr_boosted = 0;
8009 rt_rq->rq = rq;
7868 rt_rq->tg = tg; 8010 rt_rq->tg = tg;
7869 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7870 8011
8012 tg->rt_rq[cpu] = rt_rq;
7871 tg->rt_se[cpu] = rt_se; 8013 tg->rt_se[cpu] = rt_se;
8014
7872 if (!rt_se) 8015 if (!rt_se)
7873 return; 8016 return;
7874 8017
@@ -7950,7 +8093,7 @@ void __init sched_init(void)
7950 rq->nr_running = 0; 8093 rq->nr_running = 0;
7951 rq->calc_load_active = 0; 8094 rq->calc_load_active = 0;
7952 rq->calc_load_update = jiffies + LOAD_FREQ; 8095 rq->calc_load_update = jiffies + LOAD_FREQ;
7953 init_cfs_rq(&rq->cfs, rq); 8096 init_cfs_rq(&rq->cfs);
7954 init_rt_rq(&rq->rt, rq); 8097 init_rt_rq(&rq->rt, rq);
7955#ifdef CONFIG_FAIR_GROUP_SCHED 8098#ifdef CONFIG_FAIR_GROUP_SCHED
7956 root_task_group.shares = root_task_group_load; 8099 root_task_group.shares = root_task_group_load;
@@ -8021,7 +8164,7 @@ void __init sched_init(void)
8021#endif 8164#endif
8022 8165
8023#ifdef CONFIG_RT_MUTEXES 8166#ifdef CONFIG_RT_MUTEXES
8024 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 8167 plist_head_init(&init_task.pi_waiters);
8025#endif 8168#endif
8026 8169
8027 /* 8170 /*
@@ -8064,7 +8207,7 @@ void __init sched_init(void)
8064 scheduler_running = 1; 8207 scheduler_running = 1;
8065} 8208}
8066 8209
8067#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 8210#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8068static inline int preempt_count_equals(int preempt_offset) 8211static inline int preempt_count_equals(int preempt_offset)
8069{ 8212{
8070 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8213 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8074,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset)
8074 8217
8075void __might_sleep(const char *file, int line, int preempt_offset) 8218void __might_sleep(const char *file, int line, int preempt_offset)
8076{ 8219{
8077#ifdef in_atomic
8078 static unsigned long prev_jiffy; /* ratelimiting */ 8220 static unsigned long prev_jiffy; /* ratelimiting */
8079 8221
8080 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 8222 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8096,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
8096 if (irqs_disabled()) 8238 if (irqs_disabled())
8097 print_irqtrace_events(current); 8239 print_irqtrace_events(current);
8098 dump_stack(); 8240 dump_stack();
8099#endif
8100} 8241}
8101EXPORT_SYMBOL(__might_sleep); 8242EXPORT_SYMBOL(__might_sleep);
8102#endif 8243#endif
@@ -8255,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8255 if (!se) 8396 if (!se)
8256 goto err_free_rq; 8397 goto err_free_rq;
8257 8398
8399 init_cfs_rq(cfs_rq);
8258 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8400 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8259 } 8401 }
8260 8402
@@ -8282,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8282 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); 8424 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8283 raw_spin_unlock_irqrestore(&rq->lock, flags); 8425 raw_spin_unlock_irqrestore(&rq->lock, flags);
8284} 8426}
8285#else /* !CONFG_FAIR_GROUP_SCHED */ 8427#else /* !CONFIG_FAIR_GROUP_SCHED */
8286static inline void free_fair_sched_group(struct task_group *tg) 8428static inline void free_fair_sched_group(struct task_group *tg)
8287{ 8429{
8288} 8430}
@@ -8303,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg)
8303{ 8445{
8304 int i; 8446 int i;
8305 8447
8306 destroy_rt_bandwidth(&tg->rt_bandwidth); 8448 if (tg->rt_se)
8449 destroy_rt_bandwidth(&tg->rt_bandwidth);
8307 8450
8308 for_each_possible_cpu(i) { 8451 for_each_possible_cpu(i) {
8309 if (tg->rt_rq) 8452 if (tg->rt_rq)
@@ -8344,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8344 if (!rt_se) 8487 if (!rt_se)
8345 goto err_free_rq; 8488 goto err_free_rq;
8346 8489
8490 init_rt_rq(rt_rq, cpu_rq(i));
8491 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8347 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 8492 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8348 } 8493 }
8349 8494
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 05577055cfca..c2f0e7248dca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -13,6 +13,7 @@ struct autogroup {
13 int nice; 13 int nice;
14}; 14};
15 15
16static inline bool task_group_is_autogroup(struct task_group *tg);
16static inline struct task_group * 17static inline struct task_group *
17autogroup_task_group(struct task_struct *p, struct task_group *tg); 18autogroup_task_group(struct task_struct *p, struct task_group *tg);
18 19
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 433491c2dc8f..bc8ee9993814 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
135 return grp->my_q; 135 return grp->my_q;
136} 136}
137 137
138/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
139 * another cpu ('this_cpu')
140 */
141static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
142{
143 return cfs_rq->tg->cfs_rq[this_cpu];
144}
145
146static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 138static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
147{ 139{
148 if (!cfs_rq->on_list) { 140 if (!cfs_rq->on_list) {
@@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
271 return NULL; 263 return NULL;
272} 264}
273 265
274static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
275{
276 return &cpu_rq(this_cpu)->cfs;
277}
278
279static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 266static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
280{ 267{
281} 268}
@@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a,
334 return (s64)(a->vruntime - b->vruntime) < 0; 321 return (s64)(a->vruntime - b->vruntime) < 0;
335} 322}
336 323
337static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
338{
339 return se->vruntime - cfs_rq->min_vruntime;
340}
341
342static void update_min_vruntime(struct cfs_rq *cfs_rq) 324static void update_min_vruntime(struct cfs_rq *cfs_rq)
343{ 325{
344 u64 vruntime = cfs_rq->min_vruntime; 326 u64 vruntime = cfs_rq->min_vruntime;
@@ -372,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
372 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 354 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
373 struct rb_node *parent = NULL; 355 struct rb_node *parent = NULL;
374 struct sched_entity *entry; 356 struct sched_entity *entry;
375 s64 key = entity_key(cfs_rq, se);
376 int leftmost = 1; 357 int leftmost = 1;
377 358
378 /* 359 /*
@@ -385,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
385 * We dont care about collisions. Nodes with 366 * We dont care about collisions. Nodes with
386 * the same key stay together. 367 * the same key stay together.
387 */ 368 */
388 if (key < entity_key(cfs_rq, entry)) { 369 if (entity_before(se, entry)) {
389 link = &parent->rb_left; 370 link = &parent->rb_left;
390 } else { 371 } else {
391 link = &parent->rb_right; 372 link = &parent->rb_right;
@@ -1336,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1336 } 1317 }
1337 1318
1338 for_each_sched_entity(se) { 1319 for_each_sched_entity(se) {
1339 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1320 cfs_rq = cfs_rq_of(se);
1340 1321
1341 update_cfs_load(cfs_rq, 0); 1322 update_cfs_load(cfs_rq, 0);
1342 update_cfs_shares(cfs_rq); 1323 update_cfs_shares(cfs_rq);
@@ -1370,13 +1351,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1370 */ 1351 */
1371 if (task_sleep && parent_entity(se)) 1352 if (task_sleep && parent_entity(se))
1372 set_next_buddy(parent_entity(se)); 1353 set_next_buddy(parent_entity(se));
1354
1355 /* avoid re-evaluating load for this entity */
1356 se = parent_entity(se);
1373 break; 1357 break;
1374 } 1358 }
1375 flags |= DEQUEUE_SLEEP; 1359 flags |= DEQUEUE_SLEEP;
1376 } 1360 }
1377 1361
1378 for_each_sched_entity(se) { 1362 for_each_sched_entity(se) {
1379 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 cfs_rq = cfs_rq_of(se);
1380 1364
1381 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1382 update_cfs_shares(cfs_rq); 1366 update_cfs_shares(cfs_rq);
@@ -1481,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1481 * effect of the currently running task from the load 1465 * effect of the currently running task from the load
1482 * of the current CPU: 1466 * of the current CPU:
1483 */ 1467 */
1484 rcu_read_lock();
1485 if (sync) { 1468 if (sync) {
1486 tg = task_group(current); 1469 tg = task_group(current);
1487 weight = current->se.load.weight; 1470 weight = current->se.load.weight;
@@ -1517,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1517 balanced = this_eff_load <= prev_eff_load; 1500 balanced = this_eff_load <= prev_eff_load;
1518 } else 1501 } else
1519 balanced = true; 1502 balanced = true;
1520 rcu_read_unlock();
1521 1503
1522 /* 1504 /*
1523 * If the currently running task will sleep within 1505 * If the currently running task will sleep within
@@ -1585,7 +1567,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1585 } 1567 }
1586 1568
1587 /* Adjust by relative CPU power of the group */ 1569 /* Adjust by relative CPU power of the group */
1588 avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; 1570 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
1589 1571
1590 if (local_group) { 1572 if (local_group) {
1591 this_load = avg_load; 1573 this_load = avg_load;
@@ -1921,8 +1903,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1921 if (!sched_feat(WAKEUP_PREEMPT)) 1903 if (!sched_feat(WAKEUP_PREEMPT))
1922 return; 1904 return;
1923 1905
1924 update_curr(cfs_rq);
1925 find_matching_se(&se, &pse); 1906 find_matching_se(&se, &pse);
1907 update_curr(cfs_rq_of(se));
1926 BUG_ON(!pse); 1908 BUG_ON(!pse);
1927 if (wakeup_preempt_entity(se, pse) == 1) { 1909 if (wakeup_preempt_entity(se, pse) == 1) {
1928 /* 1910 /*
@@ -2231,11 +2213,43 @@ static void update_shares(int cpu)
2231 struct rq *rq = cpu_rq(cpu); 2213 struct rq *rq = cpu_rq(cpu);
2232 2214
2233 rcu_read_lock(); 2215 rcu_read_lock();
2216 /*
2217 * Iterates the task_group tree in a bottom up fashion, see
2218 * list_add_leaf_cfs_rq() for details.
2219 */
2234 for_each_leaf_cfs_rq(rq, cfs_rq) 2220 for_each_leaf_cfs_rq(rq, cfs_rq)
2235 update_shares_cpu(cfs_rq->tg, cpu); 2221 update_shares_cpu(cfs_rq->tg, cpu);
2236 rcu_read_unlock(); 2222 rcu_read_unlock();
2237} 2223}
2238 2224
2225/*
2226 * Compute the cpu's hierarchical load factor for each task group.
2227 * This needs to be done in a top-down fashion because the load of a child
2228 * group is a fraction of its parents load.
2229 */
2230static int tg_load_down(struct task_group *tg, void *data)
2231{
2232 unsigned long load;
2233 long cpu = (long)data;
2234
2235 if (!tg->parent) {
2236 load = cpu_rq(cpu)->load.weight;
2237 } else {
2238 load = tg->parent->cfs_rq[cpu]->h_load;
2239 load *= tg->se[cpu]->load.weight;
2240 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
2241 }
2242
2243 tg->cfs_rq[cpu]->h_load = load;
2244
2245 return 0;
2246}
2247
2248static void update_h_load(long cpu)
2249{
2250 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
2251}
2252
2239static unsigned long 2253static unsigned long
2240load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2254load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2241 unsigned long max_load_move, 2255 unsigned long max_load_move,
@@ -2243,14 +2257,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2243 int *all_pinned) 2257 int *all_pinned)
2244{ 2258{
2245 long rem_load_move = max_load_move; 2259 long rem_load_move = max_load_move;
2246 int busiest_cpu = cpu_of(busiest); 2260 struct cfs_rq *busiest_cfs_rq;
2247 struct task_group *tg;
2248 2261
2249 rcu_read_lock(); 2262 rcu_read_lock();
2250 update_h_load(busiest_cpu); 2263 update_h_load(cpu_of(busiest));
2251 2264
2252 list_for_each_entry_rcu(tg, &task_groups, list) { 2265 for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
2253 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
2254 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 2266 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
2255 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 2267 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
2256 u64 rem_load, moved_load; 2268 u64 rem_load, moved_load;
@@ -2631,7 +2643,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2631 power >>= SCHED_POWER_SHIFT; 2643 power >>= SCHED_POWER_SHIFT;
2632 } 2644 }
2633 2645
2634 sdg->cpu_power_orig = power; 2646 sdg->sgp->power_orig = power;
2635 2647
2636 if (sched_feat(ARCH_POWER)) 2648 if (sched_feat(ARCH_POWER))
2637 power *= arch_scale_freq_power(sd, cpu); 2649 power *= arch_scale_freq_power(sd, cpu);
@@ -2647,7 +2659,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2647 power = 1; 2659 power = 1;
2648 2660
2649 cpu_rq(cpu)->cpu_power = power; 2661 cpu_rq(cpu)->cpu_power = power;
2650 sdg->cpu_power = power; 2662 sdg->sgp->power = power;
2651} 2663}
2652 2664
2653static void update_group_power(struct sched_domain *sd, int cpu) 2665static void update_group_power(struct sched_domain *sd, int cpu)
@@ -2665,11 +2677,11 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2665 2677
2666 group = child->groups; 2678 group = child->groups;
2667 do { 2679 do {
2668 power += group->cpu_power; 2680 power += group->sgp->power;
2669 group = group->next; 2681 group = group->next;
2670 } while (group != child->groups); 2682 } while (group != child->groups);
2671 2683
2672 sdg->cpu_power = power; 2684 sdg->sgp->power = power;
2673} 2685}
2674 2686
2675/* 2687/*
@@ -2691,7 +2703,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2691 /* 2703 /*
2692 * If ~90% of the cpu_power is still there, we're good. 2704 * If ~90% of the cpu_power is still there, we're good.
2693 */ 2705 */
2694 if (group->cpu_power * 32 > group->cpu_power_orig * 29) 2706 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
2695 return 1; 2707 return 1;
2696 2708
2697 return 0; 2709 return 0;
@@ -2771,7 +2783,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2771 } 2783 }
2772 2784
2773 /* Adjust by relative CPU power of the group */ 2785 /* Adjust by relative CPU power of the group */
2774 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; 2786 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
2775 2787
2776 /* 2788 /*
2777 * Consider the group unbalanced when the imbalance is larger 2789 * Consider the group unbalanced when the imbalance is larger
@@ -2788,7 +2800,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2788 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 2800 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2789 sgs->group_imb = 1; 2801 sgs->group_imb = 1;
2790 2802
2791 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, 2803 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
2792 SCHED_POWER_SCALE); 2804 SCHED_POWER_SCALE);
2793 if (!sgs->group_capacity) 2805 if (!sgs->group_capacity)
2794 sgs->group_capacity = fix_small_capacity(sd, group); 2806 sgs->group_capacity = fix_small_capacity(sd, group);
@@ -2877,7 +2889,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2877 return; 2889 return;
2878 2890
2879 sds->total_load += sgs.group_load; 2891 sds->total_load += sgs.group_load;
2880 sds->total_pwr += sg->cpu_power; 2892 sds->total_pwr += sg->sgp->power;
2881 2893
2882 /* 2894 /*
2883 * In case the child domain prefers tasks go to siblings 2895 * In case the child domain prefers tasks go to siblings
@@ -2962,7 +2974,7 @@ static int check_asym_packing(struct sched_domain *sd,
2962 if (this_cpu > busiest_cpu) 2974 if (this_cpu > busiest_cpu)
2963 return 0; 2975 return 0;
2964 2976
2965 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2977 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
2966 SCHED_POWER_SCALE); 2978 SCHED_POWER_SCALE);
2967 return 1; 2979 return 1;
2968} 2980}
@@ -2993,7 +3005,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2993 3005
2994 scaled_busy_load_per_task = sds->busiest_load_per_task 3006 scaled_busy_load_per_task = sds->busiest_load_per_task
2995 * SCHED_POWER_SCALE; 3007 * SCHED_POWER_SCALE;
2996 scaled_busy_load_per_task /= sds->busiest->cpu_power; 3008 scaled_busy_load_per_task /= sds->busiest->sgp->power;
2997 3009
2998 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 3010 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2999 (scaled_busy_load_per_task * imbn)) { 3011 (scaled_busy_load_per_task * imbn)) {
@@ -3007,28 +3019,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3007 * moving them. 3019 * moving them.
3008 */ 3020 */
3009 3021
3010 pwr_now += sds->busiest->cpu_power * 3022 pwr_now += sds->busiest->sgp->power *
3011 min(sds->busiest_load_per_task, sds->max_load); 3023 min(sds->busiest_load_per_task, sds->max_load);
3012 pwr_now += sds->this->cpu_power * 3024 pwr_now += sds->this->sgp->power *
3013 min(sds->this_load_per_task, sds->this_load); 3025 min(sds->this_load_per_task, sds->this_load);
3014 pwr_now /= SCHED_POWER_SCALE; 3026 pwr_now /= SCHED_POWER_SCALE;
3015 3027
3016 /* Amount of load we'd subtract */ 3028 /* Amount of load we'd subtract */
3017 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 3029 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3018 sds->busiest->cpu_power; 3030 sds->busiest->sgp->power;
3019 if (sds->max_load > tmp) 3031 if (sds->max_load > tmp)
3020 pwr_move += sds->busiest->cpu_power * 3032 pwr_move += sds->busiest->sgp->power *
3021 min(sds->busiest_load_per_task, sds->max_load - tmp); 3033 min(sds->busiest_load_per_task, sds->max_load - tmp);
3022 3034
3023 /* Amount of load we'd add */ 3035 /* Amount of load we'd add */
3024 if (sds->max_load * sds->busiest->cpu_power < 3036 if (sds->max_load * sds->busiest->sgp->power <
3025 sds->busiest_load_per_task * SCHED_POWER_SCALE) 3037 sds->busiest_load_per_task * SCHED_POWER_SCALE)
3026 tmp = (sds->max_load * sds->busiest->cpu_power) / 3038 tmp = (sds->max_load * sds->busiest->sgp->power) /
3027 sds->this->cpu_power; 3039 sds->this->sgp->power;
3028 else 3040 else
3029 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 3041 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
3030 sds->this->cpu_power; 3042 sds->this->sgp->power;
3031 pwr_move += sds->this->cpu_power * 3043 pwr_move += sds->this->sgp->power *
3032 min(sds->this_load_per_task, sds->this_load + tmp); 3044 min(sds->this_load_per_task, sds->this_load + tmp);
3033 pwr_move /= SCHED_POWER_SCALE; 3045 pwr_move /= SCHED_POWER_SCALE;
3034 3046
@@ -3074,7 +3086,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3074 3086
3075 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 3087 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
3076 3088
3077 load_above_capacity /= sds->busiest->cpu_power; 3089 load_above_capacity /= sds->busiest->sgp->power;
3078 } 3090 }
3079 3091
3080 /* 3092 /*
@@ -3090,8 +3102,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3090 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3102 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
3091 3103
3092 /* How much load to actually move to equalise the imbalance */ 3104 /* How much load to actually move to equalise the imbalance */
3093 *imbalance = min(max_pull * sds->busiest->cpu_power, 3105 *imbalance = min(max_pull * sds->busiest->sgp->power,
3094 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3106 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
3095 / SCHED_POWER_SCALE; 3107 / SCHED_POWER_SCALE;
3096 3108
3097 /* 3109 /*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f7371ee1..2e74677cb040 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,12 +61,14 @@ SCHED_FEAT(LB_BIAS, 1)
61SCHED_FEAT(OWNER_SPIN, 1) 61SCHED_FEAT(OWNER_SPIN, 1)
62 62
63/* 63/*
64 * Decrement CPU power based on irq activity 64 * Decrement CPU power based on time not spent running tasks
65 */ 65 */
66SCHED_FEAT(NONIRQ_POWER, 1) 66SCHED_FEAT(NONTASK_POWER, 1)
67 67
68/* 68/*
69 * Queue remote wakeups on the target CPU and process them 69 * Queue remote wakeups on the target CPU and process them
70 * using the scheduler IPI. Reduces rq->lock contention/bounces. 70 * using the scheduler IPI. Reduces rq->lock contention/bounces.
71 */ 71 */
72SCHED_FEAT(TTWU_QUEUE, 1) 72SCHED_FEAT(TTWU_QUEUE, 1)
73
74SCHED_FEAT(FORCE_SD_OVERLAP, 0)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 10d018212bab..97540f0c9e47 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -185,11 +185,23 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
185 185
186typedef struct task_group *rt_rq_iter_t; 186typedef struct task_group *rt_rq_iter_t;
187 187
188#define for_each_rt_rq(rt_rq, iter, rq) \ 188static inline struct task_group *next_task_group(struct task_group *tg)
189 for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ 189{
190 (&iter->list != &task_groups) && \ 190 do {
191 (rt_rq = iter->rt_rq[cpu_of(rq)]); \ 191 tg = list_entry_rcu(tg->list.next,
192 iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) 192 typeof(struct task_group), list);
193 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
194
195 if (&tg->list == &task_groups)
196 tg = NULL;
197
198 return tg;
199}
200
201#define for_each_rt_rq(rt_rq, iter, rq) \
202 for (iter = container_of(&task_groups, typeof(*iter), list); \
203 (iter = next_task_group(iter)) && \
204 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
193 205
194static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 206static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
195{ 207{
@@ -1126,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1126 1138
1127 rt_rq = &rq->rt; 1139 rt_rq = &rq->rt;
1128 1140
1129 if (unlikely(!rt_rq->rt_nr_running)) 1141 if (!rt_rq->rt_nr_running)
1130 return NULL; 1142 return NULL;
1131 1143
1132 if (rt_rq_throttled(rt_rq)) 1144 if (rt_rq_throttled(rt_rq))
@@ -1548,7 +1560,7 @@ skip:
1548static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1560static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1549{ 1561{
1550 /* Try to pull RT tasks here if we lower this rq's prio */ 1562 /* Try to pull RT tasks here if we lower this rq's prio */
1551 if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) 1563 if (rq->rt.highest_prio.curr > prev->prio)
1552 pull_rt_task(rq); 1564 pull_rt_task(rq);
1553} 1565}
1554 1566
diff --git a/kernel/signal.c b/kernel/signal.c
index 415d85d6f6c6..291c9700be75 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
87 /* 87 /*
88 * Tracers may want to know about even ignored signals. 88 * Tracers may want to know about even ignored signals.
89 */ 89 */
90 return !tracehook_consider_ignored_signal(t, sig); 90 return !t->ptrace;
91} 91}
92 92
93/* 93/*
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
124 124
125static int recalc_sigpending_tsk(struct task_struct *t) 125static int recalc_sigpending_tsk(struct task_struct *t)
126{ 126{
127 if ((t->group_stop & GROUP_STOP_PENDING) || 127 if ((t->jobctl & JOBCTL_PENDING_MASK) ||
128 PENDING(&t->pending, &t->blocked) || 128 PENDING(&t->pending, &t->blocked) ||
129 PENDING(&t->signal->shared_pending, &t->blocked)) { 129 PENDING(&t->signal->shared_pending, &t->blocked)) {
130 set_tsk_thread_flag(t, TIF_SIGPENDING); 130 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
150 150
151void recalc_sigpending(void) 151void recalc_sigpending(void)
152{ 152{
153 if (unlikely(tracehook_force_sigpending())) 153 if (!recalc_sigpending_tsk(current) && !freezing(current))
154 set_thread_flag(TIF_SIGPENDING);
155 else if (!recalc_sigpending_tsk(current) && !freezing(current))
156 clear_thread_flag(TIF_SIGPENDING); 154 clear_thread_flag(TIF_SIGPENDING);
157 155
158} 156}
@@ -224,47 +222,93 @@ static inline void print_dropped_signal(int sig)
224} 222}
225 223
226/** 224/**
227 * task_clear_group_stop_trapping - clear group stop trapping bit 225 * task_set_jobctl_pending - set jobctl pending bits
228 * @task: target task 226 * @task: target task
227 * @mask: pending bits to set
229 * 228 *
230 * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it 229 * Clear @mask from @task->jobctl. @mask must be subset of
231 * and wake up the ptracer. Note that we don't need any further locking. 230 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
232 * @task->siglock guarantees that @task->parent points to the ptracer. 231 * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is
232 * cleared. If @task is already being killed or exiting, this function
233 * becomes noop.
234 *
235 * CONTEXT:
236 * Must be called with @task->sighand->siglock held.
237 *
238 * RETURNS:
239 * %true if @mask is set, %false if made noop because @task was dying.
240 */
241bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
242{
243 BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
244 JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
245 BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
246
247 if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
248 return false;
249
250 if (mask & JOBCTL_STOP_SIGMASK)
251 task->jobctl &= ~JOBCTL_STOP_SIGMASK;
252
253 task->jobctl |= mask;
254 return true;
255}
256
257/**
258 * task_clear_jobctl_trapping - clear jobctl trapping bit
259 * @task: target task
260 *
261 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
262 * Clear it and wake up the ptracer. Note that we don't need any further
263 * locking. @task->siglock guarantees that @task->parent points to the
264 * ptracer.
233 * 265 *
234 * CONTEXT: 266 * CONTEXT:
235 * Must be called with @task->sighand->siglock held. 267 * Must be called with @task->sighand->siglock held.
236 */ 268 */
237static void task_clear_group_stop_trapping(struct task_struct *task) 269void task_clear_jobctl_trapping(struct task_struct *task)
238{ 270{
239 if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { 271 if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
240 task->group_stop &= ~GROUP_STOP_TRAPPING; 272 task->jobctl &= ~JOBCTL_TRAPPING;
241 __wake_up_sync_key(&task->parent->signal->wait_chldexit, 273 wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
242 TASK_UNINTERRUPTIBLE, 1, task);
243 } 274 }
244} 275}
245 276
246/** 277/**
247 * task_clear_group_stop_pending - clear pending group stop 278 * task_clear_jobctl_pending - clear jobctl pending bits
248 * @task: target task 279 * @task: target task
280 * @mask: pending bits to clear
249 * 281 *
250 * Clear group stop states for @task. 282 * Clear @mask from @task->jobctl. @mask must be subset of
283 * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other
284 * STOP bits are cleared together.
285 *
286 * If clearing of @mask leaves no stop or trap pending, this function calls
287 * task_clear_jobctl_trapping().
251 * 288 *
252 * CONTEXT: 289 * CONTEXT:
253 * Must be called with @task->sighand->siglock held. 290 * Must be called with @task->sighand->siglock held.
254 */ 291 */
255void task_clear_group_stop_pending(struct task_struct *task) 292void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
256{ 293{
257 task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | 294 BUG_ON(mask & ~JOBCTL_PENDING_MASK);
258 GROUP_STOP_DEQUEUED); 295
296 if (mask & JOBCTL_STOP_PENDING)
297 mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;
298
299 task->jobctl &= ~mask;
300
301 if (!(task->jobctl & JOBCTL_PENDING_MASK))
302 task_clear_jobctl_trapping(task);
259} 303}
260 304
261/** 305/**
262 * task_participate_group_stop - participate in a group stop 306 * task_participate_group_stop - participate in a group stop
263 * @task: task participating in a group stop 307 * @task: task participating in a group stop
264 * 308 *
265 * @task has GROUP_STOP_PENDING set and is participating in a group stop. 309 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
266 * Group stop states are cleared and the group stop count is consumed if 310 * Group stop states are cleared and the group stop count is consumed if
267 * %GROUP_STOP_CONSUME was set. If the consumption completes the group 311 * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
268 * stop, the appropriate %SIGNAL_* flags are set. 312 * stop, the appropriate %SIGNAL_* flags are set.
269 * 313 *
270 * CONTEXT: 314 * CONTEXT:
@@ -277,11 +321,11 @@ void task_clear_group_stop_pending(struct task_struct *task)
277static bool task_participate_group_stop(struct task_struct *task) 321static bool task_participate_group_stop(struct task_struct *task)
278{ 322{
279 struct signal_struct *sig = task->signal; 323 struct signal_struct *sig = task->signal;
280 bool consume = task->group_stop & GROUP_STOP_CONSUME; 324 bool consume = task->jobctl & JOBCTL_STOP_CONSUME;
281 325
282 WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); 326 WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));
283 327
284 task_clear_group_stop_pending(task); 328 task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);
285 329
286 if (!consume) 330 if (!consume)
287 return false; 331 return false;
@@ -449,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
449 return 1; 493 return 1;
450 if (handler != SIG_IGN && handler != SIG_DFL) 494 if (handler != SIG_IGN && handler != SIG_DFL)
451 return 0; 495 return 0;
452 return !tracehook_consider_fatal_signal(tsk, sig); 496 /* if ptraced, let the tracer determine */
497 return !tsk->ptrace;
453} 498}
454 499
455/* 500/*
@@ -604,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
604 * is to alert stop-signal processing code when another 649 * is to alert stop-signal processing code when another
605 * processor has come along and cleared the flag. 650 * processor has come along and cleared the flag.
606 */ 651 */
607 current->group_stop |= GROUP_STOP_DEQUEUED; 652 current->jobctl |= JOBCTL_STOP_DEQUEUED;
608 } 653 }
609 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 654 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
610 /* 655 /*
@@ -773,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info,
773 return security_task_kill(t, info, sig, 0); 818 return security_task_kill(t, info, sig, 0);
774} 819}
775 820
821/**
822 * ptrace_trap_notify - schedule trap to notify ptracer
823 * @t: tracee wanting to notify tracer
824 *
825 * This function schedules sticky ptrace trap which is cleared on the next
826 * TRAP_STOP to notify ptracer of an event. @t must have been seized by
827 * ptracer.
828 *
829 * If @t is running, STOP trap will be taken. If trapped for STOP and
830 * ptracer is listening for events, tracee is woken up so that it can
831 * re-trap for the new event. If trapped otherwise, STOP trap will be
832 * eventually taken without returning to userland after the existing traps
833 * are finished by PTRACE_CONT.
834 *
835 * CONTEXT:
836 * Must be called with @task->sighand->siglock held.
837 */
838static void ptrace_trap_notify(struct task_struct *t)
839{
840 WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
841 assert_spin_locked(&t->sighand->siglock);
842
843 task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
844 signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
845}
846
776/* 847/*
777 * Handle magic process-wide effects of stop/continue signals. Unlike 848 * Handle magic process-wide effects of stop/continue signals. Unlike
778 * the signal actions, these happen immediately at signal-generation 849 * the signal actions, these happen immediately at signal-generation
@@ -809,9 +880,12 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
809 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); 880 rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
810 t = p; 881 t = p;
811 do { 882 do {
812 task_clear_group_stop_pending(t); 883 task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
813 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); 884 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
814 wake_up_state(t, __TASK_STOPPED); 885 if (likely(!(t->ptrace & PT_SEIZED)))
886 wake_up_state(t, __TASK_STOPPED);
887 else
888 ptrace_trap_notify(t);
815 } while_each_thread(p, t); 889 } while_each_thread(p, t);
816 890
817 /* 891 /*
@@ -908,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
908 if (sig_fatal(p, sig) && 982 if (sig_fatal(p, sig) &&
909 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 983 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
910 !sigismember(&t->real_blocked, sig) && 984 !sigismember(&t->real_blocked, sig) &&
911 (sig == SIGKILL || 985 (sig == SIGKILL || !t->ptrace)) {
912 !tracehook_consider_fatal_signal(t, sig))) {
913 /* 986 /*
914 * This signal will be fatal to the whole group. 987 * This signal will be fatal to the whole group.
915 */ 988 */
@@ -925,7 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
925 signal->group_stop_count = 0; 998 signal->group_stop_count = 0;
926 t = p; 999 t = p;
927 do { 1000 do {
928 task_clear_group_stop_pending(t); 1001 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
929 sigaddset(&t->pending.signal, SIGKILL); 1002 sigaddset(&t->pending.signal, SIGKILL);
930 signal_wake_up(t, 1); 1003 signal_wake_up(t, 1);
931 } while_each_thread(p, t); 1004 } while_each_thread(p, t);
@@ -1160,7 +1233,7 @@ int zap_other_threads(struct task_struct *p)
1160 p->signal->group_stop_count = 0; 1233 p->signal->group_stop_count = 0;
1161 1234
1162 while_each_thread(p, t) { 1235 while_each_thread(p, t) {
1163 task_clear_group_stop_pending(t); 1236 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
1164 count++; 1237 count++;
1165 1238
1166 /* Don't bother with already dead threads */ 1239 /* Don't bother with already dead threads */
@@ -1511,22 +1584,22 @@ ret:
1511 * Let a parent know about the death of a child. 1584 * Let a parent know about the death of a child.
1512 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1585 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1513 * 1586 *
1514 * Returns -1 if our parent ignored us and so we've switched to 1587 * Returns true if our parent ignored us and so we've switched to
1515 * self-reaping, or else @sig. 1588 * self-reaping.
1516 */ 1589 */
1517int do_notify_parent(struct task_struct *tsk, int sig) 1590bool do_notify_parent(struct task_struct *tsk, int sig)
1518{ 1591{
1519 struct siginfo info; 1592 struct siginfo info;
1520 unsigned long flags; 1593 unsigned long flags;
1521 struct sighand_struct *psig; 1594 struct sighand_struct *psig;
1522 int ret = sig; 1595 bool autoreap = false;
1523 1596
1524 BUG_ON(sig == -1); 1597 BUG_ON(sig == -1);
1525 1598
1526 /* do_notify_parent_cldstop should have been called instead. */ 1599 /* do_notify_parent_cldstop should have been called instead. */
1527 BUG_ON(task_is_stopped_or_traced(tsk)); 1600 BUG_ON(task_is_stopped_or_traced(tsk));
1528 1601
1529 BUG_ON(!task_ptrace(tsk) && 1602 BUG_ON(!tsk->ptrace &&
1530 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1603 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1531 1604
1532 info.si_signo = sig; 1605 info.si_signo = sig;
@@ -1565,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1565 1638
1566 psig = tsk->parent->sighand; 1639 psig = tsk->parent->sighand;
1567 spin_lock_irqsave(&psig->siglock, flags); 1640 spin_lock_irqsave(&psig->siglock, flags);
1568 if (!task_ptrace(tsk) && sig == SIGCHLD && 1641 if (!tsk->ptrace && sig == SIGCHLD &&
1569 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1642 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1570 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1643 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1571 /* 1644 /*
@@ -1583,16 +1656,16 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1583 * is implementation-defined: we do (if you don't want 1656 * is implementation-defined: we do (if you don't want
1584 * it, just use SIG_IGN instead). 1657 * it, just use SIG_IGN instead).
1585 */ 1658 */
1586 ret = tsk->exit_signal = -1; 1659 autoreap = true;
1587 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) 1660 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1588 sig = -1; 1661 sig = 0;
1589 } 1662 }
1590 if (valid_signal(sig) && sig > 0) 1663 if (valid_signal(sig) && sig)
1591 __group_send_sig_info(sig, &info, tsk->parent); 1664 __group_send_sig_info(sig, &info, tsk->parent);
1592 __wake_up_parent(tsk, tsk->parent); 1665 __wake_up_parent(tsk, tsk->parent);
1593 spin_unlock_irqrestore(&psig->siglock, flags); 1666 spin_unlock_irqrestore(&psig->siglock, flags);
1594 1667
1595 return ret; 1668 return autoreap;
1596} 1669}
1597 1670
1598/** 1671/**
@@ -1665,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1665 1738
1666static inline int may_ptrace_stop(void) 1739static inline int may_ptrace_stop(void)
1667{ 1740{
1668 if (!likely(task_ptrace(current))) 1741 if (!likely(current->ptrace))
1669 return 0; 1742 return 0;
1670 /* 1743 /*
1671 * Are we in the middle of do_coredump? 1744 * Are we in the middle of do_coredump?
@@ -1694,15 +1767,6 @@ static int sigkill_pending(struct task_struct *tsk)
1694} 1767}
1695 1768
1696/* 1769/*
1697 * Test whether the target task of the usual cldstop notification - the
1698 * real_parent of @child - is in the same group as the ptracer.
1699 */
1700static bool real_parent_is_ptracer(struct task_struct *child)
1701{
1702 return same_thread_group(child->parent, child->real_parent);
1703}
1704
1705/*
1706 * This must be called with current->sighand->siglock held. 1770 * This must be called with current->sighand->siglock held.
1707 * 1771 *
1708 * This should be the path for all ptrace stops. 1772 * This should be the path for all ptrace stops.
@@ -1739,31 +1803,34 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1739 } 1803 }
1740 1804
1741 /* 1805 /*
1742 * If @why is CLD_STOPPED, we're trapping to participate in a group 1806 * We're committing to trapping. TRACED should be visible before
1743 * stop. Do the bookkeeping. Note that if SIGCONT was delievered 1807 * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
1744 * while siglock was released for the arch hook, PENDING could be 1808 * Also, transition to TRACED and updates to ->jobctl should be
1745 * clear now. We act as if SIGCONT is received after TASK_TRACED 1809 * atomic with respect to siglock and should be done after the arch
1746 * is entered - ignore it. 1810 * hook as siglock is released and regrabbed across it.
1747 */ 1811 */
1748 if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) 1812 set_current_state(TASK_TRACED);
1749 gstop_done = task_participate_group_stop(current);
1750 1813
1751 current->last_siginfo = info; 1814 current->last_siginfo = info;
1752 current->exit_code = exit_code; 1815 current->exit_code = exit_code;
1753 1816
1754 /* 1817 /*
1755 * TRACED should be visible before TRAPPING is cleared; otherwise, 1818 * If @why is CLD_STOPPED, we're trapping to participate in a group
1756 * the tracer might fail do_wait(). 1819 * stop. Do the bookkeeping. Note that if SIGCONT was delievered
1820 * across siglock relocks since INTERRUPT was scheduled, PENDING
1821 * could be clear now. We act as if SIGCONT is received after
1822 * TASK_TRACED is entered - ignore it.
1757 */ 1823 */
1758 set_current_state(TASK_TRACED); 1824 if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
1825 gstop_done = task_participate_group_stop(current);
1759 1826
1760 /* 1827 /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
1761 * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and 1828 task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
1762 * transition to TASK_TRACED should be atomic with respect to 1829 if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
1763 * siglock. This hsould be done after the arch hook as siglock is 1830 task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);
1764 * released and regrabbed across it. 1831
1765 */ 1832 /* entering a trap, clear TRAPPING */
1766 task_clear_group_stop_trapping(current); 1833 task_clear_jobctl_trapping(current);
1767 1834
1768 spin_unlock_irq(&current->sighand->siglock); 1835 spin_unlock_irq(&current->sighand->siglock);
1769 read_lock(&tasklist_lock); 1836 read_lock(&tasklist_lock);
@@ -1779,7 +1846,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1779 * separately unless they're gonna be duplicates. 1846 * separately unless they're gonna be duplicates.
1780 */ 1847 */
1781 do_notify_parent_cldstop(current, true, why); 1848 do_notify_parent_cldstop(current, true, why);
1782 if (gstop_done && !real_parent_is_ptracer(current)) 1849 if (gstop_done && ptrace_reparented(current))
1783 do_notify_parent_cldstop(current, false, why); 1850 do_notify_parent_cldstop(current, false, why);
1784 1851
1785 /* 1852 /*
@@ -1799,9 +1866,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1799 * 1866 *
1800 * If @gstop_done, the ptracer went away between group stop 1867 * If @gstop_done, the ptracer went away between group stop
1801 * completion and here. During detach, it would have set 1868 * completion and here. During detach, it would have set
1802 * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED 1869 * JOBCTL_STOP_PENDING on us and we'll re-enter
1803 * in do_signal_stop() on return, so notifying the real 1870 * TASK_STOPPED in do_signal_stop() on return, so notifying
1804 * parent of the group stop completion is enough. 1871 * the real parent of the group stop completion is enough.
1805 */ 1872 */
1806 if (gstop_done) 1873 if (gstop_done)
1807 do_notify_parent_cldstop(current, false, why); 1874 do_notify_parent_cldstop(current, false, why);
@@ -1827,6 +1894,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1827 spin_lock_irq(&current->sighand->siglock); 1894 spin_lock_irq(&current->sighand->siglock);
1828 current->last_siginfo = NULL; 1895 current->last_siginfo = NULL;
1829 1896
1897 /* LISTENING can be set only during STOP traps, clear it */
1898 current->jobctl &= ~JOBCTL_LISTENING;
1899
1830 /* 1900 /*
1831 * Queued signals ignored us while we were stopped for tracing. 1901 * Queued signals ignored us while we were stopped for tracing.
1832 * So check for any that we should take before resuming user mode. 1902 * So check for any that we should take before resuming user mode.
@@ -1835,44 +1905,66 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
1835 recalc_sigpending_tsk(current); 1905 recalc_sigpending_tsk(current);
1836} 1906}
1837 1907
1838void ptrace_notify(int exit_code) 1908static void ptrace_do_notify(int signr, int exit_code, int why)
1839{ 1909{
1840 siginfo_t info; 1910 siginfo_t info;
1841 1911
1842 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1843
1844 memset(&info, 0, sizeof info); 1912 memset(&info, 0, sizeof info);
1845 info.si_signo = SIGTRAP; 1913 info.si_signo = signr;
1846 info.si_code = exit_code; 1914 info.si_code = exit_code;
1847 info.si_pid = task_pid_vnr(current); 1915 info.si_pid = task_pid_vnr(current);
1848 info.si_uid = current_uid(); 1916 info.si_uid = current_uid();
1849 1917
1850 /* Let the debugger run. */ 1918 /* Let the debugger run. */
1919 ptrace_stop(exit_code, why, 1, &info);
1920}
1921
1922void ptrace_notify(int exit_code)
1923{
1924 BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
1925
1851 spin_lock_irq(&current->sighand->siglock); 1926 spin_lock_irq(&current->sighand->siglock);
1852 ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); 1927 ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
1853 spin_unlock_irq(&current->sighand->siglock); 1928 spin_unlock_irq(&current->sighand->siglock);
1854} 1929}
1855 1930
1856/* 1931/**
1857 * This performs the stopping for SIGSTOP and other stop signals. 1932 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
1858 * We have to stop all threads in the thread group. 1933 * @signr: signr causing group stop if initiating
1859 * Returns non-zero if we've actually stopped and released the siglock. 1934 *
1860 * Returns zero if we didn't stop and still hold the siglock. 1935 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
1936 * and participate in it. If already set, participate in the existing
1937 * group stop. If participated in a group stop (and thus slept), %true is
1938 * returned with siglock released.
1939 *
1940 * If ptraced, this function doesn't handle stop itself. Instead,
1941 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
1942 * untouched. The caller must ensure that INTERRUPT trap handling takes
1943 * places afterwards.
1944 *
1945 * CONTEXT:
1946 * Must be called with @current->sighand->siglock held, which is released
1947 * on %true return.
1948 *
1949 * RETURNS:
1950 * %false if group stop is already cancelled or ptrace trap is scheduled.
1951 * %true if participated in group stop.
1861 */ 1952 */
1862static int do_signal_stop(int signr) 1953static bool do_signal_stop(int signr)
1954 __releases(&current->sighand->siglock)
1863{ 1955{
1864 struct signal_struct *sig = current->signal; 1956 struct signal_struct *sig = current->signal;
1865 1957
1866 if (!(current->group_stop & GROUP_STOP_PENDING)) { 1958 if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
1867 unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; 1959 unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
1868 struct task_struct *t; 1960 struct task_struct *t;
1869 1961
1870 /* signr will be recorded in task->group_stop for retries */ 1962 /* signr will be recorded in task->jobctl for retries */
1871 WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); 1963 WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);
1872 1964
1873 if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || 1965 if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
1874 unlikely(signal_group_exit(sig))) 1966 unlikely(signal_group_exit(sig)))
1875 return 0; 1967 return false;
1876 /* 1968 /*
1877 * There is no group stop already in progress. We must 1969 * There is no group stop already in progress. We must
1878 * initiate one now. 1970 * initiate one now.
@@ -1895,28 +1987,32 @@ static int do_signal_stop(int signr)
1895 if (!(sig->flags & SIGNAL_STOP_STOPPED)) 1987 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1896 sig->group_exit_code = signr; 1988 sig->group_exit_code = signr;
1897 else 1989 else
1898 WARN_ON_ONCE(!task_ptrace(current)); 1990 WARN_ON_ONCE(!current->ptrace);
1991
1992 sig->group_stop_count = 0;
1993
1994 if (task_set_jobctl_pending(current, signr | gstop))
1995 sig->group_stop_count++;
1899 1996
1900 current->group_stop &= ~GROUP_STOP_SIGMASK;
1901 current->group_stop |= signr | gstop;
1902 sig->group_stop_count = 1;
1903 for (t = next_thread(current); t != current; 1997 for (t = next_thread(current); t != current;
1904 t = next_thread(t)) { 1998 t = next_thread(t)) {
1905 t->group_stop &= ~GROUP_STOP_SIGMASK;
1906 /* 1999 /*
1907 * Setting state to TASK_STOPPED for a group 2000 * Setting state to TASK_STOPPED for a group
1908 * stop is always done with the siglock held, 2001 * stop is always done with the siglock held,
1909 * so this check has no races. 2002 * so this check has no races.
1910 */ 2003 */
1911 if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { 2004 if (!task_is_stopped(t) &&
1912 t->group_stop |= signr | gstop; 2005 task_set_jobctl_pending(t, signr | gstop)) {
1913 sig->group_stop_count++; 2006 sig->group_stop_count++;
1914 signal_wake_up(t, 0); 2007 if (likely(!(t->ptrace & PT_SEIZED)))
2008 signal_wake_up(t, 0);
2009 else
2010 ptrace_trap_notify(t);
1915 } 2011 }
1916 } 2012 }
1917 } 2013 }
1918retry: 2014
1919 if (likely(!task_ptrace(current))) { 2015 if (likely(!current->ptrace)) {
1920 int notify = 0; 2016 int notify = 0;
1921 2017
1922 /* 2018 /*
@@ -1947,43 +2043,65 @@ retry:
1947 2043
1948 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2044 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1949 schedule(); 2045 schedule();
1950 2046 return true;
1951 spin_lock_irq(&current->sighand->siglock);
1952 } else { 2047 } else {
1953 ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, 2048 /*
1954 CLD_STOPPED, 0, NULL); 2049 * While ptraced, group stop is handled by STOP trap.
1955 current->exit_code = 0; 2050 * Schedule it and let the caller deal with it.
2051 */
2052 task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
2053 return false;
1956 } 2054 }
2055}
1957 2056
1958 /* 2057/**
1959 * GROUP_STOP_PENDING could be set if another group stop has 2058 * do_jobctl_trap - take care of ptrace jobctl traps
1960 * started since being woken up or ptrace wants us to transit 2059 *
1961 * between TASK_STOPPED and TRACED. Retry group stop. 2060 * When PT_SEIZED, it's used for both group stop and explicit
1962 */ 2061 * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with
1963 if (current->group_stop & GROUP_STOP_PENDING) { 2062 * accompanying siginfo. If stopped, lower eight bits of exit_code contain
1964 WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); 2063 * the stop signal; otherwise, %SIGTRAP.
1965 goto retry; 2064 *
2065 * When !PT_SEIZED, it's used only for group stop trap with stop signal
2066 * number as exit_code and no siginfo.
2067 *
2068 * CONTEXT:
2069 * Must be called with @current->sighand->siglock held, which may be
2070 * released and re-acquired before returning with intervening sleep.
2071 */
2072static void do_jobctl_trap(void)
2073{
2074 struct signal_struct *signal = current->signal;
2075 int signr = current->jobctl & JOBCTL_STOP_SIGMASK;
2076
2077 if (current->ptrace & PT_SEIZED) {
2078 if (!signal->group_stop_count &&
2079 !(signal->flags & SIGNAL_STOP_STOPPED))
2080 signr = SIGTRAP;
2081 WARN_ON_ONCE(!signr);
2082 ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
2083 CLD_STOPPED);
2084 } else {
2085 WARN_ON_ONCE(!signr);
2086 ptrace_stop(signr, CLD_STOPPED, 0, NULL);
2087 current->exit_code = 0;
1966 } 2088 }
1967
1968 /* PTRACE_ATTACH might have raced with task killing, clear trapping */
1969 task_clear_group_stop_trapping(current);
1970
1971 spin_unlock_irq(&current->sighand->siglock);
1972
1973 tracehook_finish_jctl();
1974
1975 return 1;
1976} 2089}
1977 2090
1978static int ptrace_signal(int signr, siginfo_t *info, 2091static int ptrace_signal(int signr, siginfo_t *info,
1979 struct pt_regs *regs, void *cookie) 2092 struct pt_regs *regs, void *cookie)
1980{ 2093{
1981 if (!task_ptrace(current))
1982 return signr;
1983
1984 ptrace_signal_deliver(regs, cookie); 2094 ptrace_signal_deliver(regs, cookie);
1985 2095 /*
1986 /* Let the debugger run. */ 2096 * We do not check sig_kernel_stop(signr) but set this marker
2097 * unconditionally because we do not know whether debugger will
2098 * change signr. This flag has no meaning unless we are going
2099 * to stop after return from ptrace_stop(). In this case it will
2100 * be checked in do_signal_stop(), we should only stop if it was
2101 * not cleared by SIGCONT while we were sleeping. See also the
2102 * comment in dequeue_signal().
2103 */
2104 current->jobctl |= JOBCTL_STOP_DEQUEUED;
1987 ptrace_stop(signr, CLD_TRAPPED, 0, info); 2105 ptrace_stop(signr, CLD_TRAPPED, 0, info);
1988 2106
1989 /* We're back. Did the debugger cancel the sig? */ 2107 /* We're back. Did the debugger cancel the sig? */
@@ -2039,7 +2157,6 @@ relock:
2039 * the CLD_ si_code into SIGNAL_CLD_MASK bits. 2157 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
2040 */ 2158 */
2041 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { 2159 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
2042 struct task_struct *leader;
2043 int why; 2160 int why;
2044 2161
2045 if (signal->flags & SIGNAL_CLD_CONTINUED) 2162 if (signal->flags & SIGNAL_CLD_CONTINUED)
@@ -2060,13 +2177,11 @@ relock:
2060 * a duplicate. 2177 * a duplicate.
2061 */ 2178 */
2062 read_lock(&tasklist_lock); 2179 read_lock(&tasklist_lock);
2063
2064 do_notify_parent_cldstop(current, false, why); 2180 do_notify_parent_cldstop(current, false, why);
2065 2181
2066 leader = current->group_leader; 2182 if (ptrace_reparented(current->group_leader))
2067 if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) 2183 do_notify_parent_cldstop(current->group_leader,
2068 do_notify_parent_cldstop(leader, true, why); 2184 true, why);
2069
2070 read_unlock(&tasklist_lock); 2185 read_unlock(&tasklist_lock);
2071 2186
2072 goto relock; 2187 goto relock;
@@ -2074,37 +2189,31 @@ relock:
2074 2189
2075 for (;;) { 2190 for (;;) {
2076 struct k_sigaction *ka; 2191 struct k_sigaction *ka;
2077 /* 2192
2078 * Tracing can induce an artificial signal and choose sigaction. 2193 if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
2079 * The return value in @signr determines the default action, 2194 do_signal_stop(0))
2080 * but @info->si_signo is the signal number we will report.
2081 */
2082 signr = tracehook_get_signal(current, regs, info, return_ka);
2083 if (unlikely(signr < 0))
2084 goto relock; 2195 goto relock;
2085 if (unlikely(signr != 0))
2086 ka = return_ka;
2087 else {
2088 if (unlikely(current->group_stop &
2089 GROUP_STOP_PENDING) && do_signal_stop(0))
2090 goto relock;
2091 2196
2092 signr = dequeue_signal(current, &current->blocked, 2197 if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
2093 info); 2198 do_jobctl_trap();
2199 spin_unlock_irq(&sighand->siglock);
2200 goto relock;
2201 }
2094 2202
2095 if (!signr) 2203 signr = dequeue_signal(current, &current->blocked, info);
2096 break; /* will return 0 */
2097 2204
2098 if (signr != SIGKILL) { 2205 if (!signr)
2099 signr = ptrace_signal(signr, info, 2206 break; /* will return 0 */
2100 regs, cookie);
2101 if (!signr)
2102 continue;
2103 }
2104 2207
2105 ka = &sighand->action[signr-1]; 2208 if (unlikely(current->ptrace) && signr != SIGKILL) {
2209 signr = ptrace_signal(signr, info,
2210 regs, cookie);
2211 if (!signr)
2212 continue;
2106 } 2213 }
2107 2214
2215 ka = &sighand->action[signr-1];
2216
2108 /* Trace actually delivered signals. */ 2217 /* Trace actually delivered signals. */
2109 trace_signal_deliver(signr, info, ka); 2218 trace_signal_deliver(signr, info, ka);
2110 2219
@@ -2260,7 +2369,7 @@ void exit_signals(struct task_struct *tsk)
2260 signotset(&unblocked); 2369 signotset(&unblocked);
2261 retarget_shared_pending(tsk, &unblocked); 2370 retarget_shared_pending(tsk, &unblocked);
2262 2371
2263 if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && 2372 if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
2264 task_participate_group_stop(tsk)) 2373 task_participate_group_stop(tsk))
2265 group_stop = CLD_STOPPED; 2374 group_stop = CLD_STOPPED;
2266out: 2375out:
@@ -2993,15 +3102,11 @@ SYSCALL_DEFINE0(sgetmask)
2993 3102
2994SYSCALL_DEFINE1(ssetmask, int, newmask) 3103SYSCALL_DEFINE1(ssetmask, int, newmask)
2995{ 3104{
2996 int old; 3105 int old = current->blocked.sig[0];
2997 3106 sigset_t newset;
2998 spin_lock_irq(&current->sighand->siglock);
2999 old = current->blocked.sig[0];
3000 3107
3001 siginitset(&current->blocked, newmask & ~(sigmask(SIGKILL)| 3108 siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP)));
3002 sigmask(SIGSTOP))); 3109 set_current_blocked(&newset);
3003 recalc_sigpending();
3004 spin_unlock_irq(&current->sighand->siglock);
3005 3110
3006 return old; 3111 return old;
3007} 3112}
@@ -3058,11 +3163,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
3058 return -EFAULT; 3163 return -EFAULT;
3059 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3164 sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3060 3165
3061 spin_lock_irq(&current->sighand->siglock);
3062 current->saved_sigmask = current->blocked; 3166 current->saved_sigmask = current->blocked;
3063 current->blocked = newset; 3167 set_current_blocked(&newset);
3064 recalc_sigpending();
3065 spin_unlock_irq(&current->sighand->siglock);
3066 3168
3067 current->state = TASK_INTERRUPTIBLE; 3169 current->state = TASK_INTERRUPTIBLE;
3068 schedule(); 3170 schedule();
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index eb212f8f8bc8..d20c6983aad9 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
26EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
27 27
28/* 28/*
29 * Architectures that do not implement save_stack_trace_tsk get this 29 * Architectures that do not implement save_stack_trace_tsk or
30 * weak alias and a once-per-bootup warning (whenever this facility 30 * save_stack_trace_regs get this weak alias and a once-per-bootup warning
31 * is utilized - for example by procfs): 31 * (whenever this facility is utilized - for example by procfs):
32 */ 32 */
33__weak void 33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{ 35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); 36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37} 37}
38
39__weak void
40save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
41{
42 WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
43}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e3516b29076c..ba5070ce5765 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -19,7 +19,7 @@
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21 21
22#include <asm/atomic.h> 22#include <linux/atomic.h>
23 23
24/* 24/*
25 * Structure to determine completion condition and record errors. May 25 * Structure to determine completion condition and record errors. May
@@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
136static DEFINE_MUTEX(stop_cpus_mutex); 136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); 137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138 138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) 139static void queue_stop_cpus_work(const struct cpumask *cpumask,
140 cpu_stop_fn_t fn, void *arg,
141 struct cpu_stop_done *done)
140{ 142{
141 struct cpu_stop_work *work; 143 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu; 144 unsigned int cpu;
144 145
145 /* initialize works and done */ 146 /* initialize works and done */
@@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
147 work = &per_cpu(stop_cpus_work, cpu); 148 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn; 149 work->fn = fn;
149 work->arg = arg; 150 work->arg = arg;
150 work->done = &done; 151 work->done = done;
151 } 152 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153 153
154 /* 154 /*
155 * Disable preemption while queueing to avoid getting 155 * Disable preemption while queueing to avoid getting
@@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), 161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu)); 162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable(); 163 preempt_enable();
164}
164 165
166static int __stop_cpus(const struct cpumask *cpumask,
167 cpu_stop_fn_t fn, void *arg)
168{
169 struct cpu_stop_done done;
170
171 cpu_stop_init_done(&done, cpumask_weight(cpumask));
172 queue_stop_cpus_work(cpumask, fn, arg, &done);
165 wait_for_completion(&done.completion); 173 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT; 174 return done.executed ? done.ret : -ENOENT;
167} 175}
@@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data)
431 struct stop_machine_data *smdata = data; 439 struct stop_machine_data *smdata = data;
432 enum stopmachine_state curstate = STOPMACHINE_NONE; 440 enum stopmachine_state curstate = STOPMACHINE_NONE;
433 int cpu = smp_processor_id(), err = 0; 441 int cpu = smp_processor_id(), err = 0;
442 unsigned long flags;
434 bool is_active; 443 bool is_active;
435 444
445 /*
446 * When called from stop_machine_from_inactive_cpu(), irq might
447 * already be disabled. Save the state and restore it on exit.
448 */
449 local_save_flags(flags);
450
436 if (!smdata->active_cpus) 451 if (!smdata->active_cpus)
437 is_active = cpu == cpumask_first(cpu_online_mask); 452 is_active = cpu == cpumask_first(cpu_online_mask);
438 else 453 else
@@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data)
460 } 475 }
461 } while (curstate != STOPMACHINE_EXIT); 476 } while (curstate != STOPMACHINE_EXIT);
462 477
463 local_irq_enable(); 478 local_irq_restore(flags);
464 return err; 479 return err;
465} 480}
466 481
@@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
487} 502}
488EXPORT_SYMBOL_GPL(stop_machine); 503EXPORT_SYMBOL_GPL(stop_machine);
489 504
505/**
506 * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU
507 * @fn: the function to run
508 * @data: the data ptr for the @fn()
509 * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
510 *
511 * This is identical to stop_machine() but can be called from a CPU which
512 * is not active. The local CPU is in the process of hotplug (so no other
513 * CPU hotplug can start) and not marked active and doesn't have enough
514 * context to sleep.
515 *
516 * This function provides stop_machine() functionality for such state by
517 * using busy-wait for synchronization and executing @fn directly for local
518 * CPU.
519 *
520 * CONTEXT:
521 * Local CPU is inactive. Temporarily stops all active CPUs.
522 *
523 * RETURNS:
524 * 0 if all executions of @fn returned 0, any non zero return value if any
525 * returned non zero.
526 */
527int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
528 const struct cpumask *cpus)
529{
530 struct stop_machine_data smdata = { .fn = fn, .data = data,
531 .active_cpus = cpus };
532 struct cpu_stop_done done;
533 int ret;
534
535 /* Local CPU must be inactive and CPU hotplug in progress. */
536 BUG_ON(cpu_active(raw_smp_processor_id()));
537 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */
538
539 /* No proper task established and can't sleep - busy wait for lock. */
540 while (!mutex_trylock(&stop_cpus_mutex))
541 cpu_relax();
542
543 /* Schedule work on other CPUs and execute directly for local CPU */
544 set_state(&smdata, STOPMACHINE_PREPARE);
545 cpu_stop_init_done(&done, num_active_cpus());
546 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
547 &done);
548 ret = stop_machine_cpu_stop(&smdata);
549
550 /* Busy wait for completion. */
551 while (!completion_done(&done.completion))
552 cpu_relax();
553
554 mutex_unlock(&stop_cpus_mutex);
555 return ret ?: done.ret;
556}
557
490#endif /* CONFIG_STOP_MACHINE */ 558#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index e4128b278f23..a101ba36c444 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/notifier.h>
12#include <linux/reboot.h> 11#include <linux/reboot.h>
13#include <linux/prctl.h> 12#include <linux/prctl.h>
14#include <linux/highuid.h> 13#include <linux/highuid.h>
@@ -320,6 +319,37 @@ void kernel_restart_prepare(char *cmd)
320} 319}
321 320
322/** 321/**
322 * register_reboot_notifier - Register function to be called at reboot time
323 * @nb: Info about notifier function to be called
324 *
325 * Registers a function with the list of functions
326 * to be called at reboot time.
327 *
328 * Currently always returns zero, as blocking_notifier_chain_register()
329 * always returns zero.
330 */
331int register_reboot_notifier(struct notifier_block *nb)
332{
333 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
334}
335EXPORT_SYMBOL(register_reboot_notifier);
336
337/**
338 * unregister_reboot_notifier - Unregister previously registered reboot notifier
339 * @nb: Hook to be unregistered
340 *
341 * Unregisters a previously registered reboot
342 * notifier function.
343 *
344 * Returns zero on success, or %-ENOENT on failure.
345 */
346int unregister_reboot_notifier(struct notifier_block *nb)
347{
348 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
349}
350EXPORT_SYMBOL(unregister_reboot_notifier);
351
352/**
323 * kernel_restart - reboot the system 353 * kernel_restart - reboot the system
324 * @cmd: pointer to buffer containing command to execute for restart 354 * @cmd: pointer to buffer containing command to execute for restart
325 * or %NULL 355 * or %NULL
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f175d98bd355..11d65b531e50 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1590,16 +1590,11 @@ void sysctl_head_get(struct ctl_table_header *head)
1590 spin_unlock(&sysctl_lock); 1590 spin_unlock(&sysctl_lock);
1591} 1591}
1592 1592
1593static void free_head(struct rcu_head *rcu)
1594{
1595 kfree(container_of(rcu, struct ctl_table_header, rcu));
1596}
1597
1598void sysctl_head_put(struct ctl_table_header *head) 1593void sysctl_head_put(struct ctl_table_header *head)
1599{ 1594{
1600 spin_lock(&sysctl_lock); 1595 spin_lock(&sysctl_lock);
1601 if (!--head->count) 1596 if (!--head->count)
1602 call_rcu(&head->rcu, free_head); 1597 kfree_rcu(head, rcu);
1603 spin_unlock(&sysctl_lock); 1598 spin_unlock(&sysctl_lock);
1604} 1599}
1605 1600
@@ -1971,10 +1966,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1971 start_unregistering(header); 1966 start_unregistering(header);
1972 if (!--header->parent->count) { 1967 if (!--header->parent->count) {
1973 WARN_ON(1); 1968 WARN_ON(1);
1974 call_rcu(&header->parent->rcu, free_head); 1969 kfree_rcu(header->parent, rcu);
1975 } 1970 }
1976 if (!--header->count) 1971 if (!--header->count)
1977 call_rcu(&header->rcu, free_head); 1972 kfree_rcu(header, rcu);
1978 spin_unlock(&sysctl_lock); 1973 spin_unlock(&sysctl_lock);
1979} 1974}
1980 1975
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index fc0f22005417..d1db2880d1cf 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -28,7 +28,7 @@
28#include <linux/fs.h> 28#include <linux/fs.h>
29#include <linux/file.h> 29#include <linux/file.h>
30#include <net/genetlink.h> 30#include <net/genetlink.h>
31#include <asm/atomic.h> 31#include <linux/atomic.h>
32 32
33/* 33/*
34 * Maximum length of a cpumask that can be specified in 34 * Maximum length of a cpumask that can be specified in
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 342408cf68dd..2b021b0e8507 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -604,6 +604,12 @@ static struct timespec timekeeping_suspend_time;
604 */ 604 */
605static void __timekeeping_inject_sleeptime(struct timespec *delta) 605static void __timekeeping_inject_sleeptime(struct timespec *delta)
606{ 606{
607 if (!timespec_valid(delta)) {
608 printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
609 "sleep delta value!\n");
610 return;
611 }
612
607 xtime = timespec_add(xtime, *delta); 613 xtime = timespec_add(xtime, *delta);
608 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); 614 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
609 total_sleep_time = timespec_add(total_sleep_time, *delta); 615 total_sleep_time = timespec_add(total_sleep_time, *delta);
@@ -686,12 +692,34 @@ static void timekeeping_resume(void)
686static int timekeeping_suspend(void) 692static int timekeeping_suspend(void)
687{ 693{
688 unsigned long flags; 694 unsigned long flags;
695 struct timespec delta, delta_delta;
696 static struct timespec old_delta;
689 697
690 read_persistent_clock(&timekeeping_suspend_time); 698 read_persistent_clock(&timekeeping_suspend_time);
691 699
692 write_seqlock_irqsave(&xtime_lock, flags); 700 write_seqlock_irqsave(&xtime_lock, flags);
693 timekeeping_forward_now(); 701 timekeeping_forward_now();
694 timekeeping_suspended = 1; 702 timekeeping_suspended = 1;
703
704 /*
705 * To avoid drift caused by repeated suspend/resumes,
706 * which each can add ~1 second drift error,
707 * try to compensate so the difference in system time
708 * and persistent_clock time stays close to constant.
709 */
710 delta = timespec_sub(xtime, timekeeping_suspend_time);
711 delta_delta = timespec_sub(delta, old_delta);
712 if (abs(delta_delta.tv_sec) >= 2) {
713 /*
714 * if delta_delta is too large, assume time correction
715 * has occured and set old_delta to the current delta.
716 */
717 old_delta = delta;
718 } else {
719 /* Otherwise try to adjust old_system to compensate */
720 timekeeping_suspend_time =
721 timespec_add(timekeeping_suspend_time, delta_delta);
722 }
695 write_sequnlock_irqrestore(&xtime_lock, flags); 723 write_sequnlock_irqrestore(&xtime_lock, flags);
696 724
697 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 725 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 908038f57440..c3e4575e7829 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,7 +32,6 @@
32 32
33#include <trace/events/sched.h> 33#include <trace/events/sched.h>
34 34
35#include <asm/ftrace.h>
36#include <asm/setup.h> 35#include <asm/setup.h>
37 36
38#include "trace_output.h" 37#include "trace_output.h"
@@ -82,14 +81,14 @@ static int ftrace_disabled __read_mostly;
82 81
83static DEFINE_MUTEX(ftrace_lock); 82static DEFINE_MUTEX(ftrace_lock);
84 83
85static struct ftrace_ops ftrace_list_end __read_mostly = 84static struct ftrace_ops ftrace_list_end __read_mostly = {
86{
87 .func = ftrace_stub, 85 .func = ftrace_stub,
88}; 86};
89 87
90static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 88static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
91static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 89static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
92ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 90ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
91static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
93ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 92ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
94ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 93ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
95static struct ftrace_ops global_ops; 94static struct ftrace_ops global_ops;
@@ -148,9 +147,11 @@ void clear_ftrace_function(void)
148{ 147{
149 ftrace_trace_function = ftrace_stub; 148 ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function = ftrace_stub; 149 __ftrace_trace_function = ftrace_stub;
150 __ftrace_trace_function_delay = ftrace_stub;
151 ftrace_pid_function = ftrace_stub; 151 ftrace_pid_function = ftrace_stub;
152} 152}
153 153
154#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
154#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 155#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
155/* 156/*
156 * For those archs that do not test ftrace_trace_stop in their 157 * For those archs that do not test ftrace_trace_stop in their
@@ -210,7 +211,12 @@ static void update_ftrace_function(void)
210#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 211#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
211 ftrace_trace_function = func; 212 ftrace_trace_function = func;
212#else 213#else
214#ifdef CONFIG_DYNAMIC_FTRACE
215 /* do not update till all functions have been modified */
216 __ftrace_trace_function_delay = func;
217#else
213 __ftrace_trace_function = func; 218 __ftrace_trace_function = func;
219#endif
214 ftrace_trace_function = ftrace_test_stop_func; 220 ftrace_trace_function = ftrace_test_stop_func;
215#endif 221#endif
216} 222}
@@ -785,8 +791,7 @@ static void unregister_ftrace_profiler(void)
785 unregister_ftrace_graph(); 791 unregister_ftrace_graph();
786} 792}
787#else 793#else
788static struct ftrace_ops ftrace_profile_ops __read_mostly = 794static struct ftrace_ops ftrace_profile_ops __read_mostly = {
789{
790 .func = function_profile_call, 795 .func = function_profile_call,
791}; 796};
792 797
@@ -806,19 +811,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
806 size_t cnt, loff_t *ppos) 811 size_t cnt, loff_t *ppos)
807{ 812{
808 unsigned long val; 813 unsigned long val;
809 char buf[64]; /* big enough to hold a number */
810 int ret; 814 int ret;
811 815
812 if (cnt >= sizeof(buf)) 816 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
813 return -EINVAL; 817 if (ret)
814
815 if (copy_from_user(&buf, ubuf, cnt))
816 return -EFAULT;
817
818 buf[cnt] = 0;
819
820 ret = strict_strtoul(buf, 10, &val);
821 if (ret < 0)
822 return ret; 818 return ret;
823 819
824 val = !!val; 820 val = !!val;
@@ -1182,8 +1178,14 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1182 return NULL; 1178 return NULL;
1183} 1179}
1184 1180
1181static void
1182ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash);
1183static void
1184ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash);
1185
1185static int 1186static int
1186ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) 1187ftrace_hash_move(struct ftrace_ops *ops, int enable,
1188 struct ftrace_hash **dst, struct ftrace_hash *src)
1187{ 1189{
1188 struct ftrace_func_entry *entry; 1190 struct ftrace_func_entry *entry;
1189 struct hlist_node *tp, *tn; 1191 struct hlist_node *tp, *tn;
@@ -1193,9 +1195,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1193 unsigned long key; 1195 unsigned long key;
1194 int size = src->count; 1196 int size = src->count;
1195 int bits = 0; 1197 int bits = 0;
1198 int ret;
1196 int i; 1199 int i;
1197 1200
1198 /* 1201 /*
1202 * Remove the current set, update the hash and add
1203 * them back.
1204 */
1205 ftrace_hash_rec_disable(ops, enable);
1206
1207 /*
1199 * If the new source is empty, just free dst and assign it 1208 * If the new source is empty, just free dst and assign it
1200 * the empty_hash. 1209 * the empty_hash.
1201 */ 1210 */
@@ -1215,9 +1224,10 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1215 if (bits > FTRACE_HASH_MAX_BITS) 1224 if (bits > FTRACE_HASH_MAX_BITS)
1216 bits = FTRACE_HASH_MAX_BITS; 1225 bits = FTRACE_HASH_MAX_BITS;
1217 1226
1227 ret = -ENOMEM;
1218 new_hash = alloc_ftrace_hash(bits); 1228 new_hash = alloc_ftrace_hash(bits);
1219 if (!new_hash) 1229 if (!new_hash)
1220 return -ENOMEM; 1230 goto out;
1221 1231
1222 size = 1 << src->size_bits; 1232 size = 1 << src->size_bits;
1223 for (i = 0; i < size; i++) { 1233 for (i = 0; i < size; i++) {
@@ -1236,7 +1246,16 @@ ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
1236 rcu_assign_pointer(*dst, new_hash); 1246 rcu_assign_pointer(*dst, new_hash);
1237 free_ftrace_hash_rcu(old_hash); 1247 free_ftrace_hash_rcu(old_hash);
1238 1248
1239 return 0; 1249 ret = 0;
1250 out:
1251 /*
1252 * Enable regardless of ret:
1253 * On success, we enable the new hash.
1254 * On failure, we re-enable the original hash.
1255 */
1256 ftrace_hash_rec_enable(ops, enable);
1257
1258 return ret;
1240} 1259}
1241 1260
1242/* 1261/*
@@ -1596,6 +1615,12 @@ static int __ftrace_modify_code(void *data)
1596{ 1615{
1597 int *command = data; 1616 int *command = data;
1598 1617
1618 /*
1619 * Do not call function tracer while we update the code.
1620 * We are in stop machine, no worrying about races.
1621 */
1622 function_trace_stop++;
1623
1599 if (*command & FTRACE_ENABLE_CALLS) 1624 if (*command & FTRACE_ENABLE_CALLS)
1600 ftrace_replace_code(1); 1625 ftrace_replace_code(1);
1601 else if (*command & FTRACE_DISABLE_CALLS) 1626 else if (*command & FTRACE_DISABLE_CALLS)
@@ -1609,6 +1634,18 @@ static int __ftrace_modify_code(void *data)
1609 else if (*command & FTRACE_STOP_FUNC_RET) 1634 else if (*command & FTRACE_STOP_FUNC_RET)
1610 ftrace_disable_ftrace_graph_caller(); 1635 ftrace_disable_ftrace_graph_caller();
1611 1636
1637#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1638 /*
1639 * For archs that call ftrace_test_stop_func(), we must
1640 * wait till after we update all the function callers
1641 * before we update the callback. This keeps different
1642 * ops that record different functions from corrupting
1643 * each other.
1644 */
1645 __ftrace_trace_function = __ftrace_trace_function_delay;
1646#endif
1647 function_trace_stop--;
1648
1612 return 0; 1649 return 0;
1613} 1650}
1614 1651
@@ -1744,10 +1781,36 @@ static cycle_t ftrace_update_time;
1744static unsigned long ftrace_update_cnt; 1781static unsigned long ftrace_update_cnt;
1745unsigned long ftrace_update_tot_cnt; 1782unsigned long ftrace_update_tot_cnt;
1746 1783
1784static int ops_traces_mod(struct ftrace_ops *ops)
1785{
1786 struct ftrace_hash *hash;
1787
1788 hash = ops->filter_hash;
1789 return !!(!hash || !hash->count);
1790}
1791
1747static int ftrace_update_code(struct module *mod) 1792static int ftrace_update_code(struct module *mod)
1748{ 1793{
1749 struct dyn_ftrace *p; 1794 struct dyn_ftrace *p;
1750 cycle_t start, stop; 1795 cycle_t start, stop;
1796 unsigned long ref = 0;
1797
1798 /*
1799 * When adding a module, we need to check if tracers are
1800 * currently enabled and if they are set to trace all functions.
1801 * If they are, we need to enable the module functions as well
1802 * as update the reference counts for those function records.
1803 */
1804 if (mod) {
1805 struct ftrace_ops *ops;
1806
1807 for (ops = ftrace_ops_list;
1808 ops != &ftrace_list_end; ops = ops->next) {
1809 if (ops->flags & FTRACE_OPS_FL_ENABLED &&
1810 ops_traces_mod(ops))
1811 ref++;
1812 }
1813 }
1751 1814
1752 start = ftrace_now(raw_smp_processor_id()); 1815 start = ftrace_now(raw_smp_processor_id());
1753 ftrace_update_cnt = 0; 1816 ftrace_update_cnt = 0;
@@ -1760,7 +1823,7 @@ static int ftrace_update_code(struct module *mod)
1760 1823
1761 p = ftrace_new_addrs; 1824 p = ftrace_new_addrs;
1762 ftrace_new_addrs = p->newlist; 1825 ftrace_new_addrs = p->newlist;
1763 p->flags = 0L; 1826 p->flags = ref;
1764 1827
1765 /* 1828 /*
1766 * Do the initial record conversion from mcount jump 1829 * Do the initial record conversion from mcount jump
@@ -1783,7 +1846,7 @@ static int ftrace_update_code(struct module *mod)
1783 * conversion puts the module to the correct state, thus 1846 * conversion puts the module to the correct state, thus
1784 * passing the ftrace_make_call check. 1847 * passing the ftrace_make_call check.
1785 */ 1848 */
1786 if (ftrace_start_up) { 1849 if (ftrace_start_up && ref) {
1787 int failed = __ftrace_replace_code(p, 1); 1850 int failed = __ftrace_replace_code(p, 1);
1788 if (failed) { 1851 if (failed) {
1789 ftrace_bug(failed, p->ip); 1852 ftrace_bug(failed, p->ip);
@@ -2407,10 +2470,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
2407 */ 2470 */
2408 2471
2409static int 2472static int
2410ftrace_mod_callback(char *func, char *cmd, char *param, int enable) 2473ftrace_mod_callback(struct ftrace_hash *hash,
2474 char *func, char *cmd, char *param, int enable)
2411{ 2475{
2412 struct ftrace_ops *ops = &global_ops;
2413 struct ftrace_hash *hash;
2414 char *mod; 2476 char *mod;
2415 int ret = -EINVAL; 2477 int ret = -EINVAL;
2416 2478
@@ -2430,11 +2492,6 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
2430 if (!strlen(mod)) 2492 if (!strlen(mod))
2431 return ret; 2493 return ret;
2432 2494
2433 if (enable)
2434 hash = ops->filter_hash;
2435 else
2436 hash = ops->notrace_hash;
2437
2438 ret = ftrace_match_module_records(hash, func, mod); 2495 ret = ftrace_match_module_records(hash, func, mod);
2439 if (!ret) 2496 if (!ret)
2440 ret = -EINVAL; 2497 ret = -EINVAL;
@@ -2760,7 +2817,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash,
2760 mutex_lock(&ftrace_cmd_mutex); 2817 mutex_lock(&ftrace_cmd_mutex);
2761 list_for_each_entry(p, &ftrace_commands, list) { 2818 list_for_each_entry(p, &ftrace_commands, list) {
2762 if (strcmp(p->name, command) == 0) { 2819 if (strcmp(p->name, command) == 0) {
2763 ret = p->func(func, command, next, enable); 2820 ret = p->func(hash, func, command, next, enable);
2764 goto out_unlock; 2821 goto out_unlock;
2765 } 2822 }
2766 } 2823 }
@@ -2857,7 +2914,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2857 ftrace_match_records(hash, buf, len); 2914 ftrace_match_records(hash, buf, len);
2858 2915
2859 mutex_lock(&ftrace_lock); 2916 mutex_lock(&ftrace_lock);
2860 ret = ftrace_hash_move(orig_hash, hash); 2917 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
2918 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
2919 && ftrace_enabled)
2920 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2921
2861 mutex_unlock(&ftrace_lock); 2922 mutex_unlock(&ftrace_lock);
2862 2923
2863 mutex_unlock(&ftrace_regex_lock); 2924 mutex_unlock(&ftrace_regex_lock);
@@ -3040,18 +3101,12 @@ ftrace_regex_release(struct inode *inode, struct file *file)
3040 orig_hash = &iter->ops->notrace_hash; 3101 orig_hash = &iter->ops->notrace_hash;
3041 3102
3042 mutex_lock(&ftrace_lock); 3103 mutex_lock(&ftrace_lock);
3043 /* 3104 ret = ftrace_hash_move(iter->ops, filter_hash,
3044 * Remove the current set, update the hash and add 3105 orig_hash, iter->hash);
3045 * them back. 3106 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3046 */ 3107 && ftrace_enabled)
3047 ftrace_hash_rec_disable(iter->ops, filter_hash); 3108 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3048 ret = ftrace_hash_move(orig_hash, iter->hash); 3109
3049 if (!ret) {
3050 ftrace_hash_rec_enable(iter->ops, filter_hash);
3051 if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
3052 && ftrace_enabled)
3053 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
3054 }
3055 mutex_unlock(&ftrace_lock); 3110 mutex_unlock(&ftrace_lock);
3056 } 3111 }
3057 free_ftrace_hash(iter->hash); 3112 free_ftrace_hash(iter->hash);
@@ -3330,7 +3385,7 @@ static int ftrace_process_locs(struct module *mod,
3330{ 3385{
3331 unsigned long *p; 3386 unsigned long *p;
3332 unsigned long addr; 3387 unsigned long addr;
3333 unsigned long flags; 3388 unsigned long flags = 0; /* Shut up gcc */
3334 3389
3335 mutex_lock(&ftrace_lock); 3390 mutex_lock(&ftrace_lock);
3336 p = start; 3391 p = start;
@@ -3348,12 +3403,18 @@ static int ftrace_process_locs(struct module *mod,
3348 } 3403 }
3349 3404
3350 /* 3405 /*
3351 * Disable interrupts to prevent interrupts from executing 3406 * We only need to disable interrupts on start up
3352 * code that is being modified. 3407 * because we are modifying code that an interrupt
3408 * may execute, and the modification is not atomic.
3409 * But for modules, nothing runs the code we modify
3410 * until we are finished with it, and there's no
3411 * reason to cause large interrupt latencies while we do it.
3353 */ 3412 */
3354 local_irq_save(flags); 3413 if (!mod)
3414 local_irq_save(flags);
3355 ftrace_update_code(mod); 3415 ftrace_update_code(mod);
3356 local_irq_restore(flags); 3416 if (!mod)
3417 local_irq_restore(flags);
3357 mutex_unlock(&ftrace_lock); 3418 mutex_unlock(&ftrace_lock);
3358 3419
3359 return 0; 3420 return 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b0c7aa407943..731201bf4acc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
997 unsigned nr_pages) 997 unsigned nr_pages)
998{ 998{
999 struct buffer_page *bpage, *tmp; 999 struct buffer_page *bpage, *tmp;
1000 unsigned long addr;
1001 LIST_HEAD(pages); 1000 LIST_HEAD(pages);
1002 unsigned i; 1001 unsigned i;
1003 1002
1004 WARN_ON(!nr_pages); 1003 WARN_ON(!nr_pages);
1005 1004
1006 for (i = 0; i < nr_pages; i++) { 1005 for (i = 0; i < nr_pages; i++) {
1006 struct page *page;
1007 /*
1008 * __GFP_NORETRY flag makes sure that the allocation fails
1009 * gracefully without invoking oom-killer and the system is
1010 * not destabilized.
1011 */
1007 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1012 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1008 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 1013 GFP_KERNEL | __GFP_NORETRY,
1014 cpu_to_node(cpu_buffer->cpu));
1009 if (!bpage) 1015 if (!bpage)
1010 goto free_pages; 1016 goto free_pages;
1011 1017
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
1013 1019
1014 list_add(&bpage->list, &pages); 1020 list_add(&bpage->list, &pages);
1015 1021
1016 addr = __get_free_page(GFP_KERNEL); 1022 page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
1017 if (!addr) 1023 GFP_KERNEL | __GFP_NORETRY, 0);
1024 if (!page)
1018 goto free_pages; 1025 goto free_pages;
1019 bpage->page = (void *)addr; 1026 bpage->page = page_address(page);
1020 rb_init_page(bpage->page); 1027 rb_init_page(bpage->page);
1021 } 1028 }
1022 1029
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1045{ 1052{
1046 struct ring_buffer_per_cpu *cpu_buffer; 1053 struct ring_buffer_per_cpu *cpu_buffer;
1047 struct buffer_page *bpage; 1054 struct buffer_page *bpage;
1048 unsigned long addr; 1055 struct page *page;
1049 int ret; 1056 int ret;
1050 1057
1051 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), 1058 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
1067 rb_check_bpage(cpu_buffer, bpage); 1074 rb_check_bpage(cpu_buffer, bpage);
1068 1075
1069 cpu_buffer->reader_page = bpage; 1076 cpu_buffer->reader_page = bpage;
1070 addr = __get_free_page(GFP_KERNEL); 1077 page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
1071 if (!addr) 1078 if (!page)
1072 goto fail_free_reader; 1079 goto fail_free_reader;
1073 bpage->page = (void *)addr; 1080 bpage->page = page_address(page);
1074 rb_init_page(bpage->page); 1081 rb_init_page(bpage->page);
1075 1082
1076 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 1083 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1314 unsigned nr_pages, rm_pages, new_pages; 1321 unsigned nr_pages, rm_pages, new_pages;
1315 struct buffer_page *bpage, *tmp; 1322 struct buffer_page *bpage, *tmp;
1316 unsigned long buffer_size; 1323 unsigned long buffer_size;
1317 unsigned long addr;
1318 LIST_HEAD(pages); 1324 LIST_HEAD(pages);
1319 int i, cpu; 1325 int i, cpu;
1320 1326
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1375 1381
1376 for_each_buffer_cpu(buffer, cpu) { 1382 for_each_buffer_cpu(buffer, cpu) {
1377 for (i = 0; i < new_pages; i++) { 1383 for (i = 0; i < new_pages; i++) {
1384 struct page *page;
1385 /*
1386 * __GFP_NORETRY flag makes sure that the allocation
1387 * fails gracefully without invoking oom-killer and
1388 * the system is not destabilized.
1389 */
1378 bpage = kzalloc_node(ALIGN(sizeof(*bpage), 1390 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
1379 cache_line_size()), 1391 cache_line_size()),
1380 GFP_KERNEL, cpu_to_node(cpu)); 1392 GFP_KERNEL | __GFP_NORETRY,
1393 cpu_to_node(cpu));
1381 if (!bpage) 1394 if (!bpage)
1382 goto free_pages; 1395 goto free_pages;
1383 list_add(&bpage->list, &pages); 1396 list_add(&bpage->list, &pages);
1384 addr = __get_free_page(GFP_KERNEL); 1397 page = alloc_pages_node(cpu_to_node(cpu),
1385 if (!addr) 1398 GFP_KERNEL | __GFP_NORETRY, 0);
1399 if (!page)
1386 goto free_pages; 1400 goto free_pages;
1387 bpage->page = (void *)addr; 1401 bpage->page = page_address(page);
1388 rb_init_page(bpage->page); 1402 rb_init_page(bpage->page);
1389 } 1403 }
1390 } 1404 }
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3730 * Returns: 3744 * Returns:
3731 * The page allocated, or NULL on error. 3745 * The page allocated, or NULL on error.
3732 */ 3746 */
3733void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) 3747void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
3734{ 3748{
3735 struct buffer_data_page *bpage; 3749 struct buffer_data_page *bpage;
3736 unsigned long addr; 3750 struct page *page;
3737 3751
3738 addr = __get_free_page(GFP_KERNEL); 3752 page = alloc_pages_node(cpu_to_node(cpu),
3739 if (!addr) 3753 GFP_KERNEL | __GFP_NORETRY, 0);
3754 if (!page)
3740 return NULL; 3755 return NULL;
3741 3756
3742 bpage = (void *)addr; 3757 bpage = page_address(page);
3743 3758
3744 rb_init_page(bpage); 3759 rb_init_page(bpage);
3745 3760
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
3978 size_t cnt, loff_t *ppos) 3993 size_t cnt, loff_t *ppos)
3979{ 3994{
3980 unsigned long *p = filp->private_data; 3995 unsigned long *p = filp->private_data;
3981 char buf[64];
3982 unsigned long val; 3996 unsigned long val;
3983 int ret; 3997 int ret;
3984 3998
3985 if (cnt >= sizeof(buf)) 3999 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3986 return -EINVAL; 4000 if (ret)
3987
3988 if (copy_from_user(&buf, ubuf, cnt))
3989 return -EFAULT;
3990
3991 buf[cnt] = 0;
3992
3993 ret = strict_strtoul(buf, 10, &val);
3994 if (ret < 0)
3995 return ret; 4001 return ret;
3996 4002
3997 if (val) 4003 if (val)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a614635..a5457d577b98 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
106 int inc; 106 int inc;
107 int i; 107 int i;
108 108
109 bpage = ring_buffer_alloc_read_page(buffer); 109 bpage = ring_buffer_alloc_read_page(buffer, cpu);
110 if (!bpage) 110 if (!bpage)
111 return EVENT_DROPPED; 111 return EVENT_DROPPED;
112 112
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ee9c921d7f21..e5df02c69b1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
343static int trace_stop_count; 343static int trace_stop_count;
344static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
345 345
346static void wakeup_work_handler(struct work_struct *work)
347{
348 wake_up(&trace_wait);
349}
350
351static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
352
346/** 353/**
347 * trace_wake_up - wake up tasks waiting for trace input 354 * trace_wake_up - wake up tasks waiting for trace input
348 * 355 *
349 * Simply wakes up any task that is blocked on the trace_wait 356 * Schedules a delayed work to wake up any task that is blocked on the
350 * queue. These is used with trace_poll for tasks polling the trace. 357 * trace_wait queue. These is used with trace_poll for tasks polling the
358 * trace.
351 */ 359 */
352void trace_wake_up(void) 360void trace_wake_up(void)
353{ 361{
354 int cpu; 362 const unsigned long delay = msecs_to_jiffies(2);
355 363
356 if (trace_flags & TRACE_ITER_BLOCK) 364 if (trace_flags & TRACE_ITER_BLOCK)
357 return; 365 return;
358 /* 366 schedule_delayed_work(&wakeup_work, delay);
359 * The runqueue_is_locked() can fail, but this is the best we
360 * have for now:
361 */
362 cpu = get_cpu();
363 if (!runqueue_is_locked(cpu))
364 wake_up(&trace_wait);
365 put_cpu();
366} 367}
367 368
368static int __init set_buf_size(char *str) 369static int __init set_buf_size(char *str)
@@ -424,6 +425,7 @@ static const char *trace_options[] = {
424 "graph-time", 425 "graph-time",
425 "record-cmd", 426 "record-cmd",
426 "overwrite", 427 "overwrite",
428 "disable_on_free",
427 NULL 429 NULL
428}; 430};
429 431
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
1191} 1193}
1192EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1194EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
1193 1195
1196void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
1197 struct ring_buffer_event *event,
1198 unsigned long flags, int pc,
1199 struct pt_regs *regs)
1200{
1201 ring_buffer_unlock_commit(buffer, event);
1202
1203 ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
1204 ftrace_trace_userstack(buffer, flags, pc);
1205}
1206EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
1207
1194void trace_current_buffer_discard_commit(struct ring_buffer *buffer, 1208void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1195 struct ring_buffer_event *event) 1209 struct ring_buffer_event *event)
1196{ 1210{
@@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1234} 1248}
1235 1249
1236#ifdef CONFIG_STACKTRACE 1250#ifdef CONFIG_STACKTRACE
1251
1252#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
1253struct ftrace_stack {
1254 unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
1255};
1256
1257static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
1258static DEFINE_PER_CPU(int, ftrace_stack_reserve);
1259
1237static void __ftrace_trace_stack(struct ring_buffer *buffer, 1260static void __ftrace_trace_stack(struct ring_buffer *buffer,
1238 unsigned long flags, 1261 unsigned long flags,
1239 int skip, int pc) 1262 int skip, int pc, struct pt_regs *regs)
1240{ 1263{
1241 struct ftrace_event_call *call = &event_kernel_stack; 1264 struct ftrace_event_call *call = &event_kernel_stack;
1242 struct ring_buffer_event *event; 1265 struct ring_buffer_event *event;
1243 struct stack_entry *entry; 1266 struct stack_entry *entry;
1244 struct stack_trace trace; 1267 struct stack_trace trace;
1268 int use_stack;
1269 int size = FTRACE_STACK_ENTRIES;
1270
1271 trace.nr_entries = 0;
1272 trace.skip = skip;
1273
1274 /*
1275 * Since events can happen in NMIs there's no safe way to
1276 * use the per cpu ftrace_stacks. We reserve it and if an interrupt
1277 * or NMI comes in, it will just have to use the default
1278 * FTRACE_STACK_SIZE.
1279 */
1280 preempt_disable_notrace();
1281
1282 use_stack = ++__get_cpu_var(ftrace_stack_reserve);
1283 /*
1284 * We don't need any atomic variables, just a barrier.
1285 * If an interrupt comes in, we don't care, because it would
1286 * have exited and put the counter back to what we want.
1287 * We just need a barrier to keep gcc from moving things
1288 * around.
1289 */
1290 barrier();
1291 if (use_stack == 1) {
1292 trace.entries = &__get_cpu_var(ftrace_stack).calls[0];
1293 trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
1294
1295 if (regs)
1296 save_stack_trace_regs(regs, &trace);
1297 else
1298 save_stack_trace(&trace);
1299
1300 if (trace.nr_entries > size)
1301 size = trace.nr_entries;
1302 } else
1303 /* From now on, use_stack is a boolean */
1304 use_stack = 0;
1305
1306 size *= sizeof(unsigned long);
1245 1307
1246 event = trace_buffer_lock_reserve(buffer, TRACE_STACK, 1308 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1247 sizeof(*entry), flags, pc); 1309 sizeof(*entry) + size, flags, pc);
1248 if (!event) 1310 if (!event)
1249 return; 1311 goto out;
1250 entry = ring_buffer_event_data(event); 1312 entry = ring_buffer_event_data(event);
1251 memset(&entry->caller, 0, sizeof(entry->caller));
1252 1313
1253 trace.nr_entries = 0; 1314 memset(&entry->caller, 0, size);
1254 trace.max_entries = FTRACE_STACK_ENTRIES; 1315
1255 trace.skip = skip; 1316 if (use_stack)
1256 trace.entries = entry->caller; 1317 memcpy(&entry->caller, trace.entries,
1318 trace.nr_entries * sizeof(unsigned long));
1319 else {
1320 trace.max_entries = FTRACE_STACK_ENTRIES;
1321 trace.entries = entry->caller;
1322 if (regs)
1323 save_stack_trace_regs(regs, &trace);
1324 else
1325 save_stack_trace(&trace);
1326 }
1327
1328 entry->size = trace.nr_entries;
1257 1329
1258 save_stack_trace(&trace);
1259 if (!filter_check_discard(call, entry, buffer, event)) 1330 if (!filter_check_discard(call, entry, buffer, event))
1260 ring_buffer_unlock_commit(buffer, event); 1331 ring_buffer_unlock_commit(buffer, event);
1332
1333 out:
1334 /* Again, don't let gcc optimize things here */
1335 barrier();
1336 __get_cpu_var(ftrace_stack_reserve)--;
1337 preempt_enable_notrace();
1338
1339}
1340
1341void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
1342 int skip, int pc, struct pt_regs *regs)
1343{
1344 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1345 return;
1346
1347 __ftrace_trace_stack(buffer, flags, skip, pc, regs);
1261} 1348}
1262 1349
1263void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 1350void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
@@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1266 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1353 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1267 return; 1354 return;
1268 1355
1269 __ftrace_trace_stack(buffer, flags, skip, pc); 1356 __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
1270} 1357}
1271 1358
1272void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1359void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1273 int pc) 1360 int pc)
1274{ 1361{
1275 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1362 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
1276} 1363}
1277 1364
1278/** 1365/**
@@ -1288,7 +1375,7 @@ void trace_dump_stack(void)
1288 local_save_flags(flags); 1375 local_save_flags(flags);
1289 1376
1290 /* skipping 3 traces, seems to get us at the caller of this function */ 1377 /* skipping 3 traces, seems to get us at the caller of this function */
1291 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); 1378 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
1292} 1379}
1293 1380
1294static DEFINE_PER_CPU(int, user_stack_count); 1381static DEFINE_PER_CPU(int, user_stack_count);
@@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1536 1623
1537 ftrace_enable_cpu(); 1624 ftrace_enable_cpu();
1538 1625
1539 return event ? ring_buffer_event_data(event) : NULL; 1626 if (event) {
1627 iter->ent_size = ring_buffer_event_length(event);
1628 return ring_buffer_event_data(event);
1629 }
1630 iter->ent_size = 0;
1631 return NULL;
1540} 1632}
1541 1633
1542static struct trace_entry * 1634static struct trace_entry *
@@ -2051,6 +2143,9 @@ void trace_default_header(struct seq_file *m)
2051{ 2143{
2052 struct trace_iterator *iter = m->private; 2144 struct trace_iterator *iter = m->private;
2053 2145
2146 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
2147 return;
2148
2054 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2149 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2055 /* print nothing if the buffers are empty */ 2150 /* print nothing if the buffers are empty */
2056 if (trace_empty(iter)) 2151 if (trace_empty(iter))
@@ -2701,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2701 size_t cnt, loff_t *ppos) 2796 size_t cnt, loff_t *ppos)
2702{ 2797{
2703 struct trace_array *tr = filp->private_data; 2798 struct trace_array *tr = filp->private_data;
2704 char buf[64];
2705 unsigned long val; 2799 unsigned long val;
2706 int ret; 2800 int ret;
2707 2801
2708 if (cnt >= sizeof(buf)) 2802 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2709 return -EINVAL; 2803 if (ret)
2710
2711 if (copy_from_user(&buf, ubuf, cnt))
2712 return -EFAULT;
2713
2714 buf[cnt] = 0;
2715
2716 ret = strict_strtoul(buf, 10, &val);
2717 if (ret < 0)
2718 return ret; 2804 return ret;
2719 2805
2720 val = !!val; 2806 val = !!val;
@@ -2767,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
2767 return t->init(tr); 2853 return t->init(tr);
2768} 2854}
2769 2855
2770static int tracing_resize_ring_buffer(unsigned long size) 2856static int __tracing_resize_ring_buffer(unsigned long size)
2771{ 2857{
2772 int ret; 2858 int ret;
2773 2859
@@ -2819,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size)
2819 return ret; 2905 return ret;
2820} 2906}
2821 2907
2908static ssize_t tracing_resize_ring_buffer(unsigned long size)
2909{
2910 int cpu, ret = size;
2911
2912 mutex_lock(&trace_types_lock);
2913
2914 tracing_stop();
2915
2916 /* disable all cpu buffers */
2917 for_each_tracing_cpu(cpu) {
2918 if (global_trace.data[cpu])
2919 atomic_inc(&global_trace.data[cpu]->disabled);
2920 if (max_tr.data[cpu])
2921 atomic_inc(&max_tr.data[cpu]->disabled);
2922 }
2923
2924 if (size != global_trace.entries)
2925 ret = __tracing_resize_ring_buffer(size);
2926
2927 if (ret < 0)
2928 ret = -ENOMEM;
2929
2930 for_each_tracing_cpu(cpu) {
2931 if (global_trace.data[cpu])
2932 atomic_dec(&global_trace.data[cpu]->disabled);
2933 if (max_tr.data[cpu])
2934 atomic_dec(&max_tr.data[cpu]->disabled);
2935 }
2936
2937 tracing_start();
2938 mutex_unlock(&trace_types_lock);
2939
2940 return ret;
2941}
2942
2822 2943
2823/** 2944/**
2824 * tracing_update_buffers - used by tracing facility to expand ring buffers 2945 * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -2836,7 +2957,7 @@ int tracing_update_buffers(void)
2836 2957
2837 mutex_lock(&trace_types_lock); 2958 mutex_lock(&trace_types_lock);
2838 if (!ring_buffer_expanded) 2959 if (!ring_buffer_expanded)
2839 ret = tracing_resize_ring_buffer(trace_buf_size); 2960 ret = __tracing_resize_ring_buffer(trace_buf_size);
2840 mutex_unlock(&trace_types_lock); 2961 mutex_unlock(&trace_types_lock);
2841 2962
2842 return ret; 2963 return ret;
@@ -2860,7 +2981,7 @@ static int tracing_set_tracer(const char *buf)
2860 mutex_lock(&trace_types_lock); 2981 mutex_lock(&trace_types_lock);
2861 2982
2862 if (!ring_buffer_expanded) { 2983 if (!ring_buffer_expanded) {
2863 ret = tracing_resize_ring_buffer(trace_buf_size); 2984 ret = __tracing_resize_ring_buffer(trace_buf_size);
2864 if (ret < 0) 2985 if (ret < 0)
2865 goto out; 2986 goto out;
2866 ret = 0; 2987 ret = 0;
@@ -2966,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
2966 size_t cnt, loff_t *ppos) 3087 size_t cnt, loff_t *ppos)
2967{ 3088{
2968 unsigned long *ptr = filp->private_data; 3089 unsigned long *ptr = filp->private_data;
2969 char buf[64];
2970 unsigned long val; 3090 unsigned long val;
2971 int ret; 3091 int ret;
2972 3092
2973 if (cnt >= sizeof(buf)) 3093 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
2974 return -EINVAL; 3094 if (ret)
2975
2976 if (copy_from_user(&buf, ubuf, cnt))
2977 return -EFAULT;
2978
2979 buf[cnt] = 0;
2980
2981 ret = strict_strtoul(buf, 10, &val);
2982 if (ret < 0)
2983 return ret; 3095 return ret;
2984 3096
2985 *ptr = val * 1000; 3097 *ptr = val * 1000;
@@ -3434,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3434 size_t cnt, loff_t *ppos) 3546 size_t cnt, loff_t *ppos)
3435{ 3547{
3436 unsigned long val; 3548 unsigned long val;
3437 char buf[64]; 3549 int ret;
3438 int ret, cpu;
3439
3440 if (cnt >= sizeof(buf))
3441 return -EINVAL;
3442
3443 if (copy_from_user(&buf, ubuf, cnt))
3444 return -EFAULT;
3445
3446 buf[cnt] = 0;
3447 3550
3448 ret = strict_strtoul(buf, 10, &val); 3551 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
3449 if (ret < 0) 3552 if (ret)
3450 return ret; 3553 return ret;
3451 3554
3452 /* must have at least 1 entry */ 3555 /* must have at least 1 entry */
3453 if (!val) 3556 if (!val)
3454 return -EINVAL; 3557 return -EINVAL;
3455 3558
3456 mutex_lock(&trace_types_lock);
3457
3458 tracing_stop();
3459
3460 /* disable all cpu buffers */
3461 for_each_tracing_cpu(cpu) {
3462 if (global_trace.data[cpu])
3463 atomic_inc(&global_trace.data[cpu]->disabled);
3464 if (max_tr.data[cpu])
3465 atomic_inc(&max_tr.data[cpu]->disabled);
3466 }
3467
3468 /* value is in KB */ 3559 /* value is in KB */
3469 val <<= 10; 3560 val <<= 10;
3470 3561
3471 if (val != global_trace.entries) { 3562 ret = tracing_resize_ring_buffer(val);
3472 ret = tracing_resize_ring_buffer(val); 3563 if (ret < 0)
3473 if (ret < 0) { 3564 return ret;
3474 cnt = ret;
3475 goto out;
3476 }
3477 }
3478 3565
3479 *ppos += cnt; 3566 *ppos += cnt;
3480 3567
3481 /* If check pages failed, return ENOMEM */ 3568 return cnt;
3482 if (tracing_disabled) 3569}
3483 cnt = -ENOMEM;
3484 out:
3485 for_each_tracing_cpu(cpu) {
3486 if (global_trace.data[cpu])
3487 atomic_dec(&global_trace.data[cpu]->disabled);
3488 if (max_tr.data[cpu])
3489 atomic_dec(&max_tr.data[cpu]->disabled);
3490 }
3491 3570
3492 tracing_start(); 3571static ssize_t
3493 mutex_unlock(&trace_types_lock); 3572tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3573 size_t cnt, loff_t *ppos)
3574{
3575 /*
3576 * There is no need to read what the user has written, this function
3577 * is just to make sure that there is no error when "echo" is used
3578 */
3579
3580 *ppos += cnt;
3494 3581
3495 return cnt; 3582 return cnt;
3496} 3583}
3497 3584
3585static int
3586tracing_free_buffer_release(struct inode *inode, struct file *filp)
3587{
3588 /* disable tracing ? */
3589 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3590 tracing_off();
3591 /* resize the ring buffer to 0 */
3592 tracing_resize_ring_buffer(0);
3593
3594 return 0;
3595}
3596
3498static int mark_printk(const char *fmt, ...) 3597static int mark_printk(const char *fmt, ...)
3499{ 3598{
3500 int ret; 3599 int ret;
@@ -3640,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = {
3640 .llseek = generic_file_llseek, 3739 .llseek = generic_file_llseek,
3641}; 3740};
3642 3741
3742static const struct file_operations tracing_free_buffer_fops = {
3743 .write = tracing_free_buffer_write,
3744 .release = tracing_free_buffer_release,
3745};
3746
3643static const struct file_operations tracing_mark_fops = { 3747static const struct file_operations tracing_mark_fops = {
3644 .open = tracing_open_generic, 3748 .open = tracing_open_generic,
3645 .write = tracing_mark_write, 3749 .write = tracing_mark_write,
@@ -3696,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3696 return 0; 3800 return 0;
3697 3801
3698 if (!info->spare) 3802 if (!info->spare)
3699 info->spare = ring_buffer_alloc_read_page(info->tr->buffer); 3803 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
3700 if (!info->spare) 3804 if (!info->spare)
3701 return -ENOMEM; 3805 return -ENOMEM;
3702 3806
@@ -3853,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3853 3957
3854 ref->ref = 1; 3958 ref->ref = 1;
3855 ref->buffer = info->tr->buffer; 3959 ref->buffer = info->tr->buffer;
3856 ref->page = ring_buffer_alloc_read_page(ref->buffer); 3960 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
3857 if (!ref->page) { 3961 if (!ref->page) {
3858 kfree(ref); 3962 kfree(ref);
3859 break; 3963 break;
@@ -3862,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3862 r = ring_buffer_read_page(ref->buffer, &ref->page, 3966 r = ring_buffer_read_page(ref->buffer, &ref->page,
3863 len, info->cpu, 1); 3967 len, info->cpu, 1);
3864 if (r < 0) { 3968 if (r < 0) {
3865 ring_buffer_free_read_page(ref->buffer, 3969 ring_buffer_free_read_page(ref->buffer, ref->page);
3866 ref->page);
3867 kfree(ref); 3970 kfree(ref);
3868 break; 3971 break;
3869 } 3972 }
@@ -4099,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4099{ 4202{
4100 struct trace_option_dentry *topt = filp->private_data; 4203 struct trace_option_dentry *topt = filp->private_data;
4101 unsigned long val; 4204 unsigned long val;
4102 char buf[64];
4103 int ret; 4205 int ret;
4104 4206
4105 if (cnt >= sizeof(buf)) 4207 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4106 return -EINVAL; 4208 if (ret)
4107
4108 if (copy_from_user(&buf, ubuf, cnt))
4109 return -EFAULT;
4110
4111 buf[cnt] = 0;
4112
4113 ret = strict_strtoul(buf, 10, &val);
4114 if (ret < 0)
4115 return ret; 4209 return ret;
4116 4210
4117 if (val != 0 && val != 1) 4211 if (val != 0 && val != 1)
@@ -4159,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4159 loff_t *ppos) 4253 loff_t *ppos)
4160{ 4254{
4161 long index = (long)filp->private_data; 4255 long index = (long)filp->private_data;
4162 char buf[64];
4163 unsigned long val; 4256 unsigned long val;
4164 int ret; 4257 int ret;
4165 4258
4166 if (cnt >= sizeof(buf)) 4259 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4167 return -EINVAL; 4260 if (ret)
4168
4169 if (copy_from_user(&buf, ubuf, cnt))
4170 return -EFAULT;
4171
4172 buf[cnt] = 0;
4173
4174 ret = strict_strtoul(buf, 10, &val);
4175 if (ret < 0)
4176 return ret; 4261 return ret;
4177 4262
4178 if (val != 0 && val != 1) 4263 if (val != 0 && val != 1)
@@ -4365,6 +4450,9 @@ static __init int tracer_init_debugfs(void)
4365 trace_create_file("buffer_size_kb", 0644, d_tracer, 4450 trace_create_file("buffer_size_kb", 0644, d_tracer,
4366 &global_trace, &tracing_entries_fops); 4451 &global_trace, &tracing_entries_fops);
4367 4452
4453 trace_create_file("free_buffer", 0644, d_tracer,
4454 &global_trace, &tracing_free_buffer_fops);
4455
4368 trace_create_file("trace_marker", 0220, d_tracer, 4456 trace_create_file("trace_marker", 0220, d_tracer,
4369 NULL, &tracing_mark_fops); 4457 NULL, &tracing_mark_fops);
4370 4458
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 229f8591f61d..616846bcfee5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2,7 +2,7 @@
2#define _LINUX_KERNEL_TRACE_H 2#define _LINUX_KERNEL_TRACE_H
3 3
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <asm/atomic.h> 5#include <linux/atomic.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
@@ -278,6 +278,29 @@ struct tracer {
278}; 278};
279 279
280 280
281/* Only current can touch trace_recursion */
282#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
283#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
284
285/* Ring buffer has the 10 LSB bits to count */
286#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
287
288/* for function tracing recursion */
289#define TRACE_INTERNAL_BIT (1<<11)
290#define TRACE_GLOBAL_BIT (1<<12)
291/*
292 * Abuse of the trace_recursion.
293 * As we need a way to maintain state if we are tracing the function
294 * graph in irq because we want to trace a particular function that
295 * was called in irq context but we have irq tracing off. Since this
296 * can only be modified by current, we can reuse trace_recursion.
297 */
298#define TRACE_IRQ_BIT (1<<13)
299
300#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
301#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
302#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
303
281#define TRACE_PIPE_ALL_CPU -1 304#define TRACE_PIPE_ALL_CPU -1
282 305
283int tracer_init(struct tracer *t, struct trace_array *tr); 306int tracer_init(struct tracer *t, struct trace_array *tr);
@@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr,
389void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, 412void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
390 int skip, int pc); 413 int skip, int pc);
391 414
415void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
416 int skip, int pc, struct pt_regs *regs);
417
392void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, 418void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
393 int pc); 419 int pc);
394 420
@@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
400{ 426{
401} 427}
402 428
429static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
430 unsigned long flags, int skip,
431 int pc, struct pt_regs *regs)
432{
433}
434
403static inline void ftrace_trace_userstack(struct ring_buffer *buffer, 435static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
404 unsigned long flags, int pc) 436 unsigned long flags, int pc)
405{ 437{
@@ -507,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr)
507 return 1; 539 return 1;
508 540
509 for (i = 0; i < ftrace_graph_count; i++) { 541 for (i = 0; i < ftrace_graph_count; i++) {
510 if (addr == ftrace_graph_funcs[i]) 542 if (addr == ftrace_graph_funcs[i]) {
543 /*
544 * If no irqs are to be traced, but a set_graph_function
545 * is set, and called by an interrupt handler, we still
546 * want to trace it.
547 */
548 if (in_irq())
549 trace_recursion_set(TRACE_IRQ_BIT);
550 else
551 trace_recursion_clear(TRACE_IRQ_BIT);
511 return 1; 552 return 1;
553 }
512 } 554 }
513 555
514 return 0; 556 return 0;
@@ -609,6 +651,7 @@ enum trace_iterator_flags {
609 TRACE_ITER_GRAPH_TIME = 0x80000, 651 TRACE_ITER_GRAPH_TIME = 0x80000,
610 TRACE_ITER_RECORD_CMD = 0x100000, 652 TRACE_ITER_RECORD_CMD = 0x100000,
611 TRACE_ITER_OVERWRITE = 0x200000, 653 TRACE_ITER_OVERWRITE = 0x200000,
654 TRACE_ITER_STOP_ON_FREE = 0x400000,
612}; 655};
613 656
614/* 657/*
@@ -677,6 +720,7 @@ struct event_subsystem {
677 struct dentry *entry; 720 struct dentry *entry;
678 struct event_filter *filter; 721 struct event_filter *filter;
679 int nr_events; 722 int nr_events;
723 int ref_count;
680}; 724};
681 725
682#define FILTER_PRED_INVALID ((unsigned short)-1) 726#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -784,19 +828,4 @@ extern const char *__stop___trace_bprintk_fmt[];
784 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 828 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
785#include "trace_entries.h" 829#include "trace_entries.h"
786 830
787/* Only current can touch trace_recursion */
788#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
789#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
790
791/* Ring buffer has the 10 LSB bits to count */
792#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
793
794/* for function tracing recursion */
795#define TRACE_INTERNAL_BIT (1<<11)
796#define TRACE_GLOBAL_BIT (1<<12)
797
798#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
799#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
800#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
801
802#endif /* _LINUX_KERNEL_TRACE_H */ 831#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e32744c84d94..93365907f219 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
161 TRACE_STACK, 161 TRACE_STACK,
162 162
163 F_STRUCT( 163 F_STRUCT(
164 __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) 164 __field( int, size )
165 __dynamic_array(unsigned long, caller )
165 ), 166 ),
166 167
167 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" 168 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 686ec399f2a8..581876f9f387 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -244,6 +244,35 @@ static void ftrace_clear_events(void)
244 mutex_unlock(&event_mutex); 244 mutex_unlock(&event_mutex);
245} 245}
246 246
247static void __put_system(struct event_subsystem *system)
248{
249 struct event_filter *filter = system->filter;
250
251 WARN_ON_ONCE(system->ref_count == 0);
252 if (--system->ref_count)
253 return;
254
255 if (filter) {
256 kfree(filter->filter_string);
257 kfree(filter);
258 }
259 kfree(system->name);
260 kfree(system);
261}
262
263static void __get_system(struct event_subsystem *system)
264{
265 WARN_ON_ONCE(system->ref_count == 0);
266 system->ref_count++;
267}
268
269static void put_system(struct event_subsystem *system)
270{
271 mutex_lock(&event_mutex);
272 __put_system(system);
273 mutex_unlock(&event_mutex);
274}
275
247/* 276/*
248 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 277 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
249 */ 278 */
@@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
486 loff_t *ppos) 515 loff_t *ppos)
487{ 516{
488 struct ftrace_event_call *call = filp->private_data; 517 struct ftrace_event_call *call = filp->private_data;
489 char buf[64];
490 unsigned long val; 518 unsigned long val;
491 int ret; 519 int ret;
492 520
493 if (cnt >= sizeof(buf)) 521 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
494 return -EINVAL; 522 if (ret)
495
496 if (copy_from_user(&buf, ubuf, cnt))
497 return -EFAULT;
498
499 buf[cnt] = 0;
500
501 ret = strict_strtoul(buf, 10, &val);
502 if (ret < 0)
503 return ret; 523 return ret;
504 524
505 ret = tracing_update_buffers(); 525 ret = tracing_update_buffers();
@@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
528 loff_t *ppos) 548 loff_t *ppos)
529{ 549{
530 const char set_to_char[4] = { '?', '0', '1', 'X' }; 550 const char set_to_char[4] = { '?', '0', '1', 'X' };
531 const char *system = filp->private_data; 551 struct event_subsystem *system = filp->private_data;
532 struct ftrace_event_call *call; 552 struct ftrace_event_call *call;
533 char buf[2]; 553 char buf[2];
534 int set = 0; 554 int set = 0;
@@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
539 if (!call->name || !call->class || !call->class->reg) 559 if (!call->name || !call->class || !call->class->reg)
540 continue; 560 continue;
541 561
542 if (system && strcmp(call->class->system, system) != 0) 562 if (system && strcmp(call->class->system, system->name) != 0)
543 continue; 563 continue;
544 564
545 /* 565 /*
@@ -569,21 +589,13 @@ static ssize_t
569system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 589system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
570 loff_t *ppos) 590 loff_t *ppos)
571{ 591{
572 const char *system = filp->private_data; 592 struct event_subsystem *system = filp->private_data;
593 const char *name = NULL;
573 unsigned long val; 594 unsigned long val;
574 char buf[64];
575 ssize_t ret; 595 ssize_t ret;
576 596
577 if (cnt >= sizeof(buf)) 597 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
578 return -EINVAL; 598 if (ret)
579
580 if (copy_from_user(&buf, ubuf, cnt))
581 return -EFAULT;
582
583 buf[cnt] = 0;
584
585 ret = strict_strtoul(buf, 10, &val);
586 if (ret < 0)
587 return ret; 599 return ret;
588 600
589 ret = tracing_update_buffers(); 601 ret = tracing_update_buffers();
@@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
593 if (val != 0 && val != 1) 605 if (val != 0 && val != 1)
594 return -EINVAL; 606 return -EINVAL;
595 607
596 ret = __ftrace_set_clr_event(NULL, system, NULL, val); 608 /*
609 * Opening of "enable" adds a ref count to system,
610 * so the name is safe to use.
611 */
612 if (system)
613 name = system->name;
614
615 ret = __ftrace_set_clr_event(NULL, name, NULL, val);
597 if (ret) 616 if (ret)
598 goto out; 617 goto out;
599 618
@@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
826 return cnt; 845 return cnt;
827} 846}
828 847
848static LIST_HEAD(event_subsystems);
849
850static int subsystem_open(struct inode *inode, struct file *filp)
851{
852 struct event_subsystem *system = NULL;
853 int ret;
854
855 if (!inode->i_private)
856 goto skip_search;
857
858 /* Make sure the system still exists */
859 mutex_lock(&event_mutex);
860 list_for_each_entry(system, &event_subsystems, list) {
861 if (system == inode->i_private) {
862 /* Don't open systems with no events */
863 if (!system->nr_events) {
864 system = NULL;
865 break;
866 }
867 __get_system(system);
868 break;
869 }
870 }
871 mutex_unlock(&event_mutex);
872
873 if (system != inode->i_private)
874 return -ENODEV;
875
876 skip_search:
877 ret = tracing_open_generic(inode, filp);
878 if (ret < 0 && system)
879 put_system(system);
880
881 return ret;
882}
883
884static int subsystem_release(struct inode *inode, struct file *file)
885{
886 struct event_subsystem *system = inode->i_private;
887
888 if (system)
889 put_system(system);
890
891 return 0;
892}
893
829static ssize_t 894static ssize_t
830subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 895subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
831 loff_t *ppos) 896 loff_t *ppos)
@@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = {
963}; 1028};
964 1029
965static const struct file_operations ftrace_subsystem_filter_fops = { 1030static const struct file_operations ftrace_subsystem_filter_fops = {
966 .open = tracing_open_generic, 1031 .open = subsystem_open,
967 .read = subsystem_filter_read, 1032 .read = subsystem_filter_read,
968 .write = subsystem_filter_write, 1033 .write = subsystem_filter_write,
969 .llseek = default_llseek, 1034 .llseek = default_llseek,
1035 .release = subsystem_release,
970}; 1036};
971 1037
972static const struct file_operations ftrace_system_enable_fops = { 1038static const struct file_operations ftrace_system_enable_fops = {
973 .open = tracing_open_generic, 1039 .open = subsystem_open,
974 .read = system_enable_read, 1040 .read = system_enable_read,
975 .write = system_enable_write, 1041 .write = system_enable_write,
976 .llseek = default_llseek, 1042 .llseek = default_llseek,
1043 .release = subsystem_release,
977}; 1044};
978 1045
979static const struct file_operations ftrace_show_header_fops = { 1046static const struct file_operations ftrace_show_header_fops = {
@@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void)
1002 return d_events; 1069 return d_events;
1003} 1070}
1004 1071
1005static LIST_HEAD(event_subsystems);
1006
1007static struct dentry * 1072static struct dentry *
1008event_subsystem_dir(const char *name, struct dentry *d_events) 1073event_subsystem_dir(const char *name, struct dentry *d_events)
1009{ 1074{
@@ -1013,6 +1078,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1013 /* First see if we did not already create this dir */ 1078 /* First see if we did not already create this dir */
1014 list_for_each_entry(system, &event_subsystems, list) { 1079 list_for_each_entry(system, &event_subsystems, list) {
1015 if (strcmp(system->name, name) == 0) { 1080 if (strcmp(system->name, name) == 0) {
1081 __get_system(system);
1016 system->nr_events++; 1082 system->nr_events++;
1017 return system->entry; 1083 return system->entry;
1018 } 1084 }
@@ -1035,6 +1101,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1035 } 1101 }
1036 1102
1037 system->nr_events = 1; 1103 system->nr_events = 1;
1104 system->ref_count = 1;
1038 system->name = kstrdup(name, GFP_KERNEL); 1105 system->name = kstrdup(name, GFP_KERNEL);
1039 if (!system->name) { 1106 if (!system->name) {
1040 debugfs_remove(system->entry); 1107 debugfs_remove(system->entry);
@@ -1062,8 +1129,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1062 "'%s/filter' entry\n", name); 1129 "'%s/filter' entry\n", name);
1063 } 1130 }
1064 1131
1065 trace_create_file("enable", 0644, system->entry, 1132 trace_create_file("enable", 0644, system->entry, system,
1066 (void *)system->name,
1067 &ftrace_system_enable_fops); 1133 &ftrace_system_enable_fops);
1068 1134
1069 return system->entry; 1135 return system->entry;
@@ -1184,16 +1250,9 @@ static void remove_subsystem_dir(const char *name)
1184 list_for_each_entry(system, &event_subsystems, list) { 1250 list_for_each_entry(system, &event_subsystems, list) {
1185 if (strcmp(system->name, name) == 0) { 1251 if (strcmp(system->name, name) == 0) {
1186 if (!--system->nr_events) { 1252 if (!--system->nr_events) {
1187 struct event_filter *filter = system->filter;
1188
1189 debugfs_remove_recursive(system->entry); 1253 debugfs_remove_recursive(system->entry);
1190 list_del(&system->list); 1254 list_del(&system->list);
1191 if (filter) { 1255 __put_system(system);
1192 kfree(filter->filter_string);
1193 kfree(filter);
1194 }
1195 kfree(system->name);
1196 kfree(system);
1197 } 1256 }
1198 break; 1257 break;
1199 } 1258 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddcfbf20..256764ecccd6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1886 1886
1887 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1888 1888
1889 /* Make sure the system still has events */
1890 if (!system->nr_events) {
1891 err = -ENODEV;
1892 goto out_unlock;
1893 }
1894
1889 if (!strcmp(strstrip(filter_string), "0")) { 1895 if (!strcmp(strstrip(filter_string), "0")) {
1890 filter_free_subsystem_preds(system); 1896 filter_free_subsystem_preds(system);
1891 remove_filter_string(system->filter); 1897 remove_filter_string(system->filter);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8d0e1cc4e974..c7b0c6a7db09 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -324,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
324} 324}
325 325
326static int 326static int
327ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) 327ftrace_trace_onoff_callback(struct ftrace_hash *hash,
328 char *glob, char *cmd, char *param, int enable)
328{ 329{
329 struct ftrace_probe_ops *ops; 330 struct ftrace_probe_ops *ops;
330 void *count = (void *)-1; 331 void *count = (void *)-1;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 962cdb24ed81..a7d2a4c653d8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = {
74 74
75static struct trace_array *graph_array; 75static struct trace_array *graph_array;
76 76
77/*
78 * DURATION column is being also used to display IRQ signs,
79 * following values are used by print_graph_irq and others
80 * to fill in space into DURATION column.
81 */
82enum {
83 DURATION_FILL_FULL = -1,
84 DURATION_FILL_START = -2,
85 DURATION_FILL_END = -3,
86};
87
88static enum print_line_t
89print_graph_duration(unsigned long long duration, struct trace_seq *s,
90 u32 flags);
77 91
78/* Add a function return address to the trace stack on thread info.*/ 92/* Add a function return address to the trace stack on thread info.*/
79int 93int
@@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr,
213 227
214static inline int ftrace_graph_ignore_irqs(void) 228static inline int ftrace_graph_ignore_irqs(void)
215{ 229{
216 if (!ftrace_graph_skip_irqs) 230 if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT))
217 return 0; 231 return 0;
218 232
219 return in_irq(); 233 return in_irq();
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter,
577 return next; 591 return next;
578} 592}
579 593
580/* Signal a overhead of time execution to the output */
581static int
582print_graph_overhead(unsigned long long duration, struct trace_seq *s,
583 u32 flags)
584{
585 /* If duration disappear, we don't need anything */
586 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
587 return 1;
588
589 /* Non nested entry or return */
590 if (duration == -1)
591 return trace_seq_printf(s, " ");
592
593 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
594 /* Duration exceeded 100 msecs */
595 if (duration > 100000ULL)
596 return trace_seq_printf(s, "! ");
597
598 /* Duration exceeded 10 msecs */
599 if (duration > 10000ULL)
600 return trace_seq_printf(s, "+ ");
601 }
602
603 return trace_seq_printf(s, " ");
604}
605
606static int print_graph_abs_time(u64 t, struct trace_seq *s) 594static int print_graph_abs_time(u64 t, struct trace_seq *s)
607{ 595{
608 unsigned long usecs_rem; 596 unsigned long usecs_rem;
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
625 addr >= (unsigned long)__irqentry_text_end) 613 addr >= (unsigned long)__irqentry_text_end)
626 return TRACE_TYPE_UNHANDLED; 614 return TRACE_TYPE_UNHANDLED;
627 615
628 /* Absolute time */ 616 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
629 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 617 /* Absolute time */
630 ret = print_graph_abs_time(iter->ts, s); 618 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
631 if (!ret) 619 ret = print_graph_abs_time(iter->ts, s);
632 return TRACE_TYPE_PARTIAL_LINE; 620 if (!ret)
633 } 621 return TRACE_TYPE_PARTIAL_LINE;
622 }
634 623
635 /* Cpu */ 624 /* Cpu */
636 if (flags & TRACE_GRAPH_PRINT_CPU) { 625 if (flags & TRACE_GRAPH_PRINT_CPU) {
637 ret = print_graph_cpu(s, cpu); 626 ret = print_graph_cpu(s, cpu);
638 if (ret == TRACE_TYPE_PARTIAL_LINE) 627 if (ret == TRACE_TYPE_PARTIAL_LINE)
639 return TRACE_TYPE_PARTIAL_LINE; 628 return TRACE_TYPE_PARTIAL_LINE;
640 } 629 }
641 630
642 /* Proc */ 631 /* Proc */
643 if (flags & TRACE_GRAPH_PRINT_PROC) { 632 if (flags & TRACE_GRAPH_PRINT_PROC) {
644 ret = print_graph_proc(s, pid); 633 ret = print_graph_proc(s, pid);
645 if (ret == TRACE_TYPE_PARTIAL_LINE) 634 if (ret == TRACE_TYPE_PARTIAL_LINE)
646 return TRACE_TYPE_PARTIAL_LINE; 635 return TRACE_TYPE_PARTIAL_LINE;
647 ret = trace_seq_printf(s, " | "); 636 ret = trace_seq_printf(s, " | ");
648 if (!ret) 637 if (!ret)
649 return TRACE_TYPE_PARTIAL_LINE; 638 return TRACE_TYPE_PARTIAL_LINE;
639 }
650 } 640 }
651 641
652 /* No overhead */ 642 /* No overhead */
653 ret = print_graph_overhead(-1, s, flags); 643 ret = print_graph_duration(DURATION_FILL_START, s, flags);
654 if (!ret) 644 if (ret != TRACE_TYPE_HANDLED)
655 return TRACE_TYPE_PARTIAL_LINE; 645 return ret;
656 646
657 if (type == TRACE_GRAPH_ENT) 647 if (type == TRACE_GRAPH_ENT)
658 ret = trace_seq_printf(s, "==========>"); 648 ret = trace_seq_printf(s, "==========>");
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
662 if (!ret) 652 if (!ret)
663 return TRACE_TYPE_PARTIAL_LINE; 653 return TRACE_TYPE_PARTIAL_LINE;
664 654
665 /* Don't close the duration column if haven't one */ 655 ret = print_graph_duration(DURATION_FILL_END, s, flags);
666 if (flags & TRACE_GRAPH_PRINT_DURATION) 656 if (ret != TRACE_TYPE_HANDLED)
667 trace_seq_printf(s, " |"); 657 return ret;
658
668 ret = trace_seq_printf(s, "\n"); 659 ret = trace_seq_printf(s, "\n");
669 660
670 if (!ret) 661 if (!ret)
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
716} 707}
717 708
718static enum print_line_t 709static enum print_line_t
719print_graph_duration(unsigned long long duration, struct trace_seq *s) 710print_graph_duration(unsigned long long duration, struct trace_seq *s,
711 u32 flags)
720{ 712{
721 int ret; 713 int ret = -1;
714
715 if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
716 !(trace_flags & TRACE_ITER_CONTEXT_INFO))
717 return TRACE_TYPE_HANDLED;
718
719 /* No real adata, just filling the column with spaces */
720 switch (duration) {
721 case DURATION_FILL_FULL:
722 ret = trace_seq_printf(s, " | ");
723 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
724 case DURATION_FILL_START:
725 ret = trace_seq_printf(s, " ");
726 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
727 case DURATION_FILL_END:
728 ret = trace_seq_printf(s, " |");
729 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
730 }
731
732 /* Signal a overhead of time execution to the output */
733 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
734 /* Duration exceeded 100 msecs */
735 if (duration > 100000ULL)
736 ret = trace_seq_printf(s, "! ");
737 /* Duration exceeded 10 msecs */
738 else if (duration > 10000ULL)
739 ret = trace_seq_printf(s, "+ ");
740 }
741
742 /*
743 * The -1 means we either did not exceed the duration tresholds
744 * or we dont want to print out the overhead. Either way we need
745 * to fill out the space.
746 */
747 if (ret == -1)
748 ret = trace_seq_printf(s, " ");
749
750 /* Catching here any failure happenned above */
751 if (!ret)
752 return TRACE_TYPE_PARTIAL_LINE;
722 753
723 ret = trace_print_graph_duration(duration, s); 754 ret = trace_print_graph_duration(duration, s);
724 if (ret != TRACE_TYPE_HANDLED) 755 if (ret != TRACE_TYPE_HANDLED)
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter,
767 cpu_data->enter_funcs[call->depth] = 0; 798 cpu_data->enter_funcs[call->depth] = 0;
768 } 799 }
769 800
770 /* Overhead */ 801 /* Overhead and duration */
771 ret = print_graph_overhead(duration, s, flags); 802 ret = print_graph_duration(duration, s, flags);
772 if (!ret) 803 if (ret == TRACE_TYPE_PARTIAL_LINE)
773 return TRACE_TYPE_PARTIAL_LINE; 804 return TRACE_TYPE_PARTIAL_LINE;
774 805
775 /* Duration */
776 if (flags & TRACE_GRAPH_PRINT_DURATION) {
777 ret = print_graph_duration(duration, s);
778 if (ret == TRACE_TYPE_PARTIAL_LINE)
779 return TRACE_TYPE_PARTIAL_LINE;
780 }
781
782 /* Function */ 806 /* Function */
783 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 807 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
784 ret = trace_seq_printf(s, " "); 808 ret = trace_seq_printf(s, " ");
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter,
815 cpu_data->enter_funcs[call->depth] = call->func; 839 cpu_data->enter_funcs[call->depth] = call->func;
816 } 840 }
817 841
818 /* No overhead */
819 ret = print_graph_overhead(-1, s, flags);
820 if (!ret)
821 return TRACE_TYPE_PARTIAL_LINE;
822
823 /* No time */ 842 /* No time */
824 if (flags & TRACE_GRAPH_PRINT_DURATION) { 843 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
825 ret = trace_seq_printf(s, " | "); 844 if (ret != TRACE_TYPE_HANDLED)
826 if (!ret) 845 return ret;
827 return TRACE_TYPE_PARTIAL_LINE;
828 }
829 846
830 /* Function */ 847 /* Function */
831 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 848 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
865 return TRACE_TYPE_PARTIAL_LINE; 882 return TRACE_TYPE_PARTIAL_LINE;
866 } 883 }
867 884
885 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
886 return 0;
887
868 /* Absolute time */ 888 /* Absolute time */
869 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { 889 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
870 ret = print_graph_abs_time(iter->ts, s); 890 ret = print_graph_abs_time(iter->ts, s);
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1078 if (print_graph_prologue(iter, s, 0, 0, flags)) 1098 if (print_graph_prologue(iter, s, 0, 0, flags))
1079 return TRACE_TYPE_PARTIAL_LINE; 1099 return TRACE_TYPE_PARTIAL_LINE;
1080 1100
1081 /* Overhead */ 1101 /* Overhead and duration */
1082 ret = print_graph_overhead(duration, s, flags); 1102 ret = print_graph_duration(duration, s, flags);
1083 if (!ret) 1103 if (ret == TRACE_TYPE_PARTIAL_LINE)
1084 return TRACE_TYPE_PARTIAL_LINE; 1104 return TRACE_TYPE_PARTIAL_LINE;
1085 1105
1086 /* Duration */
1087 if (flags & TRACE_GRAPH_PRINT_DURATION) {
1088 ret = print_graph_duration(duration, s);
1089 if (ret == TRACE_TYPE_PARTIAL_LINE)
1090 return TRACE_TYPE_PARTIAL_LINE;
1091 }
1092
1093 /* Closing brace */ 1106 /* Closing brace */
1094 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1107 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
1095 ret = trace_seq_printf(s, " "); 1108 ret = trace_seq_printf(s, " ");
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1146 if (print_graph_prologue(iter, s, 0, 0, flags)) 1159 if (print_graph_prologue(iter, s, 0, 0, flags))
1147 return TRACE_TYPE_PARTIAL_LINE; 1160 return TRACE_TYPE_PARTIAL_LINE;
1148 1161
1149 /* No overhead */
1150 ret = print_graph_overhead(-1, s, flags);
1151 if (!ret)
1152 return TRACE_TYPE_PARTIAL_LINE;
1153
1154 /* No time */ 1162 /* No time */
1155 if (flags & TRACE_GRAPH_PRINT_DURATION) { 1163 ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
1156 ret = trace_seq_printf(s, " | "); 1164 if (ret != TRACE_TYPE_HANDLED)
1157 if (!ret) 1165 return ret;
1158 return TRACE_TYPE_PARTIAL_LINE;
1159 }
1160 1166
1161 /* Indentation */ 1167 /* Indentation */
1162 if (depth > 0) 1168 if (depth > 0)
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1207 1213
1208 1214
1209enum print_line_t 1215enum print_line_t
1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1216print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1211{ 1217{
1212 struct ftrace_graph_ent_entry *field; 1218 struct ftrace_graph_ent_entry *field;
1213 struct fgraph_data *data = iter->private; 1219 struct fgraph_data *data = iter->private;
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1270static enum print_line_t 1276static enum print_line_t
1271print_graph_function(struct trace_iterator *iter) 1277print_graph_function(struct trace_iterator *iter)
1272{ 1278{
1273 return __print_graph_function_flags(iter, tracer_flags.val); 1279 return print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1285} 1280}
1286 1281
1287static enum print_line_t 1282static enum print_line_t
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1309 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); 1304 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
1310 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); 1305 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
1311 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); 1306 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
1312 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); 1307 seq_printf(s, "#%.*s||| / \n", size, spaces);
1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1314} 1308}
1315 1309
1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags) 1310static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1329 if (flags & TRACE_GRAPH_PRINT_PROC) 1323 if (flags & TRACE_GRAPH_PRINT_PROC)
1330 seq_printf(s, " TASK/PID "); 1324 seq_printf(s, " TASK/PID ");
1331 if (lat) 1325 if (lat)
1332 seq_printf(s, "|||||"); 1326 seq_printf(s, "||||");
1333 if (flags & TRACE_GRAPH_PRINT_DURATION) 1327 if (flags & TRACE_GRAPH_PRINT_DURATION)
1334 seq_printf(s, " DURATION "); 1328 seq_printf(s, " DURATION ");
1335 seq_printf(s, " FUNCTION CALLS\n"); 1329 seq_printf(s, " FUNCTION CALLS\n");
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1343 if (flags & TRACE_GRAPH_PRINT_PROC) 1337 if (flags & TRACE_GRAPH_PRINT_PROC)
1344 seq_printf(s, " | | "); 1338 seq_printf(s, " | | ");
1345 if (lat) 1339 if (lat)
1346 seq_printf(s, "|||||"); 1340 seq_printf(s, "||||");
1347 if (flags & TRACE_GRAPH_PRINT_DURATION) 1341 if (flags & TRACE_GRAPH_PRINT_DURATION)
1348 seq_printf(s, " | | "); 1342 seq_printf(s, " | | ");
1349 seq_printf(s, " | | | |\n"); 1343 seq_printf(s, " | | | |\n");
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{ 1352{
1359 struct trace_iterator *iter = s->private; 1353 struct trace_iterator *iter = s->private;
1360 1354
1355 if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
1356 return;
1357
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) { 1358 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */ 1359 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter)) 1360 if (trace_empty(iter))
1364 return; 1361 return;
1365 1362
1366 print_trace_header(s, iter); 1363 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION; 1364 }
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370 1365
1371 __print_graph_headers_flags(s, flags); 1366 __print_graph_headers_flags(s, flags);
1372} 1367}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c77424be284d..667aa8cc0cfc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
226} 226}
227 227
228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ 228#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
229 TRACE_GRAPH_PRINT_PROC) 229 TRACE_GRAPH_PRINT_PROC | \
230 TRACE_GRAPH_PRINT_ABS_TIME | \
231 TRACE_GRAPH_PRINT_DURATION)
230 232
231static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 233static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
232{ 234{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 27d13b36b8be..5fb3697bf0e5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -343,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref)
343DEFINE_FETCH_deref(string) 343DEFINE_FETCH_deref(string)
344DEFINE_FETCH_deref(string_size) 344DEFINE_FETCH_deref(string_size)
345 345
346static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
347{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
349 update_deref_fetch_param(data->orig.data);
350 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
351 update_symbol_cache(data->orig.data);
352}
353
346static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) 354static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
347{ 355{
348 if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) 356 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
@@ -377,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string_size NULL 385#define fetch_bitfield_string_size NULL
378 386
379static __kprobes void 387static __kprobes void
388update_bitfield_fetch_param(struct bitfield_fetch_param *data)
389{
390 /*
391 * Don't check the bitfield itself, because this must be the
392 * last fetch function.
393 */
394 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
395 update_deref_fetch_param(data->orig.data);
396 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
397 update_symbol_cache(data->orig.data);
398}
399
400static __kprobes void
380free_bitfield_fetch_param(struct bitfield_fetch_param *data) 401free_bitfield_fetch_param(struct bitfield_fetch_param *data)
381{ 402{
382 /* 403 /*
@@ -389,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data)
389 free_symbol_cache(data->orig.data); 410 free_symbol_cache(data->orig.data);
390 kfree(data); 411 kfree(data);
391} 412}
413
392/* Default (unsigned long) fetch type */ 414/* Default (unsigned long) fetch type */
393#define __DEFAULT_FETCH_TYPE(t) u##t 415#define __DEFAULT_FETCH_TYPE(t) u##t
394#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 416#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -536,6 +558,7 @@ struct probe_arg {
536/* Flags for trace_probe */ 558/* Flags for trace_probe */
537#define TP_FLAG_TRACE 1 559#define TP_FLAG_TRACE 1
538#define TP_FLAG_PROFILE 2 560#define TP_FLAG_PROFILE 2
561#define TP_FLAG_REGISTERED 4
539 562
540struct trace_probe { 563struct trace_probe {
541 struct list_head list; 564 struct list_head list;
@@ -555,16 +578,49 @@ struct trace_probe {
555 (sizeof(struct probe_arg) * (n))) 578 (sizeof(struct probe_arg) * (n)))
556 579
557 580
558static __kprobes int probe_is_return(struct trace_probe *tp) 581static __kprobes int trace_probe_is_return(struct trace_probe *tp)
559{ 582{
560 return tp->rp.handler != NULL; 583 return tp->rp.handler != NULL;
561} 584}
562 585
563static __kprobes const char *probe_symbol(struct trace_probe *tp) 586static __kprobes const char *trace_probe_symbol(struct trace_probe *tp)
564{ 587{
565 return tp->symbol ? tp->symbol : "unknown"; 588 return tp->symbol ? tp->symbol : "unknown";
566} 589}
567 590
591static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp)
592{
593 return tp->rp.kp.offset;
594}
595
596static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp)
597{
598 return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
599}
600
601static __kprobes bool trace_probe_is_registered(struct trace_probe *tp)
602{
603 return !!(tp->flags & TP_FLAG_REGISTERED);
604}
605
606static __kprobes bool trace_probe_has_gone(struct trace_probe *tp)
607{
608 return !!(kprobe_gone(&tp->rp.kp));
609}
610
611static __kprobes bool trace_probe_within_module(struct trace_probe *tp,
612 struct module *mod)
613{
614 int len = strlen(mod->name);
615 const char *name = trace_probe_symbol(tp);
616 return strncmp(mod->name, name, len) == 0 && name[len] == ':';
617}
618
619static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
620{
621 return !!strchr(trace_probe_symbol(tp), ':');
622}
623
568static int register_probe_event(struct trace_probe *tp); 624static int register_probe_event(struct trace_probe *tp);
569static void unregister_probe_event(struct trace_probe *tp); 625static void unregister_probe_event(struct trace_probe *tp);
570 626
@@ -646,6 +702,16 @@ error:
646 return ERR_PTR(ret); 702 return ERR_PTR(ret);
647} 703}
648 704
705static void update_probe_arg(struct probe_arg *arg)
706{
707 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
708 update_bitfield_fetch_param(arg->fetch.data);
709 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
710 update_deref_fetch_param(arg->fetch.data);
711 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
712 update_symbol_cache(arg->fetch.data);
713}
714
649static void free_probe_arg(struct probe_arg *arg) 715static void free_probe_arg(struct probe_arg *arg)
650{ 716{
651 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) 717 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
@@ -671,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp)
671 kfree(tp); 737 kfree(tp);
672} 738}
673 739
674static struct trace_probe *find_probe_event(const char *event, 740static struct trace_probe *find_trace_probe(const char *event,
675 const char *group) 741 const char *group)
676{ 742{
677 struct trace_probe *tp; 743 struct trace_probe *tp;
@@ -683,13 +749,96 @@ static struct trace_probe *find_probe_event(const char *event,
683 return NULL; 749 return NULL;
684} 750}
685 751
752/* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
753static int enable_trace_probe(struct trace_probe *tp, int flag)
754{
755 int ret = 0;
756
757 tp->flags |= flag;
758 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) &&
759 !trace_probe_has_gone(tp)) {
760 if (trace_probe_is_return(tp))
761 ret = enable_kretprobe(&tp->rp);
762 else
763 ret = enable_kprobe(&tp->rp.kp);
764 }
765
766 return ret;
767}
768
769/* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */
770static void disable_trace_probe(struct trace_probe *tp, int flag)
771{
772 tp->flags &= ~flag;
773 if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) {
774 if (trace_probe_is_return(tp))
775 disable_kretprobe(&tp->rp);
776 else
777 disable_kprobe(&tp->rp.kp);
778 }
779}
780
781/* Internal register function - just handle k*probes and flags */
782static int __register_trace_probe(struct trace_probe *tp)
783{
784 int i, ret;
785
786 if (trace_probe_is_registered(tp))
787 return -EINVAL;
788
789 for (i = 0; i < tp->nr_args; i++)
790 update_probe_arg(&tp->args[i]);
791
792 /* Set/clear disabled flag according to tp->flag */
793 if (trace_probe_is_enabled(tp))
794 tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED;
795 else
796 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
797
798 if (trace_probe_is_return(tp))
799 ret = register_kretprobe(&tp->rp);
800 else
801 ret = register_kprobe(&tp->rp.kp);
802
803 if (ret == 0)
804 tp->flags |= TP_FLAG_REGISTERED;
805 else {
806 pr_warning("Could not insert probe at %s+%lu: %d\n",
807 trace_probe_symbol(tp), trace_probe_offset(tp), ret);
808 if (ret == -ENOENT && trace_probe_is_on_module(tp)) {
809 pr_warning("This probe might be able to register after"
810 "target module is loaded. Continue.\n");
811 ret = 0;
812 } else if (ret == -EILSEQ) {
813 pr_warning("Probing address(0x%p) is not an "
814 "instruction boundary.\n",
815 tp->rp.kp.addr);
816 ret = -EINVAL;
817 }
818 }
819
820 return ret;
821}
822
823/* Internal unregister function - just handle k*probes and flags */
824static void __unregister_trace_probe(struct trace_probe *tp)
825{
826 if (trace_probe_is_registered(tp)) {
827 if (trace_probe_is_return(tp))
828 unregister_kretprobe(&tp->rp);
829 else
830 unregister_kprobe(&tp->rp.kp);
831 tp->flags &= ~TP_FLAG_REGISTERED;
832 /* Cleanup kprobe for reuse */
833 if (tp->rp.kp.symbol_name)
834 tp->rp.kp.addr = NULL;
835 }
836}
837
686/* Unregister a trace_probe and probe_event: call with locking probe_lock */ 838/* Unregister a trace_probe and probe_event: call with locking probe_lock */
687static void unregister_trace_probe(struct trace_probe *tp) 839static void unregister_trace_probe(struct trace_probe *tp)
688{ 840{
689 if (probe_is_return(tp)) 841 __unregister_trace_probe(tp);
690 unregister_kretprobe(&tp->rp);
691 else
692 unregister_kprobe(&tp->rp.kp);
693 list_del(&tp->list); 842 list_del(&tp->list);
694 unregister_probe_event(tp); 843 unregister_probe_event(tp);
695} 844}
@@ -702,41 +851,65 @@ static int register_trace_probe(struct trace_probe *tp)
702 851
703 mutex_lock(&probe_lock); 852 mutex_lock(&probe_lock);
704 853
705 /* register as an event */ 854 /* Delete old (same name) event if exist */
706 old_tp = find_probe_event(tp->call.name, tp->call.class->system); 855 old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
707 if (old_tp) { 856 if (old_tp) {
708 /* delete old event */
709 unregister_trace_probe(old_tp); 857 unregister_trace_probe(old_tp);
710 free_trace_probe(old_tp); 858 free_trace_probe(old_tp);
711 } 859 }
860
861 /* Register new event */
712 ret = register_probe_event(tp); 862 ret = register_probe_event(tp);
713 if (ret) { 863 if (ret) {
714 pr_warning("Failed to register probe event(%d)\n", ret); 864 pr_warning("Failed to register probe event(%d)\n", ret);
715 goto end; 865 goto end;
716 } 866 }
717 867
718 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; 868 /* Register k*probe */
719 if (probe_is_return(tp)) 869 ret = __register_trace_probe(tp);
720 ret = register_kretprobe(&tp->rp); 870 if (ret < 0)
721 else
722 ret = register_kprobe(&tp->rp.kp);
723
724 if (ret) {
725 pr_warning("Could not insert probe(%d)\n", ret);
726 if (ret == -EILSEQ) {
727 pr_warning("Probing address(0x%p) is not an "
728 "instruction boundary.\n",
729 tp->rp.kp.addr);
730 ret = -EINVAL;
731 }
732 unregister_probe_event(tp); 871 unregister_probe_event(tp);
733 } else 872 else
734 list_add_tail(&tp->list, &probe_list); 873 list_add_tail(&tp->list, &probe_list);
874
735end: 875end:
736 mutex_unlock(&probe_lock); 876 mutex_unlock(&probe_lock);
737 return ret; 877 return ret;
738} 878}
739 879
880/* Module notifier call back, checking event on the module */
881static int trace_probe_module_callback(struct notifier_block *nb,
882 unsigned long val, void *data)
883{
884 struct module *mod = data;
885 struct trace_probe *tp;
886 int ret;
887
888 if (val != MODULE_STATE_COMING)
889 return NOTIFY_DONE;
890
891 /* Update probes on coming module */
892 mutex_lock(&probe_lock);
893 list_for_each_entry(tp, &probe_list, list) {
894 if (trace_probe_within_module(tp, mod)) {
895 __unregister_trace_probe(tp);
896 ret = __register_trace_probe(tp);
897 if (ret)
898 pr_warning("Failed to re-register probe %s on"
899 "%s: %d\n",
900 tp->call.name, mod->name, ret);
901 }
902 }
903 mutex_unlock(&probe_lock);
904
905 return NOTIFY_DONE;
906}
907
908static struct notifier_block trace_probe_module_nb = {
909 .notifier_call = trace_probe_module_callback,
910 .priority = 1 /* Invoked after kprobe module callback */
911};
912
740/* Split symbol and offset. */ 913/* Split symbol and offset. */
741static int split_symbol_offset(char *symbol, unsigned long *offset) 914static int split_symbol_offset(char *symbol, unsigned long *offset)
742{ 915{
@@ -962,8 +1135,8 @@ static int create_trace_probe(int argc, char **argv)
962{ 1135{
963 /* 1136 /*
964 * Argument syntax: 1137 * Argument syntax:
965 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 1138 * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS]
966 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 1139 * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS]
967 * Fetch args: 1140 * Fetch args:
968 * $retval : fetch return value 1141 * $retval : fetch return value
969 * $stack : fetch stack address 1142 * $stack : fetch stack address
@@ -1025,7 +1198,7 @@ static int create_trace_probe(int argc, char **argv)
1025 return -EINVAL; 1198 return -EINVAL;
1026 } 1199 }
1027 mutex_lock(&probe_lock); 1200 mutex_lock(&probe_lock);
1028 tp = find_probe_event(event, group); 1201 tp = find_trace_probe(event, group);
1029 if (!tp) { 1202 if (!tp) {
1030 mutex_unlock(&probe_lock); 1203 mutex_unlock(&probe_lock);
1031 pr_info("Event %s/%s doesn't exist.\n", group, event); 1204 pr_info("Event %s/%s doesn't exist.\n", group, event);
@@ -1144,7 +1317,7 @@ error:
1144 return ret; 1317 return ret;
1145} 1318}
1146 1319
1147static void cleanup_all_probes(void) 1320static void release_all_trace_probes(void)
1148{ 1321{
1149 struct trace_probe *tp; 1322 struct trace_probe *tp;
1150 1323
@@ -1158,7 +1331,6 @@ static void cleanup_all_probes(void)
1158 mutex_unlock(&probe_lock); 1331 mutex_unlock(&probe_lock);
1159} 1332}
1160 1333
1161
1162/* Probes listing interfaces */ 1334/* Probes listing interfaces */
1163static void *probes_seq_start(struct seq_file *m, loff_t *pos) 1335static void *probes_seq_start(struct seq_file *m, loff_t *pos)
1164{ 1336{
@@ -1181,15 +1353,16 @@ static int probes_seq_show(struct seq_file *m, void *v)
1181 struct trace_probe *tp = v; 1353 struct trace_probe *tp = v;
1182 int i; 1354 int i;
1183 1355
1184 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 1356 seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p');
1185 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); 1357 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
1186 1358
1187 if (!tp->symbol) 1359 if (!tp->symbol)
1188 seq_printf(m, " 0x%p", tp->rp.kp.addr); 1360 seq_printf(m, " 0x%p", tp->rp.kp.addr);
1189 else if (tp->rp.kp.offset) 1361 else if (tp->rp.kp.offset)
1190 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); 1362 seq_printf(m, " %s+%u", trace_probe_symbol(tp),
1363 tp->rp.kp.offset);
1191 else 1364 else
1192 seq_printf(m, " %s", probe_symbol(tp)); 1365 seq_printf(m, " %s", trace_probe_symbol(tp));
1193 1366
1194 for (i = 0; i < tp->nr_args; i++) 1367 for (i = 0; i < tp->nr_args; i++)
1195 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); 1368 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
@@ -1209,7 +1382,7 @@ static int probes_open(struct inode *inode, struct file *file)
1209{ 1382{
1210 if ((file->f_mode & FMODE_WRITE) && 1383 if ((file->f_mode & FMODE_WRITE) &&
1211 (file->f_flags & O_TRUNC)) 1384 (file->f_flags & O_TRUNC))
1212 cleanup_all_probes(); 1385 release_all_trace_probes();
1213 1386
1214 return seq_open(file, &probes_seq_op); 1387 return seq_open(file, &probes_seq_op);
1215} 1388}
@@ -1397,7 +1570,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
1397 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1570 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1398 1571
1399 if (!filter_current_check_discard(buffer, call, entry, event)) 1572 if (!filter_current_check_discard(buffer, call, entry, event))
1400 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1573 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1574 irq_flags, pc, regs);
1401} 1575}
1402 1576
1403/* Kretprobe handler */ 1577/* Kretprobe handler */
@@ -1429,7 +1603,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1429 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1603 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1430 1604
1431 if (!filter_current_check_discard(buffer, call, entry, event)) 1605 if (!filter_current_check_discard(buffer, call, entry, event))
1432 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1606 trace_nowake_buffer_unlock_commit_regs(buffer, event,
1607 irq_flags, pc, regs);
1433} 1608}
1434 1609
1435/* Event entry printers */ 1610/* Event entry printers */
@@ -1511,30 +1686,6 @@ partial:
1511 return TRACE_TYPE_PARTIAL_LINE; 1686 return TRACE_TYPE_PARTIAL_LINE;
1512} 1687}
1513 1688
1514static int probe_event_enable(struct ftrace_event_call *call)
1515{
1516 struct trace_probe *tp = (struct trace_probe *)call->data;
1517
1518 tp->flags |= TP_FLAG_TRACE;
1519 if (probe_is_return(tp))
1520 return enable_kretprobe(&tp->rp);
1521 else
1522 return enable_kprobe(&tp->rp.kp);
1523}
1524
1525static void probe_event_disable(struct ftrace_event_call *call)
1526{
1527 struct trace_probe *tp = (struct trace_probe *)call->data;
1528
1529 tp->flags &= ~TP_FLAG_TRACE;
1530 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1531 if (probe_is_return(tp))
1532 disable_kretprobe(&tp->rp);
1533 else
1534 disable_kprobe(&tp->rp.kp);
1535 }
1536}
1537
1538#undef DEFINE_FIELD 1689#undef DEFINE_FIELD
1539#define DEFINE_FIELD(type, item, name, is_signed) \ 1690#define DEFINE_FIELD(type, item, name, is_signed) \
1540 do { \ 1691 do { \
@@ -1596,7 +1747,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1596 1747
1597 const char *fmt, *arg; 1748 const char *fmt, *arg;
1598 1749
1599 if (!probe_is_return(tp)) { 1750 if (!trace_probe_is_return(tp)) {
1600 fmt = "(%lx)"; 1751 fmt = "(%lx)";
1601 arg = "REC->" FIELD_STRING_IP; 1752 arg = "REC->" FIELD_STRING_IP;
1602 } else { 1753 } else {
@@ -1713,49 +1864,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1713 head = this_cpu_ptr(call->perf_events); 1864 head = this_cpu_ptr(call->perf_events);
1714 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); 1865 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1715} 1866}
1716
1717static int probe_perf_enable(struct ftrace_event_call *call)
1718{
1719 struct trace_probe *tp = (struct trace_probe *)call->data;
1720
1721 tp->flags |= TP_FLAG_PROFILE;
1722
1723 if (probe_is_return(tp))
1724 return enable_kretprobe(&tp->rp);
1725 else
1726 return enable_kprobe(&tp->rp.kp);
1727}
1728
1729static void probe_perf_disable(struct ftrace_event_call *call)
1730{
1731 struct trace_probe *tp = (struct trace_probe *)call->data;
1732
1733 tp->flags &= ~TP_FLAG_PROFILE;
1734
1735 if (!(tp->flags & TP_FLAG_TRACE)) {
1736 if (probe_is_return(tp))
1737 disable_kretprobe(&tp->rp);
1738 else
1739 disable_kprobe(&tp->rp.kp);
1740 }
1741}
1742#endif /* CONFIG_PERF_EVENTS */ 1867#endif /* CONFIG_PERF_EVENTS */
1743 1868
1744static __kprobes 1869static __kprobes
1745int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) 1870int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1746{ 1871{
1872 struct trace_probe *tp = (struct trace_probe *)event->data;
1873
1747 switch (type) { 1874 switch (type) {
1748 case TRACE_REG_REGISTER: 1875 case TRACE_REG_REGISTER:
1749 return probe_event_enable(event); 1876 return enable_trace_probe(tp, TP_FLAG_TRACE);
1750 case TRACE_REG_UNREGISTER: 1877 case TRACE_REG_UNREGISTER:
1751 probe_event_disable(event); 1878 disable_trace_probe(tp, TP_FLAG_TRACE);
1752 return 0; 1879 return 0;
1753 1880
1754#ifdef CONFIG_PERF_EVENTS 1881#ifdef CONFIG_PERF_EVENTS
1755 case TRACE_REG_PERF_REGISTER: 1882 case TRACE_REG_PERF_REGISTER:
1756 return probe_perf_enable(event); 1883 return enable_trace_probe(tp, TP_FLAG_PROFILE);
1757 case TRACE_REG_PERF_UNREGISTER: 1884 case TRACE_REG_PERF_UNREGISTER:
1758 probe_perf_disable(event); 1885 disable_trace_probe(tp, TP_FLAG_PROFILE);
1759 return 0; 1886 return 0;
1760#endif 1887#endif
1761 } 1888 }
@@ -1805,7 +1932,7 @@ static int register_probe_event(struct trace_probe *tp)
1805 1932
1806 /* Initialize ftrace_event_call */ 1933 /* Initialize ftrace_event_call */
1807 INIT_LIST_HEAD(&call->class->fields); 1934 INIT_LIST_HEAD(&call->class->fields);
1808 if (probe_is_return(tp)) { 1935 if (trace_probe_is_return(tp)) {
1809 call->event.funcs = &kretprobe_funcs; 1936 call->event.funcs = &kretprobe_funcs;
1810 call->class->define_fields = kretprobe_event_define_fields; 1937 call->class->define_fields = kretprobe_event_define_fields;
1811 } else { 1938 } else {
@@ -1844,6 +1971,9 @@ static __init int init_kprobe_trace(void)
1844 struct dentry *d_tracer; 1971 struct dentry *d_tracer;
1845 struct dentry *entry; 1972 struct dentry *entry;
1846 1973
1974 if (register_module_notifier(&trace_probe_module_nb))
1975 return -EINVAL;
1976
1847 d_tracer = tracing_init_dentry(); 1977 d_tracer = tracing_init_dentry();
1848 if (!d_tracer) 1978 if (!d_tracer)
1849 return 0; 1979 return 0;
@@ -1897,12 +2027,12 @@ static __init int kprobe_trace_self_tests_init(void)
1897 warn++; 2027 warn++;
1898 } else { 2028 } else {
1899 /* Enable trace point */ 2029 /* Enable trace point */
1900 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); 2030 tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
1901 if (WARN_ON_ONCE(tp == NULL)) { 2031 if (WARN_ON_ONCE(tp == NULL)) {
1902 pr_warning("error on getting new probe.\n"); 2032 pr_warning("error on getting new probe.\n");
1903 warn++; 2033 warn++;
1904 } else 2034 } else
1905 probe_event_enable(&tp->call); 2035 enable_trace_probe(tp, TP_FLAG_TRACE);
1906 } 2036 }
1907 2037
1908 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 2038 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
@@ -1912,12 +2042,12 @@ static __init int kprobe_trace_self_tests_init(void)
1912 warn++; 2042 warn++;
1913 } else { 2043 } else {
1914 /* Enable trace point */ 2044 /* Enable trace point */
1915 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); 2045 tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
1916 if (WARN_ON_ONCE(tp == NULL)) { 2046 if (WARN_ON_ONCE(tp == NULL)) {
1917 pr_warning("error on getting new probe.\n"); 2047 pr_warning("error on getting new probe.\n");
1918 warn++; 2048 warn++;
1919 } else 2049 } else
1920 probe_event_enable(&tp->call); 2050 enable_trace_probe(tp, TP_FLAG_TRACE);
1921 } 2051 }
1922 2052
1923 if (warn) 2053 if (warn)
@@ -1938,7 +2068,7 @@ static __init int kprobe_trace_self_tests_init(void)
1938 } 2068 }
1939 2069
1940end: 2070end:
1941 cleanup_all_probes(); 2071 release_all_trace_probes();
1942 if (warn) 2072 if (warn)
1943 pr_cont("NG: Some tests are failed. Please check them.\n"); 2073 pr_cont("NG: Some tests are failed. Please check them.\n");
1944 else 2074 else
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 017fa376505d..fd3c8aae55e5 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/time.h> 13#include <linux/time.h>
14 14
15#include <asm/atomic.h> 15#include <linux/atomic.h>
16 16
17#include "trace.h" 17#include "trace.h"
18#include "trace_output.h" 18#include "trace_output.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e37de492a9e1..51999309a6cf 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1107,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1107{ 1107{
1108 struct stack_entry *field; 1108 struct stack_entry *field;
1109 struct trace_seq *s = &iter->seq; 1109 struct trace_seq *s = &iter->seq;
1110 int i; 1110 unsigned long *p;
1111 unsigned long *end;
1111 1112
1112 trace_assign_type(field, iter->ent); 1113 trace_assign_type(field, iter->ent);
1114 end = (unsigned long *)((long)iter->ent + iter->ent_size);
1113 1115
1114 if (!trace_seq_puts(s, "<stack trace>\n")) 1116 if (!trace_seq_puts(s, "<stack trace>\n"))
1115 goto partial; 1117 goto partial;
1116 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1118
1117 if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) 1119 for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
1118 break;
1119 if (!trace_seq_puts(s, " => ")) 1120 if (!trace_seq_puts(s, " => "))
1120 goto partial; 1121 goto partial;
1121 1122
1122 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1123 if (!seq_print_ip_sym(s, *p, flags))
1123 goto partial; 1124 goto partial;
1124 if (!trace_seq_puts(s, "\n")) 1125 if (!trace_seq_puts(s, "\n"))
1125 goto partial; 1126 goto partial;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f029dd4fd2ca..e4a70c0c71b6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)
227 graph_trace_close(iter); 227 graph_trace_close(iter);
228} 228}
229 229
230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) 230#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
231 TRACE_GRAPH_PRINT_ABS_TIME | \
232 TRACE_GRAPH_PRINT_DURATION)
231 233
232static enum print_line_t wakeup_print_line(struct trace_iterator *iter) 234static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
233{ 235{
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b0b53b8e4c25..77575b386d97 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
156{ 156{
157 long *ptr = filp->private_data; 157 long *ptr = filp->private_data;
158 unsigned long val, flags; 158 unsigned long val, flags;
159 char buf[64];
160 int ret; 159 int ret;
161 int cpu; 160 int cpu;
162 161
163 if (count >= sizeof(buf)) 162 ret = kstrtoul_from_user(ubuf, count, 10, &val);
164 return -EINVAL; 163 if (ret)
165
166 if (copy_from_user(&buf, ubuf, count))
167 return -EFAULT;
168
169 buf[count] = 0;
170
171 ret = strict_strtoul(buf, 10, &val);
172 if (ret < 0)
173 return ret; 164 return ret;
174 165
175 local_irq_save(flags); 166 local_irq_save(flags);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56ad4792..36491cd5b7d4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -200,6 +200,7 @@ static int is_softlockup(unsigned long touch_ts)
200} 200}
201 201
202#ifdef CONFIG_HARDLOCKUP_DETECTOR 202#ifdef CONFIG_HARDLOCKUP_DETECTOR
203
203static struct perf_event_attr wd_hw_attr = { 204static struct perf_event_attr wd_hw_attr = {
204 .type = PERF_TYPE_HARDWARE, 205 .type = PERF_TYPE_HARDWARE,
205 .config = PERF_COUNT_HW_CPU_CYCLES, 206 .config = PERF_COUNT_HW_CPU_CYCLES,
@@ -209,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = {
209}; 210};
210 211
211/* Callback function for perf event subsystem */ 212/* Callback function for perf event subsystem */
212static void watchdog_overflow_callback(struct perf_event *event, int nmi, 213static void watchdog_overflow_callback(struct perf_event *event,
213 struct perf_sample_data *data, 214 struct perf_sample_data *data,
214 struct pt_regs *regs) 215 struct pt_regs *regs)
215{ 216{
@@ -368,10 +369,11 @@ static int watchdog_nmi_enable(int cpu)
368 if (event != NULL) 369 if (event != NULL)
369 goto out_enable; 370 goto out_enable;
370 371
371 /* Try to register using hardware perf events */
372 wd_attr = &wd_hw_attr; 372 wd_attr = &wd_hw_attr;
373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); 373 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
374 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); 374
375 /* Try to register using hardware perf events */
376 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
375 if (!IS_ERR(event)) { 377 if (!IS_ERR(event)) {
376 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 378 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
377 goto out_save; 379 goto out_save;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0400553f0d04..25fb1b0e53fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t;
221 * per-CPU workqueues: 221 * per-CPU workqueues:
222 */ 222 */
223struct workqueue_struct { 223struct workqueue_struct {
224 unsigned int flags; /* I: WQ_* flags */ 224 unsigned int flags; /* W: WQ_* flags */
225 union { 225 union {
226 struct cpu_workqueue_struct __percpu *pcpu; 226 struct cpu_workqueue_struct __percpu *pcpu;
227 struct cpu_workqueue_struct *single; 227 struct cpu_workqueue_struct *single;
@@ -240,6 +240,7 @@ struct workqueue_struct {
240 mayday_mask_t mayday_mask; /* cpus requesting rescue */ 240 mayday_mask_t mayday_mask; /* cpus requesting rescue */
241 struct worker *rescuer; /* I: rescue worker */ 241 struct worker *rescuer; /* I: rescue worker */
242 242
243 int nr_drainers; /* W: drain in progress */
243 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
244 const char *name; /* I: workqueue name */ 245 const char *name; /* I: workqueue name */
245#ifdef CONFIG_LOCKDEP 246#ifdef CONFIG_LOCKDEP
@@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
990 debug_work_activate(work); 991 debug_work_activate(work);
991 992
992 /* if dying, only works from the same workqueue are allowed */ 993 /* if dying, only works from the same workqueue are allowed */
993 if (unlikely(wq->flags & WQ_DYING) && 994 if (unlikely(wq->flags & WQ_DRAINING) &&
994 WARN_ON_ONCE(!is_chained_work(wq))) 995 WARN_ON_ONCE(!is_chained_work(wq)))
995 return; 996 return;
996 997
@@ -2381,6 +2382,54 @@ out_unlock:
2381} 2382}
2382EXPORT_SYMBOL_GPL(flush_workqueue); 2383EXPORT_SYMBOL_GPL(flush_workqueue);
2383 2384
2385/**
2386 * drain_workqueue - drain a workqueue
2387 * @wq: workqueue to drain
2388 *
2389 * Wait until the workqueue becomes empty. While draining is in progress,
2390 * only chain queueing is allowed. IOW, only currently pending or running
2391 * work items on @wq can queue further work items on it. @wq is flushed
2392 * repeatedly until it becomes empty. The number of flushing is detemined
2393 * by the depth of chaining and should be relatively short. Whine if it
2394 * takes too long.
2395 */
2396void drain_workqueue(struct workqueue_struct *wq)
2397{
2398 unsigned int flush_cnt = 0;
2399 unsigned int cpu;
2400
2401 /*
2402 * __queue_work() needs to test whether there are drainers, is much
2403 * hotter than drain_workqueue() and already looks at @wq->flags.
2404 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2405 */
2406 spin_lock(&workqueue_lock);
2407 if (!wq->nr_drainers++)
2408 wq->flags |= WQ_DRAINING;
2409 spin_unlock(&workqueue_lock);
2410reflush:
2411 flush_workqueue(wq);
2412
2413 for_each_cwq_cpu(cpu, wq) {
2414 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2415
2416 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
2417 continue;
2418
2419 if (++flush_cnt == 10 ||
2420 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2421 pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n",
2422 wq->name, flush_cnt);
2423 goto reflush;
2424 }
2425
2426 spin_lock(&workqueue_lock);
2427 if (!--wq->nr_drainers)
2428 wq->flags &= ~WQ_DRAINING;
2429 spin_unlock(&workqueue_lock);
2430}
2431EXPORT_SYMBOL_GPL(drain_workqueue);
2432
2384static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, 2433static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
2385 bool wait_executing) 2434 bool wait_executing)
2386{ 2435{
@@ -3009,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3009 */ 3058 */
3010void destroy_workqueue(struct workqueue_struct *wq) 3059void destroy_workqueue(struct workqueue_struct *wq)
3011{ 3060{
3012 unsigned int flush_cnt = 0;
3013 unsigned int cpu; 3061 unsigned int cpu;
3014 3062
3015 /* 3063 /* drain it before proceeding with destruction */
3016 * Mark @wq dying and drain all pending works. Once WQ_DYING is 3064 drain_workqueue(wq);
3017 * set, only chain queueing is allowed. IOW, only currently
3018 * pending or running work items on @wq can queue further work
3019 * items on it. @wq is flushed repeatedly until it becomes empty.
3020 * The number of flushing is detemined by the depth of chaining and
3021 * should be relatively short. Whine if it takes too long.
3022 */
3023 wq->flags |= WQ_DYING;
3024reflush:
3025 flush_workqueue(wq);
3026
3027 for_each_cwq_cpu(cpu, wq) {
3028 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
3029
3030 if (!cwq->nr_active && list_empty(&cwq->delayed_works))
3031 continue;
3032
3033 if (++flush_cnt == 10 ||
3034 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
3035 printk(KERN_WARNING "workqueue %s: flush on "
3036 "destruction isn't complete after %u tries\n",
3037 wq->name, flush_cnt);
3038 goto reflush;
3039 }
3040 3065
3041 /* 3066 /*
3042 * wq list is used to freeze wq, remove from list after 3067 * wq list is used to freeze wq, remove from list after