aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/cgroup.c15
-rw-r--r--kernel/compat.c21
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c2
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c66
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/gcov/fs.c244
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/hrtimer.c16
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/hw_breakpoint.c74
-rw-r--r--kernel/irq_work.c164
-rw-r--r--kernel/jump_label.c429
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kprobes.c26
-rw-r--r--kernel/lockdep.c51
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/mutex.c23
-rw-r--r--kernel/perf_event.c2628
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/pm_qos_params.c4
-rw-r--r--kernel/power/hibernate.c1
-rw-r--r--kernel/power/snapshot.c86
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/rcupdate.c8
-rw-r--r--kernel/rcutiny.c33
-rw-r--r--kernel/rcutiny_plugin.h582
-rw-r--r--kernel/rcutorture.c17
-rw-r--r--kernel/rcutree.c92
-rw-r--r--kernel/rcutree.h20
-rw-r--r--kernel/rcutree_plugin.h47
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c323
-rw-r--r--kernel/sched_fair.c94
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c40
-rw-r--r--kernel/sched_stoptask.c108
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/smp.c17
-rw-r--r--kernel/softirq.c64
-rw-r--r--kernel/srcu.c2
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/sysctl_check.c9
-rw-r--r--kernel/test_kprobes.c12
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/ftrace.c142
-rw-r--r--kernel/trace/ring_buffer.c25
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_event_perf.c31
-rw-r--r--kernel/trace/trace_events.c55
-rw-r--r--kernel/trace/trace_functions_graph.c209
-rw-r--r--kernel/trace/trace_irqsoff.c152
-rw-r--r--kernel/trace/trace_kprobe.c43
-rw-r--r--kernel/trace/trace_sched_wakeup.c256
-rw-r--r--kernel/trace/trace_workqueue.c10
-rw-r--r--kernel/tracepoint.c14
-rw-r--r--kernel/watchdog.c60
-rw-r--r--kernel/workqueue.c27
66 files changed, 4624 insertions, 1829 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b72d1a74be0..e2c9d52cfe9e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o jump_label.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o 14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
15obj-y += groups.o 15obj-y += groups.o
16 16
@@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg
23CFLAGS_REMOVE_cgroup-debug.o = -pg 23CFLAGS_REMOVE_cgroup-debug.o = -pg
24CFLAGS_REMOVE_sched_clock.o = -pg 24CFLAGS_REMOVE_sched_clock.o = -pg
25CFLAGS_REMOVE_perf_event.o = -pg 25CFLAGS_REMOVE_perf_event.o = -pg
26CFLAGS_REMOVE_irq_work.o = -pg
26endif 27endif
27 28
28obj-$(CONFIG_FREEZER) += freezer.o 29obj-$(CONFIG_FREEZER) += freezer.o
@@ -86,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 87obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 88obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
88obj-$(CONFIG_TINY_RCU) += rcutiny.o 89obj-$(CONFIG_TINY_RCU) += rcutiny.o
90obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
89obj-$(CONFIG_RELAY) += relay.o 91obj-$(CONFIG_RELAY) += relay.o
90obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 92obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
91obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 93obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -100,6 +102,7 @@ obj-$(CONFIG_TRACING) += trace/
100obj-$(CONFIG_X86_DS) += trace/ 102obj-$(CONFIG_X86_DS) += trace/
101obj-$(CONFIG_RING_BUFFER) += trace/ 103obj-$(CONFIG_RING_BUFFER) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o 104obj-$(CONFIG_SMP) += sched_cpupri.o
105obj-$(CONFIG_IRQ_WORK) += irq_work.o
103obj-$(CONFIG_PERF_EVENTS) += perf_event.o 106obj-$(CONFIG_PERF_EVENTS) += perf_event.o
104obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 107obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
105obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 108obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 192f88c5b0f9..291ba3d04bea 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
138 * is called after synchronize_rcu(). But for safe use, css_is_removed() 138 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139 * css_tryget() should be used for avoiding race. 139 * css_tryget() should be used for avoiding race.
140 */ 140 */
141 struct cgroup_subsys_state *css; 141 struct cgroup_subsys_state __rcu *css;
142 /* 142 /*
143 * ID of this css. 143 * ID of this css.
144 */ 144 */
@@ -1791,19 +1791,20 @@ out:
1791} 1791}
1792 1792
1793/** 1793/**
1794 * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup 1794 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
1795 * @from: attach to all cgroups of a given task
1795 * @tsk: the task to be attached 1796 * @tsk: the task to be attached
1796 */ 1797 */
1797int cgroup_attach_task_current_cg(struct task_struct *tsk) 1798int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1798{ 1799{
1799 struct cgroupfs_root *root; 1800 struct cgroupfs_root *root;
1800 struct cgroup *cur_cg;
1801 int retval = 0; 1801 int retval = 0;
1802 1802
1803 cgroup_lock(); 1803 cgroup_lock();
1804 for_each_active_root(root) { 1804 for_each_active_root(root) {
1805 cur_cg = task_cgroup_from_root(current, root); 1805 struct cgroup *from_cg = task_cgroup_from_root(from, root);
1806 retval = cgroup_attach_task(cur_cg, tsk); 1806
1807 retval = cgroup_attach_task(from_cg, tsk);
1807 if (retval) 1808 if (retval)
1808 break; 1809 break;
1809 } 1810 }
@@ -1811,7 +1812,7 @@ int cgroup_attach_task_current_cg(struct task_struct *tsk)
1811 1812
1812 return retval; 1813 return retval;
1813} 1814}
1814EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); 1815EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1815 1816
1816/* 1817/*
1817 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1818 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
diff --git a/kernel/compat.c b/kernel/compat.c
index e167efce8423..c9e2ec0b34a8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1126,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
1126 1126
1127 return 0; 1127 return 0;
1128} 1128}
1129
1130/*
1131 * Allocate user-space memory for the duration of a single system call,
1132 * in order to marshall parameters inside a compat thunk.
1133 */
1134void __user *compat_alloc_user_space(unsigned long len)
1135{
1136 void __user *ptr;
1137
1138 /* If len would occupy more than half of the entire compat space... */
1139 if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
1140 return NULL;
1141
1142 ptr = arch_compat_alloc_user_space(len);
1143
1144 if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
1145 return NULL;
1146
1147 return ptr;
1148}
1149EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe7..51b143e2a07a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1397 if (tsk->flags & PF_THREAD_BOUND) 1397 if (tsk->flags & PF_THREAD_BOUND)
1398 return -EINVAL; 1398 return -EINVAL;
1399 1399
1400 ret = security_task_setscheduler(tsk, 0, NULL); 1400 ret = security_task_setscheduler(tsk);
1401 if (ret) 1401 if (ret)
1402 return ret; 1402 return ret;
1403 if (threadgroup) { 1403 if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1405 1405
1406 rcu_read_lock(); 1406 rcu_read_lock();
1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1408 ret = security_task_setscheduler(c, 0, NULL); 1408 ret = security_task_setscheduler(c);
1409 if (ret) { 1409 if (ret) {
1410 rcu_read_unlock(); 1410 rcu_read_unlock();
1411 return ret; 1411 return ret;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 75bd9b3ebbb7..20059ef4459a 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv)
274 int i, bpno; 274 int i, bpno;
275 kdb_bp_t *bp, *bp_check; 275 kdb_bp_t *bp, *bp_check;
276 int diag; 276 int diag;
277 int free;
278 char *symname = NULL; 277 char *symname = NULL;
279 long offset = 0ul; 278 long offset = 0ul;
280 int nextarg; 279 int nextarg;
@@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv)
305 /* 304 /*
306 * Find an empty bp structure to allocate 305 * Find an empty bp structure to allocate
307 */ 306 */
308 free = KDB_MAXBPT;
309 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { 307 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
310 if (bp->bp_free) 308 if (bp->bp_free)
311 break; 309 break;
diff --git a/kernel/exit.c b/kernel/exit.c
index 03120229db28..e2bdf37f9fde 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
149{ 149{
150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
151 151
152#ifdef CONFIG_PERF_EVENTS 152 perf_event_delayed_put(tsk);
153 WARN_ON_ONCE(tsk->perf_event_ctxp);
154#endif
155 trace_sched_process_free(tsk); 153 trace_sched_process_free(tsk);
156 put_task_struct(tsk); 154 put_task_struct(tsk);
157} 155}
diff --git a/kernel/fork.c b/kernel/fork.c
index b7e9d60a675d..c445f8cc408d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -356,10 +356,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
356 if (IS_ERR(pol)) 356 if (IS_ERR(pol))
357 goto fail_nomem_policy; 357 goto fail_nomem_policy;
358 vma_set_policy(tmp, pol); 358 vma_set_policy(tmp, pol);
359 tmp->vm_mm = mm;
359 if (anon_vma_fork(tmp, mpnt)) 360 if (anon_vma_fork(tmp, mpnt))
360 goto fail_nomem_anon_vma_fork; 361 goto fail_nomem_anon_vma_fork;
361 tmp->vm_flags &= ~VM_LOCKED; 362 tmp->vm_flags &= ~VM_LOCKED;
362 tmp->vm_mm = mm;
363 tmp->vm_next = tmp->vm_prev = NULL; 363 tmp->vm_next = tmp->vm_prev = NULL;
364 file = tmp->vm_file; 364 file = tmp->vm_file;
365 if (file) { 365 if (file) {
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..a118bf160e0b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
91 91
92/** 92/**
93 * struct futex_q - The hashed futex queue entry, one per waiting task 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @list: priority-sorted list of tasks waiting on this futex
94 * @task: the task waiting on the futex 95 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock 96 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on 97 * @key: the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
104 * 105 *
105 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 106 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 107 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
107 * The order of wakup is always to make the first condition true, then 108 * The order of wakeup is always to make the first condition true, then
108 * the second. 109 * the second.
109 * 110 *
110 * PI futexes are typically woken before they are removed from the hash list via 111 * PI futexes are typically woken before they are removed from the hash list via
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
295 * Slow path to fixup the fault we just took in the atomic write 296 * Slow path to fixup the fault we just took in the atomic write
296 * access to @uaddr. 297 * access to @uaddr.
297 * 298 *
298 * We have no generic implementation of a non destructive write to the 299 * We have no generic implementation of a non-destructive write to the
299 * user address. We know that we faulted in the atomic pagefault 300 * user address. We know that we faulted in the atomic pagefault
300 * disabled section so we can as well avoid the #PF overhead by 301 * disabled section so we can as well avoid the #PF overhead by
301 * calling get_user_pages() right away. 302 * calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
515 */ 516 */
516 pi_state = this->pi_state; 517 pi_state = this->pi_state;
517 /* 518 /*
518 * Userspace might have messed up non PI and PI futexes 519 * Userspace might have messed up non-PI and PI futexes
519 */ 520 */
520 if (unlikely(!pi_state)) 521 if (unlikely(!pi_state))
521 return -EINVAL; 522 return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
736 737
737 /* 738 /*
738 * We set q->lock_ptr = NULL _before_ we wake up the task. If 739 * We set q->lock_ptr = NULL _before_ we wake up the task. If
739 * a non futex wake up happens on another CPU then the task 740 * a non-futex wake up happens on another CPU then the task
740 * might exit and p would dereference a non existing task 741 * might exit and p would dereference a non-existing task
741 * struct. Prevent this by holding a reference on p across the 742 * struct. Prevent this by holding a reference on p across the
742 * wake up. 743 * wake up.
743 */ 744 */
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1131 1132
1132/** 1133/**
1133 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1134 * uaddr1: source futex user address 1135 * @uaddr1: source futex user address
1135 * uaddr2: target futex user address 1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
1136 * nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1137 * @uaddr2: target futex user address
1137 * nr_requeue: number of waiters to requeue (0-INT_MAX) 1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1138 * requeue_pi: if we are attempting to requeue from a non-pi futex to a 1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1139 * pi futex (pi to pi requeue is not supported) 1142 * pi futex (pi to pi requeue is not supported)
1140 * 1143 *
1141 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
1360 1363
1361/* The key must be already stored in q->key. */ 1364/* The key must be already stored in q->key. */
1362static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 1365static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1366 __acquires(&hb->lock)
1363{ 1367{
1364 struct futex_hash_bucket *hb; 1368 struct futex_hash_bucket *hb;
1365 1369
1366 get_futex_key_refs(&q->key);
1367 hb = hash_futex(&q->key); 1370 hb = hash_futex(&q->key);
1368 q->lock_ptr = &hb->lock; 1371 q->lock_ptr = &hb->lock;
1369 1372
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1373 1376
1374static inline void 1377static inline void
1375queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1378queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1379 __releases(&hb->lock)
1376{ 1380{
1377 spin_unlock(&hb->lock); 1381 spin_unlock(&hb->lock);
1378 drop_futex_key_refs(&q->key);
1379} 1382}
1380 1383
1381/** 1384/**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1391 * an example). 1394 * an example).
1392 */ 1395 */
1393static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1396static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1397 __releases(&hb->lock)
1394{ 1398{
1395 int prio; 1399 int prio;
1396 1400
@@ -1471,6 +1475,7 @@ retry:
1471 * and dropped here. 1475 * and dropped here.
1472 */ 1476 */
1473static void unqueue_me_pi(struct futex_q *q) 1477static void unqueue_me_pi(struct futex_q *q)
1478 __releases(q->lock_ptr)
1474{ 1479{
1475 WARN_ON(plist_node_empty(&q->list)); 1480 WARN_ON(plist_node_empty(&q->list));
1476 plist_del(&q->list, &q->list.plist); 1481 plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
1480 q->pi_state = NULL; 1485 q->pi_state = NULL;
1481 1486
1482 spin_unlock(q->lock_ptr); 1487 spin_unlock(q->lock_ptr);
1483
1484 drop_futex_key_refs(&q->key);
1485} 1488}
1486 1489
1487/* 1490/*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1812 } 1815 }
1813 1816
1814retry: 1817retry:
1815 /* Prepare to wait on uaddr. */ 1818 /*
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs.
1821 */
1816 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1817 if (ret) 1823 if (ret)
1818 goto out; 1824 goto out;
@@ -1822,28 +1828,27 @@ retry:
1822 1828
1823 /* If we were woken (and unqueued), we succeeded, whatever. */ 1829 /* If we were woken (and unqueued), we succeeded, whatever. */
1824 ret = 0; 1830 ret = 0;
1831 /* unqueue_me() drops q.key ref */
1825 if (!unqueue_me(&q)) 1832 if (!unqueue_me(&q))
1826 goto out_put_key; 1833 goto out;
1827 ret = -ETIMEDOUT; 1834 ret = -ETIMEDOUT;
1828 if (to && !to->task) 1835 if (to && !to->task)
1829 goto out_put_key; 1836 goto out;
1830 1837
1831 /* 1838 /*
1832 * We expect signal_pending(current), but we might be the 1839 * We expect signal_pending(current), but we might be the
1833 * victim of a spurious wakeup as well. 1840 * victim of a spurious wakeup as well.
1834 */ 1841 */
1835 if (!signal_pending(current)) { 1842 if (!signal_pending(current))
1836 put_futex_key(fshared, &q.key);
1837 goto retry; 1843 goto retry;
1838 }
1839 1844
1840 ret = -ERESTARTSYS; 1845 ret = -ERESTARTSYS;
1841 if (!abs_time) 1846 if (!abs_time)
1842 goto out_put_key; 1847 goto out;
1843 1848
1844 restart = &current_thread_info()->restart_block; 1849 restart = &current_thread_info()->restart_block;
1845 restart->fn = futex_wait_restart; 1850 restart->fn = futex_wait_restart;
1846 restart->futex.uaddr = (u32 *)uaddr; 1851 restart->futex.uaddr = uaddr;
1847 restart->futex.val = val; 1852 restart->futex.val = val;
1848 restart->futex.time = abs_time->tv64; 1853 restart->futex.time = abs_time->tv64;
1849 restart->futex.bitset = bitset; 1854 restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
1856 1861
1857 ret = -ERESTART_RESTARTBLOCK; 1862 ret = -ERESTART_RESTARTBLOCK;
1858 1863
1859out_put_key:
1860 put_futex_key(fshared, &q.key);
1861out: 1864out:
1862 if (to) { 1865 if (to) {
1863 hrtimer_cancel(&to->timer); 1866 hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
1869 1872
1870static long futex_wait_restart(struct restart_block *restart) 1873static long futex_wait_restart(struct restart_block *restart)
1871{ 1874{
1872 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1875 u32 __user *uaddr = restart->futex.uaddr;
1873 int fshared = 0; 1876 int fshared = 0;
1874 ktime_t t, *tp = NULL; 1877 ktime_t t, *tp = NULL;
1875 1878
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2236 q.rt_waiter = &rt_waiter; 2239 q.rt_waiter = &rt_waiter;
2237 q.requeue_pi_key = &key2; 2240 q.requeue_pi_key = &key2;
2238 2241
2239 /* Prepare to wait on uaddr. */ 2242 /*
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count.
2245 */
2240 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2241 if (ret) 2247 if (ret)
2242 goto out_key2; 2248 goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2254 * In order for us to be here, we know our q.key == key2, and since 2260 * In order for us to be here, we know our q.key == key2, and since
2255 * we took the hb->lock above, we also know that futex_requeue() has 2261 * we took the hb->lock above, we also know that futex_requeue() has
2256 * completed and we no longer have to concern ourselves with a wakeup 2262 * completed and we no longer have to concern ourselves with a wakeup
2257 * race with the atomic proxy lock acquition by the requeue code. 2263 * race with the atomic proxy lock acquisition by the requeue code. The
2264 * futex_requeue dropped our key1 reference and incremented our key2
2265 * reference count.
2258 */ 2266 */
2259 2267
2260 /* Check if the requeue code acquired the second futex for us. */ 2268 /* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
2458 */ 2466 */
2459static inline int fetch_robust_entry(struct robust_list __user **entry, 2467static inline int fetch_robust_entry(struct robust_list __user **entry,
2460 struct robust_list __user * __user *head, 2468 struct robust_list __user * __user *head,
2461 int *pi) 2469 unsigned int *pi)
2462{ 2470{
2463 unsigned long uentry; 2471 unsigned long uentry;
2464 2472
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
2647 * of the complex code paths. Also we want to prevent 2655 * of the complex code paths. Also we want to prevent
2648 * registration of robust lists in that case. NULL is 2656 * registration of robust lists in that case. NULL is
2649 * guaranteed to fault and we get -EFAULT on functional 2657 * guaranteed to fault and we get -EFAULT on functional
2650 * implementation, the non functional ones will return 2658 * implementation, the non-functional ones will return
2651 * -ENOSYS. 2659 * -ENOSYS.
2652 */ 2660 */
2653 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2661 curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..06da4dfc339b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
19 */ 19 */
20static inline int 20static inline int
21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
22 compat_uptr_t __user *head, int *pi) 22 compat_uptr_t __user *head, unsigned int *pi)
23{ 23{
24 if (get_user(*uentry, head)) 24 if (get_user(*uentry, head))
25 return -EFAULT; 25 return -EFAULT;
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a3..f83972b16564 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
33 * @children: child nodes 33 * @children: child nodes
34 * @all: list head for list of all nodes 34 * @all: list head for list of all nodes
35 * @parent: parent node 35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory 36 * @loaded_info: array of pointers to profiling data sets for loaded object
37 * @ghost: when an object file containing profiling data is unloaded we keep a 37 * files.
38 * copy of the profiling data here to allow collecting coverage data 38 * @num_loaded: number of profiling data sets for loaded object files.
39 * for cleanup code. Such a node is called a "ghost". 39 * @unloaded_info: accumulated copy of profiling data sets for unloaded
40 * object files. Used only when gcov_persist=1.
40 * @dentry: main debugfs entry, either a directory or data file 41 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links 42 * @links: associated symbolic links
42 * @name: data file basename 43 * @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
51 struct list_head children; 52 struct list_head children;
52 struct list_head all; 53 struct list_head all;
53 struct gcov_node *parent; 54 struct gcov_node *parent;
54 struct gcov_info *info; 55 struct gcov_info **loaded_info;
55 struct gcov_info *ghost; 56 struct gcov_info *unloaded_info;
56 struct dentry *dentry; 57 struct dentry *dentry;
57 struct dentry **links; 58 struct dentry **links;
59 int num_loaded;
58 char name[0]; 60 char name[0];
59}; 61};
60 62
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
136}; 138};
137 139
138/* 140/*
139 * Return the profiling data set for a given node. This can either be the 141 * Return a profiling data set associated with the given node. This is
140 * original profiling data structure or a duplicate (also called "ghost") 142 * either a data set for a loaded object file or a data set copy in case
141 * in case the associated object file has been unloaded. 143 * all associated object files have been unloaded.
142 */ 144 */
143static struct gcov_info *get_node_info(struct gcov_node *node) 145static struct gcov_info *get_node_info(struct gcov_node *node)
144{ 146{
145 if (node->info) 147 if (node->num_loaded > 0)
146 return node->info; 148 return node->loaded_info[0];
147 149
148 return node->ghost; 150 return node->unloaded_info;
151}
152
153/*
154 * Return a newly allocated profiling data set which contains the sum of
155 * all profiling data associated with the given node.
156 */
157static struct gcov_info *get_accumulated_info(struct gcov_node *node)
158{
159 struct gcov_info *info;
160 int i = 0;
161
162 if (node->unloaded_info)
163 info = gcov_info_dup(node->unloaded_info);
164 else
165 info = gcov_info_dup(node->loaded_info[i++]);
166 if (!info)
167 return NULL;
168 for (; i < node->num_loaded; i++)
169 gcov_info_add(info, node->loaded_info[i]);
170
171 return info;
149} 172}
150 173
151/* 174/*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
163 mutex_lock(&node_lock); 186 mutex_lock(&node_lock);
164 /* 187 /*
165 * Read from a profiling data copy to minimize reference tracking 188 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access. 189 * complexity and concurrent access and to keep accumulating multiple
190 * profiling data sets associated with one node simple.
167 */ 191 */
168 info = gcov_info_dup(get_node_info(node)); 192 info = get_accumulated_info(node);
169 if (!info) 193 if (!info)
170 goto out_unlock; 194 goto out_unlock;
171 iter = gcov_iter_new(info); 195 iter = gcov_iter_new(info);
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name)
225 return NULL; 249 return NULL;
226} 250}
227 251
252/*
253 * Reset all profiling data associated with the specified node.
254 */
255static void reset_node(struct gcov_node *node)
256{
257 int i;
258
259 if (node->unloaded_info)
260 gcov_info_reset(node->unloaded_info);
261 for (i = 0; i < node->num_loaded; i++)
262 gcov_info_reset(node->loaded_info[i]);
263}
264
228static void remove_node(struct gcov_node *node); 265static void remove_node(struct gcov_node *node);
229 266
230/* 267/*
231 * write() implementation for gcov data files. Reset profiling data for the 268 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is 269 * corresponding file. If all associated object files have been unloaded,
233 * a "ghost" node), remove the debug fs node as well. 270 * remove the debug fs node as well.
234 */ 271 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr, 272static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos) 273 size_t len, loff_t *pos)
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
245 node = get_node_by_name(info->filename); 282 node = get_node_by_name(info->filename);
246 if (node) { 283 if (node) {
247 /* Reset counts or remove node for unloaded modules. */ 284 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost) 285 if (node->num_loaded == 0)
249 remove_node(node); 286 remove_node(node);
250 else 287 else
251 gcov_info_reset(node->info); 288 reset_node(node);
252 } 289 }
253 /* Reset counts for open file. */ 290 /* Reset counts for open file. */
254 gcov_info_reset(info); 291 gcov_info_reset(info);
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
378 INIT_LIST_HEAD(&node->list); 415 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children); 416 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all); 417 INIT_LIST_HEAD(&node->all);
381 node->info = info; 418 if (node->loaded_info) {
419 node->loaded_info[0] = info;
420 node->num_loaded = 1;
421 }
382 node->parent = parent; 422 node->parent = parent;
383 if (name) 423 if (name)
384 strcpy(node->name, name); 424 strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
394 struct gcov_node *node; 434 struct gcov_node *node;
395 435
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); 436 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) { 437 if (!node)
398 pr_warning("out of memory\n"); 438 goto err_nomem;
399 return NULL; 439 if (info) {
440 node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
441 GFP_KERNEL);
442 if (!node->loaded_info)
443 goto err_nomem;
400 } 444 }
401 init_node(node, info, name, parent); 445 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */ 446 /* Differentiate between gcov data file nodes and directory nodes. */
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
416 list_add(&node->all, &all_head); 460 list_add(&node->all, &all_head);
417 461
418 return node; 462 return node;
463
464err_nomem:
465 kfree(node);
466 pr_warning("out of memory\n");
467 return NULL;
419} 468}
420 469
421/* Remove symbolic links associated with node. */ 470/* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
441 list_del(&node->all); 490 list_del(&node->all);
442 debugfs_remove(node->dentry); 491 debugfs_remove(node->dentry);
443 remove_links(node); 492 remove_links(node);
444 if (node->ghost) 493 kfree(node->loaded_info);
445 gcov_info_free(node->ghost); 494 if (node->unloaded_info)
495 gcov_info_free(node->unloaded_info);
446 kfree(node); 496 kfree(node);
447} 497}
448 498
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
477 527
478/* 528/*
479 * write() implementation for reset file. Reset all profiling data to zero 529 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes. 530 * and remove nodes for which all associated object files are unloaded.
481 */ 531 */
482static ssize_t reset_write(struct file *file, const char __user *addr, 532static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos) 533 size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
487 mutex_lock(&node_lock); 537 mutex_lock(&node_lock);
488restart: 538restart:
489 list_for_each_entry(node, &all_head, all) { 539 list_for_each_entry(node, &all_head, all) {
490 if (node->info) 540 if (node->num_loaded > 0)
491 gcov_info_reset(node->info); 541 reset_node(node);
492 else if (list_empty(&node->children)) { 542 else if (list_empty(&node->children)) {
493 remove_node(node); 543 remove_node(node);
494 /* Several nodes may have gone - restart loop. */ 544 /* Several nodes may have gone - restart loop. */
@@ -564,37 +614,115 @@ err_remove:
564} 614}
565 615
566/* 616/*
567 * The profiling data set associated with this node is being unloaded. Store a 617 * Associate a profiling data set with an existing node. Needs to be called
568 * copy of the profiling data and turn this node into a "ghost". 618 * with node_lock held.
569 */ 619 */
570static int ghost_node(struct gcov_node *node) 620static void add_info(struct gcov_node *node, struct gcov_info *info)
571{ 621{
572 node->ghost = gcov_info_dup(node->info); 622 struct gcov_info **loaded_info;
573 if (!node->ghost) { 623 int num = node->num_loaded;
574 pr_warning("could not save data for '%s' (out of memory)\n", 624
575 node->info->filename); 625 /*
576 return -ENOMEM; 626 * Prepare new array. This is done first to simplify cleanup in
627 * case the new data set is incompatible, the node only contains
628 * unloaded data sets and there's not enough memory for the array.
629 */
630 loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
631 if (!loaded_info) {
632 pr_warning("could not add '%s' (out of memory)\n",
633 info->filename);
634 return;
635 }
636 memcpy(loaded_info, node->loaded_info,
637 num * sizeof(struct gcov_info *));
638 loaded_info[num] = info;
639 /* Check if the new data set is compatible. */
640 if (num == 0) {
641 /*
642 * A module was unloaded, modified and reloaded. The new
643 * data set replaces the copy of the last one.
644 */
645 if (!gcov_info_is_compatible(node->unloaded_info, info)) {
646 pr_warning("discarding saved data for %s "
647 "(incompatible version)\n", info->filename);
648 gcov_info_free(node->unloaded_info);
649 node->unloaded_info = NULL;
650 }
651 } else {
652 /*
653 * Two different versions of the same object file are loaded.
654 * The initial one takes precedence.
655 */
656 if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
657 pr_warning("could not add '%s' (incompatible "
658 "version)\n", info->filename);
659 kfree(loaded_info);
660 return;
661 }
577 } 662 }
578 node->info = NULL; 663 /* Overwrite previous array. */
664 kfree(node->loaded_info);
665 node->loaded_info = loaded_info;
666 node->num_loaded = num + 1;
667}
579 668
580 return 0; 669/*
670 * Return the index of a profiling data set associated with a node.
671 */
672static int get_info_index(struct gcov_node *node, struct gcov_info *info)
673{
674 int i;
675
676 for (i = 0; i < node->num_loaded; i++) {
677 if (node->loaded_info[i] == info)
678 return i;
679 }
680 return -ENOENT;
581} 681}
582 682
583/* 683/*
584 * Profiling data for this node has been loaded again. Add profiling data 684 * Save the data of a profiling data set which is being unloaded.
585 * from previous instantiation and turn this node into a regular node.
586 */ 685 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info) 686static void save_info(struct gcov_node *node, struct gcov_info *info)
588{ 687{
589 if (gcov_info_is_compatible(node->ghost, info)) 688 if (node->unloaded_info)
590 gcov_info_add(info, node->ghost); 689 gcov_info_add(node->unloaded_info, info);
591 else { 690 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n", 691 node->unloaded_info = gcov_info_dup(info);
692 if (!node->unloaded_info) {
693 pr_warning("could not save data for '%s' "
694 "(out of memory)\n", info->filename);
695 }
696 }
697}
698
699/*
700 * Disassociate a profiling data set from a node. Needs to be called with
701 * node_lock held.
702 */
703static void remove_info(struct gcov_node *node, struct gcov_info *info)
704{
705 int i;
706
707 i = get_info_index(node, info);
708 if (i < 0) {
709 pr_warning("could not remove '%s' (not found)\n",
593 info->filename); 710 info->filename);
711 return;
594 } 712 }
595 gcov_info_free(node->ghost); 713 if (gcov_persist)
596 node->ghost = NULL; 714 save_info(node, info);
597 node->info = info; 715 /* Shrink array. */
716 node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
717 node->num_loaded--;
718 if (node->num_loaded > 0)
719 return;
720 /* Last loaded data set was removed. */
721 kfree(node->loaded_info);
722 node->loaded_info = NULL;
723 node->num_loaded = 0;
724 if (!node->unloaded_info)
725 remove_node(node);
598} 726}
599 727
600/* 728/*
@@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
609 node = get_node_by_name(info->filename); 737 node = get_node_by_name(info->filename);
610 switch (action) { 738 switch (action) {
611 case GCOV_ADD: 739 case GCOV_ADD:
612 /* Add new node or revive ghost. */ 740 if (node)
613 if (!node) { 741 add_info(node, info);
742 else
614 add_node(info); 743 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break; 744 break;
624 case GCOV_REMOVE: 745 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */ 746 if (node)
626 if (!node) { 747 remove_info(node, info);
748 else {
627 pr_warning("could not remove '%s' (not found)\n", 749 pr_warning("could not remove '%s' (not found)\n",
628 info->filename); 750 info->filename);
629 break;
630 } 751 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break; 752 break;
637 } 753 }
638 mutex_unlock(&node_lock); 754 mutex_unlock(&node_lock);
diff --git a/kernel/groups.c b/kernel/groups.c
index 53b1916c9492..253dc0f35cf4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
143 right = group_info->ngroups; 143 right = group_info->ngroups;
144 while (left < right) { 144 while (left < right) {
145 unsigned int mid = (left+right)/2; 145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid); 146 if (grp > GROUP_AT(group_info, mid))
147 if (cmp > 0)
148 left = mid + 1; 147 left = mid + 1;
149 else if (cmp < 0) 148 else if (grp < GROUP_AT(group_info, mid))
150 right = mid; 149 right = mid;
151 else 150 else
152 return 1; 151 return 1;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ce669174f355..72206cf5c6cf 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -931,6 +931,7 @@ static inline int
931remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) 931remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
932{ 932{
933 if (hrtimer_is_queued(timer)) { 933 if (hrtimer_is_queued(timer)) {
934 unsigned long state;
934 int reprogram; 935 int reprogram;
935 936
936 /* 937 /*
@@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
944 debug_deactivate(timer); 945 debug_deactivate(timer);
945 timer_stats_hrtimer_clear_start_info(timer); 946 timer_stats_hrtimer_clear_start_info(timer);
946 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 947 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
947 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 948 /*
948 reprogram); 949 * We must preserve the CALLBACK state flag here,
950 * otherwise we could move the timer base in
951 * switch_hrtimer_base.
952 */
953 state = timer->state & HRTIMER_STATE_CALLBACK;
954 __remove_hrtimer(timer, base, state, reprogram);
949 return 1; 955 return 1;
950 } 956 }
951 return 0; 957 return 0;
@@ -1091,11 +1097,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
1091 */ 1097 */
1092ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 1098ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1093{ 1099{
1094 struct hrtimer_clock_base *base;
1095 unsigned long flags; 1100 unsigned long flags;
1096 ktime_t rem; 1101 ktime_t rem;
1097 1102
1098 base = lock_hrtimer_base(timer, &flags); 1103 lock_hrtimer_base(timer, &flags);
1099 rem = hrtimer_expires_remaining(timer); 1104 rem = hrtimer_expires_remaining(timer);
1100 unlock_hrtimer_base(timer, &flags); 1105 unlock_hrtimer_base(timer, &flags);
1101 1106
@@ -1232,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1232 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1237 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1233 enqueue_hrtimer(timer, base); 1238 enqueue_hrtimer(timer, base);
1234 } 1239 }
1240
1241 WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
1242
1235 timer->state &= ~HRTIMER_STATE_CALLBACK; 1243 timer->state &= ~HRTIMER_STATE_CALLBACK;
1236} 1244}
1237 1245
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..53ead174da2f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n"); 99 " disables this message.\n");
100 sched_show_task(t); 100 sched_show_task(t);
101 __debug_show_held_locks(t); 101 debug_show_held_locks(t);
102 102
103 touch_nmi_watchdog(); 103 touch_nmi_watchdog();
104 104
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
111 * periodically exit the critical section and enter a new one. 111 * periodically exit the critical section and enter a new one.
112 * 112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required. 114 * to exit the grace period. For classic RCU, a reschedule is required.
115 */ 115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t) 116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{ 117{
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index d71a987fd2bf..2c9120f0afca 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
113 */ 113 */
114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) 114static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
115{ 115{
116 struct perf_event_context *ctx = bp->ctx; 116 struct task_struct *tsk = bp->hw.bp_target;
117 struct perf_event *iter; 117 struct perf_event *iter;
118 int count = 0; 118 int count = 0;
119 119
120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) { 120 list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
121 if (iter->ctx == ctx && find_slot_idx(iter) == type) 121 if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
122 count += hw_breakpoint_weight(iter); 122 count += hw_breakpoint_weight(iter);
123 } 123 }
124 124
@@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
134 enum bp_type_idx type) 134 enum bp_type_idx type)
135{ 135{
136 int cpu = bp->cpu; 136 int cpu = bp->cpu;
137 struct task_struct *tsk = bp->ctx->task; 137 struct task_struct *tsk = bp->hw.bp_target;
138 138
139 if (cpu >= 0) { 139 if (cpu >= 0) {
140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); 140 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
213 int weight) 213 int weight)
214{ 214{
215 int cpu = bp->cpu; 215 int cpu = bp->cpu;
216 struct task_struct *tsk = bp->ctx->task; 216 struct task_struct *tsk = bp->hw.bp_target;
217 217
218 /* Pinned counter cpu profiling */ 218 /* Pinned counter cpu profiling */
219 if (!tsk) { 219 if (!tsk) {
@@ -433,7 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 struct task_struct *tsk) 434 struct task_struct *tsk)
435{ 435{
436 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
437} 437}
438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
439 439
@@ -515,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
515 get_online_cpus(); 515 get_online_cpus();
516 for_each_online_cpu(cpu) { 516 for_each_online_cpu(cpu) {
517 pevent = per_cpu_ptr(cpu_events, cpu); 517 pevent = per_cpu_ptr(cpu_events, cpu);
518 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
519 519
520 *pevent = bp; 520 *pevent = bp;
521 521
@@ -565,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
565 .priority = 0x7fffffff 565 .priority = 0x7fffffff
566}; 566};
567 567
568static void bp_perf_event_destroy(struct perf_event *event)
569{
570 release_bp_slot(event);
571}
572
573static int hw_breakpoint_event_init(struct perf_event *bp)
574{
575 int err;
576
577 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
578 return -ENOENT;
579
580 err = register_perf_hw_breakpoint(bp);
581 if (err)
582 return err;
583
584 bp->destroy = bp_perf_event_destroy;
585
586 return 0;
587}
588
589static int hw_breakpoint_add(struct perf_event *bp, int flags)
590{
591 if (!(flags & PERF_EF_START))
592 bp->hw.state = PERF_HES_STOPPED;
593
594 return arch_install_hw_breakpoint(bp);
595}
596
597static void hw_breakpoint_del(struct perf_event *bp, int flags)
598{
599 arch_uninstall_hw_breakpoint(bp);
600}
601
602static void hw_breakpoint_start(struct perf_event *bp, int flags)
603{
604 bp->hw.state = 0;
605}
606
607static void hw_breakpoint_stop(struct perf_event *bp, int flags)
608{
609 bp->hw.state = PERF_HES_STOPPED;
610}
611
612static struct pmu perf_breakpoint = {
613 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
614
615 .event_init = hw_breakpoint_event_init,
616 .add = hw_breakpoint_add,
617 .del = hw_breakpoint_del,
618 .start = hw_breakpoint_start,
619 .stop = hw_breakpoint_stop,
620 .read = hw_breakpoint_pmu_read,
621};
622
568static int __init init_hw_breakpoint(void) 623static int __init init_hw_breakpoint(void)
569{ 624{
570 unsigned int **task_bp_pinned; 625 unsigned int **task_bp_pinned;
@@ -586,6 +641,8 @@ static int __init init_hw_breakpoint(void)
586 641
587 constraints_initialized = 1; 642 constraints_initialized = 1;
588 643
644 perf_pmu_register(&perf_breakpoint);
645
589 return register_die_notifier(&hw_breakpoint_exceptions_nb); 646 return register_die_notifier(&hw_breakpoint_exceptions_nb);
590 647
591 err_alloc: 648 err_alloc:
@@ -601,8 +658,3 @@ static int __init init_hw_breakpoint(void)
601core_initcall(init_hw_breakpoint); 658core_initcall(init_hw_breakpoint);
602 659
603 660
604struct pmu perf_ops_bp = {
605 .enable = arch_install_hw_breakpoint,
606 .disable = arch_uninstall_hw_breakpoint,
607 .read = hw_breakpoint_pmu_read,
608};
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
new file mode 100644
index 000000000000..f16763ff8481
--- /dev/null
+++ b/kernel/irq_work.c
@@ -0,0 +1,164 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
3 *
4 * Provides a framework for enqueueing and running callbacks from hardirq
5 * context. The enqueueing is NMI-safe.
6 */
7
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/irq_work.h>
11#include <linux/hardirq.h>
12
13/*
14 * An entry can be in one of four states:
15 *
16 * free NULL, 0 -> {claimed} : free to be used
17 * claimed NULL, 3 -> {pending} : claimed to be enqueued
18 * pending next, 3 -> {busy} : queued, pending callback
19 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
20 *
21 * We use the lower two bits of the next pointer to keep PENDING and BUSY
22 * flags.
23 */
24
25#define IRQ_WORK_PENDING 1UL
26#define IRQ_WORK_BUSY 2UL
27#define IRQ_WORK_FLAGS 3UL
28
29static inline bool irq_work_is_set(struct irq_work *entry, int flags)
30{
31 return (unsigned long)entry->next & flags;
32}
33
34static inline struct irq_work *irq_work_next(struct irq_work *entry)
35{
36 unsigned long next = (unsigned long)entry->next;
37 next &= ~IRQ_WORK_FLAGS;
38 return (struct irq_work *)next;
39}
40
41static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
42{
43 unsigned long next = (unsigned long)entry;
44 next |= flags;
45 return (struct irq_work *)next;
46}
47
48static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
49
50/*
51 * Claim the entry so that no one else will poke at it.
52 */
53static bool irq_work_claim(struct irq_work *entry)
54{
55 struct irq_work *next, *nflags;
56
57 do {
58 next = entry->next;
59 if ((unsigned long)next & IRQ_WORK_PENDING)
60 return false;
61 nflags = next_flags(next, IRQ_WORK_FLAGS);
62 } while (cmpxchg(&entry->next, next, nflags) != next);
63
64 return true;
65}
66
67
68void __weak arch_irq_work_raise(void)
69{
70 /*
71 * Lame architectures will get the timer tick callback
72 */
73}
74
75/*
76 * Queue the entry and raise the IPI if needed.
77 */
78static void __irq_work_queue(struct irq_work *entry)
79{
80 struct irq_work **head, *next;
81
82 head = &get_cpu_var(irq_work_list);
83
84 do {
85 next = *head;
86 /* Can assign non-atomic because we keep the flags set. */
87 entry->next = next_flags(next, IRQ_WORK_FLAGS);
88 } while (cmpxchg(head, next, entry) != next);
89
90 /* The list was empty, raise self-interrupt to start processing. */
91 if (!irq_work_next(entry))
92 arch_irq_work_raise();
93
94 put_cpu_var(irq_work_list);
95}
96
97/*
98 * Enqueue the irq_work @entry, returns true on success, failure when the
99 * @entry was already enqueued by someone else.
100 *
101 * Can be re-enqueued while the callback is still in progress.
102 */
103bool irq_work_queue(struct irq_work *entry)
104{
105 if (!irq_work_claim(entry)) {
106 /*
107 * Already enqueued, can't do!
108 */
109 return false;
110 }
111
112 __irq_work_queue(entry);
113 return true;
114}
115EXPORT_SYMBOL_GPL(irq_work_queue);
116
117/*
118 * Run the irq_work entries on this cpu. Requires to be ran from hardirq
119 * context with local IRQs disabled.
120 */
121void irq_work_run(void)
122{
123 struct irq_work *list, **head;
124
125 head = &__get_cpu_var(irq_work_list);
126 if (*head == NULL)
127 return;
128
129 BUG_ON(!in_irq());
130 BUG_ON(!irqs_disabled());
131
132 list = xchg(head, NULL);
133 while (list != NULL) {
134 struct irq_work *entry = list;
135
136 list = irq_work_next(list);
137
138 /*
139 * Clear the PENDING bit, after this point the @entry
140 * can be re-used.
141 */
142 entry->next = next_flags(NULL, IRQ_WORK_BUSY);
143 entry->func(entry);
144 /*
145 * Clear the BUSY bit and return to the free state if
146 * no-one else claimed it meanwhile.
147 */
148 cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL);
149 }
150}
151EXPORT_SYMBOL_GPL(irq_work_run);
152
153/*
154 * Synchronize against the irq_work @entry, ensures the entry is not
155 * currently in use.
156 */
157void irq_work_sync(struct irq_work *entry)
158{
159 WARN_ON_ONCE(irqs_disabled());
160
161 while (irq_work_is_set(entry, IRQ_WORK_BUSY))
162 cpu_relax();
163}
164EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
new file mode 100644
index 000000000000..7be868bf25c6
--- /dev/null
+++ b/kernel/jump_label.c
@@ -0,0 +1,429 @@
1/*
2 * jump label support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/slab.h>
14#include <linux/sort.h>
15#include <linux/err.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19#define JUMP_LABEL_HASH_BITS 6
20#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
21static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
22
23/* mutex to protect coming/going of the the jump_label table */
24static DEFINE_MUTEX(jump_label_mutex);
25
26struct jump_label_entry {
27 struct hlist_node hlist;
28 struct jump_entry *table;
29 int nr_entries;
30 /* hang modules off here */
31 struct hlist_head modules;
32 unsigned long key;
33};
34
35struct jump_label_module_entry {
36 struct hlist_node hlist;
37 struct jump_entry *table;
38 int nr_entries;
39 struct module *mod;
40};
41
42static int jump_label_cmp(const void *a, const void *b)
43{
44 const struct jump_entry *jea = a;
45 const struct jump_entry *jeb = b;
46
47 if (jea->key < jeb->key)
48 return -1;
49
50 if (jea->key > jeb->key)
51 return 1;
52
53 return 0;
54}
55
56static void
57sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
58{
59 unsigned long size;
60
61 size = (((unsigned long)stop - (unsigned long)start)
62 / sizeof(struct jump_entry));
63 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
64}
65
66static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
67{
68 struct hlist_head *head;
69 struct hlist_node *node;
70 struct jump_label_entry *e;
71 u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
72
73 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
74 hlist_for_each_entry(e, node, head, hlist) {
75 if (key == e->key)
76 return e;
77 }
78 return NULL;
79}
80
81static struct jump_label_entry *
82add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
83{
84 struct hlist_head *head;
85 struct jump_label_entry *e;
86 u32 hash;
87
88 e = get_jump_label_entry(key);
89 if (e)
90 return ERR_PTR(-EEXIST);
91
92 e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
93 if (!e)
94 return ERR_PTR(-ENOMEM);
95
96 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
97 head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
98 e->key = key;
99 e->table = table;
100 e->nr_entries = nr_entries;
101 INIT_HLIST_HEAD(&(e->modules));
102 hlist_add_head(&e->hlist, head);
103 return e;
104}
105
106static int
107build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
108{
109 struct jump_entry *iter, *iter_begin;
110 struct jump_label_entry *entry;
111 int count;
112
113 sort_jump_label_entries(start, stop);
114 iter = start;
115 while (iter < stop) {
116 entry = get_jump_label_entry(iter->key);
117 if (!entry) {
118 iter_begin = iter;
119 count = 0;
120 while ((iter < stop) &&
121 (iter->key == iter_begin->key)) {
122 iter++;
123 count++;
124 }
125 entry = add_jump_label_entry(iter_begin->key,
126 count, iter_begin);
127 if (IS_ERR(entry))
128 return PTR_ERR(entry);
129 } else {
130 WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
131 return -1;
132 }
133 }
134 return 0;
135}
136
137/***
138 * jump_label_update - update jump label text
139 * @key - key value associated with a a jump label
140 * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
141 *
142 * Will enable/disable the jump for jump label @key, depending on the
143 * value of @type.
144 *
145 */
146
147void jump_label_update(unsigned long key, enum jump_label_type type)
148{
149 struct jump_entry *iter;
150 struct jump_label_entry *entry;
151 struct hlist_node *module_node;
152 struct jump_label_module_entry *e_module;
153 int count;
154
155 mutex_lock(&jump_label_mutex);
156 entry = get_jump_label_entry((jump_label_t)key);
157 if (entry) {
158 count = entry->nr_entries;
159 iter = entry->table;
160 while (count--) {
161 if (kernel_text_address(iter->code))
162 arch_jump_label_transform(iter, type);
163 iter++;
164 }
165 /* eanble/disable jump labels in modules */
166 hlist_for_each_entry(e_module, module_node, &(entry->modules),
167 hlist) {
168 count = e_module->nr_entries;
169 iter = e_module->table;
170 while (count--) {
171 if (kernel_text_address(iter->code))
172 arch_jump_label_transform(iter, type);
173 iter++;
174 }
175 }
176 }
177 mutex_unlock(&jump_label_mutex);
178}
179
180static int addr_conflict(struct jump_entry *entry, void *start, void *end)
181{
182 if (entry->code <= (unsigned long)end &&
183 entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start)
184 return 1;
185
186 return 0;
187}
188
189#ifdef CONFIG_MODULES
190
191static int module_conflict(void *start, void *end)
192{
193 struct hlist_head *head;
194 struct hlist_node *node, *node_next, *module_node, *module_node_next;
195 struct jump_label_entry *e;
196 struct jump_label_module_entry *e_module;
197 struct jump_entry *iter;
198 int i, count;
199 int conflict = 0;
200
201 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
202 head = &jump_label_table[i];
203 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
204 hlist_for_each_entry_safe(e_module, module_node,
205 module_node_next,
206 &(e->modules), hlist) {
207 count = e_module->nr_entries;
208 iter = e_module->table;
209 while (count--) {
210 if (addr_conflict(iter, start, end)) {
211 conflict = 1;
212 goto out;
213 }
214 iter++;
215 }
216 }
217 }
218 }
219out:
220 return conflict;
221}
222
223#endif
224
225/***
226 * jump_label_text_reserved - check if addr range is reserved
227 * @start: start text addr
228 * @end: end text addr
229 *
230 * checks if the text addr located between @start and @end
231 * overlaps with any of the jump label patch addresses. Code
232 * that wants to modify kernel text should first verify that
233 * it does not overlap with any of the jump label addresses.
234 *
235 * returns 1 if there is an overlap, 0 otherwise
236 */
237int jump_label_text_reserved(void *start, void *end)
238{
239 struct jump_entry *iter;
240 struct jump_entry *iter_start = __start___jump_table;
241 struct jump_entry *iter_stop = __start___jump_table;
242 int conflict = 0;
243
244 mutex_lock(&jump_label_mutex);
245 iter = iter_start;
246 while (iter < iter_stop) {
247 if (addr_conflict(iter, start, end)) {
248 conflict = 1;
249 goto out;
250 }
251 iter++;
252 }
253
254 /* now check modules */
255#ifdef CONFIG_MODULES
256 conflict = module_conflict(start, end);
257#endif
258out:
259 mutex_unlock(&jump_label_mutex);
260 return conflict;
261}
262
263static __init int init_jump_label(void)
264{
265 int ret;
266 struct jump_entry *iter_start = __start___jump_table;
267 struct jump_entry *iter_stop = __stop___jump_table;
268 struct jump_entry *iter;
269
270 mutex_lock(&jump_label_mutex);
271 ret = build_jump_label_hashtable(__start___jump_table,
272 __stop___jump_table);
273 iter = iter_start;
274 while (iter < iter_stop) {
275 arch_jump_label_text_poke_early(iter->code);
276 iter++;
277 }
278 mutex_unlock(&jump_label_mutex);
279 return ret;
280}
281early_initcall(init_jump_label);
282
283#ifdef CONFIG_MODULES
284
285static struct jump_label_module_entry *
286add_jump_label_module_entry(struct jump_label_entry *entry,
287 struct jump_entry *iter_begin,
288 int count, struct module *mod)
289{
290 struct jump_label_module_entry *e;
291
292 e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
293 if (!e)
294 return ERR_PTR(-ENOMEM);
295 e->mod = mod;
296 e->nr_entries = count;
297 e->table = iter_begin;
298 hlist_add_head(&e->hlist, &entry->modules);
299 return e;
300}
301
302static int add_jump_label_module(struct module *mod)
303{
304 struct jump_entry *iter, *iter_begin;
305 struct jump_label_entry *entry;
306 struct jump_label_module_entry *module_entry;
307 int count;
308
309 /* if the module doesn't have jump label entries, just return */
310 if (!mod->num_jump_entries)
311 return 0;
312
313 sort_jump_label_entries(mod->jump_entries,
314 mod->jump_entries + mod->num_jump_entries);
315 iter = mod->jump_entries;
316 while (iter < mod->jump_entries + mod->num_jump_entries) {
317 entry = get_jump_label_entry(iter->key);
318 iter_begin = iter;
319 count = 0;
320 while ((iter < mod->jump_entries + mod->num_jump_entries) &&
321 (iter->key == iter_begin->key)) {
322 iter++;
323 count++;
324 }
325 if (!entry) {
326 entry = add_jump_label_entry(iter_begin->key, 0, NULL);
327 if (IS_ERR(entry))
328 return PTR_ERR(entry);
329 }
330 module_entry = add_jump_label_module_entry(entry, iter_begin,
331 count, mod);
332 if (IS_ERR(module_entry))
333 return PTR_ERR(module_entry);
334 }
335 return 0;
336}
337
338static void remove_jump_label_module(struct module *mod)
339{
340 struct hlist_head *head;
341 struct hlist_node *node, *node_next, *module_node, *module_node_next;
342 struct jump_label_entry *e;
343 struct jump_label_module_entry *e_module;
344 int i;
345
346 /* if the module doesn't have jump label entries, just return */
347 if (!mod->num_jump_entries)
348 return;
349
350 for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
351 head = &jump_label_table[i];
352 hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
353 hlist_for_each_entry_safe(e_module, module_node,
354 module_node_next,
355 &(e->modules), hlist) {
356 if (e_module->mod == mod) {
357 hlist_del(&e_module->hlist);
358 kfree(e_module);
359 }
360 }
361 if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
362 hlist_del(&e->hlist);
363 kfree(e);
364 }
365 }
366 }
367}
368
369static int
370jump_label_module_notify(struct notifier_block *self, unsigned long val,
371 void *data)
372{
373 struct module *mod = data;
374 int ret = 0;
375
376 switch (val) {
377 case MODULE_STATE_COMING:
378 mutex_lock(&jump_label_mutex);
379 ret = add_jump_label_module(mod);
380 if (ret)
381 remove_jump_label_module(mod);
382 mutex_unlock(&jump_label_mutex);
383 break;
384 case MODULE_STATE_GOING:
385 mutex_lock(&jump_label_mutex);
386 remove_jump_label_module(mod);
387 mutex_unlock(&jump_label_mutex);
388 break;
389 }
390 return ret;
391}
392
393/***
394 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
395 * @mod: module to patch
396 *
397 * Allow for run-time selection of the optimal nops. Before the module
398 * loads patch these with arch_get_jump_label_nop(), which is specified by
399 * the arch specific jump label code.
400 */
401void jump_label_apply_nops(struct module *mod)
402{
403 struct jump_entry *iter;
404
405 /* if the module doesn't have jump label entries, just return */
406 if (!mod->num_jump_entries)
407 return;
408
409 iter = mod->jump_entries;
410 while (iter < mod->jump_entries + mod->num_jump_entries) {
411 arch_jump_label_text_poke_early(iter->code);
412 iter++;
413 }
414}
415
416struct notifier_block jump_label_module_nb = {
417 .notifier_call = jump_label_module_notify,
418 .priority = 0,
419};
420
421static __init int init_jump_label_module(void)
422{
423 return register_module_notifier(&jump_label_module_nb);
424}
425early_initcall(init_jump_label_module);
426
427#endif /* CONFIG_MODULES */
428
429#endif
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 6b5580c57644..01a0700e873f 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -365,8 +365,6 @@ static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l); 365 n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); 366 n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
367 367
368 if (n)
369 sg_mark_end(sgl + n - 1);
370 return n; 368 return n;
371} 369}
372 370
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 282035f3ae96..ec4210c6501e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,6 +47,7 @@
47#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/jump_label.h>
50 51
51#include <asm-generic/sections.h> 52#include <asm-generic/sections.h>
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
@@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p)
399 * Return an optimized kprobe whose optimizing code replaces 400 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint). 401 * instructions including addr (exclude breakpoint).
401 */ 402 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) 403static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{ 404{
404 int i; 405 int i;
405 struct kprobe *p = NULL; 406 struct kprobe *p = NULL;
@@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
831 832
832void __kprobes kretprobe_hash_lock(struct task_struct *tsk, 833void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
833 struct hlist_head **head, unsigned long *flags) 834 struct hlist_head **head, unsigned long *flags)
835__acquires(hlist_lock)
834{ 836{
835 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 837 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
836 spinlock_t *hlist_lock; 838 spinlock_t *hlist_lock;
@@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
842 844
843static void __kprobes kretprobe_table_lock(unsigned long hash, 845static void __kprobes kretprobe_table_lock(unsigned long hash,
844 unsigned long *flags) 846 unsigned long *flags)
847__acquires(hlist_lock)
845{ 848{
846 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 849 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
847 spin_lock_irqsave(hlist_lock, *flags); 850 spin_lock_irqsave(hlist_lock, *flags);
@@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash,
849 852
850void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, 853void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
851 unsigned long *flags) 854 unsigned long *flags)
855__releases(hlist_lock)
852{ 856{
853 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 857 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
854 spinlock_t *hlist_lock; 858 spinlock_t *hlist_lock;
@@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
857 spin_unlock_irqrestore(hlist_lock, *flags); 861 spin_unlock_irqrestore(hlist_lock, *flags);
858} 862}
859 863
860void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 864static void __kprobes kretprobe_table_unlock(unsigned long hash,
865 unsigned long *flags)
866__releases(hlist_lock)
861{ 867{
862 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 868 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
863 spin_unlock_irqrestore(hlist_lock, *flags); 869 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p)
1141 preempt_disable(); 1147 preempt_disable();
1142 if (!kernel_text_address((unsigned long) p->addr) || 1148 if (!kernel_text_address((unsigned long) p->addr) ||
1143 in_kprobes_functions((unsigned long) p->addr) || 1149 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) { 1150 ftrace_text_reserved(p->addr, p->addr) ||
1151 jump_label_text_reserved(p->addr, p->addr)) {
1145 preempt_enable(); 1152 preempt_enable();
1146 return -EINVAL; 1153 return -EINVAL;
1147 } 1154 }
@@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
1339 if (num <= 0) 1346 if (num <= 0)
1340 return -EINVAL; 1347 return -EINVAL;
1341 for (i = 0; i < num; i++) { 1348 for (i = 0; i < num; i++) {
1342 unsigned long addr; 1349 unsigned long addr, offset;
1343 jp = jps[i]; 1350 jp = jps[i];
1344 addr = arch_deref_entry_point(jp->entry); 1351 addr = arch_deref_entry_point(jp->entry);
1345 1352
1346 if (!kernel_text_address(addr)) 1353 /* Verify probepoint is a function entry point */
1347 ret = -EINVAL; 1354 if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
1348 else { 1355 offset == 0) {
1349 /* Todo: Verify probepoint is a function entry point */
1350 jp->kp.pre_handler = setjmp_pre_handler; 1356 jp->kp.pre_handler = setjmp_pre_handler;
1351 jp->kp.break_handler = longjmp_break_handler; 1357 jp->kp.break_handler = longjmp_break_handler;
1352 ret = register_kprobe(&jp->kp); 1358 ret = register_kprobe(&jp->kp);
1353 } 1359 } else
1360 ret = -EINVAL;
1361
1354 if (ret < 0) { 1362 if (ret < 0) {
1355 if (i > 0) 1363 if (i > 0)
1356 unregister_jprobes(jps, i); 1364 unregister_jprobes(jps, i);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a510232..42ba65dff7d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
639 } 639 }
640#endif 640#endif
641 641
642 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
643 debug_locks_off();
644 printk(KERN_ERR
645 "BUG: looking up invalid subclass: %u\n", subclass);
646 printk(KERN_ERR
647 "turning off the locking correctness validator.\n");
648 dump_stack();
649 return NULL;
650 }
651
642 /* 652 /*
643 * Static locks do not have their class-keys yet - for them the key 653 * Static locks do not have their class-keys yet - for them the key
644 * is the lock object itself: 654 * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
774 raw_local_irq_restore(flags); 784 raw_local_irq_restore(flags);
775 785
776 if (!subclass || force) 786 if (!subclass || force)
777 lock->class_cache = class; 787 lock->class_cache[0] = class;
788 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
789 lock->class_cache[subclass] = class;
778 790
779 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 791 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
780 return NULL; 792 return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2679void lockdep_init_map(struct lockdep_map *lock, const char *name, 2691void lockdep_init_map(struct lockdep_map *lock, const char *name,
2680 struct lock_class_key *key, int subclass) 2692 struct lock_class_key *key, int subclass)
2681{ 2693{
2682 lock->class_cache = NULL; 2694 int i;
2695
2696 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2697 lock->class_cache[i] = NULL;
2698
2683#ifdef CONFIG_LOCK_STAT 2699#ifdef CONFIG_LOCK_STAT
2684 lock->cpu = raw_smp_processor_id(); 2700 lock->cpu = raw_smp_processor_id();
2685#endif 2701#endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2739 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2755 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2740 return 0; 2756 return 0;
2741 2757
2742 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
2743 debug_locks_off();
2744 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2745 printk("turning off the locking correctness validator.\n");
2746 dump_stack();
2747 return 0;
2748 }
2749
2750 if (lock->key == &__lockdep_no_validate__) 2758 if (lock->key == &__lockdep_no_validate__)
2751 check = 1; 2759 check = 1;
2752 2760
2753 if (!subclass) 2761 if (subclass < NR_LOCKDEP_CACHING_CLASSES)
2754 class = lock->class_cache; 2762 class = lock->class_cache[subclass];
2755 /* 2763 /*
2756 * Not cached yet or subclass? 2764 * Not cached?
2757 */ 2765 */
2758 if (unlikely(!class)) { 2766 if (unlikely(!class)) {
2759 class = register_lock_class(lock, subclass, 0); 2767 class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2918 return 1; 2926 return 1;
2919 2927
2920 if (hlock->references) { 2928 if (hlock->references) {
2921 struct lock_class *class = lock->class_cache; 2929 struct lock_class *class = lock->class_cache[0];
2922 2930
2923 if (!class) 2931 if (!class)
2924 class = look_up_lock_class(lock, 0); 2932 class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3559 if (list_empty(head)) 3567 if (list_empty(head))
3560 continue; 3568 continue;
3561 list_for_each_entry_safe(class, next, head, hash_entry) { 3569 list_for_each_entry_safe(class, next, head, hash_entry) {
3562 if (unlikely(class == lock->class_cache)) { 3570 int match = 0;
3571
3572 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
3573 match |= class == lock->class_cache[j];
3574
3575 if (unlikely(match)) {
3563 if (debug_locks_off_graph_unlock()) 3576 if (debug_locks_off_graph_unlock())
3564 WARN_ON(1); 3577 WARN_ON(1);
3565 goto out_restore; 3578 goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
3775 * Careful: only use this function if you are sure that 3788 * Careful: only use this function if you are sure that
3776 * the task cannot run in parallel! 3789 * the task cannot run in parallel!
3777 */ 3790 */
3778void __debug_show_held_locks(struct task_struct *task) 3791void debug_show_held_locks(struct task_struct *task)
3779{ 3792{
3780 if (unlikely(!debug_locks)) { 3793 if (unlikely(!debug_locks)) {
3781 printk("INFO: lockdep is turned off.\n"); 3794 printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
3783 } 3796 }
3784 lockdep_print_held_locks(task); 3797 lockdep_print_held_locks(task);
3785} 3798}
3786EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3787
3788void debug_show_held_locks(struct task_struct *task)
3789{
3790 __debug_show_held_locks(task);
3791}
3792EXPORT_SYMBOL_GPL(debug_show_held_locks); 3799EXPORT_SYMBOL_GPL(debug_show_held_locks);
3793 3800
3794void lockdep_sys_exit(void) 3801void lockdep_sys_exit(void)
diff --git a/kernel/module.c b/kernel/module.c
index d0b5f8db11b4..2df46301a7a4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,7 @@
55#include <linux/async.h> 55#include <linux/async.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
58#include <linux/jump_label.h>
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 61#include <trace/events/module.h>
@@ -1537,6 +1538,7 @@ static int __unlink_module(void *_mod)
1537{ 1538{
1538 struct module *mod = _mod; 1539 struct module *mod = _mod;
1539 list_del(&mod->list); 1540 list_del(&mod->list);
1541 module_bug_cleanup(mod);
1540 return 0; 1542 return 0;
1541} 1543}
1542 1544
@@ -2308,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
2308 sizeof(*mod->tracepoints), 2310 sizeof(*mod->tracepoints),
2309 &mod->num_tracepoints); 2311 &mod->num_tracepoints);
2310#endif 2312#endif
2313#ifdef HAVE_JUMP_LABEL
2314 mod->jump_entries = section_objs(info, "__jump_table",
2315 sizeof(*mod->jump_entries),
2316 &mod->num_jump_entries);
2317#endif
2311#ifdef CONFIG_EVENT_TRACING 2318#ifdef CONFIG_EVENT_TRACING
2312 mod->trace_events = section_objs(info, "_ftrace_events", 2319 mod->trace_events = section_objs(info, "_ftrace_events",
2313 sizeof(*mod->trace_events), 2320 sizeof(*mod->trace_events),
@@ -2625,6 +2632,7 @@ static struct module *load_module(void __user *umod,
2625 if (err < 0) 2632 if (err < 0)
2626 goto ddebug; 2633 goto ddebug;
2627 2634
2635 module_bug_finalize(info.hdr, info.sechdrs, mod);
2628 list_add_rcu(&mod->list, &modules); 2636 list_add_rcu(&mod->list, &modules);
2629 mutex_unlock(&module_mutex); 2637 mutex_unlock(&module_mutex);
2630 2638
@@ -2650,6 +2658,8 @@ static struct module *load_module(void __user *umod,
2650 mutex_lock(&module_mutex); 2658 mutex_lock(&module_mutex);
2651 /* Unlink carefully: kallsyms could be walking list. */ 2659 /* Unlink carefully: kallsyms could be walking list. */
2652 list_del_rcu(&mod->list); 2660 list_del_rcu(&mod->list);
2661 module_bug_cleanup(mod);
2662
2653 ddebug: 2663 ddebug:
2654 if (!mod->taints) 2664 if (!mod->taints)
2655 dynamic_debug_remove(info.debug); 2665 dynamic_debug_remove(info.debug);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 4c0b7b3e6d2e..200407c1502f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -36,15 +36,6 @@
36# include <asm/mutex.h> 36# include <asm/mutex.h>
37#endif 37#endif
38 38
39/***
40 * mutex_init - initialize the mutex
41 * @lock: the mutex to be initialized
42 * @key: the lock_class_key for the class; used by mutex lock debugging
43 *
44 * Initialize the mutex to unlocked state.
45 *
46 * It is not allowed to initialize an already locked mutex.
47 */
48void 39void
49__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 40__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
50{ 41{
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
68static __used noinline void __sched 59static __used noinline void __sched
69__mutex_lock_slowpath(atomic_t *lock_count); 60__mutex_lock_slowpath(atomic_t *lock_count);
70 61
71/*** 62/**
72 * mutex_lock - acquire the mutex 63 * mutex_lock - acquire the mutex
73 * @lock: the mutex to be acquired 64 * @lock: the mutex to be acquired
74 * 65 *
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock);
105 96
106static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 97static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
107 98
108/*** 99/**
109 * mutex_unlock - release the mutex 100 * mutex_unlock - release the mutex
110 * @lock: the mutex to be released 101 * @lock: the mutex to be released
111 * 102 *
@@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count);
364static noinline int __sched 355static noinline int __sched
365__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 356__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
366 357
367/*** 358/**
368 * mutex_lock_interruptible - acquire the mutex, interruptable 359 * mutex_lock_interruptible - acquire the mutex, interruptible
369 * @lock: the mutex to be acquired 360 * @lock: the mutex to be acquired
370 * 361 *
371 * Lock the mutex like mutex_lock(), and return 0 if the mutex has 362 * Lock the mutex like mutex_lock(), and return 0 if the mutex has
@@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
456 return prev == 1; 447 return prev == 1;
457} 448}
458 449
459/*** 450/**
460 * mutex_trylock - try acquire the mutex, without waiting 451 * mutex_trylock - try to acquire the mutex, without waiting
461 * @lock: the mutex to be acquired 452 * @lock: the mutex to be acquired
462 * 453 *
463 * Try to acquire the mutex atomically. Returns 1 if the mutex 454 * Try to acquire the mutex atomically. Returns 1 if the mutex
464 * has been acquired successfully, and 0 on contention. 455 * has been acquired successfully, and 0 on contention.
465 * 456 *
466 * NOTE: this function follows the spin_trylock() convention, so 457 * NOTE: this function follows the spin_trylock() convention, so
467 * it is negated to the down_trylock() return values! Be careful 458 * it is negated from the down_trylock() return values! Be careful
468 * about this when converting semaphore users to mutexes. 459 * about this when converting semaphore users to mutexes.
469 * 460 *
470 * This function must not be used in interrupt context. The 461 * This function must not be used in interrupt context. The
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 403d1804b198..f309e8014c78 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -31,24 +31,18 @@
31#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 32#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 33#include <linux/ftrace_event.h>
34#include <linux/hw_breakpoint.h>
35 34
36#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
37 36
38/* 37atomic_t perf_task_events __read_mostly;
39 * Each CPU has a list of per CPU events:
40 */
41static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
42
43int perf_max_events __read_mostly = 1;
44static int perf_reserved_percpu __read_mostly;
45static int perf_overcommit __read_mostly = 1;
46
47static atomic_t nr_events __read_mostly;
48static atomic_t nr_mmap_events __read_mostly; 38static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 39static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 40static atomic_t nr_task_events __read_mostly;
51 41
42static LIST_HEAD(pmus);
43static DEFINE_MUTEX(pmus_lock);
44static struct srcu_struct pmus_srcu;
45
52/* 46/*
53 * perf event paranoia level: 47 * perf event paranoia level:
54 * -1 - not paranoid at all 48 * -1 - not paranoid at all
@@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
67 61
68static atomic64_t perf_event_id; 62static atomic64_t perf_event_id;
69 63
70/* 64void __weak perf_event_print_debug(void) { }
71 * Lock for (sysadmin-configurable) event reservations:
72 */
73static DEFINE_SPINLOCK(perf_resource_lock);
74 65
75/* 66extern __weak const char *perf_pmu_name(void)
76 * Architecture provided APIs - weak aliases:
77 */
78extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
79{ 67{
80 return NULL; 68 return "pmu";
81} 69}
82 70
83void __weak hw_perf_disable(void) { barrier(); } 71void perf_pmu_disable(struct pmu *pmu)
84void __weak hw_perf_enable(void) { barrier(); } 72{
85 73 int *count = this_cpu_ptr(pmu->pmu_disable_count);
86void __weak perf_event_print_debug(void) { } 74 if (!(*count)++)
87 75 pmu->pmu_disable(pmu);
88static DEFINE_PER_CPU(int, perf_disable_count); 76}
89 77
90void perf_disable(void) 78void perf_pmu_enable(struct pmu *pmu)
91{ 79{
92 if (!__get_cpu_var(perf_disable_count)++) 80 int *count = this_cpu_ptr(pmu->pmu_disable_count);
93 hw_perf_disable(); 81 if (!--(*count))
82 pmu->pmu_enable(pmu);
94} 83}
95 84
96void perf_enable(void) 85static DEFINE_PER_CPU(struct list_head, rotation_list);
86
87/*
88 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
89 * because they're strictly cpu affine and rotate_start is called with IRQs
90 * disabled, while rotate_context is called from IRQ context.
91 */
92static void perf_pmu_rotate_start(struct pmu *pmu)
97{ 93{
98 if (!--__get_cpu_var(perf_disable_count)) 94 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
99 hw_perf_enable(); 95 struct list_head *head = &__get_cpu_var(rotation_list);
96
97 WARN_ON(!irqs_disabled());
98
99 if (list_empty(&cpuctx->rotation_list))
100 list_add(&cpuctx->rotation_list, head);
100} 101}
101 102
102static void get_ctx(struct perf_event_context *ctx) 103static void get_ctx(struct perf_event_context *ctx)
@@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event)
151 * the context could get moved to another task. 152 * the context could get moved to another task.
152 */ 153 */
153static struct perf_event_context * 154static struct perf_event_context *
154perf_lock_task_context(struct task_struct *task, unsigned long *flags) 155perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
155{ 156{
156 struct perf_event_context *ctx; 157 struct perf_event_context *ctx;
157 158
158 rcu_read_lock(); 159 rcu_read_lock();
159 retry: 160retry:
160 ctx = rcu_dereference(task->perf_event_ctxp); 161 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
161 if (ctx) { 162 if (ctx) {
162 /* 163 /*
163 * If this context is a clone of another, it might 164 * If this context is a clone of another, it might
@@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
170 * can't get swapped on us any more. 171 * can't get swapped on us any more.
171 */ 172 */
172 raw_spin_lock_irqsave(&ctx->lock, *flags); 173 raw_spin_lock_irqsave(&ctx->lock, *flags);
173 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 174 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
174 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 175 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
175 goto retry; 176 goto retry;
176 } 177 }
@@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189 * can't get swapped to another task. This also increments its 190 * can't get swapped to another task. This also increments its
190 * reference count so that the context can't get freed. 191 * reference count so that the context can't get freed.
191 */ 192 */
192static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 193static struct perf_event_context *
194perf_pin_task_context(struct task_struct *task, int ctxn)
193{ 195{
194 struct perf_event_context *ctx; 196 struct perf_event_context *ctx;
195 unsigned long flags; 197 unsigned long flags;
196 198
197 ctx = perf_lock_task_context(task, &flags); 199 ctx = perf_lock_task_context(task, ctxn, &flags);
198 if (ctx) { 200 if (ctx) {
199 ++ctx->pin_count; 201 ++ctx->pin_count;
200 raw_spin_unlock_irqrestore(&ctx->lock, flags); 202 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
302 } 304 }
303 305
304 list_add_rcu(&event->event_entry, &ctx->event_list); 306 list_add_rcu(&event->event_entry, &ctx->event_list);
307 if (!ctx->nr_events)
308 perf_pmu_rotate_start(ctx->pmu);
305 ctx->nr_events++; 309 ctx->nr_events++;
306 if (event->attr.inherit_stat) 310 if (event->attr.inherit_stat)
307 ctx->nr_stat++; 311 ctx->nr_stat++;
@@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event)
311{ 315{
312 struct perf_event *group_leader = event->group_leader; 316 struct perf_event *group_leader = event->group_leader;
313 317
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); 318 /*
319 * We can have double attach due to group movement in perf_event_open.
320 */
321 if (event->attach_state & PERF_ATTACH_GROUP)
322 return;
323
315 event->attach_state |= PERF_ATTACH_GROUP; 324 event->attach_state |= PERF_ATTACH_GROUP;
316 325
317 if (group_leader == event) 326 if (group_leader == event)
@@ -402,21 +411,40 @@ static void perf_group_detach(struct perf_event *event)
402 } 411 }
403} 412}
404 413
405static void 414static inline int
406event_sched_out(struct perf_event *event, 415event_filter_match(struct perf_event *event)
416{
417 return event->cpu == -1 || event->cpu == smp_processor_id();
418}
419
420static int
421__event_sched_out(struct perf_event *event,
407 struct perf_cpu_context *cpuctx, 422 struct perf_cpu_context *cpuctx,
408 struct perf_event_context *ctx) 423 struct perf_event_context *ctx)
409{ 424{
425 u64 delta;
426 /*
427 * An event which could not be activated because of
428 * filter mismatch still needs to have its timings
429 * maintained, otherwise bogus information is return
430 * via read() for time_enabled, time_running:
431 */
432 if (event->state == PERF_EVENT_STATE_INACTIVE
433 && !event_filter_match(event)) {
434 delta = ctx->time - event->tstamp_stopped;
435 event->tstamp_running += delta;
436 event->tstamp_stopped = ctx->time;
437 }
438
410 if (event->state != PERF_EVENT_STATE_ACTIVE) 439 if (event->state != PERF_EVENT_STATE_ACTIVE)
411 return; 440 return 0;
412 441
413 event->state = PERF_EVENT_STATE_INACTIVE; 442 event->state = PERF_EVENT_STATE_INACTIVE;
414 if (event->pending_disable) { 443 if (event->pending_disable) {
415 event->pending_disable = 0; 444 event->pending_disable = 0;
416 event->state = PERF_EVENT_STATE_OFF; 445 event->state = PERF_EVENT_STATE_OFF;
417 } 446 }
418 event->tstamp_stopped = ctx->time; 447 event->pmu->del(event, 0);
419 event->pmu->disable(event);
420 event->oncpu = -1; 448 event->oncpu = -1;
421 449
422 if (!is_software_event(event)) 450 if (!is_software_event(event))
@@ -424,6 +452,19 @@ event_sched_out(struct perf_event *event,
424 ctx->nr_active--; 452 ctx->nr_active--;
425 if (event->attr.exclusive || !cpuctx->active_oncpu) 453 if (event->attr.exclusive || !cpuctx->active_oncpu)
426 cpuctx->exclusive = 0; 454 cpuctx->exclusive = 0;
455 return 1;
456}
457
458static void
459event_sched_out(struct perf_event *event,
460 struct perf_cpu_context *cpuctx,
461 struct perf_event_context *ctx)
462{
463 int ret;
464
465 ret = __event_sched_out(event, cpuctx, ctx);
466 if (ret)
467 event->tstamp_stopped = ctx->time;
427} 468}
428 469
429static void 470static void
@@ -432,9 +473,7 @@ group_sched_out(struct perf_event *group_event,
432 struct perf_event_context *ctx) 473 struct perf_event_context *ctx)
433{ 474{
434 struct perf_event *event; 475 struct perf_event *event;
435 476 int state = group_event->state;
436 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
437 return;
438 477
439 event_sched_out(group_event, cpuctx, ctx); 478 event_sched_out(group_event, cpuctx, ctx);
440 479
@@ -444,10 +483,16 @@ group_sched_out(struct perf_event *group_event,
444 list_for_each_entry(event, &group_event->sibling_list, group_entry) 483 list_for_each_entry(event, &group_event->sibling_list, group_entry)
445 event_sched_out(event, cpuctx, ctx); 484 event_sched_out(event, cpuctx, ctx);
446 485
447 if (group_event->attr.exclusive) 486 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
448 cpuctx->exclusive = 0; 487 cpuctx->exclusive = 0;
449} 488}
450 489
490static inline struct perf_cpu_context *
491__get_cpu_context(struct perf_event_context *ctx)
492{
493 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
494}
495
451/* 496/*
452 * Cross CPU call to remove a performance event 497 * Cross CPU call to remove a performance event
453 * 498 *
@@ -456,9 +501,9 @@ group_sched_out(struct perf_event *group_event,
456 */ 501 */
457static void __perf_event_remove_from_context(void *info) 502static void __perf_event_remove_from_context(void *info)
458{ 503{
459 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
460 struct perf_event *event = info; 504 struct perf_event *event = info;
461 struct perf_event_context *ctx = event->ctx; 505 struct perf_event_context *ctx = event->ctx;
506 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
462 507
463 /* 508 /*
464 * If this is a task context, we need to check whether it is 509 * If this is a task context, we need to check whether it is
@@ -469,27 +514,11 @@ static void __perf_event_remove_from_context(void *info)
469 return; 514 return;
470 515
471 raw_spin_lock(&ctx->lock); 516 raw_spin_lock(&ctx->lock);
472 /*
473 * Protect the list operation against NMI by disabling the
474 * events on a global level.
475 */
476 perf_disable();
477 517
478 event_sched_out(event, cpuctx, ctx); 518 event_sched_out(event, cpuctx, ctx);
479 519
480 list_del_event(event, ctx); 520 list_del_event(event, ctx);
481 521
482 if (!ctx->task) {
483 /*
484 * Allow more per task events with respect to the
485 * reservation:
486 */
487 cpuctx->max_pertask =
488 min(perf_max_events - ctx->nr_events,
489 perf_max_events - perf_reserved_percpu);
490 }
491
492 perf_enable();
493 raw_spin_unlock(&ctx->lock); 522 raw_spin_unlock(&ctx->lock);
494} 523}
495 524
@@ -554,8 +583,8 @@ retry:
554static void __perf_event_disable(void *info) 583static void __perf_event_disable(void *info)
555{ 584{
556 struct perf_event *event = info; 585 struct perf_event *event = info;
557 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
558 struct perf_event_context *ctx = event->ctx; 586 struct perf_event_context *ctx = event->ctx;
587 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
559 588
560 /* 589 /*
561 * If this is a per-task event, need to check whether this 590 * If this is a per-task event, need to check whether this
@@ -610,7 +639,7 @@ void perf_event_disable(struct perf_event *event)
610 return; 639 return;
611 } 640 }
612 641
613 retry: 642retry:
614 task_oncpu_function_call(task, __perf_event_disable, event); 643 task_oncpu_function_call(task, __perf_event_disable, event);
615 644
616 raw_spin_lock_irq(&ctx->lock); 645 raw_spin_lock_irq(&ctx->lock);
@@ -635,7 +664,7 @@ void perf_event_disable(struct perf_event *event)
635} 664}
636 665
637static int 666static int
638event_sched_in(struct perf_event *event, 667__event_sched_in(struct perf_event *event,
639 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
640 struct perf_event_context *ctx) 669 struct perf_event_context *ctx)
641{ 670{
@@ -649,14 +678,12 @@ event_sched_in(struct perf_event *event,
649 */ 678 */
650 smp_wmb(); 679 smp_wmb();
651 680
652 if (event->pmu->enable(event)) { 681 if (event->pmu->add(event, PERF_EF_START)) {
653 event->state = PERF_EVENT_STATE_INACTIVE; 682 event->state = PERF_EVENT_STATE_INACTIVE;
654 event->oncpu = -1; 683 event->oncpu = -1;
655 return -EAGAIN; 684 return -EAGAIN;
656 } 685 }
657 686
658 event->tstamp_running += ctx->time - event->tstamp_stopped;
659
660 if (!is_software_event(event)) 687 if (!is_software_event(event))
661 cpuctx->active_oncpu++; 688 cpuctx->active_oncpu++;
662 ctx->nr_active++; 689 ctx->nr_active++;
@@ -667,28 +694,56 @@ event_sched_in(struct perf_event *event,
667 return 0; 694 return 0;
668} 695}
669 696
697static inline int
698event_sched_in(struct perf_event *event,
699 struct perf_cpu_context *cpuctx,
700 struct perf_event_context *ctx)
701{
702 int ret = __event_sched_in(event, cpuctx, ctx);
703 if (ret)
704 return ret;
705 event->tstamp_running += ctx->time - event->tstamp_stopped;
706 return 0;
707}
708
709static void
710group_commit_event_sched_in(struct perf_event *group_event,
711 struct perf_cpu_context *cpuctx,
712 struct perf_event_context *ctx)
713{
714 struct perf_event *event;
715 u64 now = ctx->time;
716
717 group_event->tstamp_running += now - group_event->tstamp_stopped;
718 /*
719 * Schedule in siblings as one group (if any):
720 */
721 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
722 event->tstamp_running += now - event->tstamp_stopped;
723 }
724}
725
670static int 726static int
671group_sched_in(struct perf_event *group_event, 727group_sched_in(struct perf_event *group_event,
672 struct perf_cpu_context *cpuctx, 728 struct perf_cpu_context *cpuctx,
673 struct perf_event_context *ctx) 729 struct perf_event_context *ctx)
674{ 730{
675 struct perf_event *event, *partial_group = NULL; 731 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu; 732 struct pmu *pmu = group_event->pmu;
677 bool txn = false;
678 733
679 if (group_event->state == PERF_EVENT_STATE_OFF) 734 if (group_event->state == PERF_EVENT_STATE_OFF)
680 return 0; 735 return 0;
681 736
682 /* Check if group transaction availabe */ 737 pmu->start_txn(pmu);
683 if (pmu->start_txn)
684 txn = true;
685
686 if (txn)
687 pmu->start_txn(pmu);
688 738
689 if (event_sched_in(group_event, cpuctx, ctx)) { 739 /*
690 if (txn) 740 * use __event_sched_in() to delay updating tstamp_running
691 pmu->cancel_txn(pmu); 741 * until the transaction is committed. In case of failure
742 * we will keep an unmodified tstamp_running which is a
743 * requirement to get correct timing information
744 */
745 if (__event_sched_in(group_event, cpuctx, ctx)) {
746 pmu->cancel_txn(pmu);
692 return -EAGAIN; 747 return -EAGAIN;
693 } 748 }
694 749
@@ -696,29 +751,33 @@ group_sched_in(struct perf_event *group_event,
696 * Schedule in siblings as one group (if any): 751 * Schedule in siblings as one group (if any):
697 */ 752 */
698 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 753 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
699 if (event_sched_in(event, cpuctx, ctx)) { 754 if (__event_sched_in(event, cpuctx, ctx)) {
700 partial_group = event; 755 partial_group = event;
701 goto group_error; 756 goto group_error;
702 } 757 }
703 } 758 }
704 759
705 if (!txn || !pmu->commit_txn(pmu)) 760 if (!pmu->commit_txn(pmu)) {
761 /* commit tstamp_running */
762 group_commit_event_sched_in(group_event, cpuctx, ctx);
706 return 0; 763 return 0;
707 764 }
708group_error: 765group_error:
709 /* 766 /*
710 * Groups can be scheduled in as one unit only, so undo any 767 * Groups can be scheduled in as one unit only, so undo any
711 * partial group before returning: 768 * partial group before returning:
769 *
770 * use __event_sched_out() to avoid updating tstamp_stopped
771 * because the event never actually ran
712 */ 772 */
713 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 773 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
714 if (event == partial_group) 774 if (event == partial_group)
715 break; 775 break;
716 event_sched_out(event, cpuctx, ctx); 776 __event_sched_out(event, cpuctx, ctx);
717 } 777 }
718 event_sched_out(group_event, cpuctx, ctx); 778 __event_sched_out(group_event, cpuctx, ctx);
719 779
720 if (txn) 780 pmu->cancel_txn(pmu);
721 pmu->cancel_txn(pmu);
722 781
723 return -EAGAIN; 782 return -EAGAIN;
724} 783}
@@ -771,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event,
771 */ 830 */
772static void __perf_install_in_context(void *info) 831static void __perf_install_in_context(void *info)
773{ 832{
774 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
775 struct perf_event *event = info; 833 struct perf_event *event = info;
776 struct perf_event_context *ctx = event->ctx; 834 struct perf_event_context *ctx = event->ctx;
777 struct perf_event *leader = event->group_leader; 835 struct perf_event *leader = event->group_leader;
836 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
778 int err; 837 int err;
779 838
780 /* 839 /*
@@ -794,12 +853,6 @@ static void __perf_install_in_context(void *info)
794 ctx->is_active = 1; 853 ctx->is_active = 1;
795 update_context_time(ctx); 854 update_context_time(ctx);
796 855
797 /*
798 * Protect the list operation against NMI by disabling the
799 * events on a global level. NOP for non NMI based events.
800 */
801 perf_disable();
802
803 add_event_to_ctx(event, ctx); 856 add_event_to_ctx(event, ctx);
804 857
805 if (event->cpu != -1 && event->cpu != smp_processor_id()) 858 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -837,12 +890,7 @@ static void __perf_install_in_context(void *info)
837 } 890 }
838 } 891 }
839 892
840 if (!err && !ctx->task && cpuctx->max_pertask) 893unlock:
841 cpuctx->max_pertask--;
842
843 unlock:
844 perf_enable();
845
846 raw_spin_unlock(&ctx->lock); 894 raw_spin_unlock(&ctx->lock);
847} 895}
848 896
@@ -865,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx,
865{ 913{
866 struct task_struct *task = ctx->task; 914 struct task_struct *task = ctx->task;
867 915
916 event->ctx = ctx;
917
868 if (!task) { 918 if (!task) {
869 /* 919 /*
870 * Per cpu events are installed via an smp call and 920 * Per cpu events are installed via an smp call and
@@ -913,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event,
913 963
914 event->state = PERF_EVENT_STATE_INACTIVE; 964 event->state = PERF_EVENT_STATE_INACTIVE;
915 event->tstamp_enabled = ctx->time - event->total_time_enabled; 965 event->tstamp_enabled = ctx->time - event->total_time_enabled;
916 list_for_each_entry(sub, &event->sibling_list, group_entry) 966 list_for_each_entry(sub, &event->sibling_list, group_entry) {
917 if (sub->state >= PERF_EVENT_STATE_INACTIVE) 967 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
918 sub->tstamp_enabled = 968 sub->tstamp_enabled =
919 ctx->time - sub->total_time_enabled; 969 ctx->time - sub->total_time_enabled;
970 }
971 }
920} 972}
921 973
922/* 974/*
@@ -925,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event,
925static void __perf_event_enable(void *info) 977static void __perf_event_enable(void *info)
926{ 978{
927 struct perf_event *event = info; 979 struct perf_event *event = info;
928 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
929 struct perf_event_context *ctx = event->ctx; 980 struct perf_event_context *ctx = event->ctx;
930 struct perf_event *leader = event->group_leader; 981 struct perf_event *leader = event->group_leader;
982 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
931 int err; 983 int err;
932 984
933 /* 985 /*
@@ -961,12 +1013,10 @@ static void __perf_event_enable(void *info)
961 if (!group_can_go_on(event, cpuctx, 1)) { 1013 if (!group_can_go_on(event, cpuctx, 1)) {
962 err = -EEXIST; 1014 err = -EEXIST;
963 } else { 1015 } else {
964 perf_disable();
965 if (event == leader) 1016 if (event == leader)
966 err = group_sched_in(event, cpuctx, ctx); 1017 err = group_sched_in(event, cpuctx, ctx);
967 else 1018 else
968 err = event_sched_in(event, cpuctx, ctx); 1019 err = event_sched_in(event, cpuctx, ctx);
969 perf_enable();
970 } 1020 }
971 1021
972 if (err) { 1022 if (err) {
@@ -982,7 +1032,7 @@ static void __perf_event_enable(void *info)
982 } 1032 }
983 } 1033 }
984 1034
985 unlock: 1035unlock:
986 raw_spin_unlock(&ctx->lock); 1036 raw_spin_unlock(&ctx->lock);
987} 1037}
988 1038
@@ -1023,7 +1073,7 @@ void perf_event_enable(struct perf_event *event)
1023 if (event->state == PERF_EVENT_STATE_ERROR) 1073 if (event->state == PERF_EVENT_STATE_ERROR)
1024 event->state = PERF_EVENT_STATE_OFF; 1074 event->state = PERF_EVENT_STATE_OFF;
1025 1075
1026 retry: 1076retry:
1027 raw_spin_unlock_irq(&ctx->lock); 1077 raw_spin_unlock_irq(&ctx->lock);
1028 task_oncpu_function_call(task, __perf_event_enable, event); 1078 task_oncpu_function_call(task, __perf_event_enable, event);
1029 1079
@@ -1043,7 +1093,7 @@ void perf_event_enable(struct perf_event *event)
1043 if (event->state == PERF_EVENT_STATE_OFF) 1093 if (event->state == PERF_EVENT_STATE_OFF)
1044 __perf_event_mark_enabled(event, ctx); 1094 __perf_event_mark_enabled(event, ctx);
1045 1095
1046 out: 1096out:
1047 raw_spin_unlock_irq(&ctx->lock); 1097 raw_spin_unlock_irq(&ctx->lock);
1048} 1098}
1049 1099
@@ -1074,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1074 struct perf_event *event; 1124 struct perf_event *event;
1075 1125
1076 raw_spin_lock(&ctx->lock); 1126 raw_spin_lock(&ctx->lock);
1127 perf_pmu_disable(ctx->pmu);
1077 ctx->is_active = 0; 1128 ctx->is_active = 0;
1078 if (likely(!ctx->nr_events)) 1129 if (likely(!ctx->nr_events))
1079 goto out; 1130 goto out;
1080 update_context_time(ctx); 1131 update_context_time(ctx);
1081 1132
1082 perf_disable();
1083 if (!ctx->nr_active) 1133 if (!ctx->nr_active)
1084 goto out_enable; 1134 goto out;
1085 1135
1086 if (event_type & EVENT_PINNED) 1136 if (event_type & EVENT_PINNED) {
1087 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1137 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1088 group_sched_out(event, cpuctx, ctx); 1138 group_sched_out(event, cpuctx, ctx);
1139 }
1089 1140
1090 if (event_type & EVENT_FLEXIBLE) 1141 if (event_type & EVENT_FLEXIBLE) {
1091 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1142 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1092 group_sched_out(event, cpuctx, ctx); 1143 group_sched_out(event, cpuctx, ctx);
1093 1144 }
1094 out_enable: 1145out:
1095 perf_enable(); 1146 perf_pmu_enable(ctx->pmu);
1096 out:
1097 raw_spin_unlock(&ctx->lock); 1147 raw_spin_unlock(&ctx->lock);
1098} 1148}
1099 1149
@@ -1191,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1191 } 1241 }
1192} 1242}
1193 1243
1194/* 1244void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1195 * Called from scheduler to remove the events of the current task, 1245 struct task_struct *next)
1196 * with interrupts disabled.
1197 *
1198 * We stop each event and update the event value in event->count.
1199 *
1200 * This does not protect us against NMI, but disable()
1201 * sets the disabled bit in the control field of event _before_
1202 * accessing the event control register. If a NMI hits, then it will
1203 * not restart the event.
1204 */
1205void perf_event_task_sched_out(struct task_struct *task,
1206 struct task_struct *next)
1207{ 1246{
1208 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1247 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1209 struct perf_event_context *ctx = task->perf_event_ctxp;
1210 struct perf_event_context *next_ctx; 1248 struct perf_event_context *next_ctx;
1211 struct perf_event_context *parent; 1249 struct perf_event_context *parent;
1250 struct perf_cpu_context *cpuctx;
1212 int do_switch = 1; 1251 int do_switch = 1;
1213 1252
1214 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1253 if (likely(!ctx))
1254 return;
1215 1255
1216 if (likely(!ctx || !cpuctx->task_ctx)) 1256 cpuctx = __get_cpu_context(ctx);
1257 if (!cpuctx->task_ctx)
1217 return; 1258 return;
1218 1259
1219 rcu_read_lock(); 1260 rcu_read_lock();
1220 parent = rcu_dereference(ctx->parent_ctx); 1261 parent = rcu_dereference(ctx->parent_ctx);
1221 next_ctx = next->perf_event_ctxp; 1262 next_ctx = next->perf_event_ctxp[ctxn];
1222 if (parent && next_ctx && 1263 if (parent && next_ctx &&
1223 rcu_dereference(next_ctx->parent_ctx) == parent) { 1264 rcu_dereference(next_ctx->parent_ctx) == parent) {
1224 /* 1265 /*
@@ -1237,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1237 * XXX do we need a memory barrier of sorts 1278 * XXX do we need a memory barrier of sorts
1238 * wrt to rcu_dereference() of perf_event_ctxp 1279 * wrt to rcu_dereference() of perf_event_ctxp
1239 */ 1280 */
1240 task->perf_event_ctxp = next_ctx; 1281 task->perf_event_ctxp[ctxn] = next_ctx;
1241 next->perf_event_ctxp = ctx; 1282 next->perf_event_ctxp[ctxn] = ctx;
1242 ctx->task = next; 1283 ctx->task = next;
1243 next_ctx->task = task; 1284 next_ctx->task = task;
1244 do_switch = 0; 1285 do_switch = 0;
@@ -1256,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task,
1256 } 1297 }
1257} 1298}
1258 1299
1300#define for_each_task_context_nr(ctxn) \
1301 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1302
1303/*
1304 * Called from scheduler to remove the events of the current task,
1305 * with interrupts disabled.
1306 *
1307 * We stop each event and update the event value in event->count.
1308 *
1309 * This does not protect us against NMI, but disable()
1310 * sets the disabled bit in the control field of event _before_
1311 * accessing the event control register. If a NMI hits, then it will
1312 * not restart the event.
1313 */
1314void __perf_event_task_sched_out(struct task_struct *task,
1315 struct task_struct *next)
1316{
1317 int ctxn;
1318
1319 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1320
1321 for_each_task_context_nr(ctxn)
1322 perf_event_context_sched_out(task, ctxn, next);
1323}
1324
1259static void task_ctx_sched_out(struct perf_event_context *ctx, 1325static void task_ctx_sched_out(struct perf_event_context *ctx,
1260 enum event_type_t event_type) 1326 enum event_type_t event_type)
1261{ 1327{
1262 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1328 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1263 1329
1264 if (!cpuctx->task_ctx) 1330 if (!cpuctx->task_ctx)
1265 return; 1331 return;
@@ -1274,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1274/* 1340/*
1275 * Called with IRQs disabled 1341 * Called with IRQs disabled
1276 */ 1342 */
1277static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1278{
1279 task_ctx_sched_out(ctx, EVENT_ALL);
1280}
1281
1282/*
1283 * Called with IRQs disabled
1284 */
1285static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 1343static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1286 enum event_type_t event_type) 1344 enum event_type_t event_type)
1287{ 1345{
@@ -1332,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1332 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1390 if (event->cpu != -1 && event->cpu != smp_processor_id())
1333 continue; 1391 continue;
1334 1392
1335 if (group_can_go_on(event, cpuctx, can_add_hw)) 1393 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1336 if (group_sched_in(event, cpuctx, ctx)) 1394 if (group_sched_in(event, cpuctx, ctx))
1337 can_add_hw = 0; 1395 can_add_hw = 0;
1396 }
1338 } 1397 }
1339} 1398}
1340 1399
@@ -1350,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx,
1350 1409
1351 ctx->timestamp = perf_clock(); 1410 ctx->timestamp = perf_clock();
1352 1411
1353 perf_disable();
1354
1355 /* 1412 /*
1356 * First go through the list and put on any pinned groups 1413 * First go through the list and put on any pinned groups
1357 * in order to give them the best chance of going on. 1414 * in order to give them the best chance of going on.
@@ -1363,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx,
1363 if (event_type & EVENT_FLEXIBLE) 1420 if (event_type & EVENT_FLEXIBLE)
1364 ctx_flexible_sched_in(ctx, cpuctx); 1421 ctx_flexible_sched_in(ctx, cpuctx);
1365 1422
1366 perf_enable(); 1423out:
1367 out:
1368 raw_spin_unlock(&ctx->lock); 1424 raw_spin_unlock(&ctx->lock);
1369} 1425}
1370 1426
@@ -1376,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1376 ctx_sched_in(ctx, cpuctx, event_type); 1432 ctx_sched_in(ctx, cpuctx, event_type);
1377} 1433}
1378 1434
1379static void task_ctx_sched_in(struct task_struct *task, 1435static void task_ctx_sched_in(struct perf_event_context *ctx,
1380 enum event_type_t event_type) 1436 enum event_type_t event_type)
1381{ 1437{
1382 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1438 struct perf_cpu_context *cpuctx;
1383 struct perf_event_context *ctx = task->perf_event_ctxp;
1384 1439
1385 if (likely(!ctx)) 1440 cpuctx = __get_cpu_context(ctx);
1386 return;
1387 if (cpuctx->task_ctx == ctx) 1441 if (cpuctx->task_ctx == ctx)
1388 return; 1442 return;
1443
1389 ctx_sched_in(ctx, cpuctx, event_type); 1444 ctx_sched_in(ctx, cpuctx, event_type);
1390 cpuctx->task_ctx = ctx; 1445 cpuctx->task_ctx = ctx;
1391} 1446}
1392/*
1393 * Called from scheduler to add the events of the current task
1394 * with interrupts disabled.
1395 *
1396 * We restore the event value and then enable it.
1397 *
1398 * This does not protect us against NMI, but enable()
1399 * sets the enabled bit in the control field of event _before_
1400 * accessing the event control register. If a NMI hits, then it will
1401 * keep the event running.
1402 */
1403void perf_event_task_sched_in(struct task_struct *task)
1404{
1405 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1406 struct perf_event_context *ctx = task->perf_event_ctxp;
1407 1447
1408 if (likely(!ctx)) 1448void perf_event_context_sched_in(struct perf_event_context *ctx)
1409 return; 1449{
1450 struct perf_cpu_context *cpuctx;
1410 1451
1452 cpuctx = __get_cpu_context(ctx);
1411 if (cpuctx->task_ctx == ctx) 1453 if (cpuctx->task_ctx == ctx)
1412 return; 1454 return;
1413 1455
1414 perf_disable(); 1456 perf_pmu_disable(ctx->pmu);
1415
1416 /* 1457 /*
1417 * We want to keep the following priority order: 1458 * We want to keep the following priority order:
1418 * cpu pinned (that don't need to move), task pinned, 1459 * cpu pinned (that don't need to move), task pinned,
@@ -1426,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task)
1426 1467
1427 cpuctx->task_ctx = ctx; 1468 cpuctx->task_ctx = ctx;
1428 1469
1429 perf_enable(); 1470 /*
1471 * Since these rotations are per-cpu, we need to ensure the
1472 * cpu-context we got scheduled on is actually rotating.
1473 */
1474 perf_pmu_rotate_start(ctx->pmu);
1475 perf_pmu_enable(ctx->pmu);
1476}
1477
1478/*
1479 * Called from scheduler to add the events of the current task
1480 * with interrupts disabled.
1481 *
1482 * We restore the event value and then enable it.
1483 *
1484 * This does not protect us against NMI, but enable()
1485 * sets the enabled bit in the control field of event _before_
1486 * accessing the event control register. If a NMI hits, then it will
1487 * keep the event running.
1488 */
1489void __perf_event_task_sched_in(struct task_struct *task)
1490{
1491 struct perf_event_context *ctx;
1492 int ctxn;
1493
1494 for_each_task_context_nr(ctxn) {
1495 ctx = task->perf_event_ctxp[ctxn];
1496 if (likely(!ctx))
1497 continue;
1498
1499 perf_event_context_sched_in(ctx);
1500 }
1430} 1501}
1431 1502
1432#define MAX_INTERRUPTS (~0ULL) 1503#define MAX_INTERRUPTS (~0ULL)
@@ -1506,22 +1577,6 @@ do { \
1506 return div64_u64(dividend, divisor); 1577 return div64_u64(dividend, divisor);
1507} 1578}
1508 1579
1509static void perf_event_stop(struct perf_event *event)
1510{
1511 if (!event->pmu->stop)
1512 return event->pmu->disable(event);
1513
1514 return event->pmu->stop(event);
1515}
1516
1517static int perf_event_start(struct perf_event *event)
1518{
1519 if (!event->pmu->start)
1520 return event->pmu->enable(event);
1521
1522 return event->pmu->start(event);
1523}
1524
1525static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1580static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1526{ 1581{
1527 struct hw_perf_event *hwc = &event->hw; 1582 struct hw_perf_event *hwc = &event->hw;
@@ -1541,15 +1596,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1541 hwc->sample_period = sample_period; 1596 hwc->sample_period = sample_period;
1542 1597
1543 if (local64_read(&hwc->period_left) > 8*sample_period) { 1598 if (local64_read(&hwc->period_left) > 8*sample_period) {
1544 perf_disable(); 1599 event->pmu->stop(event, PERF_EF_UPDATE);
1545 perf_event_stop(event);
1546 local64_set(&hwc->period_left, 0); 1600 local64_set(&hwc->period_left, 0);
1547 perf_event_start(event); 1601 event->pmu->start(event, PERF_EF_RELOAD);
1548 perf_enable();
1549 } 1602 }
1550} 1603}
1551 1604
1552static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1605static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1553{ 1606{
1554 struct perf_event *event; 1607 struct perf_event *event;
1555 struct hw_perf_event *hwc; 1608 struct hw_perf_event *hwc;
@@ -1574,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1574 */ 1627 */
1575 if (interrupts == MAX_INTERRUPTS) { 1628 if (interrupts == MAX_INTERRUPTS) {
1576 perf_log_throttle(event, 1); 1629 perf_log_throttle(event, 1);
1577 perf_disable(); 1630 event->pmu->start(event, 0);
1578 event->pmu->unthrottle(event);
1579 perf_enable();
1580 } 1631 }
1581 1632
1582 if (!event->attr.freq || !event->attr.sample_freq) 1633 if (!event->attr.freq || !event->attr.sample_freq)
1583 continue; 1634 continue;
1584 1635
1585 perf_disable();
1586 event->pmu->read(event); 1636 event->pmu->read(event);
1587 now = local64_read(&event->count); 1637 now = local64_read(&event->count);
1588 delta = now - hwc->freq_count_stamp; 1638 delta = now - hwc->freq_count_stamp;
1589 hwc->freq_count_stamp = now; 1639 hwc->freq_count_stamp = now;
1590 1640
1591 if (delta > 0) 1641 if (delta > 0)
1592 perf_adjust_period(event, TICK_NSEC, delta); 1642 perf_adjust_period(event, period, delta);
1593 perf_enable();
1594 } 1643 }
1595 raw_spin_unlock(&ctx->lock); 1644 raw_spin_unlock(&ctx->lock);
1596} 1645}
@@ -1608,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx)
1608 raw_spin_unlock(&ctx->lock); 1657 raw_spin_unlock(&ctx->lock);
1609} 1658}
1610 1659
1611void perf_event_task_tick(struct task_struct *curr) 1660/*
1661 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
1662 * because they're strictly cpu affine and rotate_start is called with IRQs
1663 * disabled, while rotate_context is called from IRQ context.
1664 */
1665static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1612{ 1666{
1613 struct perf_cpu_context *cpuctx; 1667 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
1614 struct perf_event_context *ctx; 1668 struct perf_event_context *ctx = NULL;
1615 int rotate = 0; 1669 int rotate = 0, remove = 1;
1616
1617 if (!atomic_read(&nr_events))
1618 return;
1619 1670
1620 cpuctx = &__get_cpu_var(perf_cpu_context); 1671 if (cpuctx->ctx.nr_events) {
1621 if (cpuctx->ctx.nr_events && 1672 remove = 0;
1622 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 1673 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1623 rotate = 1; 1674 rotate = 1;
1675 }
1624 1676
1625 ctx = curr->perf_event_ctxp; 1677 ctx = cpuctx->task_ctx;
1626 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) 1678 if (ctx && ctx->nr_events) {
1627 rotate = 1; 1679 remove = 0;
1680 if (ctx->nr_events != ctx->nr_active)
1681 rotate = 1;
1682 }
1628 1683
1629 perf_ctx_adjust_freq(&cpuctx->ctx); 1684 perf_pmu_disable(cpuctx->ctx.pmu);
1685 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
1630 if (ctx) 1686 if (ctx)
1631 perf_ctx_adjust_freq(ctx); 1687 perf_ctx_adjust_freq(ctx, interval);
1632 1688
1633 if (!rotate) 1689 if (!rotate)
1634 return; 1690 goto done;
1635 1691
1636 perf_disable();
1637 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1692 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1638 if (ctx) 1693 if (ctx)
1639 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1694 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1644,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr)
1644 1699
1645 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1700 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1646 if (ctx) 1701 if (ctx)
1647 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1702 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1648 perf_enable(); 1703
1704done:
1705 if (remove)
1706 list_del_init(&cpuctx->rotation_list);
1707
1708 perf_pmu_enable(cpuctx->ctx.pmu);
1709}
1710
1711void perf_event_task_tick(void)
1712{
1713 struct list_head *head = &__get_cpu_var(rotation_list);
1714 struct perf_cpu_context *cpuctx, *tmp;
1715
1716 WARN_ON(!irqs_disabled());
1717
1718 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
1719 if (cpuctx->jiffies_interval == 1 ||
1720 !(jiffies % cpuctx->jiffies_interval))
1721 perf_rotate_context(cpuctx);
1722 }
1649} 1723}
1650 1724
1651static int event_enable_on_exec(struct perf_event *event, 1725static int event_enable_on_exec(struct perf_event *event,
@@ -1667,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event,
1667 * Enable all of a task's events that have been marked enable-on-exec. 1741 * Enable all of a task's events that have been marked enable-on-exec.
1668 * This expects task == current. 1742 * This expects task == current.
1669 */ 1743 */
1670static void perf_event_enable_on_exec(struct task_struct *task) 1744static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1671{ 1745{
1672 struct perf_event_context *ctx;
1673 struct perf_event *event; 1746 struct perf_event *event;
1674 unsigned long flags; 1747 unsigned long flags;
1675 int enabled = 0; 1748 int enabled = 0;
1676 int ret; 1749 int ret;
1677 1750
1678 local_irq_save(flags); 1751 local_irq_save(flags);
1679 ctx = task->perf_event_ctxp;
1680 if (!ctx || !ctx->nr_events) 1752 if (!ctx || !ctx->nr_events)
1681 goto out; 1753 goto out;
1682 1754
1683 __perf_event_task_sched_out(ctx); 1755 task_ctx_sched_out(ctx, EVENT_ALL);
1684 1756
1685 raw_spin_lock(&ctx->lock); 1757 raw_spin_lock(&ctx->lock);
1686 1758
@@ -1704,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1704 1776
1705 raw_spin_unlock(&ctx->lock); 1777 raw_spin_unlock(&ctx->lock);
1706 1778
1707 perf_event_task_sched_in(task); 1779 perf_event_context_sched_in(ctx);
1708 out: 1780out:
1709 local_irq_restore(flags); 1781 local_irq_restore(flags);
1710} 1782}
1711 1783
@@ -1714,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1714 */ 1786 */
1715static void __perf_event_read(void *info) 1787static void __perf_event_read(void *info)
1716{ 1788{
1717 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1718 struct perf_event *event = info; 1789 struct perf_event *event = info;
1719 struct perf_event_context *ctx = event->ctx; 1790 struct perf_event_context *ctx = event->ctx;
1791 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1720 1792
1721 /* 1793 /*
1722 * If this is a task context, we need to check whether it is 1794 * If this is a task context, we need to check whether it is
@@ -1755,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event)
1755 unsigned long flags; 1827 unsigned long flags;
1756 1828
1757 raw_spin_lock_irqsave(&ctx->lock, flags); 1829 raw_spin_lock_irqsave(&ctx->lock, flags);
1758 update_context_time(ctx); 1830 /*
1831 * may read while context is not active
1832 * (e.g., thread is blocked), in that case
1833 * we cannot update context time
1834 */
1835 if (ctx->is_active)
1836 update_context_time(ctx);
1759 update_event_times(event); 1837 update_event_times(event);
1760 raw_spin_unlock_irqrestore(&ctx->lock, flags); 1838 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1761 } 1839 }
@@ -1764,11 +1842,219 @@ static u64 perf_event_read(struct perf_event *event)
1764} 1842}
1765 1843
1766/* 1844/*
1767 * Initialize the perf_event context in a task_struct: 1845 * Callchain support
1768 */ 1846 */
1847
1848struct callchain_cpus_entries {
1849 struct rcu_head rcu_head;
1850 struct perf_callchain_entry *cpu_entries[0];
1851};
1852
1853static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
1854static atomic_t nr_callchain_events;
1855static DEFINE_MUTEX(callchain_mutex);
1856struct callchain_cpus_entries *callchain_cpus_entries;
1857
1858
1859__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
1860 struct pt_regs *regs)
1861{
1862}
1863
1864__weak void perf_callchain_user(struct perf_callchain_entry *entry,
1865 struct pt_regs *regs)
1866{
1867}
1868
1869static void release_callchain_buffers_rcu(struct rcu_head *head)
1870{
1871 struct callchain_cpus_entries *entries;
1872 int cpu;
1873
1874 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
1875
1876 for_each_possible_cpu(cpu)
1877 kfree(entries->cpu_entries[cpu]);
1878
1879 kfree(entries);
1880}
1881
1882static void release_callchain_buffers(void)
1883{
1884 struct callchain_cpus_entries *entries;
1885
1886 entries = callchain_cpus_entries;
1887 rcu_assign_pointer(callchain_cpus_entries, NULL);
1888 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
1889}
1890
1891static int alloc_callchain_buffers(void)
1892{
1893 int cpu;
1894 int size;
1895 struct callchain_cpus_entries *entries;
1896
1897 /*
1898 * We can't use the percpu allocation API for data that can be
1899 * accessed from NMI. Use a temporary manual per cpu allocation
1900 * until that gets sorted out.
1901 */
1902 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
1903 num_possible_cpus();
1904
1905 entries = kzalloc(size, GFP_KERNEL);
1906 if (!entries)
1907 return -ENOMEM;
1908
1909 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
1910
1911 for_each_possible_cpu(cpu) {
1912 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
1913 cpu_to_node(cpu));
1914 if (!entries->cpu_entries[cpu])
1915 goto fail;
1916 }
1917
1918 rcu_assign_pointer(callchain_cpus_entries, entries);
1919
1920 return 0;
1921
1922fail:
1923 for_each_possible_cpu(cpu)
1924 kfree(entries->cpu_entries[cpu]);
1925 kfree(entries);
1926
1927 return -ENOMEM;
1928}
1929
1930static int get_callchain_buffers(void)
1931{
1932 int err = 0;
1933 int count;
1934
1935 mutex_lock(&callchain_mutex);
1936
1937 count = atomic_inc_return(&nr_callchain_events);
1938 if (WARN_ON_ONCE(count < 1)) {
1939 err = -EINVAL;
1940 goto exit;
1941 }
1942
1943 if (count > 1) {
1944 /* If the allocation failed, give up */
1945 if (!callchain_cpus_entries)
1946 err = -ENOMEM;
1947 goto exit;
1948 }
1949
1950 err = alloc_callchain_buffers();
1951 if (err)
1952 release_callchain_buffers();
1953exit:
1954 mutex_unlock(&callchain_mutex);
1955
1956 return err;
1957}
1958
1959static void put_callchain_buffers(void)
1960{
1961 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
1962 release_callchain_buffers();
1963 mutex_unlock(&callchain_mutex);
1964 }
1965}
1966
1967static int get_recursion_context(int *recursion)
1968{
1969 int rctx;
1970
1971 if (in_nmi())
1972 rctx = 3;
1973 else if (in_irq())
1974 rctx = 2;
1975 else if (in_softirq())
1976 rctx = 1;
1977 else
1978 rctx = 0;
1979
1980 if (recursion[rctx])
1981 return -1;
1982
1983 recursion[rctx]++;
1984 barrier();
1985
1986 return rctx;
1987}
1988
1989static inline void put_recursion_context(int *recursion, int rctx)
1990{
1991 barrier();
1992 recursion[rctx]--;
1993}
1994
1995static struct perf_callchain_entry *get_callchain_entry(int *rctx)
1996{
1997 int cpu;
1998 struct callchain_cpus_entries *entries;
1999
2000 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2001 if (*rctx == -1)
2002 return NULL;
2003
2004 entries = rcu_dereference(callchain_cpus_entries);
2005 if (!entries)
2006 return NULL;
2007
2008 cpu = smp_processor_id();
2009
2010 return &entries->cpu_entries[cpu][*rctx];
2011}
2012
1769static void 2013static void
1770__perf_event_init_context(struct perf_event_context *ctx, 2014put_callchain_entry(int rctx)
1771 struct task_struct *task) 2015{
2016 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2017}
2018
2019static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2020{
2021 int rctx;
2022 struct perf_callchain_entry *entry;
2023
2024
2025 entry = get_callchain_entry(&rctx);
2026 if (rctx == -1)
2027 return NULL;
2028
2029 if (!entry)
2030 goto exit_put;
2031
2032 entry->nr = 0;
2033
2034 if (!user_mode(regs)) {
2035 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2036 perf_callchain_kernel(entry, regs);
2037 if (current->mm)
2038 regs = task_pt_regs(current);
2039 else
2040 regs = NULL;
2041 }
2042
2043 if (regs) {
2044 perf_callchain_store(entry, PERF_CONTEXT_USER);
2045 perf_callchain_user(entry, regs);
2046 }
2047
2048exit_put:
2049 put_callchain_entry(rctx);
2050
2051 return entry;
2052}
2053
2054/*
2055 * Initialize the perf_event context in a task_struct:
2056 */
2057static void __perf_event_init_context(struct perf_event_context *ctx)
1772{ 2058{
1773 raw_spin_lock_init(&ctx->lock); 2059 raw_spin_lock_init(&ctx->lock);
1774 mutex_init(&ctx->mutex); 2060 mutex_init(&ctx->mutex);
@@ -1776,45 +2062,38 @@ __perf_event_init_context(struct perf_event_context *ctx,
1776 INIT_LIST_HEAD(&ctx->flexible_groups); 2062 INIT_LIST_HEAD(&ctx->flexible_groups);
1777 INIT_LIST_HEAD(&ctx->event_list); 2063 INIT_LIST_HEAD(&ctx->event_list);
1778 atomic_set(&ctx->refcount, 1); 2064 atomic_set(&ctx->refcount, 1);
1779 ctx->task = task;
1780} 2065}
1781 2066
1782static struct perf_event_context *find_get_context(pid_t pid, int cpu) 2067static struct perf_event_context *
2068alloc_perf_context(struct pmu *pmu, struct task_struct *task)
1783{ 2069{
1784 struct perf_event_context *ctx; 2070 struct perf_event_context *ctx;
1785 struct perf_cpu_context *cpuctx;
1786 struct task_struct *task;
1787 unsigned long flags;
1788 int err;
1789 2071
1790 if (pid == -1 && cpu != -1) { 2072 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1791 /* Must be root to operate on a CPU event: */ 2073 if (!ctx)
1792 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2074 return NULL;
1793 return ERR_PTR(-EACCES);
1794
1795 if (cpu < 0 || cpu >= nr_cpumask_bits)
1796 return ERR_PTR(-EINVAL);
1797 2075
1798 /* 2076 __perf_event_init_context(ctx);
1799 * We could be clever and allow to attach a event to an 2077 if (task) {
1800 * offline CPU and activate it when the CPU comes up, but 2078 ctx->task = task;
1801 * that's for later. 2079 get_task_struct(task);
1802 */ 2080 }
1803 if (!cpu_online(cpu)) 2081 ctx->pmu = pmu;
1804 return ERR_PTR(-ENODEV);
1805 2082
1806 cpuctx = &per_cpu(perf_cpu_context, cpu); 2083 return ctx;
1807 ctx = &cpuctx->ctx; 2084}
1808 get_ctx(ctx);
1809 2085
1810 return ctx; 2086static struct task_struct *
1811 } 2087find_lively_task_by_vpid(pid_t vpid)
2088{
2089 struct task_struct *task;
2090 int err;
1812 2091
1813 rcu_read_lock(); 2092 rcu_read_lock();
1814 if (!pid) 2093 if (!vpid)
1815 task = current; 2094 task = current;
1816 else 2095 else
1817 task = find_task_by_vpid(pid); 2096 task = find_task_by_vpid(vpid);
1818 if (task) 2097 if (task)
1819 get_task_struct(task); 2098 get_task_struct(task);
1820 rcu_read_unlock(); 2099 rcu_read_unlock();
@@ -1834,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1834 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2113 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1835 goto errout; 2114 goto errout;
1836 2115
1837 retry: 2116 return task;
1838 ctx = perf_lock_task_context(task, &flags); 2117errout:
2118 put_task_struct(task);
2119 return ERR_PTR(err);
2120
2121}
2122
2123static struct perf_event_context *
2124find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2125{
2126 struct perf_event_context *ctx;
2127 struct perf_cpu_context *cpuctx;
2128 unsigned long flags;
2129 int ctxn, err;
2130
2131 if (!task && cpu != -1) {
2132 /* Must be root to operate on a CPU event: */
2133 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2134 return ERR_PTR(-EACCES);
2135
2136 if (cpu < 0 || cpu >= nr_cpumask_bits)
2137 return ERR_PTR(-EINVAL);
2138
2139 /*
2140 * We could be clever and allow to attach a event to an
2141 * offline CPU and activate it when the CPU comes up, but
2142 * that's for later.
2143 */
2144 if (!cpu_online(cpu))
2145 return ERR_PTR(-ENODEV);
2146
2147 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2148 ctx = &cpuctx->ctx;
2149 get_ctx(ctx);
2150
2151 return ctx;
2152 }
2153
2154 err = -EINVAL;
2155 ctxn = pmu->task_ctx_nr;
2156 if (ctxn < 0)
2157 goto errout;
2158
2159retry:
2160 ctx = perf_lock_task_context(task, ctxn, &flags);
1839 if (ctx) { 2161 if (ctx) {
1840 unclone_ctx(ctx); 2162 unclone_ctx(ctx);
1841 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2163 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1842 } 2164 }
1843 2165
1844 if (!ctx) { 2166 if (!ctx) {
1845 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); 2167 ctx = alloc_perf_context(pmu, task);
1846 err = -ENOMEM; 2168 err = -ENOMEM;
1847 if (!ctx) 2169 if (!ctx)
1848 goto errout; 2170 goto errout;
1849 __perf_event_init_context(ctx, task); 2171
1850 get_ctx(ctx); 2172 get_ctx(ctx);
1851 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2173
2174 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
1852 /* 2175 /*
1853 * We raced with some other task; use 2176 * We raced with some other task; use
1854 * the context they set. 2177 * the context they set.
1855 */ 2178 */
2179 put_task_struct(task);
1856 kfree(ctx); 2180 kfree(ctx);
1857 goto retry; 2181 goto retry;
1858 } 2182 }
1859 get_task_struct(task);
1860 } 2183 }
1861 2184
1862 put_task_struct(task);
1863 return ctx; 2185 return ctx;
1864 2186
1865 errout: 2187errout:
1866 put_task_struct(task);
1867 return ERR_PTR(err); 2188 return ERR_PTR(err);
1868} 2189}
1869 2190
@@ -1880,21 +2201,23 @@ static void free_event_rcu(struct rcu_head *head)
1880 kfree(event); 2201 kfree(event);
1881} 2202}
1882 2203
1883static void perf_pending_sync(struct perf_event *event);
1884static void perf_buffer_put(struct perf_buffer *buffer); 2204static void perf_buffer_put(struct perf_buffer *buffer);
1885 2205
1886static void free_event(struct perf_event *event) 2206static void free_event(struct perf_event *event)
1887{ 2207{
1888 perf_pending_sync(event); 2208 irq_work_sync(&event->pending);
1889 2209
1890 if (!event->parent) { 2210 if (!event->parent) {
1891 atomic_dec(&nr_events); 2211 if (event->attach_state & PERF_ATTACH_TASK)
2212 jump_label_dec(&perf_task_events);
1892 if (event->attr.mmap || event->attr.mmap_data) 2213 if (event->attr.mmap || event->attr.mmap_data)
1893 atomic_dec(&nr_mmap_events); 2214 atomic_dec(&nr_mmap_events);
1894 if (event->attr.comm) 2215 if (event->attr.comm)
1895 atomic_dec(&nr_comm_events); 2216 atomic_dec(&nr_comm_events);
1896 if (event->attr.task) 2217 if (event->attr.task)
1897 atomic_dec(&nr_task_events); 2218 atomic_dec(&nr_task_events);
2219 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2220 put_callchain_buffers();
1898 } 2221 }
1899 2222
1900 if (event->buffer) { 2223 if (event->buffer) {
@@ -1905,7 +2228,9 @@ static void free_event(struct perf_event *event)
1905 if (event->destroy) 2228 if (event->destroy)
1906 event->destroy(event); 2229 event->destroy(event);
1907 2230
1908 put_ctx(event->ctx); 2231 if (event->ctx)
2232 put_ctx(event->ctx);
2233
1909 call_rcu(&event->rcu_head, free_event_rcu); 2234 call_rcu(&event->rcu_head, free_event_rcu);
1910} 2235}
1911 2236
@@ -2184,15 +2509,13 @@ static void perf_event_for_each(struct perf_event *event,
2184static int perf_event_period(struct perf_event *event, u64 __user *arg) 2509static int perf_event_period(struct perf_event *event, u64 __user *arg)
2185{ 2510{
2186 struct perf_event_context *ctx = event->ctx; 2511 struct perf_event_context *ctx = event->ctx;
2187 unsigned long size;
2188 int ret = 0; 2512 int ret = 0;
2189 u64 value; 2513 u64 value;
2190 2514
2191 if (!event->attr.sample_period) 2515 if (!event->attr.sample_period)
2192 return -EINVAL; 2516 return -EINVAL;
2193 2517
2194 size = copy_from_user(&value, arg, sizeof(value)); 2518 if (copy_from_user(&value, arg, sizeof(value)))
2195 if (size != sizeof(value))
2196 return -EFAULT; 2519 return -EFAULT;
2197 2520
2198 if (!value) 2521 if (!value)
@@ -2326,6 +2649,9 @@ int perf_event_task_disable(void)
2326 2649
2327static int perf_event_index(struct perf_event *event) 2650static int perf_event_index(struct perf_event *event)
2328{ 2651{
2652 if (event->hw.state & PERF_HES_STOPPED)
2653 return 0;
2654
2329 if (event->state != PERF_EVENT_STATE_ACTIVE) 2655 if (event->state != PERF_EVENT_STATE_ACTIVE)
2330 return 0; 2656 return 0;
2331 2657
@@ -2829,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event)
2829 } 3155 }
2830} 3156}
2831 3157
2832/* 3158static void perf_pending_event(struct irq_work *entry)
2833 * Pending wakeups
2834 *
2835 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2836 *
2837 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2838 * single linked list and use cmpxchg() to add entries lockless.
2839 */
2840
2841static void perf_pending_event(struct perf_pending_entry *entry)
2842{ 3159{
2843 struct perf_event *event = container_of(entry, 3160 struct perf_event *event = container_of(entry,
2844 struct perf_event, pending); 3161 struct perf_event, pending);
@@ -2854,99 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry)
2854 } 3171 }
2855} 3172}
2856 3173
2857#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2858
2859static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2860 PENDING_TAIL,
2861};
2862
2863static void perf_pending_queue(struct perf_pending_entry *entry,
2864 void (*func)(struct perf_pending_entry *))
2865{
2866 struct perf_pending_entry **head;
2867
2868 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2869 return;
2870
2871 entry->func = func;
2872
2873 head = &get_cpu_var(perf_pending_head);
2874
2875 do {
2876 entry->next = *head;
2877 } while (cmpxchg(head, entry->next, entry) != entry->next);
2878
2879 set_perf_event_pending();
2880
2881 put_cpu_var(perf_pending_head);
2882}
2883
2884static int __perf_pending_run(void)
2885{
2886 struct perf_pending_entry *list;
2887 int nr = 0;
2888
2889 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2890 while (list != PENDING_TAIL) {
2891 void (*func)(struct perf_pending_entry *);
2892 struct perf_pending_entry *entry = list;
2893
2894 list = list->next;
2895
2896 func = entry->func;
2897 entry->next = NULL;
2898 /*
2899 * Ensure we observe the unqueue before we issue the wakeup,
2900 * so that we won't be waiting forever.
2901 * -- see perf_not_pending().
2902 */
2903 smp_wmb();
2904
2905 func(entry);
2906 nr++;
2907 }
2908
2909 return nr;
2910}
2911
2912static inline int perf_not_pending(struct perf_event *event)
2913{
2914 /*
2915 * If we flush on whatever cpu we run, there is a chance we don't
2916 * need to wait.
2917 */
2918 get_cpu();
2919 __perf_pending_run();
2920 put_cpu();
2921
2922 /*
2923 * Ensure we see the proper queue state before going to sleep
2924 * so that we do not miss the wakeup. -- see perf_pending_handle()
2925 */
2926 smp_rmb();
2927 return event->pending.next == NULL;
2928}
2929
2930static void perf_pending_sync(struct perf_event *event)
2931{
2932 wait_event(event->waitq, perf_not_pending(event));
2933}
2934
2935void perf_event_do_pending(void)
2936{
2937 __perf_pending_run();
2938}
2939
2940/*
2941 * Callchain support -- arch specific
2942 */
2943
2944__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2945{
2946 return NULL;
2947}
2948
2949
2950/* 3174/*
2951 * We assume there is only KVM supporting the callbacks. 3175 * We assume there is only KVM supporting the callbacks.
2952 * Later on, we might change it to a list if there is 3176 * Later on, we might change it to a list if there is
@@ -2996,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2996 3220
2997 if (handle->nmi) { 3221 if (handle->nmi) {
2998 handle->event->pending_wakeup = 1; 3222 handle->event->pending_wakeup = 1;
2999 perf_pending_queue(&handle->event->pending, 3223 irq_work_queue(&handle->event->pending);
3000 perf_pending_event);
3001 } else 3224 } else
3002 perf_event_wakeup(handle->event); 3225 perf_event_wakeup(handle->event);
3003} 3226}
@@ -3053,7 +3276,7 @@ again:
3053 if (handle->wakeup != local_read(&buffer->wakeup)) 3276 if (handle->wakeup != local_read(&buffer->wakeup))
3054 perf_output_wakeup(handle); 3277 perf_output_wakeup(handle);
3055 3278
3056 out: 3279out:
3057 preempt_enable(); 3280 preempt_enable();
3058} 3281}
3059 3282
@@ -3441,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
3441 struct perf_output_handle handle; 3664 struct perf_output_handle handle;
3442 struct perf_event_header header; 3665 struct perf_event_header header;
3443 3666
3667 /* protect the callchain buffers */
3668 rcu_read_lock();
3669
3444 perf_prepare_sample(&header, data, event, regs); 3670 perf_prepare_sample(&header, data, event, regs);
3445 3671
3446 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 3672 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3447 return; 3673 goto exit;
3448 3674
3449 perf_output_sample(&handle, &header, data, event); 3675 perf_output_sample(&handle, &header, data, event);
3450 3676
3451 perf_output_end(&handle); 3677 perf_output_end(&handle);
3678
3679exit:
3680 rcu_read_unlock();
3452} 3681}
3453 3682
3454/* 3683/*
@@ -3562,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3562static void perf_event_task_event(struct perf_task_event *task_event) 3791static void perf_event_task_event(struct perf_task_event *task_event)
3563{ 3792{
3564 struct perf_cpu_context *cpuctx; 3793 struct perf_cpu_context *cpuctx;
3565 struct perf_event_context *ctx = task_event->task_ctx; 3794 struct perf_event_context *ctx;
3795 struct pmu *pmu;
3796 int ctxn;
3566 3797
3567 rcu_read_lock(); 3798 rcu_read_lock();
3568 cpuctx = &get_cpu_var(perf_cpu_context); 3799 list_for_each_entry_rcu(pmu, &pmus, entry) {
3569 perf_event_task_ctx(&cpuctx->ctx, task_event); 3800 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3570 if (!ctx) 3801 perf_event_task_ctx(&cpuctx->ctx, task_event);
3571 ctx = rcu_dereference(current->perf_event_ctxp); 3802
3572 if (ctx) 3803 ctx = task_event->task_ctx;
3573 perf_event_task_ctx(ctx, task_event); 3804 if (!ctx) {
3574 put_cpu_var(perf_cpu_context); 3805 ctxn = pmu->task_ctx_nr;
3806 if (ctxn < 0)
3807 goto next;
3808 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3809 }
3810 if (ctx)
3811 perf_event_task_ctx(ctx, task_event);
3812next:
3813 put_cpu_ptr(pmu->pmu_cpu_context);
3814 }
3575 rcu_read_unlock(); 3815 rcu_read_unlock();
3576} 3816}
3577 3817
@@ -3676,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3676{ 3916{
3677 struct perf_cpu_context *cpuctx; 3917 struct perf_cpu_context *cpuctx;
3678 struct perf_event_context *ctx; 3918 struct perf_event_context *ctx;
3679 unsigned int size;
3680 char comm[TASK_COMM_LEN]; 3919 char comm[TASK_COMM_LEN];
3920 unsigned int size;
3921 struct pmu *pmu;
3922 int ctxn;
3681 3923
3682 memset(comm, 0, sizeof(comm)); 3924 memset(comm, 0, sizeof(comm));
3683 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3925 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3689,21 +3931,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3689 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3931 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3690 3932
3691 rcu_read_lock(); 3933 rcu_read_lock();
3692 cpuctx = &get_cpu_var(perf_cpu_context); 3934 list_for_each_entry_rcu(pmu, &pmus, entry) {
3693 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3935 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3694 ctx = rcu_dereference(current->perf_event_ctxp); 3936 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3695 if (ctx) 3937
3696 perf_event_comm_ctx(ctx, comm_event); 3938 ctxn = pmu->task_ctx_nr;
3697 put_cpu_var(perf_cpu_context); 3939 if (ctxn < 0)
3940 goto next;
3941
3942 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3943 if (ctx)
3944 perf_event_comm_ctx(ctx, comm_event);
3945next:
3946 put_cpu_ptr(pmu->pmu_cpu_context);
3947 }
3698 rcu_read_unlock(); 3948 rcu_read_unlock();
3699} 3949}
3700 3950
3701void perf_event_comm(struct task_struct *task) 3951void perf_event_comm(struct task_struct *task)
3702{ 3952{
3703 struct perf_comm_event comm_event; 3953 struct perf_comm_event comm_event;
3954 struct perf_event_context *ctx;
3955 int ctxn;
3704 3956
3705 if (task->perf_event_ctxp) 3957 for_each_task_context_nr(ctxn) {
3706 perf_event_enable_on_exec(task); 3958 ctx = task->perf_event_ctxp[ctxn];
3959 if (!ctx)
3960 continue;
3961
3962 perf_event_enable_on_exec(ctx);
3963 }
3707 3964
3708 if (!atomic_read(&nr_comm_events)) 3965 if (!atomic_read(&nr_comm_events))
3709 return; 3966 return;
@@ -3805,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3805 char tmp[16]; 4062 char tmp[16];
3806 char *buf = NULL; 4063 char *buf = NULL;
3807 const char *name; 4064 const char *name;
4065 struct pmu *pmu;
4066 int ctxn;
3808 4067
3809 memset(tmp, 0, sizeof(tmp)); 4068 memset(tmp, 0, sizeof(tmp));
3810 4069
@@ -3857,12 +4116,23 @@ got_name:
3857 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4116 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3858 4117
3859 rcu_read_lock(); 4118 rcu_read_lock();
3860 cpuctx = &get_cpu_var(perf_cpu_context); 4119 list_for_each_entry_rcu(pmu, &pmus, entry) {
3861 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); 4120 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3862 ctx = rcu_dereference(current->perf_event_ctxp); 4121 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
3863 if (ctx) 4122 vma->vm_flags & VM_EXEC);
3864 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); 4123
3865 put_cpu_var(perf_cpu_context); 4124 ctxn = pmu->task_ctx_nr;
4125 if (ctxn < 0)
4126 goto next;
4127
4128 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4129 if (ctx) {
4130 perf_event_mmap_ctx(ctx, mmap_event,
4131 vma->vm_flags & VM_EXEC);
4132 }
4133next:
4134 put_cpu_ptr(pmu->pmu_cpu_context);
4135 }
3866 rcu_read_unlock(); 4136 rcu_read_unlock();
3867 4137
3868 kfree(buf); 4138 kfree(buf);
@@ -3944,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3944 struct hw_perf_event *hwc = &event->hw; 4214 struct hw_perf_event *hwc = &event->hw;
3945 int ret = 0; 4215 int ret = 0;
3946 4216
3947 throttle = (throttle && event->pmu->unthrottle != NULL);
3948
3949 if (!throttle) { 4217 if (!throttle) {
3950 hwc->interrupts++; 4218 hwc->interrupts++;
3951 } else { 4219 } else {
@@ -3988,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3988 event->pending_kill = POLL_HUP; 4256 event->pending_kill = POLL_HUP;
3989 if (nmi) { 4257 if (nmi) {
3990 event->pending_disable = 1; 4258 event->pending_disable = 1;
3991 perf_pending_queue(&event->pending, 4259 irq_work_queue(&event->pending);
3992 perf_pending_event);
3993 } else 4260 } else
3994 perf_event_disable(event); 4261 perf_event_disable(event);
3995 } 4262 }
@@ -4013,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi,
4013 * Generic software event infrastructure 4280 * Generic software event infrastructure
4014 */ 4281 */
4015 4282
4283struct swevent_htable {
4284 struct swevent_hlist *swevent_hlist;
4285 struct mutex hlist_mutex;
4286 int hlist_refcount;
4287
4288 /* Recursion avoidance in each contexts */
4289 int recursion[PERF_NR_CONTEXTS];
4290};
4291
4292static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4293
4016/* 4294/*
4017 * We directly increment event->count and keep a second value in 4295 * We directly increment event->count and keep a second value in
4018 * event->hw.period_left to count intervals. This period event 4296 * event->hw.period_left to count intervals. This period event
@@ -4070,7 +4348,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4070 } 4348 }
4071} 4349}
4072 4350
4073static void perf_swevent_add(struct perf_event *event, u64 nr, 4351static void perf_swevent_event(struct perf_event *event, u64 nr,
4074 int nmi, struct perf_sample_data *data, 4352 int nmi, struct perf_sample_data *data,
4075 struct pt_regs *regs) 4353 struct pt_regs *regs)
4076{ 4354{
@@ -4096,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4096static int perf_exclude_event(struct perf_event *event, 4374static int perf_exclude_event(struct perf_event *event,
4097 struct pt_regs *regs) 4375 struct pt_regs *regs)
4098{ 4376{
4377 if (event->hw.state & PERF_HES_STOPPED)
4378 return 0;
4379
4099 if (regs) { 4380 if (regs) {
4100 if (event->attr.exclude_user && user_mode(regs)) 4381 if (event->attr.exclude_user && user_mode(regs))
4101 return 1; 4382 return 1;
@@ -4142,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4142 4423
4143/* For the read side: events when they trigger */ 4424/* For the read side: events when they trigger */
4144static inline struct hlist_head * 4425static inline struct hlist_head *
4145find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4426find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4146{ 4427{
4147 struct swevent_hlist *hlist; 4428 struct swevent_hlist *hlist;
4148 4429
4149 hlist = rcu_dereference(ctx->swevent_hlist); 4430 hlist = rcu_dereference(swhash->swevent_hlist);
4150 if (!hlist) 4431 if (!hlist)
4151 return NULL; 4432 return NULL;
4152 4433
@@ -4155,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4155 4436
4156/* For the event head insertion and removal in the hlist */ 4437/* For the event head insertion and removal in the hlist */
4157static inline struct hlist_head * 4438static inline struct hlist_head *
4158find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) 4439find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4159{ 4440{
4160 struct swevent_hlist *hlist; 4441 struct swevent_hlist *hlist;
4161 u32 event_id = event->attr.config; 4442 u32 event_id = event->attr.config;
@@ -4166,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4166 * and release. Which makes the protected version suitable here. 4447 * and release. Which makes the protected version suitable here.
4167 * The context lock guarantees that. 4448 * The context lock guarantees that.
4168 */ 4449 */
4169 hlist = rcu_dereference_protected(ctx->swevent_hlist, 4450 hlist = rcu_dereference_protected(swhash->swevent_hlist,
4170 lockdep_is_held(&event->ctx->lock)); 4451 lockdep_is_held(&event->ctx->lock));
4171 if (!hlist) 4452 if (!hlist)
4172 return NULL; 4453 return NULL;
@@ -4179,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4179 struct perf_sample_data *data, 4460 struct perf_sample_data *data,
4180 struct pt_regs *regs) 4461 struct pt_regs *regs)
4181{ 4462{
4182 struct perf_cpu_context *cpuctx; 4463 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4183 struct perf_event *event; 4464 struct perf_event *event;
4184 struct hlist_node *node; 4465 struct hlist_node *node;
4185 struct hlist_head *head; 4466 struct hlist_head *head;
4186 4467
4187 cpuctx = &__get_cpu_var(perf_cpu_context);
4188
4189 rcu_read_lock(); 4468 rcu_read_lock();
4190 4469 head = find_swevent_head_rcu(swhash, type, event_id);
4191 head = find_swevent_head_rcu(cpuctx, type, event_id);
4192
4193 if (!head) 4470 if (!head)
4194 goto end; 4471 goto end;
4195 4472
4196 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4473 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4197 if (perf_swevent_match(event, type, event_id, data, regs)) 4474 if (perf_swevent_match(event, type, event_id, data, regs))
4198 perf_swevent_add(event, nr, nmi, data, regs); 4475 perf_swevent_event(event, nr, nmi, data, regs);
4199 } 4476 }
4200end: 4477end:
4201 rcu_read_unlock(); 4478 rcu_read_unlock();
@@ -4203,33 +4480,17 @@ end:
4203 4480
4204int perf_swevent_get_recursion_context(void) 4481int perf_swevent_get_recursion_context(void)
4205{ 4482{
4206 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4483 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4207 int rctx;
4208
4209 if (in_nmi())
4210 rctx = 3;
4211 else if (in_irq())
4212 rctx = 2;
4213 else if (in_softirq())
4214 rctx = 1;
4215 else
4216 rctx = 0;
4217 4484
4218 if (cpuctx->recursion[rctx]) 4485 return get_recursion_context(swhash->recursion);
4219 return -1;
4220
4221 cpuctx->recursion[rctx]++;
4222 barrier();
4223
4224 return rctx;
4225} 4486}
4226EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4487EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4227 4488
4228void inline perf_swevent_put_recursion_context(int rctx) 4489void inline perf_swevent_put_recursion_context(int rctx)
4229{ 4490{
4230 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4491 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4231 barrier(); 4492
4232 cpuctx->recursion[rctx]--; 4493 put_recursion_context(swhash->recursion, rctx);
4233} 4494}
4234 4495
4235void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4496void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4255,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event)
4255{ 4516{
4256} 4517}
4257 4518
4258static int perf_swevent_enable(struct perf_event *event) 4519static int perf_swevent_add(struct perf_event *event, int flags)
4259{ 4520{
4521 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4260 struct hw_perf_event *hwc = &event->hw; 4522 struct hw_perf_event *hwc = &event->hw;
4261 struct perf_cpu_context *cpuctx;
4262 struct hlist_head *head; 4523 struct hlist_head *head;
4263 4524
4264 cpuctx = &__get_cpu_var(perf_cpu_context);
4265
4266 if (hwc->sample_period) { 4525 if (hwc->sample_period) {
4267 hwc->last_period = hwc->sample_period; 4526 hwc->last_period = hwc->sample_period;
4268 perf_swevent_set_period(event); 4527 perf_swevent_set_period(event);
4269 } 4528 }
4270 4529
4271 head = find_swevent_head(cpuctx, event); 4530 hwc->state = !(flags & PERF_EF_START);
4531
4532 head = find_swevent_head(swhash, event);
4272 if (WARN_ON_ONCE(!head)) 4533 if (WARN_ON_ONCE(!head))
4273 return -EINVAL; 4534 return -EINVAL;
4274 4535
@@ -4277,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event)
4277 return 0; 4538 return 0;
4278} 4539}
4279 4540
4280static void perf_swevent_disable(struct perf_event *event) 4541static void perf_swevent_del(struct perf_event *event, int flags)
4281{ 4542{
4282 hlist_del_rcu(&event->hlist_entry); 4543 hlist_del_rcu(&event->hlist_entry);
4283} 4544}
4284 4545
4285static void perf_swevent_void(struct perf_event *event) 4546static void perf_swevent_start(struct perf_event *event, int flags)
4286{
4287}
4288
4289static int perf_swevent_int(struct perf_event *event)
4290{
4291 return 0;
4292}
4293
4294static const struct pmu perf_ops_generic = {
4295 .enable = perf_swevent_enable,
4296 .disable = perf_swevent_disable,
4297 .start = perf_swevent_int,
4298 .stop = perf_swevent_void,
4299 .read = perf_swevent_read,
4300 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4301};
4302
4303/*
4304 * hrtimer based swevent callback
4305 */
4306
4307static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4308{
4309 enum hrtimer_restart ret = HRTIMER_RESTART;
4310 struct perf_sample_data data;
4311 struct pt_regs *regs;
4312 struct perf_event *event;
4313 u64 period;
4314
4315 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4316 event->pmu->read(event);
4317
4318 perf_sample_data_init(&data, 0);
4319 data.period = event->hw.last_period;
4320 regs = get_irq_regs();
4321
4322 if (regs && !perf_exclude_event(event, regs)) {
4323 if (!(event->attr.exclude_idle && current->pid == 0))
4324 if (perf_event_overflow(event, 0, &data, regs))
4325 ret = HRTIMER_NORESTART;
4326 }
4327
4328 period = max_t(u64, 10000, event->hw.sample_period);
4329 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4330
4331 return ret;
4332}
4333
4334static void perf_swevent_start_hrtimer(struct perf_event *event)
4335{
4336 struct hw_perf_event *hwc = &event->hw;
4337
4338 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4339 hwc->hrtimer.function = perf_swevent_hrtimer;
4340 if (hwc->sample_period) {
4341 u64 period;
4342
4343 if (hwc->remaining) {
4344 if (hwc->remaining < 0)
4345 period = 10000;
4346 else
4347 period = hwc->remaining;
4348 hwc->remaining = 0;
4349 } else {
4350 period = max_t(u64, 10000, hwc->sample_period);
4351 }
4352 __hrtimer_start_range_ns(&hwc->hrtimer,
4353 ns_to_ktime(period), 0,
4354 HRTIMER_MODE_REL, 0);
4355 }
4356}
4357
4358static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4359{
4360 struct hw_perf_event *hwc = &event->hw;
4361
4362 if (hwc->sample_period) {
4363 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4364 hwc->remaining = ktime_to_ns(remaining);
4365
4366 hrtimer_cancel(&hwc->hrtimer);
4367 }
4368}
4369
4370/*
4371 * Software event: cpu wall time clock
4372 */
4373
4374static void cpu_clock_perf_event_update(struct perf_event *event)
4375{
4376 int cpu = raw_smp_processor_id();
4377 s64 prev;
4378 u64 now;
4379
4380 now = cpu_clock(cpu);
4381 prev = local64_xchg(&event->hw.prev_count, now);
4382 local64_add(now - prev, &event->count);
4383}
4384
4385static int cpu_clock_perf_event_enable(struct perf_event *event)
4386{
4387 struct hw_perf_event *hwc = &event->hw;
4388 int cpu = raw_smp_processor_id();
4389
4390 local64_set(&hwc->prev_count, cpu_clock(cpu));
4391 perf_swevent_start_hrtimer(event);
4392
4393 return 0;
4394}
4395
4396static void cpu_clock_perf_event_disable(struct perf_event *event)
4397{ 4547{
4398 perf_swevent_cancel_hrtimer(event); 4548 event->hw.state = 0;
4399 cpu_clock_perf_event_update(event);
4400}
4401
4402static void cpu_clock_perf_event_read(struct perf_event *event)
4403{
4404 cpu_clock_perf_event_update(event);
4405}
4406
4407static const struct pmu perf_ops_cpu_clock = {
4408 .enable = cpu_clock_perf_event_enable,
4409 .disable = cpu_clock_perf_event_disable,
4410 .read = cpu_clock_perf_event_read,
4411};
4412
4413/*
4414 * Software event: task time clock
4415 */
4416
4417static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4418{
4419 u64 prev;
4420 s64 delta;
4421
4422 prev = local64_xchg(&event->hw.prev_count, now);
4423 delta = now - prev;
4424 local64_add(delta, &event->count);
4425}
4426
4427static int task_clock_perf_event_enable(struct perf_event *event)
4428{
4429 struct hw_perf_event *hwc = &event->hw;
4430 u64 now;
4431
4432 now = event->ctx->time;
4433
4434 local64_set(&hwc->prev_count, now);
4435
4436 perf_swevent_start_hrtimer(event);
4437
4438 return 0;
4439} 4549}
4440 4550
4441static void task_clock_perf_event_disable(struct perf_event *event) 4551static void perf_swevent_stop(struct perf_event *event, int flags)
4442{ 4552{
4443 perf_swevent_cancel_hrtimer(event); 4553 event->hw.state = PERF_HES_STOPPED;
4444 task_clock_perf_event_update(event, event->ctx->time);
4445
4446} 4554}
4447 4555
4448static void task_clock_perf_event_read(struct perf_event *event)
4449{
4450 u64 time;
4451
4452 if (!in_nmi()) {
4453 update_context_time(event->ctx);
4454 time = event->ctx->time;
4455 } else {
4456 u64 now = perf_clock();
4457 u64 delta = now - event->ctx->timestamp;
4458 time = event->ctx->time + delta;
4459 }
4460
4461 task_clock_perf_event_update(event, time);
4462}
4463
4464static const struct pmu perf_ops_task_clock = {
4465 .enable = task_clock_perf_event_enable,
4466 .disable = task_clock_perf_event_disable,
4467 .read = task_clock_perf_event_read,
4468};
4469
4470/* Deref the hlist from the update side */ 4556/* Deref the hlist from the update side */
4471static inline struct swevent_hlist * 4557static inline struct swevent_hlist *
4472swevent_hlist_deref(struct perf_cpu_context *cpuctx) 4558swevent_hlist_deref(struct swevent_htable *swhash)
4473{ 4559{
4474 return rcu_dereference_protected(cpuctx->swevent_hlist, 4560 return rcu_dereference_protected(swhash->swevent_hlist,
4475 lockdep_is_held(&cpuctx->hlist_mutex)); 4561 lockdep_is_held(&swhash->hlist_mutex));
4476} 4562}
4477 4563
4478static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4564static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
@@ -4483,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4483 kfree(hlist); 4569 kfree(hlist);
4484} 4570}
4485 4571
4486static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4572static void swevent_hlist_release(struct swevent_htable *swhash)
4487{ 4573{
4488 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); 4574 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
4489 4575
4490 if (!hlist) 4576 if (!hlist)
4491 return; 4577 return;
4492 4578
4493 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4579 rcu_assign_pointer(swhash->swevent_hlist, NULL);
4494 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4580 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4495} 4581}
4496 4582
4497static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) 4583static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4498{ 4584{
4499 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4585 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4500 4586
4501 mutex_lock(&cpuctx->hlist_mutex); 4587 mutex_lock(&swhash->hlist_mutex);
4502 4588
4503 if (!--cpuctx->hlist_refcount) 4589 if (!--swhash->hlist_refcount)
4504 swevent_hlist_release(cpuctx); 4590 swevent_hlist_release(swhash);
4505 4591
4506 mutex_unlock(&cpuctx->hlist_mutex); 4592 mutex_unlock(&swhash->hlist_mutex);
4507} 4593}
4508 4594
4509static void swevent_hlist_put(struct perf_event *event) 4595static void swevent_hlist_put(struct perf_event *event)
@@ -4521,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event)
4521 4607
4522static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) 4608static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4523{ 4609{
4524 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 4610 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
4525 int err = 0; 4611 int err = 0;
4526 4612
4527 mutex_lock(&cpuctx->hlist_mutex); 4613 mutex_lock(&swhash->hlist_mutex);
4528 4614
4529 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { 4615 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
4530 struct swevent_hlist *hlist; 4616 struct swevent_hlist *hlist;
4531 4617
4532 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4618 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4534,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4534 err = -ENOMEM; 4620 err = -ENOMEM;
4535 goto exit; 4621 goto exit;
4536 } 4622 }
4537 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 4623 rcu_assign_pointer(swhash->swevent_hlist, hlist);
4538 } 4624 }
4539 cpuctx->hlist_refcount++; 4625 swhash->hlist_refcount++;
4540 exit: 4626exit:
4541 mutex_unlock(&cpuctx->hlist_mutex); 4627 mutex_unlock(&swhash->hlist_mutex);
4542 4628
4543 return err; 4629 return err;
4544} 4630}
@@ -4562,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event)
4562 put_online_cpus(); 4648 put_online_cpus();
4563 4649
4564 return 0; 4650 return 0;
4565 fail: 4651fail:
4566 for_each_possible_cpu(cpu) { 4652 for_each_possible_cpu(cpu) {
4567 if (cpu == failed_cpu) 4653 if (cpu == failed_cpu)
4568 break; 4654 break;
@@ -4573,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event)
4573 return err; 4659 return err;
4574} 4660}
4575 4661
4576#ifdef CONFIG_EVENT_TRACING 4662atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4663
4664static void sw_perf_event_destroy(struct perf_event *event)
4665{
4666 u64 event_id = event->attr.config;
4667
4668 WARN_ON(event->parent);
4577 4669
4578static const struct pmu perf_ops_tracepoint = { 4670 jump_label_dec(&perf_swevent_enabled[event_id]);
4579 .enable = perf_trace_enable, 4671 swevent_hlist_put(event);
4580 .disable = perf_trace_disable, 4672}
4581 .start = perf_swevent_int, 4673
4582 .stop = perf_swevent_void, 4674static int perf_swevent_init(struct perf_event *event)
4675{
4676 int event_id = event->attr.config;
4677
4678 if (event->attr.type != PERF_TYPE_SOFTWARE)
4679 return -ENOENT;
4680
4681 switch (event_id) {
4682 case PERF_COUNT_SW_CPU_CLOCK:
4683 case PERF_COUNT_SW_TASK_CLOCK:
4684 return -ENOENT;
4685
4686 default:
4687 break;
4688 }
4689
4690 if (event_id > PERF_COUNT_SW_MAX)
4691 return -ENOENT;
4692
4693 if (!event->parent) {
4694 int err;
4695
4696 err = swevent_hlist_get(event);
4697 if (err)
4698 return err;
4699
4700 jump_label_inc(&perf_swevent_enabled[event_id]);
4701 event->destroy = sw_perf_event_destroy;
4702 }
4703
4704 return 0;
4705}
4706
4707static struct pmu perf_swevent = {
4708 .task_ctx_nr = perf_sw_context,
4709
4710 .event_init = perf_swevent_init,
4711 .add = perf_swevent_add,
4712 .del = perf_swevent_del,
4713 .start = perf_swevent_start,
4714 .stop = perf_swevent_stop,
4583 .read = perf_swevent_read, 4715 .read = perf_swevent_read,
4584 .unthrottle = perf_swevent_void,
4585}; 4716};
4586 4717
4718#ifdef CONFIG_EVENT_TRACING
4719
4587static int perf_tp_filter_match(struct perf_event *event, 4720static int perf_tp_filter_match(struct perf_event *event,
4588 struct perf_sample_data *data) 4721 struct perf_sample_data *data)
4589{ 4722{
@@ -4627,7 +4760,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4627 4760
4628 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4761 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4629 if (perf_tp_event_match(event, &data, regs)) 4762 if (perf_tp_event_match(event, &data, regs))
4630 perf_swevent_add(event, count, 1, &data, regs); 4763 perf_swevent_event(event, count, 1, &data, regs);
4631 } 4764 }
4632 4765
4633 perf_swevent_put_recursion_context(rctx); 4766 perf_swevent_put_recursion_context(rctx);
@@ -4639,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event)
4639 perf_trace_destroy(event); 4772 perf_trace_destroy(event);
4640} 4773}
4641 4774
4642static const struct pmu *tp_perf_event_init(struct perf_event *event) 4775static int perf_tp_event_init(struct perf_event *event)
4643{ 4776{
4644 int err; 4777 int err;
4645 4778
4779 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4780 return -ENOENT;
4781
4646 /* 4782 /*
4647 * Raw tracepoint data is a severe data leak, only allow root to 4783 * Raw tracepoint data is a severe data leak, only allow root to
4648 * have these. 4784 * have these.
@@ -4650,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4650 if ((event->attr.sample_type & PERF_SAMPLE_RAW) && 4786 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4651 perf_paranoid_tracepoint_raw() && 4787 perf_paranoid_tracepoint_raw() &&
4652 !capable(CAP_SYS_ADMIN)) 4788 !capable(CAP_SYS_ADMIN))
4653 return ERR_PTR(-EPERM); 4789 return -EPERM;
4654 4790
4655 err = perf_trace_init(event); 4791 err = perf_trace_init(event);
4656 if (err) 4792 if (err)
4657 return NULL; 4793 return err;
4658 4794
4659 event->destroy = tp_perf_event_destroy; 4795 event->destroy = tp_perf_event_destroy;
4660 4796
4661 return &perf_ops_tracepoint; 4797 return 0;
4798}
4799
4800static struct pmu perf_tracepoint = {
4801 .task_ctx_nr = perf_sw_context,
4802
4803 .event_init = perf_tp_event_init,
4804 .add = perf_trace_add,
4805 .del = perf_trace_del,
4806 .start = perf_swevent_start,
4807 .stop = perf_swevent_stop,
4808 .read = perf_swevent_read,
4809};
4810
4811static inline void perf_tp_register(void)
4812{
4813 perf_pmu_register(&perf_tracepoint);
4662} 4814}
4663 4815
4664static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4816static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4686,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event)
4686 4838
4687#else 4839#else
4688 4840
4689static const struct pmu *tp_perf_event_init(struct perf_event *event) 4841static inline void perf_tp_register(void)
4690{ 4842{
4691 return NULL;
4692} 4843}
4693 4844
4694static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4845static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4703,105 +4854,389 @@ static void perf_event_free_filter(struct perf_event *event)
4703#endif /* CONFIG_EVENT_TRACING */ 4854#endif /* CONFIG_EVENT_TRACING */
4704 4855
4705#ifdef CONFIG_HAVE_HW_BREAKPOINT 4856#ifdef CONFIG_HAVE_HW_BREAKPOINT
4706static void bp_perf_event_destroy(struct perf_event *event) 4857void perf_bp_event(struct perf_event *bp, void *data)
4707{ 4858{
4708 release_bp_slot(event); 4859 struct perf_sample_data sample;
4860 struct pt_regs *regs = data;
4861
4862 perf_sample_data_init(&sample, bp->attr.bp_addr);
4863
4864 if (!bp->hw.state && !perf_exclude_event(bp, regs))
4865 perf_swevent_event(bp, 1, 1, &sample, regs);
4709} 4866}
4867#endif
4868
4869/*
4870 * hrtimer based swevent callback
4871 */
4710 4872
4711static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4873static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4712{ 4874{
4713 int err; 4875 enum hrtimer_restart ret = HRTIMER_RESTART;
4876 struct perf_sample_data data;
4877 struct pt_regs *regs;
4878 struct perf_event *event;
4879 u64 period;
4714 4880
4715 err = register_perf_hw_breakpoint(bp); 4881 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4716 if (err) 4882 event->pmu->read(event);
4717 return ERR_PTR(err);
4718 4883
4719 bp->destroy = bp_perf_event_destroy; 4884 perf_sample_data_init(&data, 0);
4885 data.period = event->hw.last_period;
4886 regs = get_irq_regs();
4720 4887
4721 return &perf_ops_bp; 4888 if (regs && !perf_exclude_event(event, regs)) {
4889 if (!(event->attr.exclude_idle && current->pid == 0))
4890 if (perf_event_overflow(event, 0, &data, regs))
4891 ret = HRTIMER_NORESTART;
4892 }
4893
4894 period = max_t(u64, 10000, event->hw.sample_period);
4895 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4896
4897 return ret;
4722} 4898}
4723 4899
4724void perf_bp_event(struct perf_event *bp, void *data) 4900static void perf_swevent_start_hrtimer(struct perf_event *event)
4725{ 4901{
4726 struct perf_sample_data sample; 4902 struct hw_perf_event *hwc = &event->hw;
4727 struct pt_regs *regs = data;
4728 4903
4729 perf_sample_data_init(&sample, bp->attr.bp_addr); 4904 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4905 hwc->hrtimer.function = perf_swevent_hrtimer;
4906 if (hwc->sample_period) {
4907 s64 period = local64_read(&hwc->period_left);
4908
4909 if (period) {
4910 if (period < 0)
4911 period = 10000;
4912
4913 local64_set(&hwc->period_left, 0);
4914 } else {
4915 period = max_t(u64, 10000, hwc->sample_period);
4916 }
4917 __hrtimer_start_range_ns(&hwc->hrtimer,
4918 ns_to_ktime(period), 0,
4919 HRTIMER_MODE_REL_PINNED, 0);
4920 }
4921}
4730 4922
4731 if (!perf_exclude_event(bp, regs)) 4923static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4732 perf_swevent_add(bp, 1, 1, &sample, regs); 4924{
4925 struct hw_perf_event *hwc = &event->hw;
4926
4927 if (hwc->sample_period) {
4928 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4929 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4930
4931 hrtimer_cancel(&hwc->hrtimer);
4932 }
4733} 4933}
4734#else 4934
4735static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4935/*
4936 * Software event: cpu wall time clock
4937 */
4938
4939static void cpu_clock_event_update(struct perf_event *event)
4736{ 4940{
4737 return NULL; 4941 s64 prev;
4942 u64 now;
4943
4944 now = local_clock();
4945 prev = local64_xchg(&event->hw.prev_count, now);
4946 local64_add(now - prev, &event->count);
4738} 4947}
4739 4948
4740void perf_bp_event(struct perf_event *bp, void *regs) 4949static void cpu_clock_event_start(struct perf_event *event, int flags)
4741{ 4950{
4951 local64_set(&event->hw.prev_count, local_clock());
4952 perf_swevent_start_hrtimer(event);
4742} 4953}
4743#endif
4744 4954
4745atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4955static void cpu_clock_event_stop(struct perf_event *event, int flags)
4956{
4957 perf_swevent_cancel_hrtimer(event);
4958 cpu_clock_event_update(event);
4959}
4746 4960
4747static void sw_perf_event_destroy(struct perf_event *event) 4961static int cpu_clock_event_add(struct perf_event *event, int flags)
4748{ 4962{
4749 u64 event_id = event->attr.config; 4963 if (flags & PERF_EF_START)
4964 cpu_clock_event_start(event, flags);
4750 4965
4751 WARN_ON(event->parent); 4966 return 0;
4967}
4752 4968
4753 atomic_dec(&perf_swevent_enabled[event_id]); 4969static void cpu_clock_event_del(struct perf_event *event, int flags)
4754 swevent_hlist_put(event); 4970{
4971 cpu_clock_event_stop(event, flags);
4755} 4972}
4756 4973
4757static const struct pmu *sw_perf_event_init(struct perf_event *event) 4974static void cpu_clock_event_read(struct perf_event *event)
4758{ 4975{
4759 const struct pmu *pmu = NULL; 4976 cpu_clock_event_update(event);
4760 u64 event_id = event->attr.config; 4977}
4978
4979static int cpu_clock_event_init(struct perf_event *event)
4980{
4981 if (event->attr.type != PERF_TYPE_SOFTWARE)
4982 return -ENOENT;
4983
4984 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
4985 return -ENOENT;
4761 4986
4987 return 0;
4988}
4989
4990static struct pmu perf_cpu_clock = {
4991 .task_ctx_nr = perf_sw_context,
4992
4993 .event_init = cpu_clock_event_init,
4994 .add = cpu_clock_event_add,
4995 .del = cpu_clock_event_del,
4996 .start = cpu_clock_event_start,
4997 .stop = cpu_clock_event_stop,
4998 .read = cpu_clock_event_read,
4999};
5000
5001/*
5002 * Software event: task time clock
5003 */
5004
5005static void task_clock_event_update(struct perf_event *event, u64 now)
5006{
5007 u64 prev;
5008 s64 delta;
5009
5010 prev = local64_xchg(&event->hw.prev_count, now);
5011 delta = now - prev;
5012 local64_add(delta, &event->count);
5013}
5014
5015static void task_clock_event_start(struct perf_event *event, int flags)
5016{
5017 local64_set(&event->hw.prev_count, event->ctx->time);
5018 perf_swevent_start_hrtimer(event);
5019}
5020
5021static void task_clock_event_stop(struct perf_event *event, int flags)
5022{
5023 perf_swevent_cancel_hrtimer(event);
5024 task_clock_event_update(event, event->ctx->time);
5025}
5026
5027static int task_clock_event_add(struct perf_event *event, int flags)
5028{
5029 if (flags & PERF_EF_START)
5030 task_clock_event_start(event, flags);
5031
5032 return 0;
5033}
5034
5035static void task_clock_event_del(struct perf_event *event, int flags)
5036{
5037 task_clock_event_stop(event, PERF_EF_UPDATE);
5038}
5039
5040static void task_clock_event_read(struct perf_event *event)
5041{
5042 u64 time;
5043
5044 if (!in_nmi()) {
5045 update_context_time(event->ctx);
5046 time = event->ctx->time;
5047 } else {
5048 u64 now = perf_clock();
5049 u64 delta = now - event->ctx->timestamp;
5050 time = event->ctx->time + delta;
5051 }
5052
5053 task_clock_event_update(event, time);
5054}
5055
5056static int task_clock_event_init(struct perf_event *event)
5057{
5058 if (event->attr.type != PERF_TYPE_SOFTWARE)
5059 return -ENOENT;
5060
5061 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5062 return -ENOENT;
5063
5064 return 0;
5065}
5066
5067static struct pmu perf_task_clock = {
5068 .task_ctx_nr = perf_sw_context,
5069
5070 .event_init = task_clock_event_init,
5071 .add = task_clock_event_add,
5072 .del = task_clock_event_del,
5073 .start = task_clock_event_start,
5074 .stop = task_clock_event_stop,
5075 .read = task_clock_event_read,
5076};
5077
5078static void perf_pmu_nop_void(struct pmu *pmu)
5079{
5080}
5081
5082static int perf_pmu_nop_int(struct pmu *pmu)
5083{
5084 return 0;
5085}
5086
5087static void perf_pmu_start_txn(struct pmu *pmu)
5088{
5089 perf_pmu_disable(pmu);
5090}
5091
5092static int perf_pmu_commit_txn(struct pmu *pmu)
5093{
5094 perf_pmu_enable(pmu);
5095 return 0;
5096}
5097
5098static void perf_pmu_cancel_txn(struct pmu *pmu)
5099{
5100 perf_pmu_enable(pmu);
5101}
5102
5103/*
5104 * Ensures all contexts with the same task_ctx_nr have the same
5105 * pmu_cpu_context too.
5106 */
5107static void *find_pmu_context(int ctxn)
5108{
5109 struct pmu *pmu;
5110
5111 if (ctxn < 0)
5112 return NULL;
5113
5114 list_for_each_entry(pmu, &pmus, entry) {
5115 if (pmu->task_ctx_nr == ctxn)
5116 return pmu->pmu_cpu_context;
5117 }
5118
5119 return NULL;
5120}
5121
5122static void free_pmu_context(void * __percpu cpu_context)
5123{
5124 struct pmu *pmu;
5125
5126 mutex_lock(&pmus_lock);
4762 /* 5127 /*
4763 * Software events (currently) can't in general distinguish 5128 * Like a real lame refcount.
4764 * between user, kernel and hypervisor events.
4765 * However, context switches and cpu migrations are considered
4766 * to be kernel events, and page faults are never hypervisor
4767 * events.
4768 */ 5129 */
4769 switch (event_id) { 5130 list_for_each_entry(pmu, &pmus, entry) {
4770 case PERF_COUNT_SW_CPU_CLOCK: 5131 if (pmu->pmu_cpu_context == cpu_context)
4771 pmu = &perf_ops_cpu_clock; 5132 goto out;
5133 }
4772 5134
4773 break; 5135 free_percpu(cpu_context);
4774 case PERF_COUNT_SW_TASK_CLOCK: 5136out:
4775 /* 5137 mutex_unlock(&pmus_lock);
4776 * If the user instantiates this as a per-cpu event, 5138}
4777 * use the cpu_clock event instead.
4778 */
4779 if (event->ctx->task)
4780 pmu = &perf_ops_task_clock;
4781 else
4782 pmu = &perf_ops_cpu_clock;
4783 5139
4784 break; 5140int perf_pmu_register(struct pmu *pmu)
4785 case PERF_COUNT_SW_PAGE_FAULTS: 5141{
4786 case PERF_COUNT_SW_PAGE_FAULTS_MIN: 5142 int cpu, ret;
4787 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 5143
4788 case PERF_COUNT_SW_CONTEXT_SWITCHES: 5144 mutex_lock(&pmus_lock);
4789 case PERF_COUNT_SW_CPU_MIGRATIONS: 5145 ret = -ENOMEM;
4790 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 5146 pmu->pmu_disable_count = alloc_percpu(int);
4791 case PERF_COUNT_SW_EMULATION_FAULTS: 5147 if (!pmu->pmu_disable_count)
4792 if (!event->parent) { 5148 goto unlock;
4793 int err; 5149
4794 5150 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
4795 err = swevent_hlist_get(event); 5151 if (pmu->pmu_cpu_context)
4796 if (err) 5152 goto got_cpu_context;
4797 return ERR_PTR(err); 5153
5154 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5155 if (!pmu->pmu_cpu_context)
5156 goto free_pdc;
4798 5157
4799 atomic_inc(&perf_swevent_enabled[event_id]); 5158 for_each_possible_cpu(cpu) {
4800 event->destroy = sw_perf_event_destroy; 5159 struct perf_cpu_context *cpuctx;
5160
5161 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5162 __perf_event_init_context(&cpuctx->ctx);
5163 cpuctx->ctx.type = cpu_context;
5164 cpuctx->ctx.pmu = pmu;
5165 cpuctx->jiffies_interval = 1;
5166 INIT_LIST_HEAD(&cpuctx->rotation_list);
5167 }
5168
5169got_cpu_context:
5170 if (!pmu->start_txn) {
5171 if (pmu->pmu_enable) {
5172 /*
5173 * If we have pmu_enable/pmu_disable calls, install
5174 * transaction stubs that use that to try and batch
5175 * hardware accesses.
5176 */
5177 pmu->start_txn = perf_pmu_start_txn;
5178 pmu->commit_txn = perf_pmu_commit_txn;
5179 pmu->cancel_txn = perf_pmu_cancel_txn;
5180 } else {
5181 pmu->start_txn = perf_pmu_nop_void;
5182 pmu->commit_txn = perf_pmu_nop_int;
5183 pmu->cancel_txn = perf_pmu_nop_void;
5184 }
5185 }
5186
5187 if (!pmu->pmu_enable) {
5188 pmu->pmu_enable = perf_pmu_nop_void;
5189 pmu->pmu_disable = perf_pmu_nop_void;
5190 }
5191
5192 list_add_rcu(&pmu->entry, &pmus);
5193 ret = 0;
5194unlock:
5195 mutex_unlock(&pmus_lock);
5196
5197 return ret;
5198
5199free_pdc:
5200 free_percpu(pmu->pmu_disable_count);
5201 goto unlock;
5202}
5203
5204void perf_pmu_unregister(struct pmu *pmu)
5205{
5206 mutex_lock(&pmus_lock);
5207 list_del_rcu(&pmu->entry);
5208 mutex_unlock(&pmus_lock);
5209
5210 /*
5211 * We dereference the pmu list under both SRCU and regular RCU, so
5212 * synchronize against both of those.
5213 */
5214 synchronize_srcu(&pmus_srcu);
5215 synchronize_rcu();
5216
5217 free_percpu(pmu->pmu_disable_count);
5218 free_pmu_context(pmu->pmu_cpu_context);
5219}
5220
5221struct pmu *perf_init_event(struct perf_event *event)
5222{
5223 struct pmu *pmu = NULL;
5224 int idx;
5225
5226 idx = srcu_read_lock(&pmus_srcu);
5227 list_for_each_entry_rcu(pmu, &pmus, entry) {
5228 int ret = pmu->event_init(event);
5229 if (!ret)
5230 goto unlock;
5231
5232 if (ret != -ENOENT) {
5233 pmu = ERR_PTR(ret);
5234 goto unlock;
4801 } 5235 }
4802 pmu = &perf_ops_generic;
4803 break;
4804 } 5236 }
5237 pmu = ERR_PTR(-ENOENT);
5238unlock:
5239 srcu_read_unlock(&pmus_srcu, idx);
4805 5240
4806 return pmu; 5241 return pmu;
4807} 5242}
@@ -4810,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4810 * Allocate and initialize a event structure 5245 * Allocate and initialize a event structure
4811 */ 5246 */
4812static struct perf_event * 5247static struct perf_event *
4813perf_event_alloc(struct perf_event_attr *attr, 5248perf_event_alloc(struct perf_event_attr *attr, int cpu,
4814 int cpu, 5249 struct task_struct *task,
4815 struct perf_event_context *ctx, 5250 struct perf_event *group_leader,
4816 struct perf_event *group_leader, 5251 struct perf_event *parent_event,
4817 struct perf_event *parent_event, 5252 perf_overflow_handler_t overflow_handler)
4818 perf_overflow_handler_t overflow_handler, 5253{
4819 gfp_t gfpflags) 5254 struct pmu *pmu;
4820{
4821 const struct pmu *pmu;
4822 struct perf_event *event; 5255 struct perf_event *event;
4823 struct hw_perf_event *hwc; 5256 struct hw_perf_event *hwc;
4824 long err; 5257 long err;
4825 5258
4826 event = kzalloc(sizeof(*event), gfpflags); 5259 event = kzalloc(sizeof(*event), GFP_KERNEL);
4827 if (!event) 5260 if (!event)
4828 return ERR_PTR(-ENOMEM); 5261 return ERR_PTR(-ENOMEM);
4829 5262
@@ -4841,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4841 INIT_LIST_HEAD(&event->event_entry); 5274 INIT_LIST_HEAD(&event->event_entry);
4842 INIT_LIST_HEAD(&event->sibling_list); 5275 INIT_LIST_HEAD(&event->sibling_list);
4843 init_waitqueue_head(&event->waitq); 5276 init_waitqueue_head(&event->waitq);
5277 init_irq_work(&event->pending, perf_pending_event);
4844 5278
4845 mutex_init(&event->mmap_mutex); 5279 mutex_init(&event->mmap_mutex);
4846 5280
@@ -4848,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr,
4848 event->attr = *attr; 5282 event->attr = *attr;
4849 event->group_leader = group_leader; 5283 event->group_leader = group_leader;
4850 event->pmu = NULL; 5284 event->pmu = NULL;
4851 event->ctx = ctx;
4852 event->oncpu = -1; 5285 event->oncpu = -1;
4853 5286
4854 event->parent = parent_event; 5287 event->parent = parent_event;
@@ -4858,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr,
4858 5291
4859 event->state = PERF_EVENT_STATE_INACTIVE; 5292 event->state = PERF_EVENT_STATE_INACTIVE;
4860 5293
5294 if (task) {
5295 event->attach_state = PERF_ATTACH_TASK;
5296#ifdef CONFIG_HAVE_HW_BREAKPOINT
5297 /*
5298 * hw_breakpoint is a bit difficult here..
5299 */
5300 if (attr->type == PERF_TYPE_BREAKPOINT)
5301 event->hw.bp_target = task;
5302#endif
5303 }
5304
4861 if (!overflow_handler && parent_event) 5305 if (!overflow_handler && parent_event)
4862 overflow_handler = parent_event->overflow_handler; 5306 overflow_handler = parent_event->overflow_handler;
4863 5307
@@ -4882,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr,
4882 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 5326 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4883 goto done; 5327 goto done;
4884 5328
4885 switch (attr->type) { 5329 pmu = perf_init_event(event);
4886 case PERF_TYPE_RAW:
4887 case PERF_TYPE_HARDWARE:
4888 case PERF_TYPE_HW_CACHE:
4889 pmu = hw_perf_event_init(event);
4890 break;
4891
4892 case PERF_TYPE_SOFTWARE:
4893 pmu = sw_perf_event_init(event);
4894 break;
4895
4896 case PERF_TYPE_TRACEPOINT:
4897 pmu = tp_perf_event_init(event);
4898 break;
4899
4900 case PERF_TYPE_BREAKPOINT:
4901 pmu = bp_perf_event_init(event);
4902 break;
4903
4904 5330
4905 default:
4906 break;
4907 }
4908done: 5331done:
4909 err = 0; 5332 err = 0;
4910 if (!pmu) 5333 if (!pmu)
@@ -4922,13 +5345,21 @@ done:
4922 event->pmu = pmu; 5345 event->pmu = pmu;
4923 5346
4924 if (!event->parent) { 5347 if (!event->parent) {
4925 atomic_inc(&nr_events); 5348 if (event->attach_state & PERF_ATTACH_TASK)
5349 jump_label_inc(&perf_task_events);
4926 if (event->attr.mmap || event->attr.mmap_data) 5350 if (event->attr.mmap || event->attr.mmap_data)
4927 atomic_inc(&nr_mmap_events); 5351 atomic_inc(&nr_mmap_events);
4928 if (event->attr.comm) 5352 if (event->attr.comm)
4929 atomic_inc(&nr_comm_events); 5353 atomic_inc(&nr_comm_events);
4930 if (event->attr.task) 5354 if (event->attr.task)
4931 atomic_inc(&nr_task_events); 5355 atomic_inc(&nr_task_events);
5356 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
5357 err = get_callchain_buffers();
5358 if (err) {
5359 free_event(event);
5360 return ERR_PTR(err);
5361 }
5362 }
4932 } 5363 }
4933 5364
4934 return event; 5365 return event;
@@ -5076,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open,
5076 struct perf_event_attr __user *, attr_uptr, 5507 struct perf_event_attr __user *, attr_uptr,
5077 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5508 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5078{ 5509{
5079 struct perf_event *event, *group_leader = NULL, *output_event = NULL; 5510 struct perf_event *group_leader = NULL, *output_event = NULL;
5511 struct perf_event *event, *sibling;
5080 struct perf_event_attr attr; 5512 struct perf_event_attr attr;
5081 struct perf_event_context *ctx; 5513 struct perf_event_context *ctx;
5082 struct file *event_file = NULL; 5514 struct file *event_file = NULL;
5083 struct file *group_file = NULL; 5515 struct file *group_file = NULL;
5516 struct task_struct *task = NULL;
5517 struct pmu *pmu;
5084 int event_fd; 5518 int event_fd;
5519 int move_group = 0;
5085 int fput_needed = 0; 5520 int fput_needed = 0;
5086 int err; 5521 int err;
5087 5522
@@ -5107,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open,
5107 if (event_fd < 0) 5542 if (event_fd < 0)
5108 return event_fd; 5543 return event_fd;
5109 5544
5110 /*
5111 * Get the target context (task or percpu):
5112 */
5113 ctx = find_get_context(pid, cpu);
5114 if (IS_ERR(ctx)) {
5115 err = PTR_ERR(ctx);
5116 goto err_fd;
5117 }
5118
5119 if (group_fd != -1) { 5545 if (group_fd != -1) {
5120 group_leader = perf_fget_light(group_fd, &fput_needed); 5546 group_leader = perf_fget_light(group_fd, &fput_needed);
5121 if (IS_ERR(group_leader)) { 5547 if (IS_ERR(group_leader)) {
5122 err = PTR_ERR(group_leader); 5548 err = PTR_ERR(group_leader);
5123 goto err_put_context; 5549 goto err_fd;
5124 } 5550 }
5125 group_file = group_leader->filp; 5551 group_file = group_leader->filp;
5126 if (flags & PERF_FLAG_FD_OUTPUT) 5552 if (flags & PERF_FLAG_FD_OUTPUT)
@@ -5129,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open,
5129 group_leader = NULL; 5555 group_leader = NULL;
5130 } 5556 }
5131 5557
5558 if (pid != -1) {
5559 task = find_lively_task_by_vpid(pid);
5560 if (IS_ERR(task)) {
5561 err = PTR_ERR(task);
5562 goto err_group_fd;
5563 }
5564 }
5565
5566 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
5567 if (IS_ERR(event)) {
5568 err = PTR_ERR(event);
5569 goto err_task;
5570 }
5571
5572 /*
5573 * Special case software events and allow them to be part of
5574 * any hardware group.
5575 */
5576 pmu = event->pmu;
5577
5578 if (group_leader &&
5579 (is_software_event(event) != is_software_event(group_leader))) {
5580 if (is_software_event(event)) {
5581 /*
5582 * If event and group_leader are not both a software
5583 * event, and event is, then group leader is not.
5584 *
5585 * Allow the addition of software events to !software
5586 * groups, this is safe because software events never
5587 * fail to schedule.
5588 */
5589 pmu = group_leader->pmu;
5590 } else if (is_software_event(group_leader) &&
5591 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
5592 /*
5593 * In case the group is a pure software group, and we
5594 * try to add a hardware event, move the whole group to
5595 * the hardware context.
5596 */
5597 move_group = 1;
5598 }
5599 }
5600
5601 /*
5602 * Get the target context (task or percpu):
5603 */
5604 ctx = find_get_context(pmu, task, cpu);
5605 if (IS_ERR(ctx)) {
5606 err = PTR_ERR(ctx);
5607 goto err_alloc;
5608 }
5609
5132 /* 5610 /*
5133 * Look up the group leader (we will attach this event to it): 5611 * Look up the group leader (we will attach this event to it):
5134 */ 5612 */
@@ -5140,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open,
5140 * becoming part of another group-sibling): 5618 * becoming part of another group-sibling):
5141 */ 5619 */
5142 if (group_leader->group_leader != group_leader) 5620 if (group_leader->group_leader != group_leader)
5143 goto err_put_context; 5621 goto err_context;
5144 /* 5622 /*
5145 * Do not allow to attach to a group in a different 5623 * Do not allow to attach to a group in a different
5146 * task or CPU context: 5624 * task or CPU context:
5147 */ 5625 */
5148 if (group_leader->ctx != ctx) 5626 if (move_group) {
5149 goto err_put_context; 5627 if (group_leader->ctx->type != ctx->type)
5628 goto err_context;
5629 } else {
5630 if (group_leader->ctx != ctx)
5631 goto err_context;
5632 }
5633
5150 /* 5634 /*
5151 * Only a group leader can be exclusive or pinned 5635 * Only a group leader can be exclusive or pinned
5152 */ 5636 */
5153 if (attr.exclusive || attr.pinned) 5637 if (attr.exclusive || attr.pinned)
5154 goto err_put_context; 5638 goto err_context;
5155 }
5156
5157 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5158 NULL, NULL, GFP_KERNEL);
5159 if (IS_ERR(event)) {
5160 err = PTR_ERR(event);
5161 goto err_put_context;
5162 } 5639 }
5163 5640
5164 if (output_event) { 5641 if (output_event) {
5165 err = perf_event_set_output(event, output_event); 5642 err = perf_event_set_output(event, output_event);
5166 if (err) 5643 if (err)
5167 goto err_free_put_context; 5644 goto err_context;
5168 } 5645 }
5169 5646
5170 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); 5647 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5171 if (IS_ERR(event_file)) { 5648 if (IS_ERR(event_file)) {
5172 err = PTR_ERR(event_file); 5649 err = PTR_ERR(event_file);
5173 goto err_free_put_context; 5650 goto err_context;
5651 }
5652
5653 if (move_group) {
5654 struct perf_event_context *gctx = group_leader->ctx;
5655
5656 mutex_lock(&gctx->mutex);
5657 perf_event_remove_from_context(group_leader);
5658 list_for_each_entry(sibling, &group_leader->sibling_list,
5659 group_entry) {
5660 perf_event_remove_from_context(sibling);
5661 put_ctx(gctx);
5662 }
5663 mutex_unlock(&gctx->mutex);
5664 put_ctx(gctx);
5174 } 5665 }
5175 5666
5176 event->filp = event_file; 5667 event->filp = event_file;
5177 WARN_ON_ONCE(ctx->parent_ctx); 5668 WARN_ON_ONCE(ctx->parent_ctx);
5178 mutex_lock(&ctx->mutex); 5669 mutex_lock(&ctx->mutex);
5670
5671 if (move_group) {
5672 perf_install_in_context(ctx, group_leader, cpu);
5673 get_ctx(ctx);
5674 list_for_each_entry(sibling, &group_leader->sibling_list,
5675 group_entry) {
5676 perf_install_in_context(ctx, sibling, cpu);
5677 get_ctx(ctx);
5678 }
5679 }
5680
5179 perf_install_in_context(ctx, event, cpu); 5681 perf_install_in_context(ctx, event, cpu);
5180 ++ctx->generation; 5682 ++ctx->generation;
5181 mutex_unlock(&ctx->mutex); 5683 mutex_unlock(&ctx->mutex);
@@ -5196,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open,
5196 fd_install(event_fd, event_file); 5698 fd_install(event_fd, event_file);
5197 return event_fd; 5699 return event_fd;
5198 5700
5199err_free_put_context: 5701err_context:
5702 put_ctx(ctx);
5703err_alloc:
5200 free_event(event); 5704 free_event(event);
5201err_put_context: 5705err_task:
5706 if (task)
5707 put_task_struct(task);
5708err_group_fd:
5202 fput_light(group_file, fput_needed); 5709 fput_light(group_file, fput_needed);
5203 put_ctx(ctx);
5204err_fd: 5710err_fd:
5205 put_unused_fd(event_fd); 5711 put_unused_fd(event_fd);
5206 return err; 5712 return err;
@@ -5211,32 +5717,31 @@ err_fd:
5211 * 5717 *
5212 * @attr: attributes of the counter to create 5718 * @attr: attributes of the counter to create
5213 * @cpu: cpu in which the counter is bound 5719 * @cpu: cpu in which the counter is bound
5214 * @pid: task to profile 5720 * @task: task to profile (NULL for percpu)
5215 */ 5721 */
5216struct perf_event * 5722struct perf_event *
5217perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 5723perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5218 pid_t pid, 5724 struct task_struct *task,
5219 perf_overflow_handler_t overflow_handler) 5725 perf_overflow_handler_t overflow_handler)
5220{ 5726{
5221 struct perf_event *event;
5222 struct perf_event_context *ctx; 5727 struct perf_event_context *ctx;
5728 struct perf_event *event;
5223 int err; 5729 int err;
5224 5730
5225 /* 5731 /*
5226 * Get the target context (task or percpu): 5732 * Get the target context (task or percpu):
5227 */ 5733 */
5228 5734
5229 ctx = find_get_context(pid, cpu); 5735 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
5230 if (IS_ERR(ctx)) {
5231 err = PTR_ERR(ctx);
5232 goto err_exit;
5233 }
5234
5235 event = perf_event_alloc(attr, cpu, ctx, NULL,
5236 NULL, overflow_handler, GFP_KERNEL);
5237 if (IS_ERR(event)) { 5736 if (IS_ERR(event)) {
5238 err = PTR_ERR(event); 5737 err = PTR_ERR(event);
5239 goto err_put_context; 5738 goto err;
5739 }
5740
5741 ctx = find_get_context(event->pmu, task, cpu);
5742 if (IS_ERR(ctx)) {
5743 err = PTR_ERR(ctx);
5744 goto err_free;
5240 } 5745 }
5241 5746
5242 event->filp = NULL; 5747 event->filp = NULL;
@@ -5254,112 +5759,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5254 5759
5255 return event; 5760 return event;
5256 5761
5257 err_put_context: 5762err_free:
5258 put_ctx(ctx); 5763 free_event(event);
5259 err_exit: 5764err:
5260 return ERR_PTR(err); 5765 return ERR_PTR(err);
5261} 5766}
5262EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); 5767EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5263 5768
5264/*
5265 * inherit a event from parent task to child task:
5266 */
5267static struct perf_event *
5268inherit_event(struct perf_event *parent_event,
5269 struct task_struct *parent,
5270 struct perf_event_context *parent_ctx,
5271 struct task_struct *child,
5272 struct perf_event *group_leader,
5273 struct perf_event_context *child_ctx)
5274{
5275 struct perf_event *child_event;
5276
5277 /*
5278 * Instead of creating recursive hierarchies of events,
5279 * we link inherited events back to the original parent,
5280 * which has a filp for sure, which we use as the reference
5281 * count:
5282 */
5283 if (parent_event->parent)
5284 parent_event = parent_event->parent;
5285
5286 child_event = perf_event_alloc(&parent_event->attr,
5287 parent_event->cpu, child_ctx,
5288 group_leader, parent_event,
5289 NULL, GFP_KERNEL);
5290 if (IS_ERR(child_event))
5291 return child_event;
5292 get_ctx(child_ctx);
5293
5294 /*
5295 * Make the child state follow the state of the parent event,
5296 * not its attr.disabled bit. We hold the parent's mutex,
5297 * so we won't race with perf_event_{en, dis}able_family.
5298 */
5299 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5300 child_event->state = PERF_EVENT_STATE_INACTIVE;
5301 else
5302 child_event->state = PERF_EVENT_STATE_OFF;
5303
5304 if (parent_event->attr.freq) {
5305 u64 sample_period = parent_event->hw.sample_period;
5306 struct hw_perf_event *hwc = &child_event->hw;
5307
5308 hwc->sample_period = sample_period;
5309 hwc->last_period = sample_period;
5310
5311 local64_set(&hwc->period_left, sample_period);
5312 }
5313
5314 child_event->overflow_handler = parent_event->overflow_handler;
5315
5316 /*
5317 * Link it up in the child's context:
5318 */
5319 add_event_to_ctx(child_event, child_ctx);
5320
5321 /*
5322 * Get a reference to the parent filp - we will fput it
5323 * when the child event exits. This is safe to do because
5324 * we are in the parent and we know that the filp still
5325 * exists and has a nonzero count:
5326 */
5327 atomic_long_inc(&parent_event->filp->f_count);
5328
5329 /*
5330 * Link this into the parent event's child list
5331 */
5332 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5333 mutex_lock(&parent_event->child_mutex);
5334 list_add_tail(&child_event->child_list, &parent_event->child_list);
5335 mutex_unlock(&parent_event->child_mutex);
5336
5337 return child_event;
5338}
5339
5340static int inherit_group(struct perf_event *parent_event,
5341 struct task_struct *parent,
5342 struct perf_event_context *parent_ctx,
5343 struct task_struct *child,
5344 struct perf_event_context *child_ctx)
5345{
5346 struct perf_event *leader;
5347 struct perf_event *sub;
5348 struct perf_event *child_ctr;
5349
5350 leader = inherit_event(parent_event, parent, parent_ctx,
5351 child, NULL, child_ctx);
5352 if (IS_ERR(leader))
5353 return PTR_ERR(leader);
5354 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5355 child_ctr = inherit_event(sub, parent, parent_ctx,
5356 child, leader, child_ctx);
5357 if (IS_ERR(child_ctr))
5358 return PTR_ERR(child_ctr);
5359 }
5360 return 0;
5361}
5362
5363static void sync_child_event(struct perf_event *child_event, 5769static void sync_child_event(struct perf_event *child_event,
5364 struct task_struct *child) 5770 struct task_struct *child)
5365{ 5771{
@@ -5416,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event,
5416 } 5822 }
5417} 5823}
5418 5824
5419/* 5825static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5420 * When a child task exits, feed back event values to parent events.
5421 */
5422void perf_event_exit_task(struct task_struct *child)
5423{ 5826{
5424 struct perf_event *child_event, *tmp; 5827 struct perf_event *child_event, *tmp;
5425 struct perf_event_context *child_ctx; 5828 struct perf_event_context *child_ctx;
5426 unsigned long flags; 5829 unsigned long flags;
5427 5830
5428 if (likely(!child->perf_event_ctxp)) { 5831 if (likely(!child->perf_event_ctxp[ctxn])) {
5429 perf_event_task(child, NULL, 0); 5832 perf_event_task(child, NULL, 0);
5430 return; 5833 return;
5431 } 5834 }
@@ -5437,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child)
5437 * scheduled, so we are now safe from rescheduling changing 5840 * scheduled, so we are now safe from rescheduling changing
5438 * our context. 5841 * our context.
5439 */ 5842 */
5440 child_ctx = child->perf_event_ctxp; 5843 child_ctx = child->perf_event_ctxp[ctxn];
5441 __perf_event_task_sched_out(child_ctx); 5844 task_ctx_sched_out(child_ctx, EVENT_ALL);
5442 5845
5443 /* 5846 /*
5444 * Take the context lock here so that if find_get_context is 5847 * Take the context lock here so that if find_get_context is
@@ -5446,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child)
5446 * incremented the context's refcount before we do put_ctx below. 5849 * incremented the context's refcount before we do put_ctx below.
5447 */ 5850 */
5448 raw_spin_lock(&child_ctx->lock); 5851 raw_spin_lock(&child_ctx->lock);
5449 child->perf_event_ctxp = NULL; 5852 child->perf_event_ctxp[ctxn] = NULL;
5450 /* 5853 /*
5451 * If this context is a clone; unclone it so it can't get 5854 * If this context is a clone; unclone it so it can't get
5452 * swapped to another process while we're removing all 5855 * swapped to another process while we're removing all
@@ -5499,6 +5902,17 @@ again:
5499 put_ctx(child_ctx); 5902 put_ctx(child_ctx);
5500} 5903}
5501 5904
5905/*
5906 * When a child task exits, feed back event values to parent events.
5907 */
5908void perf_event_exit_task(struct task_struct *child)
5909{
5910 int ctxn;
5911
5912 for_each_task_context_nr(ctxn)
5913 perf_event_exit_task_context(child, ctxn);
5914}
5915
5502static void perf_free_event(struct perf_event *event, 5916static void perf_free_event(struct perf_event *event,
5503 struct perf_event_context *ctx) 5917 struct perf_event_context *ctx)
5504{ 5918{
@@ -5520,48 +5934,166 @@ static void perf_free_event(struct perf_event *event,
5520 5934
5521/* 5935/*
5522 * free an unexposed, unused context as created by inheritance by 5936 * free an unexposed, unused context as created by inheritance by
5523 * init_task below, used by fork() in case of fail. 5937 * perf_event_init_task below, used by fork() in case of fail.
5524 */ 5938 */
5525void perf_event_free_task(struct task_struct *task) 5939void perf_event_free_task(struct task_struct *task)
5526{ 5940{
5527 struct perf_event_context *ctx = task->perf_event_ctxp; 5941 struct perf_event_context *ctx;
5528 struct perf_event *event, *tmp; 5942 struct perf_event *event, *tmp;
5943 int ctxn;
5529 5944
5530 if (!ctx) 5945 for_each_task_context_nr(ctxn) {
5531 return; 5946 ctx = task->perf_event_ctxp[ctxn];
5947 if (!ctx)
5948 continue;
5532 5949
5533 mutex_lock(&ctx->mutex); 5950 mutex_lock(&ctx->mutex);
5534again: 5951again:
5535 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5952 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5536 perf_free_event(event, ctx); 5953 group_entry)
5954 perf_free_event(event, ctx);
5537 5955
5538 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5956 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5539 group_entry) 5957 group_entry)
5540 perf_free_event(event, ctx); 5958 perf_free_event(event, ctx);
5541 5959
5542 if (!list_empty(&ctx->pinned_groups) || 5960 if (!list_empty(&ctx->pinned_groups) ||
5543 !list_empty(&ctx->flexible_groups)) 5961 !list_empty(&ctx->flexible_groups))
5544 goto again; 5962 goto again;
5545 5963
5546 mutex_unlock(&ctx->mutex); 5964 mutex_unlock(&ctx->mutex);
5547 5965
5548 put_ctx(ctx); 5966 put_ctx(ctx);
5967 }
5968}
5969
5970void perf_event_delayed_put(struct task_struct *task)
5971{
5972 int ctxn;
5973
5974 for_each_task_context_nr(ctxn)
5975 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
5976}
5977
5978/*
5979 * inherit a event from parent task to child task:
5980 */
5981static struct perf_event *
5982inherit_event(struct perf_event *parent_event,
5983 struct task_struct *parent,
5984 struct perf_event_context *parent_ctx,
5985 struct task_struct *child,
5986 struct perf_event *group_leader,
5987 struct perf_event_context *child_ctx)
5988{
5989 struct perf_event *child_event;
5990 unsigned long flags;
5991
5992 /*
5993 * Instead of creating recursive hierarchies of events,
5994 * we link inherited events back to the original parent,
5995 * which has a filp for sure, which we use as the reference
5996 * count:
5997 */
5998 if (parent_event->parent)
5999 parent_event = parent_event->parent;
6000
6001 child_event = perf_event_alloc(&parent_event->attr,
6002 parent_event->cpu,
6003 child,
6004 group_leader, parent_event,
6005 NULL);
6006 if (IS_ERR(child_event))
6007 return child_event;
6008 get_ctx(child_ctx);
6009
6010 /*
6011 * Make the child state follow the state of the parent event,
6012 * not its attr.disabled bit. We hold the parent's mutex,
6013 * so we won't race with perf_event_{en, dis}able_family.
6014 */
6015 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6016 child_event->state = PERF_EVENT_STATE_INACTIVE;
6017 else
6018 child_event->state = PERF_EVENT_STATE_OFF;
6019
6020 if (parent_event->attr.freq) {
6021 u64 sample_period = parent_event->hw.sample_period;
6022 struct hw_perf_event *hwc = &child_event->hw;
6023
6024 hwc->sample_period = sample_period;
6025 hwc->last_period = sample_period;
6026
6027 local64_set(&hwc->period_left, sample_period);
6028 }
6029
6030 child_event->ctx = child_ctx;
6031 child_event->overflow_handler = parent_event->overflow_handler;
6032
6033 /*
6034 * Link it up in the child's context:
6035 */
6036 raw_spin_lock_irqsave(&child_ctx->lock, flags);
6037 add_event_to_ctx(child_event, child_ctx);
6038 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
6039
6040 /*
6041 * Get a reference to the parent filp - we will fput it
6042 * when the child event exits. This is safe to do because
6043 * we are in the parent and we know that the filp still
6044 * exists and has a nonzero count:
6045 */
6046 atomic_long_inc(&parent_event->filp->f_count);
6047
6048 /*
6049 * Link this into the parent event's child list
6050 */
6051 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6052 mutex_lock(&parent_event->child_mutex);
6053 list_add_tail(&child_event->child_list, &parent_event->child_list);
6054 mutex_unlock(&parent_event->child_mutex);
6055
6056 return child_event;
6057}
6058
6059static int inherit_group(struct perf_event *parent_event,
6060 struct task_struct *parent,
6061 struct perf_event_context *parent_ctx,
6062 struct task_struct *child,
6063 struct perf_event_context *child_ctx)
6064{
6065 struct perf_event *leader;
6066 struct perf_event *sub;
6067 struct perf_event *child_ctr;
6068
6069 leader = inherit_event(parent_event, parent, parent_ctx,
6070 child, NULL, child_ctx);
6071 if (IS_ERR(leader))
6072 return PTR_ERR(leader);
6073 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
6074 child_ctr = inherit_event(sub, parent, parent_ctx,
6075 child, leader, child_ctx);
6076 if (IS_ERR(child_ctr))
6077 return PTR_ERR(child_ctr);
6078 }
6079 return 0;
5549} 6080}
5550 6081
5551static int 6082static int
5552inherit_task_group(struct perf_event *event, struct task_struct *parent, 6083inherit_task_group(struct perf_event *event, struct task_struct *parent,
5553 struct perf_event_context *parent_ctx, 6084 struct perf_event_context *parent_ctx,
5554 struct task_struct *child, 6085 struct task_struct *child, int ctxn,
5555 int *inherited_all) 6086 int *inherited_all)
5556{ 6087{
5557 int ret; 6088 int ret;
5558 struct perf_event_context *child_ctx = child->perf_event_ctxp; 6089 struct perf_event_context *child_ctx;
5559 6090
5560 if (!event->attr.inherit) { 6091 if (!event->attr.inherit) {
5561 *inherited_all = 0; 6092 *inherited_all = 0;
5562 return 0; 6093 return 0;
5563 } 6094 }
5564 6095
6096 child_ctx = child->perf_event_ctxp[ctxn];
5565 if (!child_ctx) { 6097 if (!child_ctx) {
5566 /* 6098 /*
5567 * This is executed from the parent task context, so 6099 * This is executed from the parent task context, so
@@ -5570,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5570 * child. 6102 * child.
5571 */ 6103 */
5572 6104
5573 child_ctx = kzalloc(sizeof(struct perf_event_context), 6105 child_ctx = alloc_perf_context(event->pmu, child);
5574 GFP_KERNEL);
5575 if (!child_ctx) 6106 if (!child_ctx)
5576 return -ENOMEM; 6107 return -ENOMEM;
5577 6108
5578 __perf_event_init_context(child_ctx, child); 6109 child->perf_event_ctxp[ctxn] = child_ctx;
5579 child->perf_event_ctxp = child_ctx;
5580 get_task_struct(child);
5581 } 6110 }
5582 6111
5583 ret = inherit_group(event, parent, parent_ctx, 6112 ret = inherit_group(event, parent, parent_ctx,
@@ -5589,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5589 return ret; 6118 return ret;
5590} 6119}
5591 6120
5592
5593/* 6121/*
5594 * Initialize the perf_event context in task_struct 6122 * Initialize the perf_event context in task_struct
5595 */ 6123 */
5596int perf_event_init_task(struct task_struct *child) 6124int perf_event_init_context(struct task_struct *child, int ctxn)
5597{ 6125{
5598 struct perf_event_context *child_ctx, *parent_ctx; 6126 struct perf_event_context *child_ctx, *parent_ctx;
5599 struct perf_event_context *cloned_ctx; 6127 struct perf_event_context *cloned_ctx;
@@ -5602,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child)
5602 int inherited_all = 1; 6130 int inherited_all = 1;
5603 int ret = 0; 6131 int ret = 0;
5604 6132
5605 child->perf_event_ctxp = NULL; 6133 child->perf_event_ctxp[ctxn] = NULL;
5606 6134
5607 mutex_init(&child->perf_event_mutex); 6135 mutex_init(&child->perf_event_mutex);
5608 INIT_LIST_HEAD(&child->perf_event_list); 6136 INIT_LIST_HEAD(&child->perf_event_list);
5609 6137
5610 if (likely(!parent->perf_event_ctxp)) 6138 if (likely(!parent->perf_event_ctxp[ctxn]))
5611 return 0; 6139 return 0;
5612 6140
5613 /* 6141 /*
5614 * If the parent's context is a clone, pin it so it won't get 6142 * If the parent's context is a clone, pin it so it won't get
5615 * swapped under us. 6143 * swapped under us.
5616 */ 6144 */
5617 parent_ctx = perf_pin_task_context(parent); 6145 parent_ctx = perf_pin_task_context(parent, ctxn);
5618 6146
5619 /* 6147 /*
5620 * No need to check if parent_ctx != NULL here; since we saw 6148 * No need to check if parent_ctx != NULL here; since we saw
@@ -5634,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child)
5634 * the list, not manipulating it: 6162 * the list, not manipulating it:
5635 */ 6163 */
5636 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 6164 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5637 ret = inherit_task_group(event, parent, parent_ctx, child, 6165 ret = inherit_task_group(event, parent, parent_ctx,
5638 &inherited_all); 6166 child, ctxn, &inherited_all);
5639 if (ret) 6167 if (ret)
5640 break; 6168 break;
5641 } 6169 }
5642 6170
5643 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6171 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5644 ret = inherit_task_group(event, parent, parent_ctx, child, 6172 ret = inherit_task_group(event, parent, parent_ctx,
5645 &inherited_all); 6173 child, ctxn, &inherited_all);
5646 if (ret) 6174 if (ret)
5647 break; 6175 break;
5648 } 6176 }
5649 6177
5650 child_ctx = child->perf_event_ctxp; 6178 child_ctx = child->perf_event_ctxp[ctxn];
5651 6179
5652 if (child_ctx && inherited_all) { 6180 if (child_ctx && inherited_all) {
5653 /* 6181 /*
@@ -5676,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child)
5676 return ret; 6204 return ret;
5677} 6205}
5678 6206
6207/*
6208 * Initialize the perf_event context in task_struct
6209 */
6210int perf_event_init_task(struct task_struct *child)
6211{
6212 int ctxn, ret;
6213
6214 for_each_task_context_nr(ctxn) {
6215 ret = perf_event_init_context(child, ctxn);
6216 if (ret)
6217 return ret;
6218 }
6219
6220 return 0;
6221}
6222
5679static void __init perf_event_init_all_cpus(void) 6223static void __init perf_event_init_all_cpus(void)
5680{ 6224{
6225 struct swevent_htable *swhash;
5681 int cpu; 6226 int cpu;
5682 struct perf_cpu_context *cpuctx;
5683 6227
5684 for_each_possible_cpu(cpu) { 6228 for_each_possible_cpu(cpu) {
5685 cpuctx = &per_cpu(perf_cpu_context, cpu); 6229 swhash = &per_cpu(swevent_htable, cpu);
5686 mutex_init(&cpuctx->hlist_mutex); 6230 mutex_init(&swhash->hlist_mutex);
5687 __perf_event_init_context(&cpuctx->ctx, NULL); 6231 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
5688 } 6232 }
5689} 6233}
5690 6234
5691static void __cpuinit perf_event_init_cpu(int cpu) 6235static void __cpuinit perf_event_init_cpu(int cpu)
5692{ 6236{
5693 struct perf_cpu_context *cpuctx; 6237 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5694 6238
5695 cpuctx = &per_cpu(perf_cpu_context, cpu); 6239 mutex_lock(&swhash->hlist_mutex);
5696 6240 if (swhash->hlist_refcount > 0) {
5697 spin_lock(&perf_resource_lock);
5698 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5699 spin_unlock(&perf_resource_lock);
5700
5701 mutex_lock(&cpuctx->hlist_mutex);
5702 if (cpuctx->hlist_refcount > 0) {
5703 struct swevent_hlist *hlist; 6241 struct swevent_hlist *hlist;
5704 6242
5705 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 6243 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
5706 WARN_ON_ONCE(!hlist); 6244 WARN_ON(!hlist);
5707 rcu_assign_pointer(cpuctx->swevent_hlist, hlist); 6245 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5708 } 6246 }
5709 mutex_unlock(&cpuctx->hlist_mutex); 6247 mutex_unlock(&swhash->hlist_mutex);
5710} 6248}
5711 6249
5712#ifdef CONFIG_HOTPLUG_CPU 6250#ifdef CONFIG_HOTPLUG_CPU
5713static void __perf_event_exit_cpu(void *info) 6251static void perf_pmu_rotate_stop(struct pmu *pmu)
5714{ 6252{
5715 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 6253 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
5716 struct perf_event_context *ctx = &cpuctx->ctx; 6254
6255 WARN_ON(!irqs_disabled());
6256
6257 list_del_init(&cpuctx->rotation_list);
6258}
6259
6260static void __perf_event_exit_context(void *__info)
6261{
6262 struct perf_event_context *ctx = __info;
5717 struct perf_event *event, *tmp; 6263 struct perf_event *event, *tmp;
5718 6264
6265 perf_pmu_rotate_stop(ctx->pmu);
6266
5719 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 6267 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5720 __perf_event_remove_from_context(event); 6268 __perf_event_remove_from_context(event);
5721 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 6269 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5722 __perf_event_remove_from_context(event); 6270 __perf_event_remove_from_context(event);
5723} 6271}
6272
6273static void perf_event_exit_cpu_context(int cpu)
6274{
6275 struct perf_event_context *ctx;
6276 struct pmu *pmu;
6277 int idx;
6278
6279 idx = srcu_read_lock(&pmus_srcu);
6280 list_for_each_entry_rcu(pmu, &pmus, entry) {
6281 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
6282
6283 mutex_lock(&ctx->mutex);
6284 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
6285 mutex_unlock(&ctx->mutex);
6286 }
6287 srcu_read_unlock(&pmus_srcu, idx);
6288}
6289
5724static void perf_event_exit_cpu(int cpu) 6290static void perf_event_exit_cpu(int cpu)
5725{ 6291{
5726 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 6292 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5727 struct perf_event_context *ctx = &cpuctx->ctx;
5728 6293
5729 mutex_lock(&cpuctx->hlist_mutex); 6294 mutex_lock(&swhash->hlist_mutex);
5730 swevent_hlist_release(cpuctx); 6295 swevent_hlist_release(swhash);
5731 mutex_unlock(&cpuctx->hlist_mutex); 6296 mutex_unlock(&swhash->hlist_mutex);
5732 6297
5733 mutex_lock(&ctx->mutex); 6298 perf_event_exit_cpu_context(cpu);
5734 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5735 mutex_unlock(&ctx->mutex);
5736} 6299}
5737#else 6300#else
5738static inline void perf_event_exit_cpu(int cpu) { } 6301static inline void perf_event_exit_cpu(int cpu) { }
@@ -5743,15 +6306,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5743{ 6306{
5744 unsigned int cpu = (long)hcpu; 6307 unsigned int cpu = (long)hcpu;
5745 6308
5746 switch (action) { 6309 switch (action & ~CPU_TASKS_FROZEN) {
5747 6310
5748 case CPU_UP_PREPARE: 6311 case CPU_UP_PREPARE:
5749 case CPU_UP_PREPARE_FROZEN: 6312 case CPU_DOWN_FAILED:
5750 perf_event_init_cpu(cpu); 6313 perf_event_init_cpu(cpu);
5751 break; 6314 break;
5752 6315
6316 case CPU_UP_CANCELED:
5753 case CPU_DOWN_PREPARE: 6317 case CPU_DOWN_PREPARE:
5754 case CPU_DOWN_PREPARE_FROZEN:
5755 perf_event_exit_cpu(cpu); 6318 perf_event_exit_cpu(cpu);
5756 break; 6319 break;
5757 6320
@@ -5762,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5762 return NOTIFY_OK; 6325 return NOTIFY_OK;
5763} 6326}
5764 6327
5765/*
5766 * This has to have a higher priority than migration_notifier in sched.c.
5767 */
5768static struct notifier_block __cpuinitdata perf_cpu_nb = {
5769 .notifier_call = perf_cpu_notify,
5770 .priority = 20,
5771};
5772
5773void __init perf_event_init(void) 6328void __init perf_event_init(void)
5774{ 6329{
5775 perf_event_init_all_cpus(); 6330 perf_event_init_all_cpus();
5776 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 6331 init_srcu_struct(&pmus_srcu);
5777 (void *)(long)smp_processor_id()); 6332 perf_pmu_register(&perf_swevent);
5778 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 6333 perf_pmu_register(&perf_cpu_clock);
5779 (void *)(long)smp_processor_id()); 6334 perf_pmu_register(&perf_task_clock);
5780 register_cpu_notifier(&perf_cpu_nb); 6335 perf_tp_register();
5781} 6336 perf_cpu_notifier(perf_cpu_notify);
5782
5783static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5784 struct sysdev_class_attribute *attr,
5785 char *buf)
5786{
5787 return sprintf(buf, "%d\n", perf_reserved_percpu);
5788}
5789
5790static ssize_t
5791perf_set_reserve_percpu(struct sysdev_class *class,
5792 struct sysdev_class_attribute *attr,
5793 const char *buf,
5794 size_t count)
5795{
5796 struct perf_cpu_context *cpuctx;
5797 unsigned long val;
5798 int err, cpu, mpt;
5799
5800 err = strict_strtoul(buf, 10, &val);
5801 if (err)
5802 return err;
5803 if (val > perf_max_events)
5804 return -EINVAL;
5805
5806 spin_lock(&perf_resource_lock);
5807 perf_reserved_percpu = val;
5808 for_each_online_cpu(cpu) {
5809 cpuctx = &per_cpu(perf_cpu_context, cpu);
5810 raw_spin_lock_irq(&cpuctx->ctx.lock);
5811 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5812 perf_max_events - perf_reserved_percpu);
5813 cpuctx->max_pertask = mpt;
5814 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5815 }
5816 spin_unlock(&perf_resource_lock);
5817
5818 return count;
5819}
5820
5821static ssize_t perf_show_overcommit(struct sysdev_class *class,
5822 struct sysdev_class_attribute *attr,
5823 char *buf)
5824{
5825 return sprintf(buf, "%d\n", perf_overcommit);
5826}
5827
5828static ssize_t
5829perf_set_overcommit(struct sysdev_class *class,
5830 struct sysdev_class_attribute *attr,
5831 const char *buf, size_t count)
5832{
5833 unsigned long val;
5834 int err;
5835
5836 err = strict_strtoul(buf, 10, &val);
5837 if (err)
5838 return err;
5839 if (val > 1)
5840 return -EINVAL;
5841
5842 spin_lock(&perf_resource_lock);
5843 perf_overcommit = val;
5844 spin_unlock(&perf_resource_lock);
5845
5846 return count;
5847}
5848
5849static SYSDEV_CLASS_ATTR(
5850 reserve_percpu,
5851 0644,
5852 perf_show_reserve_percpu,
5853 perf_set_reserve_percpu
5854 );
5855
5856static SYSDEV_CLASS_ATTR(
5857 overcommit,
5858 0644,
5859 perf_show_overcommit,
5860 perf_set_overcommit
5861 );
5862
5863static struct attribute *perfclass_attrs[] = {
5864 &attr_reserve_percpu.attr,
5865 &attr_overcommit.attr,
5866 NULL
5867};
5868
5869static struct attribute_group perfclass_attr_group = {
5870 .attrs = perfclass_attrs,
5871 .name = "perf_events",
5872};
5873
5874static int __init perf_event_sysfs_init(void)
5875{
5876 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5877 &perfclass_attr_group);
5878} 6337}
5879device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d087..39b65b69584f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
401 struct task_struct *result = NULL; 401 struct task_struct *result = NULL;
402 if (pid) { 402 if (pid) {
403 struct hlist_node *first; 403 struct hlist_node *first;
404 first = rcu_dereference_check(pid->tasks[type].first, 404 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
405 rcu_read_lock_held() || 405 rcu_read_lock_held() ||
406 lockdep_tasklist_lock_is_held()); 406 lockdep_tasklist_lock_is_held());
407 if (first) 407 if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
416 */ 416 */
417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
418{ 418{
419 rcu_lockdep_assert(rcu_read_lock_held());
419 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 420 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
420} 421}
421 422
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index b7e4c362361b..645e541a45f6 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -389,10 +389,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
389 } else if (count == 11) { /* len('0x12345678/0') */ 389 } else if (count == 11) { /* len('0x12345678/0') */
390 if (copy_from_user(ascii_value, buf, 11)) 390 if (copy_from_user(ascii_value, buf, 11))
391 return -EFAULT; 391 return -EFAULT;
392 if (strlen(ascii_value) != 10)
393 return -EINVAL;
392 x = sscanf(ascii_value, "%x", &value); 394 x = sscanf(ascii_value, "%x", &value);
393 if (x != 1) 395 if (x != 1)
394 return -EINVAL; 396 return -EINVAL;
395 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); 397 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
396 } else 398 } else
397 return -EINVAL; 399 return -EINVAL;
398 400
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c77963938bca..8dc31e02ae12 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -338,7 +338,6 @@ int hibernation_snapshot(int platform_mode)
338 goto Close; 338 goto Close;
339 339
340 suspend_console(); 340 suspend_console();
341 hibernation_freeze_swap();
342 saved_mask = clear_gfp_allowed_mask(GFP_IOFS); 341 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
343 error = dpm_suspend_start(PMSG_FREEZE); 342 error = dpm_suspend_start(PMSG_FREEZE);
344 if (error) 343 if (error)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5e7edfb05e66..d3f795f01bbc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1086,7 +1086,6 @@ void swsusp_free(void)
1086 buffer = NULL; 1086 buffer = NULL;
1087 alloc_normal = 0; 1087 alloc_normal = 0;
1088 alloc_highmem = 0; 1088 alloc_highmem = 0;
1089 hibernation_thaw_swap();
1090} 1089}
1091 1090
1092/* Helper functions used for the shrinking of memory. */ 1091/* Helper functions used for the shrinking of memory. */
@@ -1122,9 +1121,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1122 return nr_alloc; 1121 return nr_alloc;
1123} 1122}
1124 1123
1125static unsigned long preallocate_image_memory(unsigned long nr_pages) 1124static unsigned long preallocate_image_memory(unsigned long nr_pages,
1125 unsigned long avail_normal)
1126{ 1126{
1127 return preallocate_image_pages(nr_pages, GFP_IMAGE); 1127 unsigned long alloc;
1128
1129 if (avail_normal <= alloc_normal)
1130 return 0;
1131
1132 alloc = avail_normal - alloc_normal;
1133 if (nr_pages < alloc)
1134 alloc = nr_pages;
1135
1136 return preallocate_image_pages(alloc, GFP_IMAGE);
1128} 1137}
1129 1138
1130#ifdef CONFIG_HIGHMEM 1139#ifdef CONFIG_HIGHMEM
@@ -1170,15 +1179,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1170 */ 1179 */
1171static void free_unnecessary_pages(void) 1180static void free_unnecessary_pages(void)
1172{ 1181{
1173 unsigned long save_highmem, to_free_normal, to_free_highmem; 1182 unsigned long save, to_free_normal, to_free_highmem;
1174 1183
1175 to_free_normal = alloc_normal - count_data_pages(); 1184 save = count_data_pages();
1176 save_highmem = count_highmem_pages(); 1185 if (alloc_normal >= save) {
1177 if (alloc_highmem > save_highmem) { 1186 to_free_normal = alloc_normal - save;
1178 to_free_highmem = alloc_highmem - save_highmem; 1187 save = 0;
1188 } else {
1189 to_free_normal = 0;
1190 save -= alloc_normal;
1191 }
1192 save += count_highmem_pages();
1193 if (alloc_highmem >= save) {
1194 to_free_highmem = alloc_highmem - save;
1179 } else { 1195 } else {
1180 to_free_highmem = 0; 1196 to_free_highmem = 0;
1181 to_free_normal -= save_highmem - alloc_highmem; 1197 to_free_normal -= save - alloc_highmem;
1182 } 1198 }
1183 1199
1184 memory_bm_position_reset(&copy_bm); 1200 memory_bm_position_reset(&copy_bm);
@@ -1259,7 +1275,7 @@ int hibernate_preallocate_memory(void)
1259{ 1275{
1260 struct zone *zone; 1276 struct zone *zone;
1261 unsigned long saveable, size, max_size, count, highmem, pages = 0; 1277 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1262 unsigned long alloc, save_highmem, pages_highmem; 1278 unsigned long alloc, save_highmem, pages_highmem, avail_normal;
1263 struct timeval start, stop; 1279 struct timeval start, stop;
1264 int error; 1280 int error;
1265 1281
@@ -1296,6 +1312,7 @@ int hibernate_preallocate_memory(void)
1296 else 1312 else
1297 count += zone_page_state(zone, NR_FREE_PAGES); 1313 count += zone_page_state(zone, NR_FREE_PAGES);
1298 } 1314 }
1315 avail_normal = count;
1299 count += highmem; 1316 count += highmem;
1300 count -= totalreserve_pages; 1317 count -= totalreserve_pages;
1301 1318
@@ -1310,12 +1327,21 @@ int hibernate_preallocate_memory(void)
1310 */ 1327 */
1311 if (size >= saveable) { 1328 if (size >= saveable) {
1312 pages = preallocate_image_highmem(save_highmem); 1329 pages = preallocate_image_highmem(save_highmem);
1313 pages += preallocate_image_memory(saveable - pages); 1330 pages += preallocate_image_memory(saveable - pages, avail_normal);
1314 goto out; 1331 goto out;
1315 } 1332 }
1316 1333
1317 /* Estimate the minimum size of the image. */ 1334 /* Estimate the minimum size of the image. */
1318 pages = minimum_image_size(saveable); 1335 pages = minimum_image_size(saveable);
1336 /*
1337 * To avoid excessive pressure on the normal zone, leave room in it to
1338 * accommodate an image of the minimum size (unless it's already too
1339 * small, in which case don't preallocate pages from it at all).
1340 */
1341 if (avail_normal > pages)
1342 avail_normal -= pages;
1343 else
1344 avail_normal = 0;
1319 if (size < pages) 1345 if (size < pages)
1320 size = min_t(unsigned long, pages, max_size); 1346 size = min_t(unsigned long, pages, max_size);
1321 1347
@@ -1336,16 +1362,34 @@ int hibernate_preallocate_memory(void)
1336 */ 1362 */
1337 pages_highmem = preallocate_image_highmem(highmem / 2); 1363 pages_highmem = preallocate_image_highmem(highmem / 2);
1338 alloc = (count - max_size) - pages_highmem; 1364 alloc = (count - max_size) - pages_highmem;
1339 pages = preallocate_image_memory(alloc); 1365 pages = preallocate_image_memory(alloc, avail_normal);
1340 if (pages < alloc) 1366 if (pages < alloc) {
1341 goto err_out; 1367 /* We have exhausted non-highmem pages, try highmem. */
1342 size = max_size - size; 1368 alloc -= pages;
1343 alloc = size; 1369 pages += pages_highmem;
1344 size = preallocate_highmem_fraction(size, highmem, count); 1370 pages_highmem = preallocate_image_highmem(alloc);
1345 pages_highmem += size; 1371 if (pages_highmem < alloc)
1346 alloc -= size; 1372 goto err_out;
1347 pages += preallocate_image_memory(alloc); 1373 pages += pages_highmem;
1348 pages += pages_highmem; 1374 /*
1375 * size is the desired number of saveable pages to leave in
1376 * memory, so try to preallocate (all memory - size) pages.
1377 */
1378 alloc = (count - pages) - size;
1379 pages += preallocate_image_highmem(alloc);
1380 } else {
1381 /*
1382 * There are approximately max_size saveable pages at this point
1383 * and we want to reduce this number down to size.
1384 */
1385 alloc = max_size - size;
1386 size = preallocate_highmem_fraction(alloc, highmem, count);
1387 pages_highmem += size;
1388 alloc -= size;
1389 size = preallocate_image_memory(alloc, avail_normal);
1390 pages_highmem += preallocate_image_highmem(alloc - size);
1391 pages += pages_highmem + size;
1392 }
1349 1393
1350 /* 1394 /*
1351 * We only need as many page frames for the image as there are saveable 1395 * We only need as many page frames for the image as there are saveable
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 5d0059eed3e4..e6a5bdf61a37 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap)
136{ 136{
137 unsigned long offset; 137 unsigned long offset;
138 138
139 offset = swp_offset(get_swap_for_hibernation(swap)); 139 offset = swp_offset(get_swap_page_of_type(swap));
140 if (offset) { 140 if (offset) {
141 if (swsusp_extents_insert(offset)) 141 if (swsusp_extents_insert(offset))
142 swap_free_for_hibernation(swp_entry(swap, offset)); 142 swap_free(swp_entry(swap, offset));
143 else 143 else
144 return swapdev_block(swap, offset); 144 return swapdev_block(swap, offset);
145 } 145 }
@@ -163,7 +163,7 @@ void free_all_swap_pages(int swap)
163 ext = container_of(node, struct swsusp_extent, node); 163 ext = container_of(node, struct swsusp_extent, node);
164 rb_erase(node, &swsusp_extents); 164 rb_erase(node, &swsusp_extents);
165 for (offset = ext->start; offset <= ext->end; offset++) 165 for (offset = ext->start; offset <= ext->end; offset++)
166 swap_free_for_hibernation(swp_entry(swap, offset)); 166 swap_free(swp_entry(swap, offset));
167 167
168 kfree(ext); 168 kfree(ext);
169 } 169 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fe465ac008a..2531017795f6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
85 * provides serialisation for access to the entire console 85 * provides serialisation for access to the entire console
86 * driver system. 86 * driver system.
87 */ 87 */
88static DECLARE_MUTEX(console_sem); 88static DEFINE_SEMAPHORE(console_sem);
89struct console *console_drivers; 89struct console *console_drivers;
90EXPORT_SYMBOL_GPL(console_drivers); 90EXPORT_SYMBOL_GPL(console_drivers);
91 91
@@ -556,7 +556,7 @@ static void zap_locks(void)
556 /* If a crash is occurring, make sure we can't deadlock */ 556 /* If a crash is occurring, make sure we can't deadlock */
557 spin_lock_init(&logbuf_lock); 557 spin_lock_init(&logbuf_lock);
558 /* And make sure that we print immediately */ 558 /* And make sure that we print immediately */
559 init_MUTEX(&console_sem); 559 sema_init(&console_sem, 1);
560} 560}
561 561
562#if defined(CONFIG_PRINTK_TIME) 562#if defined(CONFIG_PRINTK_TIME)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb36..a23a57a976d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
74 74
75/** 75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? 76 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
77 * 77 *
78 * Check for bottom half being disabled, which covers both the 78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses 79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) 80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation. 81 * will show the situation. This is useful for debug checks in functions
82 * that require that they be called within an RCU read-side critical
83 * section.
82 * 84 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 85 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
84 */ 86 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
86{ 88{
87 if (!debug_lockdep_rcu_enabled()) 89 if (!debug_lockdep_rcu_enabled())
88 return 1; 90 return 1;
89 return in_softirq(); 91 return in_softirq() || irqs_disabled();
90} 92}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 93EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92 94
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be0..d806735342ac 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active); 59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 61
62/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
64static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp);
67
68#include "rcutiny_plugin.h"
69
62#ifdef CONFIG_NO_HZ 70#ifdef CONFIG_NO_HZ
63 71
64static long rcu_dynticks_nesting = 1; 72static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
140 rcu_sched_qs(cpu); 148 rcu_sched_qs(cpu);
141 else if (!in_softirq()) 149 else if (!in_softirq())
142 rcu_bh_qs(cpu); 150 rcu_bh_qs(cpu);
151 rcu_preempt_check_callbacks();
143} 152}
144 153
145/* 154/*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 *rcp->donetail = NULL; 171 *rcp->donetail = NULL;
163 if (rcp->curtail == rcp->donetail) 172 if (rcp->curtail == rcp->donetail)
164 rcp->curtail = &rcp->rcucblist; 173 rcp->curtail = &rcp->rcucblist;
174 rcu_preempt_remove_callbacks(rcp);
165 rcp->donetail = &rcp->rcucblist; 175 rcp->donetail = &rcp->rcucblist;
166 local_irq_restore(flags); 176 local_irq_restore(flags);
167 177
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
182{ 192{
183 __rcu_process_callbacks(&rcu_sched_ctrlblk); 193 __rcu_process_callbacks(&rcu_sched_ctrlblk);
184 __rcu_process_callbacks(&rcu_bh_ctrlblk); 194 __rcu_process_callbacks(&rcu_bh_ctrlblk);
195 rcu_preempt_process_callbacks();
185} 196}
186 197
187/* 198/*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
223} 234}
224 235
225/* 236/*
226 * Post an RCU callback to be invoked after the end of an RCU grace 237 * Post an RCU callback to be invoked after the end of an RCU-sched grace
227 * period. But since we have but one CPU, that would be after any 238 * period. But since we have but one CPU, that would be after any
228 * quiescent state. 239 * quiescent state.
229 */ 240 */
230void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 241void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
231{ 242{
232 __call_rcu(head, func, &rcu_sched_ctrlblk); 243 __call_rcu(head, func, &rcu_sched_ctrlblk);
233} 244}
234EXPORT_SYMBOL_GPL(call_rcu); 245EXPORT_SYMBOL_GPL(call_rcu_sched);
235 246
236/* 247/*
237 * Post an RCU bottom-half callback to be invoked after any subsequent 248 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
243} 254}
244EXPORT_SYMBOL_GPL(call_rcu_bh); 255EXPORT_SYMBOL_GPL(call_rcu_bh);
245 256
246void rcu_barrier(void)
247{
248 struct rcu_synchronize rcu;
249
250 init_rcu_head_on_stack(&rcu.head);
251 init_completion(&rcu.completion);
252 /* Will wake me after RCU finished. */
253 call_rcu(&rcu.head, wakeme_after_rcu);
254 /* Wait for it. */
255 wait_for_completion(&rcu.completion);
256 destroy_rcu_head_on_stack(&rcu.head);
257}
258EXPORT_SYMBOL_GPL(rcu_barrier);
259
260void rcu_barrier_bh(void) 257void rcu_barrier_bh(void)
261{ 258{
262 struct rcu_synchronize rcu; 259 struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
289{ 286{
290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
291} 288}
292
293#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..6ceca4f745ff 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 * 19 *
20 * Copyright IBM Corporation, 2009 20 * Copyright (c) 2010 Linaro
21 * 21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#ifdef CONFIG_TINY_PREEMPT_RCU
26
27#include <linux/delay.h>
28
29/* Global control variables for preemptible RCU. */
30struct rcu_preempt_ctrlblk {
31 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
32 struct rcu_head **nexttail;
33 /* Tasks blocked in a preemptible RCU */
34 /* read-side critical section while an */
35 /* preemptible-RCU grace period is in */
36 /* progress must wait for a later grace */
37 /* period. This pointer points to the */
38 /* ->next pointer of the last task that */
39 /* must wait for a later grace period, or */
40 /* to &->rcb.rcucblist if there is no */
41 /* such task. */
42 struct list_head blkd_tasks;
43 /* Tasks blocked in RCU read-side critical */
44 /* section. Tasks are placed at the head */
45 /* of this list and age towards the tail. */
46 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */
49 /* is not such task. */
50 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */
54 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */
56 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */
60};
61
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
63 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
64 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
65 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
66 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
67};
68
69static int rcu_preempted_readers_exp(void);
70static void rcu_report_exp_done(void);
71
72/*
73 * Return true if the CPU has not yet responded to the current grace period.
74 */
75static int rcu_cpu_blocking_cur_gp(void)
76{
77 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
78}
79
80/*
81 * Check for a running RCU reader. Because there is only one CPU,
82 * there can be but one running RCU reader at a time. ;-)
83 */
84static int rcu_preempt_running_reader(void)
85{
86 return current->rcu_read_lock_nesting;
87}
88
89/*
90 * Check for preempted RCU readers blocking any grace period.
91 * If the caller needs a reliable answer, it must disable hard irqs.
92 */
93static int rcu_preempt_blocked_readers_any(void)
94{
95 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
96}
97
98/*
99 * Check for preempted RCU readers blocking the current grace period.
100 * If the caller needs a reliable answer, it must disable hard irqs.
101 */
102static int rcu_preempt_blocked_readers_cgp(void)
103{
104 return rcu_preempt_ctrlblk.gp_tasks != NULL;
105}
106
107/*
108 * Return true if another preemptible-RCU grace period is needed.
109 */
110static int rcu_preempt_needs_another_gp(void)
111{
112 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
113}
114
115/*
116 * Return true if a preemptible-RCU grace period is in progress.
117 * The caller must disable hardirqs.
118 */
119static int rcu_preempt_gp_in_progress(void)
120{
121 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
122}
123
124/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked
128 * while in an RCU read-side critical section.
129 *
130 * Unlike the other rcu_*_qs() functions, callers to this function
131 * must disable irqs in order to protect the assignment to
132 * ->rcu_read_unlock_special.
133 *
134 * Because this is a single-CPU implementation, the only way a grace
135 * period can end is if the CPU is in a quiescent state. The reason is
136 * that a blocked preemptible-RCU reader can exit its critical section
137 * only if the CPU is running it at the time. Therefore, when the
138 * last task blocking the current grace period exits its RCU read-side
139 * critical section, neither the CPU nor blocked tasks will be stopping
140 * the current grace period. (In contrast, SMP implementations
141 * might have CPUs running in RCU read-side critical sections that
142 * block later grace periods -- but this is not possible given only
143 * one CPU.)
144 */
145static void rcu_preempt_cpu_qs(void)
146{
147 /* Record both CPU and task as having responded to current GP. */
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150
151 /*
152 * If there is no GP, or if blocked readers are still blocking GP,
153 * then there is nothing more to do.
154 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
156 return;
157
158 /* Advance callbacks. */
159 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
160 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
161 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
162
163 /* If there are no blocked readers, next GP is done instantly. */
164 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ);
170}
171
172/*
173 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
174 */
175static void rcu_preempt_start_gp(void)
176{
177 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
178
179 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++;
181
182 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next;
186
187 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs();
190 }
191}
192
193/*
194 * We have entered the scheduler, and the current task might soon be
195 * context-switched away from. If this task is in an RCU read-side
196 * critical section, we will no longer be able to rely on the CPU to
197 * record that fact, so we enqueue the task on the blkd_tasks list.
198 * If the task started after the current grace period began, as recorded
199 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
200 * before the element referenced by ->gp_tasks (or at the tail if
201 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
202 * The task will dequeue itself when it exits the outermost enclosing
203 * RCU read-side critical section. Therefore, the current grace period
204 * cannot be permitted to complete until the ->gp_tasks pointer becomes
205 * NULL.
206 *
207 * Caller must disable preemption.
208 */
209void rcu_preempt_note_context_switch(void)
210{
211 struct task_struct *t = current;
212 unsigned long flags;
213
214 local_irq_save(flags); /* must exclude scheduler_tick(). */
215 if (rcu_preempt_running_reader() &&
216 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
217
218 /* Possibly blocking in an RCU read-side critical section. */
219 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
220
221 /*
222 * If this CPU has already checked in, then this task
223 * will hold up the next grace period rather than the
224 * current grace period. Queue the task accordingly.
225 * If the task is queued for the current grace period
226 * (i.e., this CPU has not yet passed through a quiescent
227 * state for the current grace period), then as long
228 * as that task remains queued, the current grace period
229 * cannot end.
230 */
231 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
232 if (rcu_cpu_blocking_cur_gp())
233 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
234 }
235
236 /*
237 * Either we were not in an RCU read-side critical section to
238 * begin with, or we have now recorded that critical section
239 * globally. Either way, we can now note a quiescent state
240 * for this CPU. Again, if we were in an RCU read-side critical
241 * section, and if that critical section was blocking the current
242 * grace period, then the fact that the task has been enqueued
243 * means that current grace period continues to be blocked.
244 */
245 rcu_preempt_cpu_qs();
246 local_irq_restore(flags);
247}
248
249/*
250 * Tiny-preemptible RCU implementation for rcu_read_lock().
251 * Just increment ->rcu_read_lock_nesting, shared state will be updated
252 * if we block.
253 */
254void __rcu_read_lock(void)
255{
256 current->rcu_read_lock_nesting++;
257 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
258}
259EXPORT_SYMBOL_GPL(__rcu_read_lock);
260
261/*
262 * Handle special cases during rcu_read_unlock(), such as needing to
263 * notify RCU core processing or task having blocked during the RCU
264 * read-side critical section.
265 */
266static void rcu_read_unlock_special(struct task_struct *t)
267{
268 int empty;
269 int empty_exp;
270 unsigned long flags;
271 struct list_head *np;
272 int special;
273
274 /*
275 * NMI handlers cannot block and cannot safely manipulate state.
276 * They therefore cannot possibly be special, so just leave.
277 */
278 if (in_nmi())
279 return;
280
281 local_irq_save(flags);
282
283 /*
284 * If RCU core is waiting for this CPU to exit critical section,
285 * let it know that we have done so.
286 */
287 special = t->rcu_read_unlock_special;
288 if (special & RCU_READ_UNLOCK_NEED_QS)
289 rcu_preempt_cpu_qs();
290
291 /* Hardware IRQ handlers cannot block. */
292 if (in_irq()) {
293 local_irq_restore(flags);
294 return;
295 }
296
297 /* Clean up if blocked during RCU read-side critical section. */
298 if (special & RCU_READ_UNLOCK_BLOCKED) {
299 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
300
301 /*
302 * Remove this task from the ->blkd_tasks list and adjust
303 * any pointers that might have been referencing it.
304 */
305 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next;
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np;
315 INIT_LIST_HEAD(&t->rcu_node_entry);
316
317 /*
318 * If this was the last task on the current list, and if
319 * we aren't waiting on the CPU, report the quiescent state
320 * and start a new grace period if needed.
321 */
322 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
323 rcu_preempt_cpu_qs();
324 rcu_preempt_start_gp();
325 }
326
327 /*
328 * If this was the last task on the expedited lists,
329 * then we need wake up the waiting task.
330 */
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done();
333 }
334 local_irq_restore(flags);
335}
336
337/*
338 * Tiny-preemptible RCU implementation for rcu_read_unlock().
339 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
340 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
341 * invoke rcu_read_unlock_special() to clean up after a context switch
342 * in an RCU read-side critical section and other special cases.
343 */
344void __rcu_read_unlock(void)
345{
346 struct task_struct *t = current;
347
348 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
349 --t->rcu_read_lock_nesting;
350 barrier(); /* decrement before load of ->rcu_read_unlock_special */
351 if (t->rcu_read_lock_nesting == 0 &&
352 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
353 rcu_read_unlock_special(t);
354#ifdef CONFIG_PROVE_LOCKING
355 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
356#endif /* #ifdef CONFIG_PROVE_LOCKING */
357}
358EXPORT_SYMBOL_GPL(__rcu_read_unlock);
359
360/*
361 * Check for a quiescent state from the current CPU. When a task blocks,
362 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
363 * checked elsewhere. This is called from the scheduling-clock interrupt.
364 *
365 * Caller must disable hard irqs.
366 */
367static void rcu_preempt_check_callbacks(void)
368{
369 struct task_struct *t = current;
370
371 if (rcu_preempt_gp_in_progress() &&
372 (!rcu_preempt_running_reader() ||
373 !rcu_cpu_blocking_cur_gp()))
374 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ);
378 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader())
381 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
382}
383
384/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there
390 * is no need for an explicit check.
391 */
392static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
393{
394 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
395 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
396}
397
398/*
399 * Process callbacks for preemptible RCU.
400 */
401static void rcu_preempt_process_callbacks(void)
402{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404}
405
406/*
407 * Queue a preemptible -RCU callback for invocation after a grace period.
408 */
409void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
410{
411 unsigned long flags;
412
413 debug_rcu_head_queue(head);
414 head->func = func;
415 head->next = NULL;
416
417 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next;
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags);
422}
423EXPORT_SYMBOL_GPL(call_rcu);
424
425void rcu_barrier(void)
426{
427 struct rcu_synchronize rcu;
428
429 init_rcu_head_on_stack(&rcu.head);
430 init_completion(&rcu.completion);
431 /* Will wake me after RCU finished. */
432 call_rcu(&rcu.head, wakeme_after_rcu);
433 /* Wait for it. */
434 wait_for_completion(&rcu.completion);
435 destroy_rcu_head_on_stack(&rcu.head);
436}
437EXPORT_SYMBOL_GPL(rcu_barrier);
438
439/*
440 * synchronize_rcu - wait until a grace period has elapsed.
441 *
442 * Control will return to the caller some time after a full grace
443 * period has elapsed, in other words after all currently executing RCU
444 * read-side critical sections have completed. RCU read-side critical
445 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
446 * and may be nested.
447 */
448void synchronize_rcu(void)
449{
450#ifdef CONFIG_DEBUG_LOCK_ALLOC
451 if (!rcu_scheduler_active)
452 return;
453#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
454
455 WARN_ON_ONCE(rcu_preempt_running_reader());
456 if (!rcu_preempt_blocked_readers_any())
457 return;
458
459 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
460 rcu_barrier();
461}
462EXPORT_SYMBOL_GPL(synchronize_rcu);
463
464static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
465static unsigned long sync_rcu_preempt_exp_count;
466static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
467
468/*
469 * Return non-zero if there are any tasks in RCU read-side critical
470 * sections blocking the current preemptible-RCU expedited grace period.
471 * If there is no preemptible-RCU expedited grace period currently in
472 * progress, returns zero unconditionally.
473 */
474static int rcu_preempted_readers_exp(void)
475{
476 return rcu_preempt_ctrlblk.exp_tasks != NULL;
477}
478
479/*
480 * Report the exit from RCU read-side critical section for the last task
481 * that queued itself during or before the current expedited preemptible-RCU
482 * grace period.
483 */
484static void rcu_report_exp_done(void)
485{
486 wake_up(&sync_rcu_preempt_exp_wq);
487}
488
489/*
490 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
491 * is to rely in the fact that there is but one CPU, and that it is
492 * illegal for a task to invoke synchronize_rcu_expedited() while in a
493 * preemptible-RCU read-side critical section. Therefore, any such
494 * critical sections must correspond to blocked tasks, which must therefore
495 * be on the ->blkd_tasks list. So just record the current head of the
496 * list in the ->exp_tasks pointer, and wait for all tasks including and
497 * after the task pointed to by ->exp_tasks to drain.
498 */
499void synchronize_rcu_expedited(void)
500{
501 unsigned long flags;
502 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
503 unsigned long snap;
504
505 barrier(); /* ensure prior action seen before grace period. */
506
507 WARN_ON_ONCE(rcu_preempt_running_reader());
508
509 /*
510 * Acquire lock so that there is only one preemptible RCU grace
511 * period in flight. Of course, if someone does the expedited
512 * grace period for us while we are acquiring the lock, just leave.
513 */
514 snap = sync_rcu_preempt_exp_count + 1;
515 mutex_lock(&sync_rcu_preempt_exp_mutex);
516 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
517 goto unlock_mb_ret; /* Others did our work for us. */
518
519 local_irq_save(flags);
520
521 /*
522 * All RCU readers have to already be on blkd_tasks because
523 * we cannot legally be executing in an RCU read-side critical
524 * section.
525 */
526
527 /* Snapshot current head of ->blkd_tasks list. */
528 rpcp->exp_tasks = rpcp->blkd_tasks.next;
529 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
530 rpcp->exp_tasks = NULL;
531 local_irq_restore(flags);
532
533 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp())
535 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp());
537
538 /* Clean up and exit. */
539 barrier(); /* ensure expedited GP seen before counter increment. */
540 sync_rcu_preempt_exp_count++;
541unlock_mb_ret:
542 mutex_unlock(&sync_rcu_preempt_exp_mutex);
543 barrier(); /* ensure subsequent action seen after grace period. */
544}
545EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
546
547/*
548 * Does preemptible RCU need the CPU to stay out of dynticks mode?
549 */
550int rcu_preempt_needs_cpu(void)
551{
552 if (!rcu_preempt_running_reader())
553 rcu_preempt_cpu_qs();
554 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
555}
556
557/*
558 * Check for a task exiting while in a preemptible -RCU read-side
559 * critical section, clean up if so. No need to issue warnings,
560 * as debug_check_no_locks_held() already does this if lockdep
561 * is enabled.
562 */
563void exit_rcu(void)
564{
565 struct task_struct *t = current;
566
567 if (t->rcu_read_lock_nesting == 0)
568 return;
569 t->rcu_read_lock_nesting = 1;
570 rcu_read_unlock();
571}
572
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574
575/*
576 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check.
578 */
579static void rcu_preempt_check_callbacks(void)
580{
581}
582
583/*
584 * Because preemptible RCU does not exist, it never has any callbacks
585 * to remove.
586 */
587static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
588{
589}
590
591/*
592 * Because preemptible RCU does not exist, it never has any callbacks
593 * to process.
594 */
595static void rcu_preempt_process_callbacks(void)
596{
597}
598
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600
25#ifdef CONFIG_DEBUG_LOCK_ALLOC 601#ifdef CONFIG_DEBUG_LOCK_ALLOC
26 602
27#include <linux/kernel_stat.h> 603#include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b9..9d8e8fb2515f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
120}; 120};
121 121
122static LIST_HEAD(rcu_torture_freelist); 122static LIST_HEAD(rcu_torture_freelist);
123static struct rcu_torture *rcu_torture_current; 123static struct rcu_torture __rcu *rcu_torture_current;
124static long rcu_torture_current_version; 124static long rcu_torture_current_version;
125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
126static DEFINE_SPINLOCK(rcu_torture_lock); 126static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ 153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ 154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
155static int fullstop = FULLSTOP_RMMOD; 155static int fullstop = FULLSTOP_RMMOD;
156DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ 156/*
157 /* of kthreads. */ 157 * Protect fullstop transitions and spawning of kthreads.
158 */
159static DEFINE_MUTEX(fullstop_mutex);
158 160
159/* 161/*
160 * Detect and respond to a system shutdown. 162 * Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
303 mdelay(longdelay_ms); 305 mdelay(longdelay_ms);
304 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 306 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
305 udelay(shortdelay_us); 307 udelay(shortdelay_us);
308#ifdef CONFIG_PREEMPT
309 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
310 preempt_schedule(); /* No QS if preempt_disable() in effect */
311#endif
306} 312}
307 313
308static void rcu_torture_read_unlock(int idx) __releases(RCU) 314static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
536 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 542 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
537 if (!delay) 543 if (!delay)
538 schedule_timeout_interruptible(longdelay); 544 schedule_timeout_interruptible(longdelay);
545 else
546 rcu_read_delay(rrsp);
539} 547}
540 548
541static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) 549static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
731 continue; 739 continue;
732 rp->rtort_pipe_count = 0; 740 rp->rtort_pipe_count = 0;
733 udelay(rcu_random(&rand) & 0x3ff); 741 udelay(rcu_random(&rand) & 0x3ff);
734 old_rp = rcu_torture_current; 742 old_rp = rcu_dereference_check(rcu_torture_current,
743 current == writer_task);
735 rp->rtort_mbtest = 1; 744 rp->rtort_mbtest = 1;
736 rcu_assign_pointer(rcu_torture_current, rp); 745 rcu_assign_pointer(rcu_torture_current, rp);
737 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ 746 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5a..ccdc04c47981 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
143module_param(qhimark, int, 0); 143module_param(qhimark, int, 0);
144module_param(qlowmark, int, 0); 144module_param(qlowmark, int, 0);
145 145
146#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
147int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
148module_param(rcu_cpu_stall_suppress, int, 0644);
149#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
150
146static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 151static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
147static int rcu_pending(int cpu); 152static int rcu_pending(int cpu);
148 153
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 455
451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 456#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
452 457
453int rcu_cpu_stall_panicking __read_mostly; 458int rcu_cpu_stall_suppress __read_mostly;
454 459
455static void record_gp_stall_check_time(struct rcu_state *rsp) 460static void record_gp_stall_check_time(struct rcu_state *rsp)
456{ 461{
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
482 rcu_print_task_stall(rnp); 487 rcu_print_task_stall(rnp);
483 raw_spin_unlock_irqrestore(&rnp->lock, flags); 488 raw_spin_unlock_irqrestore(&rnp->lock, flags);
484 489
485 /* OK, time to rat on our buddy... */ 490 /*
486 491 * OK, time to rat on our buddy...
492 * See Documentation/RCU/stallwarn.txt for info on how to debug
493 * RCU CPU stall warnings.
494 */
487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 495 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name); 496 rsp->name);
489 rcu_for_each_leaf_node(rsp, rnp) { 497 rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
512 unsigned long flags; 520 unsigned long flags;
513 struct rcu_node *rnp = rcu_get_root(rsp); 521 struct rcu_node *rnp = rcu_get_root(rsp);
514 522
523 /*
524 * OK, time to rat on ourselves...
525 * See Documentation/RCU/stallwarn.txt for info on how to debug
526 * RCU CPU stall warnings.
527 */
515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 528 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 529 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
517 trigger_all_cpu_backtrace(); 530 trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
530 long delta; 543 long delta;
531 struct rcu_node *rnp; 544 struct rcu_node *rnp;
532 545
533 if (rcu_cpu_stall_panicking) 546 if (rcu_cpu_stall_suppress)
534 return; 547 return;
535 delta = jiffies - rsp->jiffies_stall; 548 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
536 rnp = rdp->mynode; 549 rnp = rdp->mynode;
537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 550 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
538 551
539 /* We haven't checked in, so go dump stack. */ 552 /* We haven't checked in, so go dump stack. */
540 print_cpu_stall(rsp); 553 print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
548 561
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 562static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{ 563{
551 rcu_cpu_stall_panicking = 1; 564 rcu_cpu_stall_suppress = 1;
552 return NOTIFY_DONE; 565 return NOTIFY_DONE;
553} 566}
554 567
568/**
569 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
570 *
571 * Set the stall-warning timeout way off into the future, thus preventing
572 * any RCU CPU stall-warning messages from appearing in the current set of
573 * RCU grace periods.
574 *
575 * The caller must disable hard irqs.
576 */
577void rcu_cpu_stall_reset(void)
578{
579 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
580 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
581 rcu_preempt_stall_reset();
582}
583
555static struct notifier_block rcu_panic_block = { 584static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic, 585 .notifier_call = rcu_panic,
557}; 586};
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
571{ 600{
572} 601}
573 602
603void rcu_cpu_stall_reset(void)
604{
605}
606
574static void __init check_cpu_stall_init(void) 607static void __init check_cpu_stall_init(void)
575{ 608{
576} 609}
@@ -712,7 +745,7 @@ static void
712rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 745rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
713 __releases(rcu_get_root(rsp)->lock) 746 __releases(rcu_get_root(rsp)->lock)
714{ 747{
715 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 748 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
716 struct rcu_node *rnp = rcu_get_root(rsp); 749 struct rcu_node *rnp = rcu_get_root(rsp);
717 750
718 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 751 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
960static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
961{ 994{
962 int i; 995 int i;
963 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
964 997
965 if (rdp->nxtlist == NULL) 998 if (rdp->nxtlist == NULL)
966 return; /* irqs disabled, so comparison is stable. */ 999 return; /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
971 for (i = 0; i < RCU_NEXT_SIZE; i++) 1004 for (i = 0; i < RCU_NEXT_SIZE; i++)
972 rdp->nxttail[i] = &rdp->nxtlist; 1005 rdp->nxttail[i] = &rdp->nxtlist;
973 rsp->orphan_qlen += rdp->qlen; 1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
974 rdp->qlen = 0; 1008 rdp->qlen = 0;
975 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
976} 1010}
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
984 struct rcu_data *rdp; 1018 struct rcu_data *rdp;
985 1019
986 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
987 rdp = rsp->rda[smp_processor_id()]; 1021 rdp = this_cpu_ptr(rsp->rda);
988 if (rsp->orphan_cbs_list == NULL) { 1022 if (rsp->orphan_cbs_list == NULL) {
989 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
990 return; 1024 return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
992 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
993 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; 1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
994 rdp->qlen += rsp->orphan_qlen; 1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
995 rsp->orphan_cbs_list = NULL; 1030 rsp->orphan_cbs_list = NULL;
996 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
997 rsp->orphan_qlen = 0; 1032 rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1007 unsigned long flags; 1042 unsigned long flags;
1008 unsigned long mask; 1043 unsigned long mask;
1009 int need_report = 0; 1044 int need_report = 0;
1010 struct rcu_data *rdp = rsp->rda[cpu]; 1045 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1011 struct rcu_node *rnp; 1046 struct rcu_node *rnp;
1012 1047
1013 /* Exclude any attempts to start a new grace period. */ 1048 /* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1123 1158
1124 /* Update count, and requeue any remaining callbacks. */ 1159 /* Update count, and requeue any remaining callbacks. */
1125 rdp->qlen -= count; 1160 rdp->qlen -= count;
1161 rdp->n_cbs_invoked += count;
1126 if (list != NULL) { 1162 if (list != NULL) {
1127 *tail = rdp->nxtlist; 1163 *tail = rdp->nxtlist;
1128 rdp->nxtlist = list; 1164 rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1226 cpu = rnp->grplo; 1262 cpu = rnp->grplo;
1227 bit = 1; 1263 bit = 1;
1228 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 1264 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1229 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1265 if ((rnp->qsmask & bit) != 0 &&
1266 f(per_cpu_ptr(rsp->rda, cpu)))
1230 mask |= bit; 1267 mask |= bit;
1231 } 1268 }
1232 if (mask != 0) { 1269 if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1402 * a quiescent state betweentimes. 1439 * a quiescent state betweentimes.
1403 */ 1440 */
1404 local_irq_save(flags); 1441 local_irq_save(flags);
1405 rdp = rsp->rda[smp_processor_id()]; 1442 rdp = this_cpu_ptr(rsp->rda);
1406 rcu_process_gp_end(rsp, rdp); 1443 rcu_process_gp_end(rsp, rdp);
1407 check_for_new_grace_period(rsp, rdp); 1444 check_for_new_grace_period(rsp, rdp);
1408 1445
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1701{ 1738{
1702 unsigned long flags; 1739 unsigned long flags;
1703 int i; 1740 int i;
1704 struct rcu_data *rdp = rsp->rda[cpu]; 1741 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1705 struct rcu_node *rnp = rcu_get_root(rsp); 1742 struct rcu_node *rnp = rcu_get_root(rsp);
1706 1743
1707 /* Set up local state, ensuring consistent view of global state. */ 1744 /* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1729{ 1766{
1730 unsigned long flags; 1767 unsigned long flags;
1731 unsigned long mask; 1768 unsigned long mask;
1732 struct rcu_data *rdp = rsp->rda[cpu]; 1769 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1733 struct rcu_node *rnp = rcu_get_root(rsp); 1770 struct rcu_node *rnp = rcu_get_root(rsp);
1734 1771
1735 /* Set up local state, ensuring consistent view of global state. */ 1772 /* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1865/* 1902/*
1866 * Helper function for rcu_init() that initializes one rcu_state structure. 1903 * Helper function for rcu_init() that initializes one rcu_state structure.
1867 */ 1904 */
1868static void __init rcu_init_one(struct rcu_state *rsp) 1905static void __init rcu_init_one(struct rcu_state *rsp,
1906 struct rcu_data __percpu *rda)
1869{ 1907{
1870 static char *buf[] = { "rcu_node_level_0", 1908 static char *buf[] = { "rcu_node_level_0",
1871 "rcu_node_level_1", 1909 "rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1918 } 1956 }
1919 } 1957 }
1920 1958
1959 rsp->rda = rda;
1921 rnp = rsp->level[NUM_RCU_LVLS - 1]; 1960 rnp = rsp->level[NUM_RCU_LVLS - 1];
1922 for_each_possible_cpu(i) { 1961 for_each_possible_cpu(i) {
1923 while (i > rnp->grphi) 1962 while (i > rnp->grphi)
1924 rnp++; 1963 rnp++;
1925 rsp->rda[i]->mynode = rnp; 1964 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
1926 rcu_boot_init_percpu_data(i, rsp); 1965 rcu_boot_init_percpu_data(i, rsp);
1927 } 1966 }
1928} 1967}
1929 1968
1930/*
1931 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1932 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1933 * structure.
1934 */
1935#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1936do { \
1937 int i; \
1938 \
1939 for_each_possible_cpu(i) { \
1940 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1941 } \
1942 rcu_init_one(rsp); \
1943} while (0)
1944
1945void __init rcu_init(void) 1969void __init rcu_init(void)
1946{ 1970{
1947 int cpu; 1971 int cpu;
1948 1972
1949 rcu_bootup_announce(); 1973 rcu_bootup_announce();
1950 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1974 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1951 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1975 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1952 __rcu_init_preempt(); 1976 __rcu_init_preempt();
1953 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1977 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1954 1978
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..91d4170c5c13 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
202 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check; 203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 204 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
205 unsigned long n_force_qs_snap; 208 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */ 209 /* did other CPU force QS recently? */
207 long blimit; /* Upper limit on a processed batch */ 210 long blimit; /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
254#define RCU_STALL_DELAY_DELTA 0 257#define RCU_STALL_DELAY_DELTA 0
255#endif 258#endif
256 259
257#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) 260#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
261 RCU_STALL_DELAY_DELTA)
258 /* for rsp->jiffies_stall */ 262 /* for rsp->jiffies_stall */
259#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) 263#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
260 /* for rsp->jiffies_stall */ 264 /* for rsp->jiffies_stall */
261#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 265#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
262 /* to take at least one */ 266 /* to take at least one */
263 /* scheduling clock irq */ 267 /* scheduling clock irq */
264 /* before ratting on them. */ 268 /* before ratting on them. */
265 269
266#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 270#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
271#define RCU_CPU_STALL_SUPPRESS_INIT 0
272#else
273#define RCU_CPU_STALL_SUPPRESS_INIT 1
274#endif
267 275
268#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
269#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
270 277
271/* 278/*
272 * RCU global state, including node hierarchy. This hierarchy is 279 * RCU global state, including node hierarchy. This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
283 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 290 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
284 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 291 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
285 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 292 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
286 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ 293 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
287 294
288 /* The following fields are guarded by the root rcu_node's lock. */ 295 /* The following fields are guarded by the root rcu_node's lock. */
289 296
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
365#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 372#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
366static void rcu_print_detail_task_stall(struct rcu_state *rsp); 373static void rcu_print_detail_task_stall(struct rcu_state *rsp);
367static void rcu_print_task_stall(struct rcu_node *rnp); 374static void rcu_print_task_stall(struct rcu_node *rnp);
375static void rcu_preempt_stall_reset(void);
368#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 376#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
369static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 377static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
370#ifdef CONFIG_HOTPLUG_CPU 378#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..71a4147473f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
57 printk(KERN_INFO 57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n"); 58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif 59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE 60#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif 62#endif
63#if NUM_RCU_LVL_4 != 0 63#if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
155 155
156 /* Possibly blocking in an RCU read-side critical section. */ 156 /* Possibly blocking in an RCU read-side critical section. */
157 rdp = rcu_preempt_state.rda[cpu]; 157 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
158 rnp = rdp->mynode; 158 rnp = rdp->mynode;
159 raw_spin_lock_irqsave(&rnp->lock, flags); 159 raw_spin_lock_irqsave(&rnp->lock, flags);
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
201 */ 201 */
202void __rcu_read_lock(void) 202void __rcu_read_lock(void)
203{ 203{
204 ACCESS_ONCE(current->rcu_read_lock_nesting)++; 204 current->rcu_read_lock_nesting++;
205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ 205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
206} 206}
207EXPORT_SYMBOL_GPL(__rcu_read_lock); 207EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
344 struct task_struct *t = current; 344 struct task_struct *t = current;
345 345
346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ 346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
347 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 347 --t->rcu_read_lock_nesting;
348 barrier(); /* decrement before load of ->rcu_read_unlock_special */
349 if (t->rcu_read_lock_nesting == 0 &&
348 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 350 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
349 rcu_read_unlock_special(t); 351 rcu_read_unlock_special(t);
350#ifdef CONFIG_PROVE_LOCKING 352#ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
417 } 419 }
418} 420}
419 421
422/*
423 * Suppress preemptible RCU's CPU stall warnings by pushing the
424 * time of the next stall-warning message comfortably far into the
425 * future.
426 */
427static void rcu_preempt_stall_reset(void)
428{
429 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
430}
431
420#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 432#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
421 433
422/* 434/*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
546 * 558 *
547 * Control will return to the caller some time after a full grace 559 * Control will return to the caller some time after a full grace
548 * period has elapsed, in other words after all currently executing RCU 560 * period has elapsed, in other words after all currently executing RCU
549 * read-side critical sections have completed. RCU read-side critical 561 * read-side critical sections have completed. Note, however, that
550 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 562 * upon return from synchronize_rcu(), the caller might well be executing
551 * and may be nested. 563 * concurrently with new RCU read-side critical sections that began while
564 * synchronize_rcu() was waiting. RCU read-side critical sections are
565 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
552 */ 566 */
553void synchronize_rcu(void) 567void synchronize_rcu(void)
554{ 568{
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
771 */ 785 */
772static void __init __rcu_init_preempt(void) 786static void __init __rcu_init_preempt(void)
773{ 787{
774 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); 788 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
775} 789}
776 790
777/* 791/*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
865{ 879{
866} 880}
867 881
882/*
883 * Because preemptible RCU does not exist, there is no need to suppress
884 * its CPU stall warnings.
885 */
886static void rcu_preempt_stall_reset(void)
887{
888}
889
868#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 890#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
869 891
870/* 892/*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
919} 941}
920 942
921/* 943/*
922 * In classic RCU, call_rcu() is just call_rcu_sched().
923 */
924void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
925{
926 call_rcu_sched(head, func);
927}
928EXPORT_SYMBOL_GPL(call_rcu);
929
930/*
931 * Wait for an rcu-preempt grace period, but make it happen quickly. 944 * Wait for an rcu-preempt grace period, but make it happen quickly.
932 * But because preemptable RCU does not exist, map to rcu-sched. 945 * But because preemptable RCU does not exist, map to rcu-sched.
933 */ 946 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..d15430b9d122 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
64 rdp->dynticks_fqs); 64 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 65#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); 67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
68} 70}
69 71
70#define PRINT_RCU_DATA(name, func, m) \ 72#define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
119 rdp->dynticks_fqs); 121 rdp->dynticks_fqs);
120#endif /* #ifdef CONFIG_NO_HZ */ 122#endif /* #ifdef CONFIG_NO_HZ */
121 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
122 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); 124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
123} 127}
124 128
125static int show_rcudata_csv(struct seq_file *m, void *unused) 129static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
128#ifdef CONFIG_NO_HZ 132#ifdef CONFIG_NO_HZ
129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
130#endif /* #ifdef CONFIG_NO_HZ */ 134#endif /* #ifdef CONFIG_NO_HZ */
131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
132#ifdef CONFIG_TREE_PREEMPT_RCU 136#ifdef CONFIG_TREE_PREEMPT_RCU
133 seq_puts(m, "\"rcu_preempt:\"\n"); 137 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 138 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
262 struct rcu_data *rdp; 266 struct rcu_data *rdp;
263 267
264 for_each_possible_cpu(cpu) { 268 for_each_possible_cpu(cpu) {
265 rdp = rsp->rda[cpu]; 269 rdp = per_cpu_ptr(rsp->rda, cpu);
266 if (rdp->beenonline) 270 if (rdp->beenonline)
267 print_one_rcu_pending(m, rdp); 271 print_one_rcu_pending(m, rdp);
268 } 272 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 09b574e7f4df..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
426 */ 426 */
427 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
428 atomic_t rto_count; 428 atomic_t rto_count;
429#ifdef CONFIG_SMP
430 struct cpupri cpupri; 429 struct cpupri cpupri;
431#endif
432}; 430};
433 431
434/* 432/*
@@ -437,7 +435,7 @@ struct root_domain {
437 */ 435 */
438static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
439 437
440#endif 438#endif /* CONFIG_SMP */
441 439
442/* 440/*
443 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
488 */ 486 */
489 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
490 488
491 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
492 unsigned long next_balance; 490 unsigned long next_balance;
493 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
494 492
495 u64 clock; 493 u64 clock;
494 u64 clock_task;
496 495
497 atomic_t nr_iowait; 496 atomic_t nr_iowait;
498 497
@@ -520,6 +519,10 @@ struct rq {
520 u64 avg_idle; 519 u64 avg_idle;
521#endif 520#endif
522 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
523 /* calc_load related fields */ 526 /* calc_load related fields */
524 unsigned long calc_load_update; 527 unsigned long calc_load_update;
525 long calc_load_active; 528 long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
643 646
644#endif /* CONFIG_CGROUP_SCHED */ 647#endif /* CONFIG_CGROUP_SCHED */
645 648
649static u64 irq_time_cpu(int cpu);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651
646inline void update_rq_clock(struct rq *rq) 652inline void update_rq_clock(struct rq *rq)
647{ 653{
648 if (!rq->skip_clock_update) 654 if (!rq->skip_clock_update) {
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 655 int cpu = cpu_of(rq);
656 u64 irq_time;
657
658 rq->clock = sched_clock_cpu(cpu);
659 irq_time = irq_time_cpu(cpu);
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662
663 sched_irq_time_avg_update(rq, irq_time);
664 }
650} 665}
651 666
652/* 667/*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
723 size_t cnt, loff_t *ppos) 738 size_t cnt, loff_t *ppos)
724{ 739{
725 char buf[64]; 740 char buf[64];
726 char *cmp = buf; 741 char *cmp;
727 int neg = 0; 742 int neg = 0;
728 int i; 743 int i;
729 744
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
734 return -EFAULT; 749 return -EFAULT;
735 750
736 buf[cnt] = 0; 751 buf[cnt] = 0;
752 cmp = strstrip(buf);
737 753
738 if (strncmp(buf, "NO_", 3) == 0) { 754 if (strncmp(buf, "NO_", 3) == 0) {
739 neg = 1; 755 neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 } 757 }
742 758
743 for (i = 0; sched_feat_names[i]; i++) { 759 for (i = 0; sched_feat_names[i]; i++) {
744 int len = strlen(sched_feat_names[i]); 760 if (strcmp(cmp, sched_feat_names[i]) == 0) {
745
746 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
747 if (neg) 761 if (neg)
748 sysctl_sched_features &= ~(1UL << i); 762 sysctl_sched_features &= ~(1UL << i);
749 else 763 else
@@ -1294,6 +1308,10 @@ static void resched_task(struct task_struct *p)
1294static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1308static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1295{ 1309{
1296} 1310}
1311
1312static void sched_avg_update(struct rq *rq)
1313{
1314}
1297#endif /* CONFIG_SMP */ 1315#endif /* CONFIG_SMP */
1298 1316
1299#if BITS_PER_LONG == 32 1317#if BITS_PER_LONG == 32
@@ -1836,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1836 1854
1837static const struct sched_class rt_sched_class; 1855static const struct sched_class rt_sched_class;
1838 1856
1839#define sched_class_highest (&rt_sched_class) 1857#define sched_class_highest (&stop_sched_class)
1840#define for_each_class(class) \ 1858#define for_each_class(class) \
1841 for (class = sched_class_highest; class; class = class->next) 1859 for (class = sched_class_highest; class; class = class->next)
1842 1860
@@ -1854,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
1854 1872
1855static void set_load_weight(struct task_struct *p) 1873static void set_load_weight(struct task_struct *p)
1856{ 1874{
1857 if (task_has_rt_policy(p)) {
1858 p->se.load.weight = 0;
1859 p->se.load.inv_weight = WMULT_CONST;
1860 return;
1861 }
1862
1863 /* 1875 /*
1864 * SCHED_IDLE tasks get minimal weight: 1876 * SCHED_IDLE tasks get minimal weight:
1865 */ 1877 */
@@ -1913,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1913 dec_nr_running(rq); 1925 dec_nr_running(rq);
1914} 1926}
1915 1927
1928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1929
1930/*
1931 * There are no locks covering percpu hardirq/softirq time.
1932 * They are only modified in account_system_vtime, on corresponding CPU
1933 * with interrupts disabled. So, writes are safe.
1934 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of
1938 * accounting a slice of irq time to wrong task when irq is in progress
1939 * while we read rq->clock. That is a worthy compromise in place of having
1940 * locks on each irq in account_system_time.
1941 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time);
1944
1945static DEFINE_PER_CPU(u64, irq_start_time);
1946static int sched_clock_irqtime;
1947
1948void enable_sched_clock_irqtime(void)
1949{
1950 sched_clock_irqtime = 1;
1951}
1952
1953void disable_sched_clock_irqtime(void)
1954{
1955 sched_clock_irqtime = 0;
1956}
1957
1958static u64 irq_time_cpu(int cpu)
1959{
1960 if (!sched_clock_irqtime)
1961 return 0;
1962
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964}
1965
1966void account_system_vtime(struct task_struct *curr)
1967{
1968 unsigned long flags;
1969 int cpu;
1970 u64 now, delta;
1971
1972 if (!sched_clock_irqtime)
1973 return;
1974
1975 local_irq_save(flags);
1976
1977 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu);
1979 delta = now - per_cpu(irq_start_time, cpu);
1980 per_cpu(irq_start_time, cpu) = now;
1981 /*
1982 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread
1984 * in that case, so as not to confuse scheduler with a special task
1985 * that do not consume any time, but still wants to run.
1986 */
1987 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta;
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta;
1991
1992 local_irq_restore(flags);
1993}
1994EXPORT_SYMBOL_GPL(account_system_vtime);
1995
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
1997{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time;
2000 rq->prev_irq_time = curr_irq_time;
2001 sched_rt_avg_update(rq, delta_irq);
2002 }
2003}
2004
2005#else
2006
2007static u64 irq_time_cpu(int cpu)
2008{
2009 return 0;
2010}
2011
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2013
2014#endif
2015
1916#include "sched_idletask.c" 2016#include "sched_idletask.c"
1917#include "sched_fair.c" 2017#include "sched_fair.c"
1918#include "sched_rt.c" 2018#include "sched_rt.c"
2019#include "sched_stoptask.c"
1919#ifdef CONFIG_SCHED_DEBUG 2020#ifdef CONFIG_SCHED_DEBUG
1920# include "sched_debug.c" 2021# include "sched_debug.c"
1921#endif 2022#endif
1922 2023
2024void sched_set_stop_task(int cpu, struct task_struct *stop)
2025{
2026 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2027 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2028
2029 if (stop) {
2030 /*
2031 * Make it appear like a SCHED_FIFO task, its something
2032 * userspace knows about and won't get confused about.
2033 *
2034 * Also, it will make PI more or less work without too
2035 * much confusion -- but then, stop work should not
2036 * rely on PI working anyway.
2037 */
2038 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2039
2040 stop->sched_class = &stop_sched_class;
2041 }
2042
2043 cpu_rq(cpu)->stop = stop;
2044
2045 if (old_stop) {
2046 /*
2047 * Reset it back to a normal scheduling class so that
2048 * it can die in pieces.
2049 */
2050 old_stop->sched_class = &rt_sched_class;
2051 }
2052}
2053
1923/* 2054/*
1924 * __normal_prio - return the priority that is based on the static prio 2055 * __normal_prio - return the priority that is based on the static prio
1925 */ 2056 */
@@ -1999,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1999 if (p->sched_class != &fair_sched_class) 2130 if (p->sched_class != &fair_sched_class)
2000 return 0; 2131 return 0;
2001 2132
2133 if (unlikely(p->policy == SCHED_IDLE))
2134 return 0;
2135
2002 /* 2136 /*
2003 * Buddy candidates are cache hot: 2137 * Buddy candidates are cache hot:
2004 */ 2138 */
@@ -2848,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2848 */ 2982 */
2849 arch_start_context_switch(prev); 2983 arch_start_context_switch(prev);
2850 2984
2851 if (likely(!mm)) { 2985 if (!mm) {
2852 next->active_mm = oldmm; 2986 next->active_mm = oldmm;
2853 atomic_inc(&oldmm->mm_count); 2987 atomic_inc(&oldmm->mm_count);
2854 enter_lazy_tlb(oldmm, next); 2988 enter_lazy_tlb(oldmm, next);
2855 } else 2989 } else
2856 switch_mm(oldmm, mm, next); 2990 switch_mm(oldmm, mm, next);
2857 2991
2858 if (likely(!prev->mm)) { 2992 if (!prev->mm) {
2859 prev->active_mm = NULL; 2993 prev->active_mm = NULL;
2860 rq->prev_mm = oldmm; 2994 rq->prev_mm = oldmm;
2861 } 2995 }
@@ -3182,6 +3316,8 @@ static void update_cpu_load(struct rq *this_rq)
3182 3316
3183 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 3317 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3184 } 3318 }
3319
3320 sched_avg_update(this_rq);
3185} 3321}
3186 3322
3187static void update_cpu_load_active(struct rq *this_rq) 3323static void update_cpu_load_active(struct rq *this_rq)
@@ -3242,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3242 3378
3243 if (task_current(rq, p)) { 3379 if (task_current(rq, p)) {
3244 update_rq_clock(rq); 3380 update_rq_clock(rq);
3245 ns = rq->clock - p->se.exec_start; 3381 ns = rq->clock_task - p->se.exec_start;
3246 if ((s64)ns < 0) 3382 if ((s64)ns < 0)
3247 ns = 0; 3383 ns = 0;
3248 } 3384 }
@@ -3391,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3391 tmp = cputime_to_cputime64(cputime); 3527 tmp = cputime_to_cputime64(cputime);
3392 if (hardirq_count() - hardirq_offset) 3528 if (hardirq_count() - hardirq_offset)
3393 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3529 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3394 else if (softirq_count()) 3530 else if (in_serving_softirq())
3395 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3531 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3396 else 3532 else
3397 cpustat->system = cputime64_add(cpustat->system, tmp); 3533 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3507,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3507 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3643 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3508 3644
3509 if (total) { 3645 if (total) {
3510 u64 temp; 3646 u64 temp = rtime;
3511 3647
3512 temp = (u64)(rtime * utime); 3648 temp *= utime;
3513 do_div(temp, total); 3649 do_div(temp, total);
3514 utime = (cputime_t)temp; 3650 utime = (cputime_t)temp;
3515 } else 3651 } else
@@ -3540,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3540 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3676 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3541 3677
3542 if (total) { 3678 if (total) {
3543 u64 temp; 3679 u64 temp = rtime;
3544 3680
3545 temp = (u64)(rtime * cputime.utime); 3681 temp *= cputime.utime;
3546 do_div(temp, total); 3682 do_div(temp, total);
3547 utime = (cputime_t)temp; 3683 utime = (cputime_t)temp;
3548 } else 3684 } else
@@ -3578,7 +3714,7 @@ void scheduler_tick(void)
3578 curr->sched_class->task_tick(rq, curr, 0); 3714 curr->sched_class->task_tick(rq, curr, 0);
3579 raw_spin_unlock(&rq->lock); 3715 raw_spin_unlock(&rq->lock);
3580 3716
3581 perf_event_task_tick(curr); 3717 perf_event_task_tick();
3582 3718
3583#ifdef CONFIG_SMP 3719#ifdef CONFIG_SMP
3584 rq->idle_at_tick = idle_cpu(cpu); 3720 rq->idle_at_tick = idle_cpu(cpu);
@@ -3717,17 +3853,13 @@ pick_next_task(struct rq *rq)
3717 return p; 3853 return p;
3718 } 3854 }
3719 3855
3720 class = sched_class_highest; 3856 for_each_class(class) {
3721 for ( ; ; ) {
3722 p = class->pick_next_task(rq); 3857 p = class->pick_next_task(rq);
3723 if (p) 3858 if (p)
3724 return p; 3859 return p;
3725 /*
3726 * Will never be NULL as the idle class always
3727 * returns a non-NULL p:
3728 */
3729 class = class->next;
3730 } 3860 }
3861
3862 BUG(); /* the idle class will always have a runnable task */
3731} 3863}
3732 3864
3733/* 3865/*
@@ -4352,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4352 4484
4353 rq = task_rq_lock(p, &flags); 4485 rq = task_rq_lock(p, &flags);
4354 4486
4487 trace_sched_pi_setprio(p, prio);
4355 oldprio = p->prio; 4488 oldprio = p->prio;
4356 prev_class = p->sched_class; 4489 prev_class = p->sched_class;
4357 on_rq = p->se.on_rq; 4490 on_rq = p->se.on_rq;
@@ -4639,7 +4772,7 @@ recheck:
4639 } 4772 }
4640 4773
4641 if (user) { 4774 if (user) {
4642 retval = security_task_setscheduler(p, policy, param); 4775 retval = security_task_setscheduler(p);
4643 if (retval) 4776 if (retval)
4644 return retval; 4777 return retval;
4645 } 4778 }
@@ -4655,6 +4788,15 @@ recheck:
4655 */ 4788 */
4656 rq = __task_rq_lock(p); 4789 rq = __task_rq_lock(p);
4657 4790
4791 /*
4792 * Changing the policy of the stop threads its a very bad idea
4793 */
4794 if (p == rq->stop) {
4795 __task_rq_unlock(rq);
4796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4797 return -EINVAL;
4798 }
4799
4658#ifdef CONFIG_RT_GROUP_SCHED 4800#ifdef CONFIG_RT_GROUP_SCHED
4659 if (user) { 4801 if (user) {
4660 /* 4802 /*
@@ -4881,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4881 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5023 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4882 goto out_unlock; 5024 goto out_unlock;
4883 5025
4884 retval = security_task_setscheduler(p, 0, NULL); 5026 retval = security_task_setscheduler(p);
4885 if (retval) 5027 if (retval)
4886 goto out_unlock; 5028 goto out_unlock;
4887 5029
4888 cpuset_cpus_allowed(p, cpus_allowed); 5030 cpuset_cpus_allowed(p, cpus_allowed);
4889 cpumask_and(new_mask, in_mask, cpus_allowed); 5031 cpumask_and(new_mask, in_mask, cpus_allowed);
4890 again: 5032again:
4891 retval = set_cpus_allowed_ptr(p, new_mask); 5033 retval = set_cpus_allowed_ptr(p, new_mask);
4892 5034
4893 if (!retval) { 5035 if (!retval) {
@@ -5331,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5331 idle->se.exec_start = sched_clock(); 5473 idle->se.exec_start = sched_clock();
5332 5474
5333 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5475 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5476 /*
5477 * We're having a chicken and egg problem, even though we are
5478 * holding rq->lock, the cpu isn't yet set to this cpu so the
5479 * lockdep check in task_group() will fail.
5480 *
5481 * Similar case to sched_fork(). / Alternatively we could
5482 * use task_rq_lock() here and obtain the other rq->lock.
5483 *
5484 * Silence PROVE_RCU
5485 */
5486 rcu_read_lock();
5334 __set_task_cpu(idle, cpu); 5487 __set_task_cpu(idle, cpu);
5488 rcu_read_unlock();
5335 5489
5336 rq->curr = rq->idle = idle; 5490 rq->curr = rq->idle = idle;
5337#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5491#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6508,6 +6662,7 @@ struct s_data {
6508 cpumask_var_t nodemask; 6662 cpumask_var_t nodemask;
6509 cpumask_var_t this_sibling_map; 6663 cpumask_var_t this_sibling_map;
6510 cpumask_var_t this_core_map; 6664 cpumask_var_t this_core_map;
6665 cpumask_var_t this_book_map;
6511 cpumask_var_t send_covered; 6666 cpumask_var_t send_covered;
6512 cpumask_var_t tmpmask; 6667 cpumask_var_t tmpmask;
6513 struct sched_group **sched_group_nodes; 6668 struct sched_group **sched_group_nodes;
@@ -6519,6 +6674,7 @@ enum s_alloc {
6519 sa_rootdomain, 6674 sa_rootdomain,
6520 sa_tmpmask, 6675 sa_tmpmask,
6521 sa_send_covered, 6676 sa_send_covered,
6677 sa_this_book_map,
6522 sa_this_core_map, 6678 sa_this_core_map,
6523 sa_this_sibling_map, 6679 sa_this_sibling_map,
6524 sa_nodemask, 6680 sa_nodemask,
@@ -6554,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6554#ifdef CONFIG_SCHED_MC 6710#ifdef CONFIG_SCHED_MC
6555static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6711static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6556static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6712static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6557#endif /* CONFIG_SCHED_MC */
6558 6713
6559#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6560static int 6714static int
6561cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6715cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6562 struct sched_group **sg, struct cpumask *mask) 6716 struct sched_group **sg, struct cpumask *mask)
6563{ 6717{
6564 int group; 6718 int group;
6565 6719#ifdef CONFIG_SCHED_SMT
6566 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6720 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6567 group = cpumask_first(mask); 6721 group = cpumask_first(mask);
6722#else
6723 group = cpu;
6724#endif
6568 if (sg) 6725 if (sg)
6569 *sg = &per_cpu(sched_group_core, group).sg; 6726 *sg = &per_cpu(sched_group_core, group).sg;
6570 return group; 6727 return group;
6571} 6728}
6572#elif defined(CONFIG_SCHED_MC) 6729#endif /* CONFIG_SCHED_MC */
6730
6731/*
6732 * book sched-domains:
6733 */
6734#ifdef CONFIG_SCHED_BOOK
6735static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6736static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6737
6573static int 6738static int
6574cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6739cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6575 struct sched_group **sg, struct cpumask *unused) 6740 struct sched_group **sg, struct cpumask *mask)
6576{ 6741{
6742 int group = cpu;
6743#ifdef CONFIG_SCHED_MC
6744 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6745 group = cpumask_first(mask);
6746#elif defined(CONFIG_SCHED_SMT)
6747 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6748 group = cpumask_first(mask);
6749#endif
6577 if (sg) 6750 if (sg)
6578 *sg = &per_cpu(sched_group_core, cpu).sg; 6751 *sg = &per_cpu(sched_group_book, group).sg;
6579 return cpu; 6752 return group;
6580} 6753}
6581#endif 6754#endif /* CONFIG_SCHED_BOOK */
6582 6755
6583static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6756static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6584static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6757static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6588,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6588 struct sched_group **sg, struct cpumask *mask) 6761 struct sched_group **sg, struct cpumask *mask)
6589{ 6762{
6590 int group; 6763 int group;
6591#ifdef CONFIG_SCHED_MC 6764#ifdef CONFIG_SCHED_BOOK
6765 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6766 group = cpumask_first(mask);
6767#elif defined(CONFIG_SCHED_MC)
6592 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6768 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6593 group = cpumask_first(mask); 6769 group = cpumask_first(mask);
6594#elif defined(CONFIG_SCHED_SMT) 6770#elif defined(CONFIG_SCHED_SMT)
@@ -6849,6 +7025,9 @@ SD_INIT_FUNC(CPU)
6849#ifdef CONFIG_SCHED_MC 7025#ifdef CONFIG_SCHED_MC
6850 SD_INIT_FUNC(MC) 7026 SD_INIT_FUNC(MC)
6851#endif 7027#endif
7028#ifdef CONFIG_SCHED_BOOK
7029 SD_INIT_FUNC(BOOK)
7030#endif
6852 7031
6853static int default_relax_domain_level = -1; 7032static int default_relax_domain_level = -1;
6854 7033
@@ -6898,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6898 free_cpumask_var(d->tmpmask); /* fall through */ 7077 free_cpumask_var(d->tmpmask); /* fall through */
6899 case sa_send_covered: 7078 case sa_send_covered:
6900 free_cpumask_var(d->send_covered); /* fall through */ 7079 free_cpumask_var(d->send_covered); /* fall through */
7080 case sa_this_book_map:
7081 free_cpumask_var(d->this_book_map); /* fall through */
6901 case sa_this_core_map: 7082 case sa_this_core_map:
6902 free_cpumask_var(d->this_core_map); /* fall through */ 7083 free_cpumask_var(d->this_core_map); /* fall through */
6903 case sa_this_sibling_map: 7084 case sa_this_sibling_map:
@@ -6944,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6944 return sa_nodemask; 7125 return sa_nodemask;
6945 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7126 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6946 return sa_this_sibling_map; 7127 return sa_this_sibling_map;
6947 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7128 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6948 return sa_this_core_map; 7129 return sa_this_core_map;
7130 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7131 return sa_this_book_map;
6949 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7132 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6950 return sa_send_covered; 7133 return sa_send_covered;
6951 d->rd = alloc_rootdomain(); 7134 d->rd = alloc_rootdomain();
@@ -7003,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7003 return sd; 7186 return sd;
7004} 7187}
7005 7188
7189static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7190 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7191 struct sched_domain *parent, int i)
7192{
7193 struct sched_domain *sd = parent;
7194#ifdef CONFIG_SCHED_BOOK
7195 sd = &per_cpu(book_domains, i).sd;
7196 SD_INIT(sd, BOOK);
7197 set_domain_attribute(sd, attr);
7198 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7199 sd->parent = parent;
7200 parent->child = sd;
7201 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7202#endif
7203 return sd;
7204}
7205
7006static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7206static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7007 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7207 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7008 struct sched_domain *parent, int i) 7208 struct sched_domain *parent, int i)
@@ -7060,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7060 d->send_covered, d->tmpmask); 7260 d->send_covered, d->tmpmask);
7061 break; 7261 break;
7062#endif 7262#endif
7263#ifdef CONFIG_SCHED_BOOK
7264 case SD_LV_BOOK: /* set up book groups */
7265 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7266 if (cpu == cpumask_first(d->this_book_map))
7267 init_sched_build_groups(d->this_book_map, cpu_map,
7268 &cpu_to_book_group,
7269 d->send_covered, d->tmpmask);
7270 break;
7271#endif
7063 case SD_LV_CPU: /* set up physical groups */ 7272 case SD_LV_CPU: /* set up physical groups */
7064 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7273 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7065 if (!cpumask_empty(d->nodemask)) 7274 if (!cpumask_empty(d->nodemask))
@@ -7107,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7107 7316
7108 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7317 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7109 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7318 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7319 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7110 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7320 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7111 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7321 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7112 } 7322 }
7113 7323
7114 for_each_cpu(i, cpu_map) { 7324 for_each_cpu(i, cpu_map) {
7115 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7325 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7326 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7116 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7327 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7117 } 7328 }
7118 7329
@@ -7143,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7143 init_sched_groups_power(i, sd); 7354 init_sched_groups_power(i, sd);
7144 } 7355 }
7145#endif 7356#endif
7357#ifdef CONFIG_SCHED_BOOK
7358 for_each_cpu(i, cpu_map) {
7359 sd = &per_cpu(book_domains, i).sd;
7360 init_sched_groups_power(i, sd);
7361 }
7362#endif
7146 7363
7147 for_each_cpu(i, cpu_map) { 7364 for_each_cpu(i, cpu_map) {
7148 sd = &per_cpu(phys_domains, i).sd; 7365 sd = &per_cpu(phys_domains, i).sd;
@@ -7168,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7168 sd = &per_cpu(cpu_domains, i).sd; 7385 sd = &per_cpu(cpu_domains, i).sd;
7169#elif defined(CONFIG_SCHED_MC) 7386#elif defined(CONFIG_SCHED_MC)
7170 sd = &per_cpu(core_domains, i).sd; 7387 sd = &per_cpu(core_domains, i).sd;
7388#elif defined(CONFIG_SCHED_BOOK)
7389 sd = &per_cpu(book_domains, i).sd;
7171#else 7390#else
7172 sd = &per_cpu(phys_domains, i).sd; 7391 sd = &per_cpu(phys_domains, i).sd;
7173#endif 7392#endif
@@ -8072,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8072 8291
8073 return 1; 8292 return 1;
8074 8293
8075 err_free_rq: 8294err_free_rq:
8076 kfree(cfs_rq); 8295 kfree(cfs_rq);
8077 err: 8296err:
8078 return 0; 8297 return 0;
8079} 8298}
8080 8299
@@ -8162,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8162 8381
8163 return 1; 8382 return 1;
8164 8383
8165 err_free_rq: 8384err_free_rq:
8166 kfree(rt_rq); 8385 kfree(rt_rq);
8167 err: 8386err:
8168 return 0; 8387 return 0;
8169} 8388}
8170 8389
@@ -8522,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8522 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8741 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8523 } 8742 }
8524 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8743 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8525 unlock: 8744unlock:
8526 read_unlock(&tasklist_lock); 8745 read_unlock(&tasklist_lock);
8527 mutex_unlock(&rt_constraints_mutex); 8746 mutex_unlock(&rt_constraints_mutex);
8528 8747
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ab661ebc4895..933f3d1b62ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
25 25
26/* 26/*
27 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 28 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29 * 29 *
30 * NOTE: this latency value is not the same as the concept of 30 * NOTE: this latency value is not the same as the concept of
31 * 'timeslice length' - timeslices in CFS are of variable length 31 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 2000000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 3; 63static unsigned int sched_nr_latency = 8;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
519static void update_curr(struct cfs_rq *cfs_rq) 519static void update_curr(struct cfs_rq *cfs_rq)
520{ 520{
521 struct sched_entity *curr = cfs_rq->curr; 521 struct sched_entity *curr = cfs_rq->curr;
522 u64 now = rq_of(cfs_rq)->clock; 522 u64 now = rq_of(cfs_rq)->clock_task;
523 unsigned long delta_exec; 523 unsigned long delta_exec;
524 524
525 if (unlikely(!curr)) 525 if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602 /* 602 /*
603 * We are starting a new run period: 603 * We are starting a new run period:
604 */ 604 */
605 se->exec_start = rq_of(cfs_rq)->clock; 605 se->exec_start = rq_of(cfs_rq)->clock_task;
606} 606}
607 607
608/************************************************** 608/**************************************************
@@ -1313,7 +1313,7 @@ static struct sched_group *
1313find_idlest_group(struct sched_domain *sd, struct task_struct *p, 1313find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1314 int this_cpu, int load_idx) 1314 int this_cpu, int load_idx)
1315{ 1315{
1316 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 1316 struct sched_group *idlest = NULL, *group = sd->groups;
1317 unsigned long min_load = ULONG_MAX, this_load = 0; 1317 unsigned long min_load = ULONG_MAX, this_load = 0;
1318 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1318 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1319 1319
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1348 1348
1349 if (local_group) { 1349 if (local_group) {
1350 this_load = avg_load; 1350 this_load = avg_load;
1351 this = group;
1352 } else if (avg_load < min_load) { 1351 } else if (avg_load < min_load) {
1353 min_load = avg_load; 1352 min_load = avg_load;
1354 idlest = group; 1353 idlest = group;
@@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1765 set_task_cpu(p, this_cpu); 1764 set_task_cpu(p, this_cpu);
1766 activate_task(this_rq, p, 0); 1765 activate_task(this_rq, p, 0);
1767 check_preempt_curr(this_rq, p, 0); 1766 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1768} 1771}
1769 1772
1770/* 1773/*
@@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1799 * 2) too many balance attempts have failed. 1802 * 2) too many balance attempts have failed.
1800 */ 1803 */
1801 1804
1802 tsk_cache_hot = task_hot(p, rq->clock, sd); 1805 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1803 if (!tsk_cache_hot || 1806 if (!tsk_cache_hot ||
1804 sd->nr_balance_failed > sd->cache_nice_tries) { 1807 sd->nr_balance_failed > sd->cache_nice_tries) {
1805#ifdef CONFIG_SCHEDSTATS 1808#ifdef CONFIG_SCHEDSTATS
@@ -2031,12 +2034,14 @@ struct sd_lb_stats {
2031 unsigned long this_load; 2034 unsigned long this_load;
2032 unsigned long this_load_per_task; 2035 unsigned long this_load_per_task;
2033 unsigned long this_nr_running; 2036 unsigned long this_nr_running;
2037 unsigned long this_has_capacity;
2034 2038
2035 /* Statistics of the busiest group */ 2039 /* Statistics of the busiest group */
2036 unsigned long max_load; 2040 unsigned long max_load;
2037 unsigned long busiest_load_per_task; 2041 unsigned long busiest_load_per_task;
2038 unsigned long busiest_nr_running; 2042 unsigned long busiest_nr_running;
2039 unsigned long busiest_group_capacity; 2043 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity;
2040 2045
2041 int group_imb; /* Is there imbalance in this sd */ 2046 int group_imb; /* Is there imbalance in this sd */
2042#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2059,6 +2064,7 @@ struct sg_lb_stats {
2059 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2060 unsigned long group_capacity; 2065 unsigned long group_capacity;
2061 int group_imb; /* Is there an imbalance in the group ? */ 2066 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */
2062}; 2068};
2063 2069
2064/** 2070/**
@@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu)
2268 struct rq *rq = cpu_rq(cpu); 2274 struct rq *rq = cpu_rq(cpu);
2269 u64 total, available; 2275 u64 total, available;
2270 2276
2271 sched_avg_update(rq);
2272
2273 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2277 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2274 available = total - rq->rt_avg; 2278
2279 if (unlikely(total < rq->rt_avg)) {
2280 /* Ensures that power won't end up being negative */
2281 available = 0;
2282 } else {
2283 available = total - rq->rt_avg;
2284 }
2275 2285
2276 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2286 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2277 total = SCHED_LOAD_SCALE; 2287 total = SCHED_LOAD_SCALE;
@@ -2381,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2381 int local_group, const struct cpumask *cpus, 2391 int local_group, const struct cpumask *cpus,
2382 int *balance, struct sg_lb_stats *sgs) 2392 int *balance, struct sg_lb_stats *sgs)
2383{ 2393{
2384 unsigned long load, max_cpu_load, min_cpu_load; 2394 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2385 int i; 2395 int i;
2386 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2396 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2387 unsigned long avg_load_per_task = 0; 2397 unsigned long avg_load_per_task = 0;
@@ -2392,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2392 /* Tally up the load of all CPUs in the group */ 2402 /* Tally up the load of all CPUs in the group */
2393 max_cpu_load = 0; 2403 max_cpu_load = 0;
2394 min_cpu_load = ~0UL; 2404 min_cpu_load = ~0UL;
2405 max_nr_running = 0;
2395 2406
2396 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2407 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2397 struct rq *rq = cpu_rq(i); 2408 struct rq *rq = cpu_rq(i);
@@ -2409,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2409 load = target_load(i, load_idx); 2420 load = target_load(i, load_idx);
2410 } else { 2421 } else {
2411 load = source_load(i, load_idx); 2422 load = source_load(i, load_idx);
2412 if (load > max_cpu_load) 2423 if (load > max_cpu_load) {
2413 max_cpu_load = load; 2424 max_cpu_load = load;
2425 max_nr_running = rq->nr_running;
2426 }
2414 if (min_cpu_load > load) 2427 if (min_cpu_load > load)
2415 min_cpu_load = load; 2428 min_cpu_load = load;
2416 } 2429 }
@@ -2450,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2450 if (sgs->sum_nr_running) 2463 if (sgs->sum_nr_running)
2451 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2464 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2452 2465
2453 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2466 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
2454 sgs->group_imb = 1; 2467 sgs->group_imb = 1;
2455 2468
2456 sgs->group_capacity = 2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2457 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2458 if (!sgs->group_capacity) 2470 if (!sgs->group_capacity)
2459 sgs->group_capacity = fix_small_capacity(sd, group); 2471 sgs->group_capacity = fix_small_capacity(sd, group);
2472
2473 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1;
2460} 2475}
2461 2476
2462/** 2477/**
@@ -2545,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2545 /* 2560 /*
2546 * In case the child domain prefers tasks go to siblings 2561 * In case the child domain prefers tasks go to siblings
2547 * first, lower the sg capacity to one so that we'll try 2562 * first, lower the sg capacity to one so that we'll try
2548 * and move all the excess tasks away. 2563 * and move all the excess tasks away. We lower the capacity
2564 * of a group only if the local group has the capacity to fit
2565 * these excess tasks, i.e. nr_running < group_capacity. The
2566 * extra check prevents the case where you always pull from the
2567 * heaviest group when it is already under-utilized (possible
2568 * with a large weight task outweighs the tasks on the system).
2549 */ 2569 */
2550 if (prefer_sibling) 2570 if (prefer_sibling && !local_group && sds->this_has_capacity)
2551 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2571 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2552 2572
2553 if (local_group) { 2573 if (local_group) {
@@ -2555,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2555 sds->this = sg; 2575 sds->this = sg;
2556 sds->this_nr_running = sgs.sum_nr_running; 2576 sds->this_nr_running = sgs.sum_nr_running;
2557 sds->this_load_per_task = sgs.sum_weighted_load; 2577 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity;
2558 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2559 sds->max_load = sgs.avg_load; 2580 sds->max_load = sgs.avg_load;
2560 sds->busiest = sg; 2581 sds->busiest = sg;
2561 sds->busiest_nr_running = sgs.sum_nr_running; 2582 sds->busiest_nr_running = sgs.sum_nr_running;
2562 sds->busiest_group_capacity = sgs.group_capacity; 2583 sds->busiest_group_capacity = sgs.group_capacity;
2563 sds->busiest_load_per_task = sgs.sum_weighted_load; 2584 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity;
2564 sds->group_imb = sgs.group_imb; 2586 sds->group_imb = sgs.group_imb;
2565 } 2587 }
2566 2588
@@ -2757,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2757 return fix_small_imbalance(sds, this_cpu, imbalance); 2779 return fix_small_imbalance(sds, this_cpu, imbalance);
2758 2780
2759} 2781}
2782
2760/******* find_busiest_group() helpers end here *********************/ 2783/******* find_busiest_group() helpers end here *********************/
2761 2784
2762/** 2785/**
@@ -2808,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2808 * 4) This group is more busy than the avg busieness at this 2831 * 4) This group is more busy than the avg busieness at this
2809 * sched_domain. 2832 * sched_domain.
2810 * 5) The imbalance is within the specified limit. 2833 * 5) The imbalance is within the specified limit.
2834 *
2835 * Note: when doing newidle balance, if the local group has excess
2836 * capacity (i.e. nr_running < group_capacity) and the busiest group
2837 * does not have any capacity, we force a load balance to pull tasks
2838 * to the local group. In this case, we skip past checks 3, 4 and 5.
2811 */ 2839 */
2812 if (!(*balance)) 2840 if (!(*balance))
2813 goto ret; 2841 goto ret;
@@ -2819,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2819 if (!sds.busiest || sds.busiest_nr_running == 0) 2847 if (!sds.busiest || sds.busiest_nr_running == 0)
2820 goto out_balanced; 2848 goto out_balanced;
2821 2849
2850 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
2851 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
2852 !sds.busiest_has_capacity)
2853 goto force_balance;
2854
2822 if (sds.this_load >= sds.max_load) 2855 if (sds.this_load >= sds.max_load)
2823 goto out_balanced; 2856 goto out_balanced;
2824 2857
@@ -2830,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2830 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2831 goto out_balanced; 2864 goto out_balanced;
2832 2865
2866force_balance:
2833 /* Looks like there is an imbalance. Compute it */ 2867 /* Looks like there is an imbalance. Compute it */
2834 calculate_imbalance(&sds, this_cpu, imbalance); 2868 calculate_imbalance(&sds, this_cpu, imbalance);
2835 return sds.busiest; 2869 return sds.busiest;
@@ -3034,7 +3068,14 @@ redo:
3034 3068
3035 if (!ld_moved) { 3069 if (!ld_moved) {
3036 schedstat_inc(sd, lb_failed[idle]); 3070 schedstat_inc(sd, lb_failed[idle]);
3037 sd->nr_balance_failed++; 3071 /*
3072 * Increment the failure counter only on periodic balance.
3073 * We do not want newidle balance, which can be very
3074 * frequent, pollute the failure counter causing
3075 * excessive cache_hot migrations and active balances.
3076 */
3077 if (idle != CPU_NEWLY_IDLE)
3078 sd->nr_balance_failed++;
3038 3079
3039 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3080 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3040 this_cpu)) { 3081 this_cpu)) {
@@ -3156,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3156 interval = msecs_to_jiffies(sd->balance_interval); 3197 interval = msecs_to_jiffies(sd->balance_interval);
3157 if (time_after(next_balance, sd->last_balance + interval)) 3198 if (time_after(next_balance, sd->last_balance + interval))
3158 next_balance = sd->last_balance + interval; 3199 next_balance = sd->last_balance + interval;
3159 if (pulled_task) { 3200 if (pulled_task)
3160 this_rq->idle_stamp = 0;
3161 break; 3201 break;
3162 }
3163 } 3202 }
3164 3203
3165 raw_spin_lock(&this_rq->lock); 3204 raw_spin_lock(&this_rq->lock);
@@ -3633,7 +3672,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
3633 if (time_before(now, nohz.next_balance)) 3672 if (time_before(now, nohz.next_balance))
3634 return 0; 3673 return 0;
3635 3674
3636 if (!rq->nr_running) 3675 if (rq->idle_at_tick)
3637 return 0; 3676 return 0;
3638 3677
3639 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 3678 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -3754,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p)
3754 3793
3755 update_rq_clock(rq); 3794 update_rq_clock(rq);
3756 3795
3757 if (unlikely(task_cpu(p) != this_cpu)) 3796 if (unlikely(task_cpu(p) != this_cpu)) {
3797 rcu_read_lock();
3758 __set_task_cpu(p, this_cpu); 3798 __set_task_cpu(p, this_cpu);
3799 rcu_read_unlock();
3800 }
3759 3801
3760 update_curr(cfs_rq); 3802 update_curr(cfs_rq);
3761 3803
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
61 * release the lock. Decreases scheduling overhead. 61 * release the lock. Decreases scheduling overhead.
62 */ 62 */
63SCHED_FEAT(OWNER_SPIN, 1) 63SCHED_FEAT(OWNER_SPIN, 1)
64
65/*
66 * Decrement CPU power based on irq activity
67 */
68SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67a..bea7d79f7e9c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
609 if (!task_has_rt_policy(curr)) 609 if (!task_has_rt_policy(curr))
610 return; 610 return;
611 611
612 delta_exec = rq->clock - curr->se.exec_start; 612 delta_exec = rq->clock_task - curr->se.exec_start;
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
620 620
621 curr->se.exec_start = rq->clock; 621 curr->se.exec_start = rq->clock_task;
622 cpuacct_charge(curr, delta_exec); 622 cpuacct_charge(curr, delta_exec);
623 623
624 sched_rt_avg_update(rq, delta_exec); 624 sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
960 * runqueue. Otherwise simply start this RT task 960 * runqueue. Otherwise simply start this RT task
961 * on its current runqueue. 961 * on its current runqueue.
962 * 962 *
963 * We want to avoid overloading runqueues. Even if 963 * We want to avoid overloading runqueues. If the woken
964 * the RT task is of higher priority than the current RT task. 964 * task is a higher priority, then it will stay on this CPU
965 * RT tasks behave differently than other tasks. If 965 * and the lower prio task should be moved to another CPU.
966 * one gets preempted, we try to push it off to another queue. 966 * Even though this will probably make the lower prio task
967 * So trying to keep a preempting RT task on the same 967 * lose its cache, we do not want to bounce a higher task
968 * cache hot CPU will force the running RT task to 968 * around just because it gave up its CPU, perhaps for a
969 * a cold CPU. So we waste all the cache for the lower 969 * lock?
970 * RT task in hopes of saving some of a RT task 970 *
971 * that is just being woken and probably will have 971 * For equal prio tasks, we just let the scheduler sort it out.
972 * cold cache anyway.
973 */ 972 */
974 if (unlikely(rt_task(rq->curr)) && 973 if (unlikely(rt_task(rq->curr)) &&
974 (rq->curr->rt.nr_cpus_allowed < 2 ||
975 rq->curr->prio < p->prio) &&
975 (p->rt.nr_cpus_allowed > 1)) { 976 (p->rt.nr_cpus_allowed > 1)) {
976 int cpu = find_lowest_rq(p); 977 int cpu = find_lowest_rq(p);
977 978
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1074 } while (rt_rq); 1075 } while (rt_rq);
1075 1076
1076 p = rt_task_of(rt_se); 1077 p = rt_task_of(rt_se);
1077 p->se.exec_start = rq->clock; 1078 p->se.exec_start = rq->clock_task;
1078 1079
1079 return p; 1080 return p;
1080} 1081}
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1139 for_each_leaf_rt_rq(rt_rq, rq) { 1140 for_each_leaf_rt_rq(rt_rq, rq) {
1140 array = &rt_rq->active; 1141 array = &rt_rq->active;
1141 idx = sched_find_first_bit(array->bitmap); 1142 idx = sched_find_first_bit(array->bitmap);
1142 next_idx: 1143next_idx:
1143 if (idx >= MAX_RT_PRIO) 1144 if (idx >= MAX_RT_PRIO)
1144 continue; 1145 continue;
1145 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
1315 if (!next_task) 1316 if (!next_task)
1316 return 0; 1317 return 0;
1317 1318
1318 retry: 1319retry:
1319 if (unlikely(next_task == rq->curr)) { 1320 if (unlikely(next_task == rq->curr)) {
1320 WARN_ON(1); 1321 WARN_ON(1);
1321 return 0; 1322 return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
1463 * but possible) 1464 * but possible)
1464 */ 1465 */
1465 } 1466 }
1466 skip: 1467skip:
1467 double_unlock_balance(this_rq, src_rq); 1468 double_unlock_balance(this_rq, src_rq);
1468 } 1469 }
1469 1470
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1491 if (!task_running(rq, p) && 1492 if (!task_running(rq, p) &&
1492 !test_tsk_need_resched(rq->curr) && 1493 !test_tsk_need_resched(rq->curr) &&
1493 has_pushable_tasks(rq) && 1494 has_pushable_tasks(rq) &&
1494 p->rt.nr_cpus_allowed > 1) 1495 p->rt.nr_cpus_allowed > 1 &&
1496 rt_task(rq->curr) &&
1497 (rq->curr->rt.nr_cpus_allowed < 2 ||
1498 rq->curr->prio < p->prio))
1495 push_rt_tasks(rq); 1499 push_rt_tasks(rq);
1496} 1500}
1497 1501
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1709{ 1713{
1710 struct task_struct *p = rq->curr; 1714 struct task_struct *p = rq->curr;
1711 1715
1712 p->se.exec_start = rq->clock; 1716 p->se.exec_start = rq->clock_task;
1713 1717
1714 /* The running task is never eligible for pushing */ 1718 /* The running task is never eligible for pushing */
1715 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..45bddc0c1048
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
1/*
2 * stop-task scheduling class.
3 *
4 * The stop task is the highest priority task in the system, it preempts
5 * everything and will be preempted by nothing.
6 *
7 * See kernel/stop_machine.c
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p,
13 int sd_flag, int flags)
14{
15 return task_cpu(p); /* stop tasks as never migrate */
16}
17#endif /* CONFIG_SMP */
18
19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{
22 resched_task(rq->curr); /* we preempt everything */
23}
24
25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{
27 struct task_struct *stop = rq->stop;
28
29 if (stop && stop->state == TASK_RUNNING)
30 return stop;
31
32 return NULL;
33}
34
35static void
36enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
37{
38}
39
40static void
41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
42{
43}
44
45static void yield_task_stop(struct rq *rq)
46{
47 BUG(); /* the stop task should never yield, its pointless. */
48}
49
50static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
51{
52}
53
54static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_stop(struct rq *rq)
59{
60}
61
62static void switched_to_stop(struct rq *rq, struct task_struct *p,
63 int running)
64{
65 BUG(); /* its impossible to change to this class */
66}
67
68static void prio_changed_stop(struct rq *rq, struct task_struct *p,
69 int oldprio, int running)
70{
71 BUG(); /* how!?, what priority? */
72}
73
74static unsigned int
75get_rr_interval_stop(struct rq *rq, struct task_struct *task)
76{
77 return 0;
78}
79
80/*
81 * Simple, special scheduling class for the per-CPU stop tasks:
82 */
83static const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class,
85
86 .enqueue_task = enqueue_task_stop,
87 .dequeue_task = dequeue_task_stop,
88 .yield_task = yield_task_stop,
89
90 .check_preempt_curr = check_preempt_curr_stop,
91
92 .pick_next_task = pick_next_task_stop,
93 .put_prev_task = put_prev_task_stop,
94
95#ifdef CONFIG_SMP
96 .select_task_rq = select_task_rq_stop,
97#endif
98
99 .set_curr_task = set_curr_task_stop,
100 .task_tick = task_tick_stop,
101
102 .get_rr_interval = get_rr_interval_stop,
103
104 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108};
diff --git a/kernel/signal.c b/kernel/signal.c
index bded65187780..919562c3d6b7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2215,6 +2215,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2215#ifdef __ARCH_SI_TRAPNO 2215#ifdef __ARCH_SI_TRAPNO
2216 err |= __put_user(from->si_trapno, &to->si_trapno); 2216 err |= __put_user(from->si_trapno, &to->si_trapno);
2217#endif 2217#endif
2218#ifdef BUS_MCEERR_AO
2219 /*
2220 * Other callers might not initialize the si_lsb field,
2221 * so check explicitely for the right codes here.
2222 */
2223 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
2224 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2225#endif
2218 break; 2226 break;
2219 case __SI_CHLD: 2227 case __SI_CHLD:
2220 err |= __put_user(from->si_pid, &to->si_pid); 2228 err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c970c715d3..ed6aacfcb7ef 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -365,9 +365,10 @@ call:
365EXPORT_SYMBOL_GPL(smp_call_function_any); 365EXPORT_SYMBOL_GPL(smp_call_function_any);
366 366
367/** 367/**
368 * __smp_call_function_single(): Run a function on another CPU 368 * __smp_call_function_single(): Run a function on a specific CPU
369 * @cpu: The CPU to run on. 369 * @cpu: The CPU to run on.
370 * @data: Pre-allocated and setup data structure 370 * @data: Pre-allocated and setup data structure
371 * @wait: If true, wait until function has completed on specified CPU.
371 * 372 *
372 * Like smp_call_function_single(), but allow caller to pass in a 373 * Like smp_call_function_single(), but allow caller to pass in a
373 * pre-allocated data structure. Useful for embedding @data inside 374 * pre-allocated data structure. Useful for embedding @data inside
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
376void __smp_call_function_single(int cpu, struct call_single_data *data, 377void __smp_call_function_single(int cpu, struct call_single_data *data,
377 int wait) 378 int wait)
378{ 379{
379 csd_lock(data); 380 unsigned int this_cpu;
381 unsigned long flags;
380 382
383 this_cpu = get_cpu();
381 /* 384 /*
382 * Can deadlock when called with interrupts disabled. 385 * Can deadlock when called with interrupts disabled.
383 * We allow cpu's that are not yet online though, as no one else can 386 * We allow cpu's that are not yet online though, as no one else can
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
387 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() 390 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
388 && !oops_in_progress); 391 && !oops_in_progress);
389 392
390 generic_exec_single(cpu, data, wait); 393 if (cpu == this_cpu) {
394 local_irq_save(flags);
395 data->func(data->info);
396 local_irq_restore(flags);
397 } else {
398 csd_lock(data);
399 generic_exec_single(cpu, data, wait);
400 }
401 put_cpu();
391} 402}
392 403
393/** 404/**
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..79ee8f1fc0e7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
77} 77}
78 78
79/* 79/*
80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled.
87 */
88
89/*
80 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
81 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
82 */ 92 */
83#ifdef CONFIG_TRACE_IRQFLAGS 93#ifdef CONFIG_TRACE_IRQFLAGS
84static void __local_bh_disable(unsigned long ip) 94static void __local_bh_disable(unsigned long ip, unsigned int cnt)
85{ 95{
86 unsigned long flags; 96 unsigned long flags;
87 97
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
95 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
96 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
97 */ 107 */
98 preempt_count() += SOFTIRQ_OFFSET; 108 preempt_count() += cnt;
99 /* 109 /*
100 * Were softirqs turned off above: 110 * Were softirqs turned off above:
101 */ 111 */
102 if (softirq_count() == SOFTIRQ_OFFSET) 112 if (softirq_count() == cnt)
103 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
104 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
105 115
106 if (preempt_count() == SOFTIRQ_OFFSET) 116 if (preempt_count() == cnt)
107 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
108} 118}
109#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
110static inline void __local_bh_disable(unsigned long ip) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
111{ 121{
112 add_preempt_count(SOFTIRQ_OFFSET); 122 add_preempt_count(cnt);
113 barrier(); 123 barrier();
114} 124}
115#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
116 126
117void local_bh_disable(void) 127void local_bh_disable(void)
118{ 128{
119 __local_bh_disable((unsigned long)__builtin_return_address(0)); 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET);
120} 131}
121 132
122EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
123 134
135static void __local_bh_enable(unsigned int cnt)
136{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled());
139
140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt);
143}
144
124/* 145/*
125 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
126 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
128 */ 149 */
129void _local_bh_enable(void) 150void _local_bh_enable(void)
130{ 151{
131 WARN_ON_ONCE(in_irq()); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
132 WARN_ON_ONCE(!irqs_disabled());
133
134 if (softirq_count() == SOFTIRQ_OFFSET)
135 trace_softirqs_on((unsigned long)__builtin_return_address(0));
136 sub_preempt_count(SOFTIRQ_OFFSET);
137} 153}
138 154
139EXPORT_SYMBOL(_local_bh_enable); 155EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
147 /* 163 /*
148 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
149 */ 165 */
150 if (softirq_count() == SOFTIRQ_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
151 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
152 /* 168 /*
153 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
154 * softirq processing: 170 * softirq processing:
155 */ 171 */
156 sub_preempt_count(SOFTIRQ_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
157 173
158 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
159 do_softirq(); 175 do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
198 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
199 account_system_vtime(current); 215 account_system_vtime(current);
200 216
201 __local_bh_disable((unsigned long)__builtin_return_address(0)); 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET);
202 lockdep_softirq_enter(); 219 lockdep_softirq_enter();
203 220
204 cpu = smp_processor_id(); 221 cpu = smp_processor_id();
@@ -245,7 +262,7 @@ restart:
245 lockdep_softirq_exit(); 262 lockdep_softirq_exit();
246 263
247 account_system_vtime(current); 264 account_system_vtime(current);
248 _local_bh_enable(); 265 __local_bh_enable(SOFTIRQ_OFFSET);
249} 266}
250 267
251#ifndef __ARCH_HAS_DO_SOFTIRQ 268#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +296,16 @@ void irq_enter(void)
279 296
280 rcu_irq_enter(); 297 rcu_irq_enter();
281 if (idle_cpu(cpu) && !in_interrupt()) { 298 if (idle_cpu(cpu) && !in_interrupt()) {
282 __irq_enter(); 299 /*
300 * Prevent raise_softirq from needlessly waking up ksoftirqd
301 * here, as softirq will be serviced on return from interrupt.
302 */
303 local_bh_disable();
283 tick_check_idle(cpu); 304 tick_check_idle(cpu);
284 } else 305 _local_bh_enable();
285 __irq_enter(); 306 }
307
308 __irq_enter();
286} 309}
287 310
288#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 311#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
696{ 719{
697 set_current_state(TASK_INTERRUPTIBLE); 720 set_current_state(TASK_INTERRUPTIBLE);
698 721
722 current->flags |= PF_KSOFTIRQD;
699 while (!kthread_should_stop()) { 723 while (!kthread_should_stop()) {
700 preempt_disable(); 724 preempt_disable();
701 if (!local_softirq_pending()) { 725 if (!local_softirq_pending()) {
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..c71e07500536 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
46int __init_srcu_struct(struct srcu_struct *sp, const char *name, 46int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key) 47 struct lock_class_key *key)
48{ 48{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */ 49 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp)); 50 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0); 51 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp); 52 return init_srcu_struct_fields(sp);
55} 53}
56EXPORT_SYMBOL_GPL(__init_srcu_struct); 54EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb25127..090c28812ce1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -287,11 +287,12 @@ repeat:
287 goto repeat; 287 goto repeat;
288} 288}
289 289
290extern void sched_set_stop_task(int cpu, struct task_struct *stop);
291
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 292/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 293static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu) 294 unsigned long action, void *hcpu)
293{ 295{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu; 296 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 297 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p; 298 struct task_struct *p;
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
304 cpu); 305 cpu);
305 if (IS_ERR(p)) 306 if (IS_ERR(p))
306 return NOTIFY_BAD; 307 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p);
309 stopper->thread = p; 311 stopper->thread = p;
310 break; 312 break;
311 313
312 case CPU_ONLINE: 314 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */ 315 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread); 316 wake_up_process(stopper->thread);
316 /* mark enabled */ 317 /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
325 { 326 {
326 struct cpu_stop_work *work; 327 struct cpu_stop_work *work;
327 328
329 sched_set_stop_task(cpu, NULL);
328 /* kill the stopper */ 330 /* kill the stopper */
329 kthread_stop(stopper->thread); 331 kthread_stop(stopper->thread);
330 /* drain remaining works */ 332 /* drain remaining works */
diff --git a/kernel/sys.c b/kernel/sys.c
index e9ad44489828..7f5a0cd296a9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
931 pgid = pid; 931 pgid = pid;
932 if (pgid < 0) 932 if (pgid < 0)
933 return -EINVAL; 933 return -EINVAL;
934 rcu_read_lock();
934 935
935 /* From this point forward we keep holding onto the tasklist lock 936 /* From this point forward we keep holding onto the tasklist lock
936 * so that our parent does not change from under us. -DaveM 937 * so that our parent does not change from under us. -DaveM
@@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
984out: 985out:
985 /* All paths lead to here, thus we are safe. -DaveM */ 986 /* All paths lead to here, thus we are safe. -DaveM */
986 write_unlock_irq(&tasklist_lock); 987 write_unlock_irq(&tasklist_lock);
988 rcu_read_unlock();
987 return err; 989 return err;
988} 990}
989 991
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ca38e8e3e907..3a45c224770f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1713,10 +1713,7 @@ static __init int sysctl_init(void)
1713{ 1713{
1714 sysctl_set_parent(NULL, root_table); 1714 sysctl_set_parent(NULL, root_table);
1715#ifdef CONFIG_SYSCTL_SYSCALL_CHECK 1715#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1716 { 1716 sysctl_check_table(current->nsproxy, root_table);
1717 int err;
1718 err = sysctl_check_table(current->nsproxy, root_table);
1719 }
1720#endif 1717#endif
1721 return 0; 1718 return 0;
1722} 1719}
@@ -2488,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2488 kbuf[left] = 0; 2485 kbuf[left] = 0;
2489 } 2486 }
2490 2487
2491 for (; left && vleft--; i++, min++, max++, first=0) { 2488 for (; left && vleft--; i++, first = 0) {
2492 unsigned long val; 2489 unsigned long val;
2493 2490
2494 if (write) { 2491 if (write) {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 04cdcf72c827..10b90d8a03c4 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
143 if (!table->maxlen) 143 if (!table->maxlen)
144 set_fail(&fail, table, "No maxlen"); 144 set_fail(&fail, table, "No maxlen");
145 } 145 }
146 if ((table->proc_handler == proc_doulongvec_minmax) ||
147 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
148 if (table->maxlen > sizeof (unsigned long)) {
149 if (!table->extra1)
150 set_fail(&fail, table, "No min");
151 if (!table->extra2)
152 set_fail(&fail, table, "No max");
153 }
154 }
155#ifdef CONFIG_PROC_SYSCTL 146#ifdef CONFIG_PROC_SYSCTL
156 if (table->procname && !table->proc_handler) 147 if (table->procname && !table->proc_handler)
157 set_fail(&fail, table, "No proc_handler"); 148 set_fail(&fail, table, "No proc_handler");
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 4f104515a19b..f8b11a283171 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -115,7 +115,9 @@ static int test_kprobes(void)
115 int ret; 115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2}; 116 struct kprobe *kps[2] = {&kp, &kp2};
117 117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 118 /* addr and flags should be cleard for reusing kprobe. */
119 kp.addr = NULL;
120 kp.flags = 0;
119 ret = register_kprobes(kps, 2); 121 ret = register_kprobes(kps, 2);
120 if (ret < 0) { 122 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: " 123 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -210,7 +212,9 @@ static int test_jprobes(void)
210 int ret; 212 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2}; 213 struct jprobe *jps[2] = {&jp, &jp2};
212 214
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 215 /* addr and flags should be cleard for reusing kprobe. */
216 jp.kp.addr = NULL;
217 jp.kp.flags = 0;
214 ret = register_jprobes(jps, 2); 218 ret = register_jprobes(jps, 2);
215 if (ret < 0) { 219 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: " 220 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -323,7 +327,9 @@ static int test_kretprobes(void)
323 int ret; 327 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2}; 328 struct kretprobe *rps[2] = {&rp, &rp2};
325 329
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ 330 /* addr and flags should be cleard for reusing kprobe. */
331 rp.kp.addr = NULL;
332 rp.kp.flags = 0;
327 ret = register_kretprobes(rps, 2); 333 ret = register_kretprobes(rps, 2);
328 if (ret < 0) { 334 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: " 335 printk(KERN_ERR "Kprobe smoke test failed: "
diff --git a/kernel/timer.c b/kernel/timer.c
index 97bf05baade7..68a9ae7679b7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/irq_work.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43 43
@@ -1279,7 +1279,10 @@ void update_process_times(int user_tick)
1279 run_local_timers(); 1279 run_local_timers();
1280 rcu_check_callbacks(cpu, user_tick); 1280 rcu_check_callbacks(cpu, user_tick);
1281 printk_tick(); 1281 printk_tick();
1282 perf_event_do_pending(); 1282#ifdef CONFIG_IRQ_WORK
1283 if (in_irq())
1284 irq_work_run();
1285#endif
1283 scheduler_tick(); 1286 scheduler_tick();
1284 run_posix_cpu_timers(p); 1287 run_posix_cpu_timers(p);
1285} 1288}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 538501c6ea50..e550d2eda1df 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
49 help 49 help
50 See Documentation/trace/ftrace-design.txt 50 See Documentation/trace/ftrace-design.txt
51 51
52config HAVE_C_RECORDMCOUNT
53 bool
54 help
55 C version of recordmcount available?
56
52config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
53 bool 58 bool
54 59
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0d88ce9b9fb8..ebd80d50c474 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
381{ 381{
382 struct ftrace_profile *rec = v; 382 struct ftrace_profile *rec = v;
383 char str[KSYM_SYMBOL_LEN]; 383 char str[KSYM_SYMBOL_LEN];
384 int ret = 0;
384#ifdef CONFIG_FUNCTION_GRAPH_TRACER 385#ifdef CONFIG_FUNCTION_GRAPH_TRACER
385 static DEFINE_MUTEX(mutex);
386 static struct trace_seq s; 386 static struct trace_seq s;
387 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev; 388 unsigned long long stddev;
389#endif 389#endif
390 mutex_lock(&ftrace_profile_lock);
391
392 /* we raced with function_profile_reset() */
393 if (unlikely(rec->counter == 0)) {
394 ret = -EBUSY;
395 goto out;
396 }
390 397
391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 398 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
392 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 399 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v)
408 do_div(stddev, (rec->counter - 1) * 1000); 415 do_div(stddev, (rec->counter - 1) * 1000);
409 } 416 }
410 417
411 mutex_lock(&mutex);
412 trace_seq_init(&s); 418 trace_seq_init(&s);
413 trace_print_graph_duration(rec->time, &s); 419 trace_print_graph_duration(rec->time, &s);
414 trace_seq_puts(&s, " "); 420 trace_seq_puts(&s, " ");
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v)
416 trace_seq_puts(&s, " "); 422 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s); 423 trace_print_graph_duration(stddev, &s);
418 trace_print_seq(m, &s); 424 trace_print_seq(m, &s);
419 mutex_unlock(&mutex);
420#endif 425#endif
421 seq_putc(m, '\n'); 426 seq_putc(m, '\n');
427out:
428 mutex_unlock(&ftrace_profile_lock);
422 429
423 return 0; 430 return ret;
424} 431}
425 432
426static void ftrace_profile_reset(struct ftrace_profile_stat *stat) 433static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -877,10 +884,8 @@ enum {
877 FTRACE_ENABLE_CALLS = (1 << 0), 884 FTRACE_ENABLE_CALLS = (1 << 0),
878 FTRACE_DISABLE_CALLS = (1 << 1), 885 FTRACE_DISABLE_CALLS = (1 << 1),
879 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 886 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
880 FTRACE_ENABLE_MCOUNT = (1 << 3), 887 FTRACE_START_FUNC_RET = (1 << 3),
881 FTRACE_DISABLE_MCOUNT = (1 << 4), 888 FTRACE_STOP_FUNC_RET = (1 << 4),
882 FTRACE_START_FUNC_RET = (1 << 5),
883 FTRACE_STOP_FUNC_RET = (1 << 6),
884}; 889};
885 890
886static int ftrace_filtered; 891static int ftrace_filtered;
@@ -1219,8 +1224,6 @@ static void ftrace_shutdown(int command)
1219 1224
1220static void ftrace_startup_sysctl(void) 1225static void ftrace_startup_sysctl(void)
1221{ 1226{
1222 int command = FTRACE_ENABLE_MCOUNT;
1223
1224 if (unlikely(ftrace_disabled)) 1227 if (unlikely(ftrace_disabled))
1225 return; 1228 return;
1226 1229
@@ -1228,23 +1231,17 @@ static void ftrace_startup_sysctl(void)
1228 saved_ftrace_func = NULL; 1231 saved_ftrace_func = NULL;
1229 /* ftrace_start_up is true if we want ftrace running */ 1232 /* ftrace_start_up is true if we want ftrace running */
1230 if (ftrace_start_up) 1233 if (ftrace_start_up)
1231 command |= FTRACE_ENABLE_CALLS; 1234 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1232
1233 ftrace_run_update_code(command);
1234} 1235}
1235 1236
1236static void ftrace_shutdown_sysctl(void) 1237static void ftrace_shutdown_sysctl(void)
1237{ 1238{
1238 int command = FTRACE_DISABLE_MCOUNT;
1239
1240 if (unlikely(ftrace_disabled)) 1239 if (unlikely(ftrace_disabled))
1241 return; 1240 return;
1242 1241
1243 /* ftrace_start_up is true if ftrace is running */ 1242 /* ftrace_start_up is true if ftrace is running */
1244 if (ftrace_start_up) 1243 if (ftrace_start_up)
1245 command |= FTRACE_DISABLE_CALLS; 1244 ftrace_run_update_code(FTRACE_DISABLE_CALLS);
1246
1247 ftrace_run_update_code(command);
1248} 1245}
1249 1246
1250static cycle_t ftrace_update_time; 1247static cycle_t ftrace_update_time;
@@ -1361,24 +1358,29 @@ enum {
1361#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1358#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1362 1359
1363struct ftrace_iterator { 1360struct ftrace_iterator {
1364 struct ftrace_page *pg; 1361 loff_t pos;
1365 int hidx; 1362 loff_t func_pos;
1366 int idx; 1363 struct ftrace_page *pg;
1367 unsigned flags; 1364 struct dyn_ftrace *func;
1368 struct trace_parser parser; 1365 struct ftrace_func_probe *probe;
1366 struct trace_parser parser;
1367 int hidx;
1368 int idx;
1369 unsigned flags;
1369}; 1370};
1370 1371
1371static void * 1372static void *
1372t_hash_next(struct seq_file *m, void *v, loff_t *pos) 1373t_hash_next(struct seq_file *m, loff_t *pos)
1373{ 1374{
1374 struct ftrace_iterator *iter = m->private; 1375 struct ftrace_iterator *iter = m->private;
1375 struct hlist_node *hnd = v; 1376 struct hlist_node *hnd = NULL;
1376 struct hlist_head *hhd; 1377 struct hlist_head *hhd;
1377 1378
1378 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
1379
1380 (*pos)++; 1379 (*pos)++;
1380 iter->pos = *pos;
1381 1381
1382 if (iter->probe)
1383 hnd = &iter->probe->node;
1382 retry: 1384 retry:
1383 if (iter->hidx >= FTRACE_FUNC_HASHSIZE) 1385 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
1384 return NULL; 1386 return NULL;
@@ -1401,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos)
1401 } 1403 }
1402 } 1404 }
1403 1405
1404 return hnd; 1406 if (WARN_ON_ONCE(!hnd))
1407 return NULL;
1408
1409 iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
1410
1411 return iter;
1405} 1412}
1406 1413
1407static void *t_hash_start(struct seq_file *m, loff_t *pos) 1414static void *t_hash_start(struct seq_file *m, loff_t *pos)
@@ -1410,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1410 void *p = NULL; 1417 void *p = NULL;
1411 loff_t l; 1418 loff_t l;
1412 1419
1413 if (!(iter->flags & FTRACE_ITER_HASH)) 1420 if (iter->func_pos > *pos)
1414 *pos = 0; 1421 return NULL;
1415
1416 iter->flags |= FTRACE_ITER_HASH;
1417 1422
1418 iter->hidx = 0; 1423 iter->hidx = 0;
1419 for (l = 0; l <= *pos; ) { 1424 for (l = 0; l <= (*pos - iter->func_pos); ) {
1420 p = t_hash_next(m, p, &l); 1425 p = t_hash_next(m, &l);
1421 if (!p) 1426 if (!p)
1422 break; 1427 break;
1423 } 1428 }
1424 return p; 1429 if (!p)
1430 return NULL;
1431
1432 /* Only set this if we have an item */
1433 iter->flags |= FTRACE_ITER_HASH;
1434
1435 return iter;
1425} 1436}
1426 1437
1427static int t_hash_show(struct seq_file *m, void *v) 1438static int
1439t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
1428{ 1440{
1429 struct ftrace_func_probe *rec; 1441 struct ftrace_func_probe *rec;
1430 struct hlist_node *hnd = v;
1431 1442
1432 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1443 rec = iter->probe;
1444 if (WARN_ON_ONCE(!rec))
1445 return -EIO;
1433 1446
1434 if (rec->ops->print) 1447 if (rec->ops->print)
1435 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1448 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1450,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1450 struct dyn_ftrace *rec = NULL; 1463 struct dyn_ftrace *rec = NULL;
1451 1464
1452 if (iter->flags & FTRACE_ITER_HASH) 1465 if (iter->flags & FTRACE_ITER_HASH)
1453 return t_hash_next(m, v, pos); 1466 return t_hash_next(m, pos);
1454 1467
1455 (*pos)++; 1468 (*pos)++;
1469 iter->pos = *pos;
1456 1470
1457 if (iter->flags & FTRACE_ITER_PRINTALL) 1471 if (iter->flags & FTRACE_ITER_PRINTALL)
1458 return NULL; 1472 return t_hash_start(m, pos);
1459 1473
1460 retry: 1474 retry:
1461 if (iter->idx >= iter->pg->index) { 1475 if (iter->idx >= iter->pg->index) {
@@ -1484,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
1484 } 1498 }
1485 } 1499 }
1486 1500
1487 return rec; 1501 if (!rec)
1502 return t_hash_start(m, pos);
1503
1504 iter->func_pos = *pos;
1505 iter->func = rec;
1506
1507 return iter;
1508}
1509
1510static void reset_iter_read(struct ftrace_iterator *iter)
1511{
1512 iter->pos = 0;
1513 iter->func_pos = 0;
1514 iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
1488} 1515}
1489 1516
1490static void *t_start(struct seq_file *m, loff_t *pos) 1517static void *t_start(struct seq_file *m, loff_t *pos)
@@ -1495,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1495 1522
1496 mutex_lock(&ftrace_lock); 1523 mutex_lock(&ftrace_lock);
1497 /* 1524 /*
1525 * If an lseek was done, then reset and start from beginning.
1526 */
1527 if (*pos < iter->pos)
1528 reset_iter_read(iter);
1529
1530 /*
1498 * For set_ftrace_filter reading, if we have the filter 1531 * For set_ftrace_filter reading, if we have the filter
1499 * off, we can short cut and just print out that all 1532 * off, we can short cut and just print out that all
1500 * functions are enabled. 1533 * functions are enabled.
@@ -1503,12 +1536,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1503 if (*pos > 0) 1536 if (*pos > 0)
1504 return t_hash_start(m, pos); 1537 return t_hash_start(m, pos);
1505 iter->flags |= FTRACE_ITER_PRINTALL; 1538 iter->flags |= FTRACE_ITER_PRINTALL;
1539 /* reset in case of seek/pread */
1540 iter->flags &= ~FTRACE_ITER_HASH;
1506 return iter; 1541 return iter;
1507 } 1542 }
1508 1543
1509 if (iter->flags & FTRACE_ITER_HASH) 1544 if (iter->flags & FTRACE_ITER_HASH)
1510 return t_hash_start(m, pos); 1545 return t_hash_start(m, pos);
1511 1546
1547 /*
1548 * Unfortunately, we need to restart at ftrace_pages_start
1549 * every time we let go of the ftrace_mutex. This is because
1550 * those pointers can change without the lock.
1551 */
1512 iter->pg = ftrace_pages_start; 1552 iter->pg = ftrace_pages_start;
1513 iter->idx = 0; 1553 iter->idx = 0;
1514 for (l = 0; l <= *pos; ) { 1554 for (l = 0; l <= *pos; ) {
@@ -1517,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos)
1517 break; 1557 break;
1518 } 1558 }
1519 1559
1520 if (!p && iter->flags & FTRACE_ITER_FILTER) 1560 if (!p) {
1521 return t_hash_start(m, pos); 1561 if (iter->flags & FTRACE_ITER_FILTER)
1562 return t_hash_start(m, pos);
1522 1563
1523 return p; 1564 return NULL;
1565 }
1566
1567 return iter;
1524} 1568}
1525 1569
1526static void t_stop(struct seq_file *m, void *p) 1570static void t_stop(struct seq_file *m, void *p)
@@ -1531,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p)
1531static int t_show(struct seq_file *m, void *v) 1575static int t_show(struct seq_file *m, void *v)
1532{ 1576{
1533 struct ftrace_iterator *iter = m->private; 1577 struct ftrace_iterator *iter = m->private;
1534 struct dyn_ftrace *rec = v; 1578 struct dyn_ftrace *rec;
1535 1579
1536 if (iter->flags & FTRACE_ITER_HASH) 1580 if (iter->flags & FTRACE_ITER_HASH)
1537 return t_hash_show(m, v); 1581 return t_hash_show(m, iter);
1538 1582
1539 if (iter->flags & FTRACE_ITER_PRINTALL) { 1583 if (iter->flags & FTRACE_ITER_PRINTALL) {
1540 seq_printf(m, "#### all functions enabled ####\n"); 1584 seq_printf(m, "#### all functions enabled ####\n");
1541 return 0; 1585 return 0;
1542 } 1586 }
1543 1587
1588 rec = iter->func;
1589
1544 if (!rec) 1590 if (!rec)
1545 return 0; 1591 return 0;
1546 1592
@@ -1592,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
1592 1638
1593 ret = ftrace_avail_open(inode, file); 1639 ret = ftrace_avail_open(inode, file);
1594 if (!ret) { 1640 if (!ret) {
1595 m = (struct seq_file *)file->private_data; 1641 m = file->private_data;
1596 iter = (struct ftrace_iterator *)m->private; 1642 iter = m->private;
1597 iter->flags = FTRACE_ITER_FAILURES; 1643 iter->flags = FTRACE_ITER_FAILURES;
1598 } 1644 }
1599 1645
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 19cccc3c3028..c5a632a669e1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta)
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 406
407/* Max number of timestamps that can fit on a page */ 407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) 408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
409 409
410int ring_buffer_print_page_header(struct trace_seq *s) 410int ring_buffer_print_page_header(struct trace_seq *s)
411{ 411{
@@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2606} 2606}
2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); 2607EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2608 2608
2609/*
2610 * The total entries in the ring buffer is the running counter
2611 * of entries entered into the ring buffer, minus the sum of
2612 * the entries read from the ring buffer and the number of
2613 * entries that were overwritten.
2614 */
2615static inline unsigned long
2616rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2617{
2618 return local_read(&cpu_buffer->entries) -
2619 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
2620}
2621
2609/** 2622/**
2610 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2623 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2611 * @buffer: The ring buffer 2624 * @buffer: The ring buffer
@@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2614unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 2627unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2615{ 2628{
2616 struct ring_buffer_per_cpu *cpu_buffer; 2629 struct ring_buffer_per_cpu *cpu_buffer;
2617 unsigned long ret;
2618 2630
2619 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2631 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2620 return 0; 2632 return 0;
2621 2633
2622 cpu_buffer = buffer->buffers[cpu]; 2634 cpu_buffer = buffer->buffers[cpu];
2623 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2624 - cpu_buffer->read;
2625 2635
2626 return ret; 2636 return rb_num_of_entries(cpu_buffer);
2627} 2637}
2628EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2629 2639
@@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2684 /* if you care about this being correct, lock the buffer */ 2694 /* if you care about this being correct, lock the buffer */
2685 for_each_buffer_cpu(buffer, cpu) { 2695 for_each_buffer_cpu(buffer, cpu) {
2686 cpu_buffer = buffer->buffers[cpu]; 2696 cpu_buffer = buffer->buffers[cpu];
2687 entries += (local_read(&cpu_buffer->entries) - 2697 entries += rb_num_of_entries(cpu_buffer);
2688 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2689 } 2698 }
2690 2699
2691 return entries; 2700 return entries;
@@ -2985,13 +2994,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2985 2994
2986static void rb_advance_iter(struct ring_buffer_iter *iter) 2995static void rb_advance_iter(struct ring_buffer_iter *iter)
2987{ 2996{
2988 struct ring_buffer *buffer;
2989 struct ring_buffer_per_cpu *cpu_buffer; 2997 struct ring_buffer_per_cpu *cpu_buffer;
2990 struct ring_buffer_event *event; 2998 struct ring_buffer_event *event;
2991 unsigned length; 2999 unsigned length;
2992 3000
2993 cpu_buffer = iter->cpu_buffer; 3001 cpu_buffer = iter->cpu_buffer;
2994 buffer = cpu_buffer->buffer;
2995 3002
2996 /* 3003 /*
2997 * Check if we are at the end of the buffer. 3004 * Check if we are at the end of the buffer.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f541156..001bcd2ccf4a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2196 2196
2197static int tracing_release(struct inode *inode, struct file *file) 2197static int tracing_release(struct inode *inode, struct file *file)
2198{ 2198{
2199 struct seq_file *m = (struct seq_file *)file->private_data; 2199 struct seq_file *m = file->private_data;
2200 struct trace_iterator *iter; 2200 struct trace_iterator *iter;
2201 int cpu; 2201 int cpu;
2202 2202
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a5..9021f8c0c0c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
343 unsigned long ip, 343 unsigned long ip,
344 unsigned long parent_ip, 344 unsigned long parent_ip,
345 unsigned long flags, int pc); 345 unsigned long flags, int pc);
346void trace_graph_function(struct trace_array *tr,
347 unsigned long ip,
348 unsigned long parent_ip,
349 unsigned long flags, int pc);
346void trace_default_header(struct seq_file *m); 350void trace_default_header(struct seq_file *m);
347void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 351void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
348int trace_empty(struct trace_iterator *iter); 352int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 000e6e85b445..39c059ca670e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12static char *perf_trace_buf[4]; 12static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
13 13
14/* 14/*
15 * Force it to be aligned to unsigned long to avoid misaligned accesses 15 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -24,7 +24,7 @@ static int total_ref_count;
24static int perf_trace_event_init(struct ftrace_event_call *tp_event, 24static int perf_trace_event_init(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 struct hlist_head *list; 27 struct hlist_head __percpu *list;
28 int ret = -ENOMEM; 28 int ret = -ENOMEM;
29 int cpu; 29 int cpu;
30 30
@@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
42 tp_event->perf_events = list; 42 tp_event->perf_events = list;
43 43
44 if (!total_ref_count) { 44 if (!total_ref_count) {
45 char *buf; 45 char __percpu *buf;
46 int i; 46 int i;
47 47
48 for (i = 0; i < 4; i++) { 48 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
49 buf = (char *)alloc_percpu(perf_trace_t); 49 buf = (char __percpu *)alloc_percpu(perf_trace_t);
50 if (!buf) 50 if (!buf)
51 goto fail; 51 goto fail;
52 52
@@ -65,7 +65,7 @@ fail:
65 if (!total_ref_count) { 65 if (!total_ref_count) {
66 int i; 66 int i;
67 67
68 for (i = 0; i < 4; i++) { 68 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
69 free_percpu(perf_trace_buf[i]); 69 free_percpu(perf_trace_buf[i]);
70 perf_trace_buf[i] = NULL; 70 perf_trace_buf[i] = NULL;
71 } 71 }
@@ -91,6 +91,8 @@ int perf_trace_init(struct perf_event *p_event)
91 tp_event->class && tp_event->class->reg && 91 tp_event->class && tp_event->class->reg &&
92 try_module_get(tp_event->mod)) { 92 try_module_get(tp_event->mod)) {
93 ret = perf_trace_event_init(tp_event, p_event); 93 ret = perf_trace_event_init(tp_event, p_event);
94 if (ret)
95 module_put(tp_event->mod);
94 break; 96 break;
95 } 97 }
96 } 98 }
@@ -99,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event)
99 return ret; 101 return ret;
100} 102}
101 103
102int perf_trace_enable(struct perf_event *p_event) 104int perf_trace_add(struct perf_event *p_event, int flags)
103{ 105{
104 struct ftrace_event_call *tp_event = p_event->tp_event; 106 struct ftrace_event_call *tp_event = p_event->tp_event;
107 struct hlist_head __percpu *pcpu_list;
105 struct hlist_head *list; 108 struct hlist_head *list;
106 109
107 list = tp_event->perf_events; 110 pcpu_list = tp_event->perf_events;
108 if (WARN_ON_ONCE(!list)) 111 if (WARN_ON_ONCE(!pcpu_list))
109 return -EINVAL; 112 return -EINVAL;
110 113
111 list = this_cpu_ptr(list); 114 if (!(flags & PERF_EF_START))
115 p_event->hw.state = PERF_HES_STOPPED;
116
117 list = this_cpu_ptr(pcpu_list);
112 hlist_add_head_rcu(&p_event->hlist_entry, list); 118 hlist_add_head_rcu(&p_event->hlist_entry, list);
113 119
114 return 0; 120 return 0;
115} 121}
116 122
117void perf_trace_disable(struct perf_event *p_event) 123void perf_trace_del(struct perf_event *p_event, int flags)
118{ 124{
119 hlist_del_rcu(&p_event->hlist_entry); 125 hlist_del_rcu(&p_event->hlist_entry);
120} 126}
@@ -140,12 +146,13 @@ void perf_trace_destroy(struct perf_event *p_event)
140 tp_event->perf_events = NULL; 146 tp_event->perf_events = NULL;
141 147
142 if (!--total_ref_count) { 148 if (!--total_ref_count) {
143 for (i = 0; i < 4; i++) { 149 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
144 free_percpu(perf_trace_buf[i]); 150 free_percpu(perf_trace_buf[i]);
145 perf_trace_buf[i] = NULL; 151 perf_trace_buf[i] = NULL;
146 } 152 }
147 } 153 }
148out: 154out:
155 module_put(tp_event->mod);
149 mutex_unlock(&event_mutex); 156 mutex_unlock(&event_mutex);
150} 157}
151 158
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4c758f146328..398c0e8b332c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,21 +600,29 @@ out:
600 600
601enum { 601enum {
602 FORMAT_HEADER = 1, 602 FORMAT_HEADER = 1,
603 FORMAT_PRINTFMT = 2, 603 FORMAT_FIELD_SEPERATOR = 2,
604 FORMAT_PRINTFMT = 3,
604}; 605};
605 606
606static void *f_next(struct seq_file *m, void *v, loff_t *pos) 607static void *f_next(struct seq_file *m, void *v, loff_t *pos)
607{ 608{
608 struct ftrace_event_call *call = m->private; 609 struct ftrace_event_call *call = m->private;
609 struct ftrace_event_field *field; 610 struct ftrace_event_field *field;
610 struct list_head *head; 611 struct list_head *common_head = &ftrace_common_fields;
612 struct list_head *head = trace_get_fields(call);
611 613
612 (*pos)++; 614 (*pos)++;
613 615
614 switch ((unsigned long)v) { 616 switch ((unsigned long)v) {
615 case FORMAT_HEADER: 617 case FORMAT_HEADER:
616 head = &ftrace_common_fields; 618 if (unlikely(list_empty(common_head)))
619 return NULL;
620
621 field = list_entry(common_head->prev,
622 struct ftrace_event_field, link);
623 return field;
617 624
625 case FORMAT_FIELD_SEPERATOR:
618 if (unlikely(list_empty(head))) 626 if (unlikely(list_empty(head)))
619 return NULL; 627 return NULL;
620 628
@@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
626 return NULL; 634 return NULL;
627 } 635 }
628 636
629 head = trace_get_fields(call);
630
631 /*
632 * To separate common fields from event fields, the
633 * LSB is set on the first event field. Clear it in case.
634 */
635 v = (void *)((unsigned long)v & ~1L);
636
637 field = v; 637 field = v;
638 /* 638 if (field->link.prev == common_head)
639 * If this is a common field, and at the end of the list, then 639 return (void *)FORMAT_FIELD_SEPERATOR;
640 * continue with main list. 640 else if (field->link.prev == head)
641 */
642 if (field->link.prev == &ftrace_common_fields) {
643 if (unlikely(list_empty(head)))
644 return NULL;
645 field = list_entry(head->prev, struct ftrace_event_field, link);
646 /* Set the LSB to notify f_show to print an extra newline */
647 field = (struct ftrace_event_field *)
648 ((unsigned long)field | 1);
649 return field;
650 }
651
652 /* If we are done tell f_show to print the format */
653 if (field->link.prev == head)
654 return (void *)FORMAT_PRINTFMT; 641 return (void *)FORMAT_PRINTFMT;
655 642
656 field = list_entry(field->link.prev, struct ftrace_event_field, link); 643 field = list_entry(field->link.prev, struct ftrace_event_field, link);
@@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v)
688 seq_printf(m, "format:\n"); 675 seq_printf(m, "format:\n");
689 return 0; 676 return 0;
690 677
678 case FORMAT_FIELD_SEPERATOR:
679 seq_putc(m, '\n');
680 return 0;
681
691 case FORMAT_PRINTFMT: 682 case FORMAT_PRINTFMT:
692 seq_printf(m, "\nprint fmt: %s\n", 683 seq_printf(m, "\nprint fmt: %s\n",
693 call->print_fmt); 684 call->print_fmt);
694 return 0; 685 return 0;
695 } 686 }
696 687
697 /*
698 * To separate common fields from event fields, the
699 * LSB is set on the first event field. Clear it and
700 * print a newline if it is set.
701 */
702 if ((unsigned long)v & 1) {
703 seq_putc(m, '\n');
704 v = (void *)((unsigned long)v & ~1L);
705 }
706
707 field = v; 688 field = v;
708 689
709 /* 690 /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6f233698518e..76b05980225c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,15 +15,19 @@
15#include "trace.h" 15#include "trace.h"
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18/* When set, irq functions will be ignored */
19static int ftrace_graph_skip_irqs;
20
18struct fgraph_cpu_data { 21struct fgraph_cpu_data {
19 pid_t last_pid; 22 pid_t last_pid;
20 int depth; 23 int depth;
24 int depth_irq;
21 int ignore; 25 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; 26 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
23}; 27};
24 28
25struct fgraph_data { 29struct fgraph_data {
26 struct fgraph_cpu_data *cpu_data; 30 struct fgraph_cpu_data __percpu *cpu_data;
27 31
28 /* Place to preserve last processed entry. */ 32 /* Place to preserve last processed entry. */
29 struct ftrace_graph_ent_entry ent; 33 struct ftrace_graph_ent_entry ent;
@@ -41,6 +45,7 @@ struct fgraph_data {
41#define TRACE_GRAPH_PRINT_PROC 0x8 45#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 46#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40
44 49
45static struct tracer_opt trace_opts[] = { 50static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 51 /* Display overruns? (for self-debug purpose) */
@@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = {
55 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, 60 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
56 /* Display absolute time of an entry */ 61 /* Display absolute time of an entry */
57 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, 62 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
63 /* Display interrupts */
64 { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
58 { } /* Empty entry */ 65 { } /* Empty entry */
59}; 66};
60 67
61static struct tracer_flags tracer_flags = { 68static struct tracer_flags tracer_flags = {
62 /* Don't display overruns and proc by default */ 69 /* Don't display overruns and proc by default */
63 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | 70 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
64 TRACE_GRAPH_PRINT_DURATION, 71 TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
65 .opts = trace_opts 72 .opts = trace_opts
66}; 73};
67 74
@@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr,
204 return 1; 211 return 1;
205} 212}
206 213
214static inline int ftrace_graph_ignore_irqs(void)
215{
216 if (!ftrace_graph_skip_irqs)
217 return 0;
218
219 return in_irq();
220}
221
207int trace_graph_entry(struct ftrace_graph_ent *trace) 222int trace_graph_entry(struct ftrace_graph_ent *trace)
208{ 223{
209 struct trace_array *tr = graph_array; 224 struct trace_array *tr = graph_array;
@@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
218 return 0; 233 return 0;
219 234
220 /* trace it when it is-nested-in or is a function enabled. */ 235 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func))) 236 if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
237 ftrace_graph_ignore_irqs())
222 return 0; 238 return 0;
223 239
224 local_irq_save(flags); 240 local_irq_save(flags);
@@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 262 return trace_graph_entry(trace);
247} 263}
248 264
265static void
266__trace_graph_function(struct trace_array *tr,
267 unsigned long ip, unsigned long flags, int pc)
268{
269 u64 time = trace_clock_local();
270 struct ftrace_graph_ent ent = {
271 .func = ip,
272 .depth = 0,
273 };
274 struct ftrace_graph_ret ret = {
275 .func = ip,
276 .depth = 0,
277 .calltime = time,
278 .rettime = time,
279 };
280
281 __trace_graph_entry(tr, &ent, flags, pc);
282 __trace_graph_return(tr, &ret, flags, pc);
283}
284
285void
286trace_graph_function(struct trace_array *tr,
287 unsigned long ip, unsigned long parent_ip,
288 unsigned long flags, int pc)
289{
290 __trace_graph_function(tr, ip, flags, pc);
291}
292
249void __trace_graph_return(struct trace_array *tr, 293void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 294 struct ftrace_graph_ret *trace,
251 unsigned long flags, 295 unsigned long flags,
@@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
649 693
650 /* Print nsecs (we don't want to exceed 7 numbers) */ 694 /* Print nsecs (we don't want to exceed 7 numbers) */
651 if (len < 7) { 695 if (len < 7) {
652 snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", 696 size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
653 nsecs_rem); 697
698 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
654 ret = trace_seq_printf(s, ".%s", nsecs_str); 699 ret = trace_seq_printf(s, ".%s", nsecs_str);
655 if (!ret) 700 if (!ret)
656 return TRACE_TYPE_PARTIAL_LINE; 701 return TRACE_TYPE_PARTIAL_LINE;
@@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
855 return 0; 900 return 0;
856} 901}
857 902
903/*
904 * Entry check for irq code
905 *
906 * returns 1 if
907 * - we are inside irq code
908 * - we just extered irq code
909 *
910 * retunns 0 if
911 * - funcgraph-interrupts option is set
912 * - we are not inside irq code
913 */
914static int
915check_irq_entry(struct trace_iterator *iter, u32 flags,
916 unsigned long addr, int depth)
917{
918 int cpu = iter->cpu;
919 int *depth_irq;
920 struct fgraph_data *data = iter->private;
921
922 /*
923 * If we are either displaying irqs, or we got called as
924 * a graph event and private data does not exist,
925 * then we bypass the irq check.
926 */
927 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
928 (!data))
929 return 0;
930
931 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
932
933 /*
934 * We are inside the irq code
935 */
936 if (*depth_irq >= 0)
937 return 1;
938
939 if ((addr < (unsigned long)__irqentry_text_start) ||
940 (addr >= (unsigned long)__irqentry_text_end))
941 return 0;
942
943 /*
944 * We are entering irq code.
945 */
946 *depth_irq = depth;
947 return 1;
948}
949
950/*
951 * Return check for irq code
952 *
953 * returns 1 if
954 * - we are inside irq code
955 * - we just left irq code
956 *
957 * returns 0 if
958 * - funcgraph-interrupts option is set
959 * - we are not inside irq code
960 */
961static int
962check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
963{
964 int cpu = iter->cpu;
965 int *depth_irq;
966 struct fgraph_data *data = iter->private;
967
968 /*
969 * If we are either displaying irqs, or we got called as
970 * a graph event and private data does not exist,
971 * then we bypass the irq check.
972 */
973 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
974 (!data))
975 return 0;
976
977 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
978
979 /*
980 * We are not inside the irq code.
981 */
982 if (*depth_irq == -1)
983 return 0;
984
985 /*
986 * We are inside the irq code, and this is returning entry.
987 * Let's not trace it and clear the entry depth, since
988 * we are out of irq code.
989 *
990 * This condition ensures that we 'leave the irq code' once
991 * we are out of the entry depth. Thus protecting us from
992 * the RETURN entry loss.
993 */
994 if (*depth_irq >= depth) {
995 *depth_irq = -1;
996 return 1;
997 }
998
999 /*
1000 * We are inside the irq code, and this is not the entry.
1001 */
1002 return 1;
1003}
1004
858static enum print_line_t 1005static enum print_line_t
859print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 1006print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
860 struct trace_iterator *iter, u32 flags) 1007 struct trace_iterator *iter, u32 flags)
@@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
865 static enum print_line_t ret; 1012 static enum print_line_t ret;
866 int cpu = iter->cpu; 1013 int cpu = iter->cpu;
867 1014
1015 if (check_irq_entry(iter, flags, call->func, call->depth))
1016 return TRACE_TYPE_HANDLED;
1017
868 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) 1018 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
869 return TRACE_TYPE_PARTIAL_LINE; 1019 return TRACE_TYPE_PARTIAL_LINE;
870 1020
@@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
902 int ret; 1052 int ret;
903 int i; 1053 int i;
904 1054
1055 if (check_irq_return(iter, flags, trace->depth))
1056 return TRACE_TYPE_HANDLED;
1057
905 if (data) { 1058 if (data) {
906 struct fgraph_cpu_data *cpu_data; 1059 struct fgraph_cpu_data *cpu_data;
907 int cpu = iter->cpu; 1060 int cpu = iter->cpu;
@@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1054 1207
1055 1208
1056enum print_line_t 1209enum print_line_t
1057print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1058{ 1211{
1059 struct ftrace_graph_ent_entry *field; 1212 struct ftrace_graph_ent_entry *field;
1060 struct fgraph_data *data = iter->private; 1213 struct fgraph_data *data = iter->private;
@@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1117static enum print_line_t 1270static enum print_line_t
1118print_graph_function(struct trace_iterator *iter) 1271print_graph_function(struct trace_iterator *iter)
1119{ 1272{
1120 return print_graph_function_flags(iter, tracer_flags.val); 1273 return __print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1121} 1285}
1122 1286
1123static enum print_line_t 1287static enum print_line_t
@@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1149 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1150} 1314}
1151 1315
1152void print_graph_headers_flags(struct seq_file *s, u32 flags) 1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1153{ 1317{
1154 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1318 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1155 1319
@@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
1190 print_graph_headers_flags(s, tracer_flags.val); 1354 print_graph_headers_flags(s, tracer_flags.val);
1191} 1355}
1192 1356
1357void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{
1359 struct trace_iterator *iter = s->private;
1360
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter))
1364 return;
1365
1366 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION;
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370
1371 __print_graph_headers_flags(s, flags);
1372}
1373
1193void graph_trace_open(struct trace_iterator *iter) 1374void graph_trace_open(struct trace_iterator *iter)
1194{ 1375{
1195 /* pid and depth on the last trace processed */ 1376 /* pid and depth on the last trace processed */
@@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter)
1210 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); 1391 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1211 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 1392 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1212 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); 1393 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1394 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
1395
1213 *pid = -1; 1396 *pid = -1;
1214 *depth = 0; 1397 *depth = 0;
1215 *ignore = 0; 1398 *ignore = 0;
1399 *depth_irq = -1;
1216 } 1400 }
1217 1401
1218 iter->private = data; 1402 iter->private = data;
@@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter)
1235 } 1419 }
1236} 1420}
1237 1421
1422static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
1423{
1424 if (bit == TRACE_GRAPH_PRINT_IRQS)
1425 ftrace_graph_skip_irqs = !set;
1426
1427 return 0;
1428}
1429
1238static struct trace_event_functions graph_functions = { 1430static struct trace_event_functions graph_functions = {
1239 .trace = print_graph_function_event, 1431 .trace = print_graph_function_event,
1240}; 1432};
@@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = {
1261 .print_line = print_graph_function, 1453 .print_line = print_graph_function,
1262 .print_header = print_graph_headers, 1454 .print_header = print_graph_headers,
1263 .flags = &tracer_flags, 1455 .flags = &tracer_flags,
1456 .set_flag = func_graph_set_flag,
1264#ifdef CONFIG_FTRACE_SELFTEST 1457#ifdef CONFIG_FTRACE_SELFTEST
1265 .selftest = trace_selftest_startup_function_graph, 1458 .selftest = trace_selftest_startup_function_graph,
1266#endif 1459#endif
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2e..5cf8c602b880 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence;
87 87
88#ifdef CONFIG_FUNCTION_TRACER 88#ifdef CONFIG_FUNCTION_TRACER
89/* 89/*
90 * irqsoff uses its own tracer function to keep the overhead down: 90 * Prologue for the preempt and irqs off function tracers.
91 *
92 * Returns 1 if it is OK to continue, and data->disabled is
93 * incremented.
94 * 0 if the trace is to be ignored, and data->disabled
95 * is kept the same.
96 *
97 * Note, this function is also used outside this ifdef but
98 * inside the #ifdef of the function graph tracer below.
99 * This is OK, since the function graph tracer is
100 * dependent on the function tracer.
91 */ 101 */
92static void 102static int func_prolog_dec(struct trace_array *tr,
93irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) 103 struct trace_array_cpu **data,
104 unsigned long *flags)
94{ 105{
95 struct trace_array *tr = irqsoff_trace;
96 struct trace_array_cpu *data;
97 unsigned long flags;
98 long disabled; 106 long disabled;
99 int cpu; 107 int cpu;
100 108
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
106 */ 114 */
107 cpu = raw_smp_processor_id(); 115 cpu = raw_smp_processor_id();
108 if (likely(!per_cpu(tracing_cpu, cpu))) 116 if (likely(!per_cpu(tracing_cpu, cpu)))
109 return; 117 return 0;
110 118
111 local_save_flags(flags); 119 local_save_flags(*flags);
112 /* slight chance to get a false positive on tracing_cpu */ 120 /* slight chance to get a false positive on tracing_cpu */
113 if (!irqs_disabled_flags(flags)) 121 if (!irqs_disabled_flags(*flags))
114 return; 122 return 0;
115 123
116 data = tr->data[cpu]; 124 *data = tr->data[cpu];
117 disabled = atomic_inc_return(&data->disabled); 125 disabled = atomic_inc_return(&(*data)->disabled);
118 126
119 if (likely(disabled == 1)) 127 if (likely(disabled == 1))
120 trace_function(tr, ip, parent_ip, flags, preempt_count()); 128 return 1;
129
130 atomic_dec(&(*data)->disabled);
131
132 return 0;
133}
134
135/*
136 * irqsoff uses its own tracer function to keep the overhead down:
137 */
138static void
139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
140{
141 struct trace_array *tr = irqsoff_trace;
142 struct trace_array_cpu *data;
143 unsigned long flags;
144
145 if (!func_prolog_dec(tr, &data, &flags))
146 return;
147
148 trace_function(tr, ip, parent_ip, flags, preempt_count());
121 149
122 atomic_dec(&data->disabled); 150 atomic_dec(&data->disabled);
123} 151}
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
155 struct trace_array *tr = irqsoff_trace; 183 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data; 184 struct trace_array_cpu *data;
157 unsigned long flags; 185 unsigned long flags;
158 long disabled;
159 int ret; 186 int ret;
160 int cpu;
161 int pc; 187 int pc;
162 188
163 cpu = raw_smp_processor_id(); 189 if (!func_prolog_dec(tr, &data, &flags))
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0; 190 return 0;
166 191
167 local_save_flags(flags); 192 pc = preempt_count();
168 /* slight chance to get a false positive on tracing_cpu */ 193 ret = __trace_graph_entry(tr, trace, flags, pc);
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled); 194 atomic_dec(&data->disabled);
195
182 return ret; 196 return ret;
183} 197}
184 198
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
187 struct trace_array *tr = irqsoff_trace; 201 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data; 202 struct trace_array_cpu *data;
189 unsigned long flags; 203 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc; 204 int pc;
193 205
194 cpu = raw_smp_processor_id(); 206 if (!func_prolog_dec(tr, &data, &flags))
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return; 207 return;
197 208
198 local_save_flags(flags); 209 pc = preempt_count();
199 /* slight chance to get a false positive on tracing_cpu */ 210 __trace_graph_return(tr, trace, flags, pc);
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled); 211 atomic_dec(&data->disabled);
212} 212}
213 213
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
229 229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{ 231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /* 232 /*
240 * In graph mode call the graph tracer output function, 233 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler 234 * otherwise go with the TRACE_FN event handler
242 */ 235 */
243 if (is_graph()) 236 if (is_graph())
244 return print_graph_function_flags(iter, flags); 237 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
245 238
246 return TRACE_TYPE_UNHANDLED; 239 return TRACE_TYPE_UNHANDLED;
247} 240}
248 241
249static void irqsoff_print_header(struct seq_file *s) 242static void irqsoff_print_header(struct seq_file *s)
250{ 243{
251 if (is_graph()) { 244 if (is_graph())
252 struct trace_iterator *iter = s->private; 245 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
253 u32 flags = GRAPH_TRACER_FLAGS; 246 else
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s); 247 trace_default_header(s);
268} 248}
269 249
270static void 250static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr, 251__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip, 252 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc) 253 unsigned long flags, int pc)
294{ 254{
295 if (!is_graph()) 255 if (is_graph())
256 trace_graph_function(tr, ip, parent_ip, flags, pc);
257 else
296 trace_function(tr, ip, parent_ip, flags, pc); 258 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301} 259}
302 260
303#else 261#else
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8b27c9849b42..544301d29dee 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -514,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
514static int kretprobe_dispatcher(struct kretprobe_instance *ri, 514static int kretprobe_dispatcher(struct kretprobe_instance *ri,
515 struct pt_regs *regs); 515 struct pt_regs *regs);
516 516
517/* Check the name is good for event/group */ 517/* Check the name is good for event/group/fields */
518static int check_event_name(const char *name) 518static int is_good_name(const char *name)
519{ 519{
520 if (!isalpha(*name) && *name != '_') 520 if (!isalpha(*name) && *name != '_')
521 return 0; 521 return 0;
@@ -557,7 +557,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
557 else 557 else
558 tp->rp.kp.pre_handler = kprobe_dispatcher; 558 tp->rp.kp.pre_handler = kprobe_dispatcher;
559 559
560 if (!event || !check_event_name(event)) { 560 if (!event || !is_good_name(event)) {
561 ret = -EINVAL; 561 ret = -EINVAL;
562 goto error; 562 goto error;
563 } 563 }
@@ -567,7 +567,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
567 if (!tp->call.name) 567 if (!tp->call.name)
568 goto error; 568 goto error;
569 569
570 if (!group || !check_event_name(group)) { 570 if (!group || !is_good_name(group)) {
571 ret = -EINVAL; 571 ret = -EINVAL;
572 goto error; 572 goto error;
573 } 573 }
@@ -883,7 +883,7 @@ static int create_trace_probe(int argc, char **argv)
883 int i, ret = 0; 883 int i, ret = 0;
884 int is_return = 0, is_delete = 0; 884 int is_return = 0, is_delete = 0;
885 char *symbol = NULL, *event = NULL, *group = NULL; 885 char *symbol = NULL, *event = NULL, *group = NULL;
886 char *arg, *tmp; 886 char *arg;
887 unsigned long offset = 0; 887 unsigned long offset = 0;
888 void *addr = NULL; 888 void *addr = NULL;
889 char buf[MAX_EVENT_NAME_LEN]; 889 char buf[MAX_EVENT_NAME_LEN];
@@ -992,26 +992,36 @@ static int create_trace_probe(int argc, char **argv)
992 /* parse arguments */ 992 /* parse arguments */
993 ret = 0; 993 ret = 0;
994 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { 994 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
995 /* Increment count for freeing args in error case */
996 tp->nr_args++;
997
995 /* Parse argument name */ 998 /* Parse argument name */
996 arg = strchr(argv[i], '='); 999 arg = strchr(argv[i], '=');
997 if (arg) 1000 if (arg) {
998 *arg++ = '\0'; 1001 *arg++ = '\0';
999 else 1002 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
1003 } else {
1000 arg = argv[i]; 1004 arg = argv[i];
1005 /* If argument name is omitted, set "argN" */
1006 snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1);
1007 tp->args[i].name = kstrdup(buf, GFP_KERNEL);
1008 }
1001 1009
1002 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
1003 if (!tp->args[i].name) { 1010 if (!tp->args[i].name) {
1004 pr_info("Failed to allocate argument%d name '%s'.\n", 1011 pr_info("Failed to allocate argument[%d] name.\n", i);
1005 i, argv[i]);
1006 ret = -ENOMEM; 1012 ret = -ENOMEM;
1007 goto error; 1013 goto error;
1008 } 1014 }
1009 tmp = strchr(tp->args[i].name, ':'); 1015
1010 if (tmp) 1016 if (!is_good_name(tp->args[i].name)) {
1011 *tmp = '_'; /* convert : to _ */ 1017 pr_info("Invalid argument[%d] name: %s\n",
1018 i, tp->args[i].name);
1019 ret = -EINVAL;
1020 goto error;
1021 }
1012 1022
1013 if (conflict_field_name(tp->args[i].name, tp->args, i)) { 1023 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
1014 pr_info("Argument%d name '%s' conflicts with " 1024 pr_info("Argument[%d] name '%s' conflicts with "
1015 "another field.\n", i, argv[i]); 1025 "another field.\n", i, argv[i]);
1016 ret = -EINVAL; 1026 ret = -EINVAL;
1017 goto error; 1027 goto error;
@@ -1020,12 +1030,9 @@ static int create_trace_probe(int argc, char **argv)
1020 /* Parse fetch argument */ 1030 /* Parse fetch argument */
1021 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); 1031 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
1022 if (ret) { 1032 if (ret) {
1023 pr_info("Parse error at argument%d. (%d)\n", i, ret); 1033 pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
1024 kfree(tp->args[i].name);
1025 goto error; 1034 goto error;
1026 } 1035 }
1027
1028 tp->nr_args++;
1029 } 1036 }
1030 1037
1031 ret = register_trace_probe(tp); 1038 ret = register_trace_probe(tp);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81b..7319559ed59f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int wakeup_rt;
31static arch_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void wakeup_reset(struct trace_array *tr);
34static void __wakeup_reset(struct trace_array *tr); 35static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
35 38
36static int save_lat_flag; 39static int save_lat_flag;
37 40
41#define TRACE_DISPLAY_GRAPH 1
42
43static struct tracer_opt trace_opts[] = {
44#ifdef CONFIG_FUNCTION_GRAPH_TRACER
45 /* display latency trace as call graph */
46 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
47#endif
48 { } /* Empty entry */
49};
50
51static struct tracer_flags tracer_flags = {
52 .val = 0,
53 .opts = trace_opts,
54};
55
56#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
57
38#ifdef CONFIG_FUNCTION_TRACER 58#ifdef CONFIG_FUNCTION_TRACER
59
39/* 60/*
40 * irqsoff uses its own tracer function to keep the overhead down: 61 * Prologue for the wakeup function tracers.
62 *
63 * Returns 1 if it is OK to continue, and preemption
64 * is disabled and data->disabled is incremented.
65 * 0 if the trace is to be ignored, and preemption
66 * is not disabled and data->disabled is
67 * kept the same.
68 *
69 * Note, this function is also used outside this ifdef but
70 * inside the #ifdef of the function graph tracer below.
71 * This is OK, since the function graph tracer is
72 * dependent on the function tracer.
41 */ 73 */
42static void 74static int
43wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) 75func_prolog_preempt_disable(struct trace_array *tr,
76 struct trace_array_cpu **data,
77 int *pc)
44{ 78{
45 struct trace_array *tr = wakeup_trace;
46 struct trace_array_cpu *data;
47 unsigned long flags;
48 long disabled; 79 long disabled;
49 int cpu; 80 int cpu;
50 int pc;
51 81
52 if (likely(!wakeup_task)) 82 if (likely(!wakeup_task))
53 return; 83 return 0;
54 84
55 pc = preempt_count(); 85 *pc = preempt_count();
56 preempt_disable_notrace(); 86 preempt_disable_notrace();
57 87
58 cpu = raw_smp_processor_id(); 88 cpu = raw_smp_processor_id();
59 if (cpu != wakeup_current_cpu) 89 if (cpu != wakeup_current_cpu)
60 goto out_enable; 90 goto out_enable;
61 91
62 data = tr->data[cpu]; 92 *data = tr->data[cpu];
63 disabled = atomic_inc_return(&data->disabled); 93 disabled = atomic_inc_return(&(*data)->disabled);
64 if (unlikely(disabled != 1)) 94 if (unlikely(disabled != 1))
65 goto out; 95 goto out;
66 96
67 local_irq_save(flags); 97 return 1;
68 98
69 trace_function(tr, ip, parent_ip, flags, pc); 99out:
100 atomic_dec(&(*data)->disabled);
101
102out_enable:
103 preempt_enable_notrace();
104 return 0;
105}
70 106
107/*
108 * wakeup uses its own tracer function to keep the overhead down:
109 */
110static void
111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
112{
113 struct trace_array *tr = wakeup_trace;
114 struct trace_array_cpu *data;
115 unsigned long flags;
116 int pc;
117
118 if (!func_prolog_preempt_disable(tr, &data, &pc))
119 return;
120
121 local_irq_save(flags);
122 trace_function(tr, ip, parent_ip, flags, pc);
71 local_irq_restore(flags); 123 local_irq_restore(flags);
72 124
73 out:
74 atomic_dec(&data->disabled); 125 atomic_dec(&data->disabled);
75 out_enable:
76 preempt_enable_notrace(); 126 preempt_enable_notrace();
77} 127}
78 128
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
82}; 132};
83#endif /* CONFIG_FUNCTION_TRACER */ 133#endif /* CONFIG_FUNCTION_TRACER */
84 134
135static int start_func_tracer(int graph)
136{
137 int ret;
138
139 if (!graph)
140 ret = register_ftrace_function(&trace_ops);
141 else
142 ret = register_ftrace_graph(&wakeup_graph_return,
143 &wakeup_graph_entry);
144
145 if (!ret && tracing_is_enabled())
146 tracer_enabled = 1;
147 else
148 tracer_enabled = 0;
149
150 return ret;
151}
152
153static void stop_func_tracer(int graph)
154{
155 tracer_enabled = 0;
156
157 if (!graph)
158 unregister_ftrace_function(&trace_ops);
159 else
160 unregister_ftrace_graph();
161}
162
163#ifdef CONFIG_FUNCTION_GRAPH_TRACER
164static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
165{
166
167 if (!(bit & TRACE_DISPLAY_GRAPH))
168 return -EINVAL;
169
170 if (!(is_graph() ^ set))
171 return 0;
172
173 stop_func_tracer(!set);
174
175 wakeup_reset(wakeup_trace);
176 tracing_max_latency = 0;
177
178 return start_func_tracer(set);
179}
180
181static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
182{
183 struct trace_array *tr = wakeup_trace;
184 struct trace_array_cpu *data;
185 unsigned long flags;
186 int pc, ret = 0;
187
188 if (!func_prolog_preempt_disable(tr, &data, &pc))
189 return 0;
190
191 local_save_flags(flags);
192 ret = __trace_graph_entry(tr, trace, flags, pc);
193 atomic_dec(&data->disabled);
194 preempt_enable_notrace();
195
196 return ret;
197}
198
199static void wakeup_graph_return(struct ftrace_graph_ret *trace)
200{
201 struct trace_array *tr = wakeup_trace;
202 struct trace_array_cpu *data;
203 unsigned long flags;
204 int pc;
205
206 if (!func_prolog_preempt_disable(tr, &data, &pc))
207 return;
208
209 local_save_flags(flags);
210 __trace_graph_return(tr, trace, flags, pc);
211 atomic_dec(&data->disabled);
212
213 preempt_enable_notrace();
214 return;
215}
216
217static void wakeup_trace_open(struct trace_iterator *iter)
218{
219 if (is_graph())
220 graph_trace_open(iter);
221}
222
223static void wakeup_trace_close(struct trace_iterator *iter)
224{
225 if (iter->private)
226 graph_trace_close(iter);
227}
228
229#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
230
231static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
232{
233 /*
234 * In graph mode call the graph tracer output function,
235 * otherwise go with the TRACE_FN event handler
236 */
237 if (is_graph())
238 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
239
240 return TRACE_TYPE_UNHANDLED;
241}
242
243static void wakeup_print_header(struct seq_file *s)
244{
245 if (is_graph())
246 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
247 else
248 trace_default_header(s);
249}
250
251static void
252__trace_function(struct trace_array *tr,
253 unsigned long ip, unsigned long parent_ip,
254 unsigned long flags, int pc)
255{
256 if (is_graph())
257 trace_graph_function(tr, ip, parent_ip, flags, pc);
258 else
259 trace_function(tr, ip, parent_ip, flags, pc);
260}
261#else
262#define __trace_function trace_function
263
264static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
265{
266 return -EINVAL;
267}
268
269static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
270{
271 return -1;
272}
273
274static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
275{
276 return TRACE_TYPE_UNHANDLED;
277}
278
279static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
280static void wakeup_print_header(struct seq_file *s) { }
281static void wakeup_trace_open(struct trace_iterator *iter) { }
282static void wakeup_trace_close(struct trace_iterator *iter) { }
283#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
284
85/* 285/*
86 * Should this new latency be reported/recorded? 286 * Should this new latency be reported/recorded?
87 */ 287 */
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
152 /* The task we are waiting for is waking up */ 352 /* The task we are waiting for is waking up */
153 data = wakeup_trace->data[wakeup_cpu]; 353 data = wakeup_trace->data[wakeup_cpu];
154 354
155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 355 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 356 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
157 357
158 T0 = data->preempt_timestamp; 358 T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
252 * is not called by an assembly function (where as schedule is) 452 * is not called by an assembly function (where as schedule is)
253 * it should be safe to use it here. 453 * it should be safe to use it here.
254 */ 454 */
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 455 __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 456
257out_locked: 457out_locked:
258 arch_spin_unlock(&wakeup_lock); 458 arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
303 */ 503 */
304 smp_wmb(); 504 smp_wmb();
305 505
306 register_ftrace_function(&trace_ops); 506 if (start_func_tracer(is_graph()))
307 507 printk(KERN_ERR "failed to start wakeup tracer\n");
308 if (tracing_is_enabled())
309 tracer_enabled = 1;
310 else
311 tracer_enabled = 0;
312 508
313 return; 509 return;
314fail_deprobe_wake_new: 510fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
320static void stop_wakeup_tracer(struct trace_array *tr) 516static void stop_wakeup_tracer(struct trace_array *tr)
321{ 517{
322 tracer_enabled = 0; 518 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 519 stop_func_tracer(is_graph());
324 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 520 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 521 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup, NULL); 522 unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
379 .start = wakeup_tracer_start, 575 .start = wakeup_tracer_start,
380 .stop = wakeup_tracer_stop, 576 .stop = wakeup_tracer_stop,
381 .print_max = 1, 577 .print_max = 1,
578 .print_header = wakeup_print_header,
579 .print_line = wakeup_print_line,
580 .flags = &tracer_flags,
581 .set_flag = wakeup_set_flag,
382#ifdef CONFIG_FTRACE_SELFTEST 582#ifdef CONFIG_FTRACE_SELFTEST
383 .selftest = trace_selftest_startup_wakeup, 583 .selftest = trace_selftest_startup_wakeup,
384#endif 584#endif
585 .open = wakeup_trace_open,
586 .close = wakeup_trace_close,
385 .use_max_tr = 1, 587 .use_max_tr = 1,
386}; 588};
387 589
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
394 .stop = wakeup_tracer_stop, 596 .stop = wakeup_tracer_stop,
395 .wait_pipe = poll_wait_pipe, 597 .wait_pipe = poll_wait_pipe,
396 .print_max = 1, 598 .print_max = 1,
599 .print_header = wakeup_print_header,
600 .print_line = wakeup_print_line,
601 .flags = &tracer_flags,
602 .set_flag = wakeup_set_flag,
397#ifdef CONFIG_FTRACE_SELFTEST 603#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 604 .selftest = trace_selftest_startup_wakeup,
399#endif 605#endif
606 .open = wakeup_trace_open,
607 .close = wakeup_trace_close,
400 .use_max_tr = 1, 608 .use_max_tr = 1,
401}; 609};
402 610
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index a7cc3793baf6..209b379a4721 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void)
263{ 263{
264 int ret, cpu; 264 int ret, cpu;
265 265
266 for_each_possible_cpu(cpu) {
267 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
268 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
269 }
270
266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); 271 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
267 if (ret) 272 if (ret)
268 goto out; 273 goto out;
@@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void)
279 if (ret) 284 if (ret)
280 goto no_creation; 285 goto no_creation;
281 286
282 for_each_possible_cpu(cpu) {
283 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
284 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
285 }
286
287 return 0; 287 return 0;
288 288
289no_creation: 289no_creation:
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index c77f3eceea25..e95ee7f31d43 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,6 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h>
28 29
29extern struct tracepoint __start___tracepoints[]; 30extern struct tracepoint __start___tracepoints[];
30extern struct tracepoint __stop___tracepoints[]; 31extern struct tracepoint __stop___tracepoints[];
@@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry,
263 * is used. 264 * is used.
264 */ 265 */
265 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 266 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
266 elem->state = active; 267 if (!elem->state && active) {
268 jump_label_enable(&elem->state);
269 elem->state = active;
270 } else if (elem->state && !active) {
271 jump_label_disable(&elem->state);
272 elem->state = active;
273 }
267} 274}
268 275
269/* 276/*
@@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem)
277 if (elem->unregfunc && elem->state) 284 if (elem->unregfunc && elem->state)
278 elem->unregfunc(); 285 elem->unregfunc();
279 286
280 elem->state = 0; 287 if (elem->state) {
288 jump_label_disable(&elem->state);
289 elem->state = 0;
290 }
281 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
282} 292}
283 293
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 0d53c8e853b1..bafba687a6d8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); 43static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
44#endif 44#endif
45 45
46static int __read_mostly did_panic;
47static int __initdata no_watchdog; 46static int __initdata no_watchdog;
48 47
49 48
@@ -122,7 +121,7 @@ static void __touch_watchdog(void)
122 121
123void touch_softlockup_watchdog(void) 122void touch_softlockup_watchdog(void)
124{ 123{
125 __get_cpu_var(watchdog_touch_ts) = 0; 124 __raw_get_cpu_var(watchdog_touch_ts) = 0;
126} 125}
127EXPORT_SYMBOL(touch_softlockup_watchdog); 126EXPORT_SYMBOL(touch_softlockup_watchdog);
128 127
@@ -142,7 +141,14 @@ void touch_all_softlockup_watchdogs(void)
142#ifdef CONFIG_HARDLOCKUP_DETECTOR 141#ifdef CONFIG_HARDLOCKUP_DETECTOR
143void touch_nmi_watchdog(void) 142void touch_nmi_watchdog(void)
144{ 143{
145 __get_cpu_var(watchdog_nmi_touch) = true; 144 if (watchdog_enabled) {
145 unsigned cpu;
146
147 for_each_present_cpu(cpu) {
148 if (per_cpu(watchdog_nmi_touch, cpu) != true)
149 per_cpu(watchdog_nmi_touch, cpu) = true;
150 }
151 }
146 touch_softlockup_watchdog(); 152 touch_softlockup_watchdog();
147} 153}
148EXPORT_SYMBOL(touch_nmi_watchdog); 154EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -180,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts)
180 return 0; 186 return 0;
181} 187}
182 188
183static int
184watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr)
185{
186 did_panic = 1;
187
188 return NOTIFY_DONE;
189}
190
191static struct notifier_block panic_block = {
192 .notifier_call = watchdog_panic,
193};
194
195#ifdef CONFIG_HARDLOCKUP_DETECTOR 189#ifdef CONFIG_HARDLOCKUP_DETECTOR
196static struct perf_event_attr wd_hw_attr = { 190static struct perf_event_attr wd_hw_attr = {
197 .type = PERF_TYPE_HARDWARE, 191 .type = PERF_TYPE_HARDWARE,
@@ -202,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = {
202}; 196};
203 197
204/* Callback function for perf event subsystem */ 198/* Callback function for perf event subsystem */
205void watchdog_overflow_callback(struct perf_event *event, int nmi, 199static void watchdog_overflow_callback(struct perf_event *event, int nmi,
206 struct perf_sample_data *data, 200 struct perf_sample_data *data,
207 struct pt_regs *regs) 201 struct pt_regs *regs)
208{ 202{
@@ -364,14 +358,14 @@ static int watchdog_nmi_enable(int cpu)
364 /* Try to register using hardware perf events */ 358 /* Try to register using hardware perf events */
365 wd_attr = &wd_hw_attr; 359 wd_attr = &wd_hw_attr;
366 wd_attr->sample_period = hw_nmi_get_sample_period(); 360 wd_attr->sample_period = hw_nmi_get_sample_period();
367 event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); 361 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
368 if (!IS_ERR(event)) { 362 if (!IS_ERR(event)) {
369 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 363 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
370 goto out_save; 364 goto out_save;
371 } 365 }
372 366
373 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); 367 printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
374 return -1; 368 return PTR_ERR(event);
375 369
376 /* success path */ 370 /* success path */
377out_save: 371out_save:
@@ -415,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu)
415static int watchdog_enable(int cpu) 409static int watchdog_enable(int cpu)
416{ 410{
417 struct task_struct *p = per_cpu(softlockup_watchdog, cpu); 411 struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
412 int err;
418 413
419 /* enable the perf event */ 414 /* enable the perf event */
420 if (watchdog_nmi_enable(cpu) != 0) 415 err = watchdog_nmi_enable(cpu);
421 return -1; 416 if (err)
417 return err;
422 418
423 /* create the watchdog thread */ 419 /* create the watchdog thread */
424 if (!p) { 420 if (!p) {
425 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); 421 p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
426 if (IS_ERR(p)) { 422 if (IS_ERR(p)) {
427 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 423 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
428 return -1; 424 return PTR_ERR(p);
429 } 425 }
430 kthread_bind(p, cpu); 426 kthread_bind(p, cpu);
431 per_cpu(watchdog_touch_ts, cpu) = 0; 427 per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -433,6 +429,9 @@ static int watchdog_enable(int cpu)
433 wake_up_process(p); 429 wake_up_process(p);
434 } 430 }
435 431
432 /* if any cpu succeeds, watchdog is considered enabled for the system */
433 watchdog_enabled = 1;
434
436 return 0; 435 return 0;
437} 436}
438 437
@@ -455,9 +454,6 @@ static void watchdog_disable(int cpu)
455 per_cpu(softlockup_watchdog, cpu) = NULL; 454 per_cpu(softlockup_watchdog, cpu) = NULL;
456 kthread_stop(p); 455 kthread_stop(p);
457 } 456 }
458
459 /* if any cpu succeeds, watchdog is considered enabled for the system */
460 watchdog_enabled = 1;
461} 457}
462 458
463static void watchdog_enable_all_cpus(void) 459static void watchdog_enable_all_cpus(void)
@@ -477,6 +473,9 @@ static void watchdog_disable_all_cpus(void)
477{ 473{
478 int cpu; 474 int cpu;
479 475
476 if (no_watchdog)
477 return;
478
480 for_each_online_cpu(cpu) 479 for_each_online_cpu(cpu)
481 watchdog_disable(cpu); 480 watchdog_disable(cpu);
482 481
@@ -519,17 +518,16 @@ static int __cpuinit
519cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 518cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
520{ 519{
521 int hotcpu = (unsigned long)hcpu; 520 int hotcpu = (unsigned long)hcpu;
521 int err = 0;
522 522
523 switch (action) { 523 switch (action) {
524 case CPU_UP_PREPARE: 524 case CPU_UP_PREPARE:
525 case CPU_UP_PREPARE_FROZEN: 525 case CPU_UP_PREPARE_FROZEN:
526 if (watchdog_prepare_cpu(hotcpu)) 526 err = watchdog_prepare_cpu(hotcpu);
527 return NOTIFY_BAD;
528 break; 527 break;
529 case CPU_ONLINE: 528 case CPU_ONLINE:
530 case CPU_ONLINE_FROZEN: 529 case CPU_ONLINE_FROZEN:
531 if (watchdog_enable(hotcpu)) 530 err = watchdog_enable(hotcpu);
532 return NOTIFY_BAD;
533 break; 531 break;
534#ifdef CONFIG_HOTPLUG_CPU 532#ifdef CONFIG_HOTPLUG_CPU
535 case CPU_UP_CANCELED: 533 case CPU_UP_CANCELED:
@@ -542,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
542 break; 540 break;
543#endif /* CONFIG_HOTPLUG_CPU */ 541#endif /* CONFIG_HOTPLUG_CPU */
544 } 542 }
545 return NOTIFY_OK; 543 return notifier_from_errno(err);
546} 544}
547 545
548static struct notifier_block __cpuinitdata cpu_nfb = { 546static struct notifier_block __cpuinitdata cpu_nfb = {
@@ -558,13 +556,11 @@ static int __init spawn_watchdog_task(void)
558 return 0; 556 return 0;
559 557
560 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 558 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
561 WARN_ON(err == NOTIFY_BAD); 559 WARN_ON(notifier_to_errno(err));
562 560
563 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 561 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
564 register_cpu_notifier(&cpu_nfb); 562 register_cpu_notifier(&cpu_nfb);
565 563
566 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
567
568 return 0; 564 return 0;
569} 565}
570early_initcall(spawn_watchdog_task); 566early_initcall(spawn_watchdog_task);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 727f24e563ae..f77afd939229 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,19 +1,26 @@
1/* 1/*
2 * linux/kernel/workqueue.c 2 * kernel/workqueue.c - generic async execution with shared worker pool
3 * 3 *
4 * Generic mechanism for defining kernel helper threads for running 4 * Copyright (C) 2002 Ingo Molnar
5 * arbitrary tasks in process context.
6 * 5 *
7 * Started by Ingo Molnar, Copyright (C) 2002 6 * Derived from the taskqueue/keventd code by:
7 * David Woodhouse <dwmw2@infradead.org>
8 * Andrew Morton
9 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
10 * Theodore Ts'o <tytso@mit.edu>
8 * 11 *
9 * Derived from the taskqueue/keventd code by: 12 * Made to use alloc_percpu by Christoph Lameter.
10 * 13 *
11 * David Woodhouse <dwmw2@infradead.org> 14 * Copyright (C) 2010 SUSE Linux Products GmbH
12 * Andrew Morton 15 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu>
15 * 16 *
16 * Made to use alloc_percpu by Christoph Lameter. 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and
20 * one extra for works which are better served by workers which are
21 * not bound to any specific CPU.
22 *
23 * Please read Documentation/workqueue.txt for details.
17 */ 24 */
18 25
19#include <linux/module.h> 26#include <linux/module.h>