diff options
Diffstat (limited to 'kernel')
66 files changed, 4624 insertions, 1829 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b72d1a74be0..e2c9d52cfe9e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
| 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
| 13 | async.o range.o | 13 | async.o range.o jump_label.o |
| 14 | obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o | 14 | obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o |
| 15 | obj-y += groups.o | 15 | obj-y += groups.o |
| 16 | 16 | ||
| @@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg | |||
| 23 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 23 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
| 24 | CFLAGS_REMOVE_sched_clock.o = -pg | 24 | CFLAGS_REMOVE_sched_clock.o = -pg |
| 25 | CFLAGS_REMOVE_perf_event.o = -pg | 25 | CFLAGS_REMOVE_perf_event.o = -pg |
| 26 | CFLAGS_REMOVE_irq_work.o = -pg | ||
| 26 | endif | 27 | endif |
| 27 | 28 | ||
| 28 | obj-$(CONFIG_FREEZER) += freezer.o | 29 | obj-$(CONFIG_FREEZER) += freezer.o |
| @@ -86,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o | |||
| 86 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o | 87 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o |
| 87 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o | 88 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o |
| 88 | obj-$(CONFIG_TINY_RCU) += rcutiny.o | 89 | obj-$(CONFIG_TINY_RCU) += rcutiny.o |
| 90 | obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o | ||
| 89 | obj-$(CONFIG_RELAY) += relay.o | 91 | obj-$(CONFIG_RELAY) += relay.o |
| 90 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 92 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
| 91 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 93 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
| @@ -100,6 +102,7 @@ obj-$(CONFIG_TRACING) += trace/ | |||
| 100 | obj-$(CONFIG_X86_DS) += trace/ | 102 | obj-$(CONFIG_X86_DS) += trace/ |
| 101 | obj-$(CONFIG_RING_BUFFER) += trace/ | 103 | obj-$(CONFIG_RING_BUFFER) += trace/ |
| 102 | obj-$(CONFIG_SMP) += sched_cpupri.o | 104 | obj-$(CONFIG_SMP) += sched_cpupri.o |
| 105 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | ||
| 103 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 106 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
| 104 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 107 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
| 105 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 108 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 192f88c5b0f9..291ba3d04bea 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -138,7 +138,7 @@ struct css_id { | |||
| 138 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | 138 | * is called after synchronize_rcu(). But for safe use, css_is_removed() |
| 139 | * css_tryget() should be used for avoiding race. | 139 | * css_tryget() should be used for avoiding race. |
| 140 | */ | 140 | */ |
| 141 | struct cgroup_subsys_state *css; | 141 | struct cgroup_subsys_state __rcu *css; |
| 142 | /* | 142 | /* |
| 143 | * ID of this css. | 143 | * ID of this css. |
| 144 | */ | 144 | */ |
| @@ -1791,19 +1791,20 @@ out: | |||
| 1791 | } | 1791 | } |
| 1792 | 1792 | ||
| 1793 | /** | 1793 | /** |
| 1794 | * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup | 1794 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' |
| 1795 | * @from: attach to all cgroups of a given task | ||
| 1795 | * @tsk: the task to be attached | 1796 | * @tsk: the task to be attached |
| 1796 | */ | 1797 | */ |
| 1797 | int cgroup_attach_task_current_cg(struct task_struct *tsk) | 1798 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) |
| 1798 | { | 1799 | { |
| 1799 | struct cgroupfs_root *root; | 1800 | struct cgroupfs_root *root; |
| 1800 | struct cgroup *cur_cg; | ||
| 1801 | int retval = 0; | 1801 | int retval = 0; |
| 1802 | 1802 | ||
| 1803 | cgroup_lock(); | 1803 | cgroup_lock(); |
| 1804 | for_each_active_root(root) { | 1804 | for_each_active_root(root) { |
| 1805 | cur_cg = task_cgroup_from_root(current, root); | 1805 | struct cgroup *from_cg = task_cgroup_from_root(from, root); |
| 1806 | retval = cgroup_attach_task(cur_cg, tsk); | 1806 | |
| 1807 | retval = cgroup_attach_task(from_cg, tsk); | ||
| 1807 | if (retval) | 1808 | if (retval) |
| 1808 | break; | 1809 | break; |
| 1809 | } | 1810 | } |
| @@ -1811,7 +1812,7 @@ int cgroup_attach_task_current_cg(struct task_struct *tsk) | |||
| 1811 | 1812 | ||
| 1812 | return retval; | 1813 | return retval; |
| 1813 | } | 1814 | } |
| 1814 | EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); | 1815 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
| 1815 | 1816 | ||
| 1816 | /* | 1817 | /* |
| 1817 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex | 1818 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex |
diff --git a/kernel/compat.c b/kernel/compat.c index e167efce8423..c9e2ec0b34a8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -1126,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) | |||
| 1126 | 1126 | ||
| 1127 | return 0; | 1127 | return 0; |
| 1128 | } | 1128 | } |
| 1129 | |||
| 1130 | /* | ||
| 1131 | * Allocate user-space memory for the duration of a single system call, | ||
| 1132 | * in order to marshall parameters inside a compat thunk. | ||
| 1133 | */ | ||
| 1134 | void __user *compat_alloc_user_space(unsigned long len) | ||
| 1135 | { | ||
| 1136 | void __user *ptr; | ||
| 1137 | |||
| 1138 | /* If len would occupy more than half of the entire compat space... */ | ||
| 1139 | if (unlikely(len > (((compat_uptr_t)~0) >> 1))) | ||
| 1140 | return NULL; | ||
| 1141 | |||
| 1142 | ptr = arch_compat_alloc_user_space(len); | ||
| 1143 | |||
| 1144 | if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) | ||
| 1145 | return NULL; | ||
| 1146 | |||
| 1147 | return ptr; | ||
| 1148 | } | ||
| 1149 | EXPORT_SYMBOL_GPL(compat_alloc_user_space); | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b23c0979bbe7..51b143e2a07a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
| 1397 | if (tsk->flags & PF_THREAD_BOUND) | 1397 | if (tsk->flags & PF_THREAD_BOUND) |
| 1398 | return -EINVAL; | 1398 | return -EINVAL; |
| 1399 | 1399 | ||
| 1400 | ret = security_task_setscheduler(tsk, 0, NULL); | 1400 | ret = security_task_setscheduler(tsk); |
| 1401 | if (ret) | 1401 | if (ret) |
| 1402 | return ret; | 1402 | return ret; |
| 1403 | if (threadgroup) { | 1403 | if (threadgroup) { |
| @@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
| 1405 | 1405 | ||
| 1406 | rcu_read_lock(); | 1406 | rcu_read_lock(); |
| 1407 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | 1407 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { |
| 1408 | ret = security_task_setscheduler(c, 0, NULL); | 1408 | ret = security_task_setscheduler(c); |
| 1409 | if (ret) { | 1409 | if (ret) { |
| 1410 | rcu_read_unlock(); | 1410 | rcu_read_unlock(); |
| 1411 | return ret; | 1411 | return ret; |
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 75bd9b3ebbb7..20059ef4459a 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c | |||
| @@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv) | |||
| 274 | int i, bpno; | 274 | int i, bpno; |
| 275 | kdb_bp_t *bp, *bp_check; | 275 | kdb_bp_t *bp, *bp_check; |
| 276 | int diag; | 276 | int diag; |
| 277 | int free; | ||
| 278 | char *symname = NULL; | 277 | char *symname = NULL; |
| 279 | long offset = 0ul; | 278 | long offset = 0ul; |
| 280 | int nextarg; | 279 | int nextarg; |
| @@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv) | |||
| 305 | /* | 304 | /* |
| 306 | * Find an empty bp structure to allocate | 305 | * Find an empty bp structure to allocate |
| 307 | */ | 306 | */ |
| 308 | free = KDB_MAXBPT; | ||
| 309 | for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { | 307 | for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { |
| 310 | if (bp->bp_free) | 308 | if (bp->bp_free) |
| 311 | break; | 309 | break; |
diff --git a/kernel/exit.c b/kernel/exit.c index 03120229db28..e2bdf37f9fde 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
| 149 | { | 149 | { |
| 150 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 150 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
| 151 | 151 | ||
| 152 | #ifdef CONFIG_PERF_EVENTS | 152 | perf_event_delayed_put(tsk); |
| 153 | WARN_ON_ONCE(tsk->perf_event_ctxp); | ||
| 154 | #endif | ||
| 155 | trace_sched_process_free(tsk); | 153 | trace_sched_process_free(tsk); |
| 156 | put_task_struct(tsk); | 154 | put_task_struct(tsk); |
| 157 | } | 155 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index b7e9d60a675d..c445f8cc408d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -356,10 +356,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 356 | if (IS_ERR(pol)) | 356 | if (IS_ERR(pol)) |
| 357 | goto fail_nomem_policy; | 357 | goto fail_nomem_policy; |
| 358 | vma_set_policy(tmp, pol); | 358 | vma_set_policy(tmp, pol); |
| 359 | tmp->vm_mm = mm; | ||
| 359 | if (anon_vma_fork(tmp, mpnt)) | 360 | if (anon_vma_fork(tmp, mpnt)) |
| 360 | goto fail_nomem_anon_vma_fork; | 361 | goto fail_nomem_anon_vma_fork; |
| 361 | tmp->vm_flags &= ~VM_LOCKED; | 362 | tmp->vm_flags &= ~VM_LOCKED; |
| 362 | tmp->vm_mm = mm; | ||
| 363 | tmp->vm_next = tmp->vm_prev = NULL; | 363 | tmp->vm_next = tmp->vm_prev = NULL; |
| 364 | file = tmp->vm_file; | 364 | file = tmp->vm_file; |
| 365 | if (file) { | 365 | if (file) { |
diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526d..a118bf160e0b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -91,6 +91,7 @@ struct futex_pi_state { | |||
| 91 | 91 | ||
| 92 | /** | 92 | /** |
| 93 | * struct futex_q - The hashed futex queue entry, one per waiting task | 93 | * struct futex_q - The hashed futex queue entry, one per waiting task |
| 94 | * @list: priority-sorted list of tasks waiting on this futex | ||
| 94 | * @task: the task waiting on the futex | 95 | * @task: the task waiting on the futex |
| 95 | * @lock_ptr: the hash bucket lock | 96 | * @lock_ptr: the hash bucket lock |
| 96 | * @key: the key the futex is hashed on | 97 | * @key: the key the futex is hashed on |
| @@ -104,7 +105,7 @@ struct futex_pi_state { | |||
| 104 | * | 105 | * |
| 105 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. | 106 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. |
| 106 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. | 107 | * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. |
| 107 | * The order of wakup is always to make the first condition true, then | 108 | * The order of wakeup is always to make the first condition true, then |
| 108 | * the second. | 109 | * the second. |
| 109 | * | 110 | * |
| 110 | * PI futexes are typically woken before they are removed from the hash list via | 111 | * PI futexes are typically woken before they are removed from the hash list via |
| @@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key) | |||
| 295 | * Slow path to fixup the fault we just took in the atomic write | 296 | * Slow path to fixup the fault we just took in the atomic write |
| 296 | * access to @uaddr. | 297 | * access to @uaddr. |
| 297 | * | 298 | * |
| 298 | * We have no generic implementation of a non destructive write to the | 299 | * We have no generic implementation of a non-destructive write to the |
| 299 | * user address. We know that we faulted in the atomic pagefault | 300 | * user address. We know that we faulted in the atomic pagefault |
| 300 | * disabled section so we can as well avoid the #PF overhead by | 301 | * disabled section so we can as well avoid the #PF overhead by |
| 301 | * calling get_user_pages() right away. | 302 | * calling get_user_pages() right away. |
| @@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 515 | */ | 516 | */ |
| 516 | pi_state = this->pi_state; | 517 | pi_state = this->pi_state; |
| 517 | /* | 518 | /* |
| 518 | * Userspace might have messed up non PI and PI futexes | 519 | * Userspace might have messed up non-PI and PI futexes |
| 519 | */ | 520 | */ |
| 520 | if (unlikely(!pi_state)) | 521 | if (unlikely(!pi_state)) |
| 521 | return -EINVAL; | 522 | return -EINVAL; |
| @@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q) | |||
| 736 | 737 | ||
| 737 | /* | 738 | /* |
| 738 | * We set q->lock_ptr = NULL _before_ we wake up the task. If | 739 | * We set q->lock_ptr = NULL _before_ we wake up the task. If |
| 739 | * a non futex wake up happens on another CPU then the task | 740 | * a non-futex wake up happens on another CPU then the task |
| 740 | * might exit and p would dereference a non existing task | 741 | * might exit and p would dereference a non-existing task |
| 741 | * struct. Prevent this by holding a reference on p across the | 742 | * struct. Prevent this by holding a reference on p across the |
| 742 | * wake up. | 743 | * wake up. |
| 743 | */ | 744 | */ |
| @@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1131 | 1132 | ||
| 1132 | /** | 1133 | /** |
| 1133 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 1134 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 |
| 1134 | * uaddr1: source futex user address | 1135 | * @uaddr1: source futex user address |
| 1135 | * uaddr2: target futex user address | 1136 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED |
| 1136 | * nr_wake: number of waiters to wake (must be 1 for requeue_pi) | 1137 | * @uaddr2: target futex user address |
| 1137 | * nr_requeue: number of waiters to requeue (0-INT_MAX) | 1138 | * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) |
| 1138 | * requeue_pi: if we are attempting to requeue from a non-pi futex to a | 1139 | * @nr_requeue: number of waiters to requeue (0-INT_MAX) |
| 1140 | * @cmpval: @uaddr1 expected value (or %NULL) | ||
| 1141 | * @requeue_pi: if we are attempting to requeue from a non-pi futex to a | ||
| 1139 | * pi futex (pi to pi requeue is not supported) | 1142 | * pi futex (pi to pi requeue is not supported) |
| 1140 | * | 1143 | * |
| 1141 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 1144 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire |
| @@ -1360,10 +1363,10 @@ out: | |||
| 1360 | 1363 | ||
| 1361 | /* The key must be already stored in q->key. */ | 1364 | /* The key must be already stored in q->key. */ |
| 1362 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | 1365 | static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) |
| 1366 | __acquires(&hb->lock) | ||
| 1363 | { | 1367 | { |
| 1364 | struct futex_hash_bucket *hb; | 1368 | struct futex_hash_bucket *hb; |
| 1365 | 1369 | ||
| 1366 | get_futex_key_refs(&q->key); | ||
| 1367 | hb = hash_futex(&q->key); | 1370 | hb = hash_futex(&q->key); |
| 1368 | q->lock_ptr = &hb->lock; | 1371 | q->lock_ptr = &hb->lock; |
| 1369 | 1372 | ||
| @@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
| 1373 | 1376 | ||
| 1374 | static inline void | 1377 | static inline void |
| 1375 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | 1378 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
| 1379 | __releases(&hb->lock) | ||
| 1376 | { | 1380 | { |
| 1377 | spin_unlock(&hb->lock); | 1381 | spin_unlock(&hb->lock); |
| 1378 | drop_futex_key_refs(&q->key); | ||
| 1379 | } | 1382 | } |
| 1380 | 1383 | ||
| 1381 | /** | 1384 | /** |
| @@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | |||
| 1391 | * an example). | 1394 | * an example). |
| 1392 | */ | 1395 | */ |
| 1393 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | 1396 | static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
| 1397 | __releases(&hb->lock) | ||
| 1394 | { | 1398 | { |
| 1395 | int prio; | 1399 | int prio; |
| 1396 | 1400 | ||
| @@ -1471,6 +1475,7 @@ retry: | |||
| 1471 | * and dropped here. | 1475 | * and dropped here. |
| 1472 | */ | 1476 | */ |
| 1473 | static void unqueue_me_pi(struct futex_q *q) | 1477 | static void unqueue_me_pi(struct futex_q *q) |
| 1478 | __releases(q->lock_ptr) | ||
| 1474 | { | 1479 | { |
| 1475 | WARN_ON(plist_node_empty(&q->list)); | 1480 | WARN_ON(plist_node_empty(&q->list)); |
| 1476 | plist_del(&q->list, &q->list.plist); | 1481 | plist_del(&q->list, &q->list.plist); |
| @@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q) | |||
| 1480 | q->pi_state = NULL; | 1485 | q->pi_state = NULL; |
| 1481 | 1486 | ||
| 1482 | spin_unlock(q->lock_ptr); | 1487 | spin_unlock(q->lock_ptr); |
| 1483 | |||
| 1484 | drop_futex_key_refs(&q->key); | ||
| 1485 | } | 1488 | } |
| 1486 | 1489 | ||
| 1487 | /* | 1490 | /* |
| @@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
| 1812 | } | 1815 | } |
| 1813 | 1816 | ||
| 1814 | retry: | 1817 | retry: |
| 1815 | /* Prepare to wait on uaddr. */ | 1818 | /* |
| 1819 | * Prepare to wait on uaddr. On success, holds hb lock and increments | ||
| 1820 | * q.key refs. | ||
| 1821 | */ | ||
| 1816 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 1822 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
| 1817 | if (ret) | 1823 | if (ret) |
| 1818 | goto out; | 1824 | goto out; |
| @@ -1822,28 +1828,27 @@ retry: | |||
| 1822 | 1828 | ||
| 1823 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 1829 | /* If we were woken (and unqueued), we succeeded, whatever. */ |
| 1824 | ret = 0; | 1830 | ret = 0; |
| 1831 | /* unqueue_me() drops q.key ref */ | ||
| 1825 | if (!unqueue_me(&q)) | 1832 | if (!unqueue_me(&q)) |
| 1826 | goto out_put_key; | 1833 | goto out; |
| 1827 | ret = -ETIMEDOUT; | 1834 | ret = -ETIMEDOUT; |
| 1828 | if (to && !to->task) | 1835 | if (to && !to->task) |
| 1829 | goto out_put_key; | 1836 | goto out; |
| 1830 | 1837 | ||
| 1831 | /* | 1838 | /* |
| 1832 | * We expect signal_pending(current), but we might be the | 1839 | * We expect signal_pending(current), but we might be the |
| 1833 | * victim of a spurious wakeup as well. | 1840 | * victim of a spurious wakeup as well. |
| 1834 | */ | 1841 | */ |
| 1835 | if (!signal_pending(current)) { | 1842 | if (!signal_pending(current)) |
| 1836 | put_futex_key(fshared, &q.key); | ||
| 1837 | goto retry; | 1843 | goto retry; |
| 1838 | } | ||
| 1839 | 1844 | ||
| 1840 | ret = -ERESTARTSYS; | 1845 | ret = -ERESTARTSYS; |
| 1841 | if (!abs_time) | 1846 | if (!abs_time) |
| 1842 | goto out_put_key; | 1847 | goto out; |
| 1843 | 1848 | ||
| 1844 | restart = ¤t_thread_info()->restart_block; | 1849 | restart = ¤t_thread_info()->restart_block; |
| 1845 | restart->fn = futex_wait_restart; | 1850 | restart->fn = futex_wait_restart; |
| 1846 | restart->futex.uaddr = (u32 *)uaddr; | 1851 | restart->futex.uaddr = uaddr; |
| 1847 | restart->futex.val = val; | 1852 | restart->futex.val = val; |
| 1848 | restart->futex.time = abs_time->tv64; | 1853 | restart->futex.time = abs_time->tv64; |
| 1849 | restart->futex.bitset = bitset; | 1854 | restart->futex.bitset = bitset; |
| @@ -1856,8 +1861,6 @@ retry: | |||
| 1856 | 1861 | ||
| 1857 | ret = -ERESTART_RESTARTBLOCK; | 1862 | ret = -ERESTART_RESTARTBLOCK; |
| 1858 | 1863 | ||
| 1859 | out_put_key: | ||
| 1860 | put_futex_key(fshared, &q.key); | ||
| 1861 | out: | 1864 | out: |
| 1862 | if (to) { | 1865 | if (to) { |
| 1863 | hrtimer_cancel(&to->timer); | 1866 | hrtimer_cancel(&to->timer); |
| @@ -1869,7 +1872,7 @@ out: | |||
| 1869 | 1872 | ||
| 1870 | static long futex_wait_restart(struct restart_block *restart) | 1873 | static long futex_wait_restart(struct restart_block *restart) |
| 1871 | { | 1874 | { |
| 1872 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; | 1875 | u32 __user *uaddr = restart->futex.uaddr; |
| 1873 | int fshared = 0; | 1876 | int fshared = 0; |
| 1874 | ktime_t t, *tp = NULL; | 1877 | ktime_t t, *tp = NULL; |
| 1875 | 1878 | ||
| @@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
| 2236 | q.rt_waiter = &rt_waiter; | 2239 | q.rt_waiter = &rt_waiter; |
| 2237 | q.requeue_pi_key = &key2; | 2240 | q.requeue_pi_key = &key2; |
| 2238 | 2241 | ||
| 2239 | /* Prepare to wait on uaddr. */ | 2242 | /* |
| 2243 | * Prepare to wait on uaddr. On success, increments q.key (key1) ref | ||
| 2244 | * count. | ||
| 2245 | */ | ||
| 2240 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 2246 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
| 2241 | if (ret) | 2247 | if (ret) |
| 2242 | goto out_key2; | 2248 | goto out_key2; |
| @@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, | |||
| 2254 | * In order for us to be here, we know our q.key == key2, and since | 2260 | * In order for us to be here, we know our q.key == key2, and since |
| 2255 | * we took the hb->lock above, we also know that futex_requeue() has | 2261 | * we took the hb->lock above, we also know that futex_requeue() has |
| 2256 | * completed and we no longer have to concern ourselves with a wakeup | 2262 | * completed and we no longer have to concern ourselves with a wakeup |
| 2257 | * race with the atomic proxy lock acquition by the requeue code. | 2263 | * race with the atomic proxy lock acquisition by the requeue code. The |
| 2264 | * futex_requeue dropped our key1 reference and incremented our key2 | ||
| 2265 | * reference count. | ||
| 2258 | */ | 2266 | */ |
| 2259 | 2267 | ||
| 2260 | /* Check if the requeue code acquired the second futex for us. */ | 2268 | /* Check if the requeue code acquired the second futex for us. */ |
| @@ -2458,7 +2466,7 @@ retry: | |||
| 2458 | */ | 2466 | */ |
| 2459 | static inline int fetch_robust_entry(struct robust_list __user **entry, | 2467 | static inline int fetch_robust_entry(struct robust_list __user **entry, |
| 2460 | struct robust_list __user * __user *head, | 2468 | struct robust_list __user * __user *head, |
| 2461 | int *pi) | 2469 | unsigned int *pi) |
| 2462 | { | 2470 | { |
| 2463 | unsigned long uentry; | 2471 | unsigned long uentry; |
| 2464 | 2472 | ||
| @@ -2647,7 +2655,7 @@ static int __init futex_init(void) | |||
| 2647 | * of the complex code paths. Also we want to prevent | 2655 | * of the complex code paths. Also we want to prevent |
| 2648 | * registration of robust lists in that case. NULL is | 2656 | * registration of robust lists in that case. NULL is |
| 2649 | * guaranteed to fault and we get -EFAULT on functional | 2657 | * guaranteed to fault and we get -EFAULT on functional |
| 2650 | * implementation, the non functional ones will return | 2658 | * implementation, the non-functional ones will return |
| 2651 | * -ENOSYS. | 2659 | * -ENOSYS. |
| 2652 | */ | 2660 | */ |
| 2653 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); | 2661 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2395e5..06da4dfc339b 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -19,7 +19,7 @@ | |||
| 19 | */ | 19 | */ |
| 20 | static inline int | 20 | static inline int |
| 21 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | 21 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, |
| 22 | compat_uptr_t __user *head, int *pi) | 22 | compat_uptr_t __user *head, unsigned int *pi) |
| 23 | { | 23 | { |
| 24 | if (get_user(*uentry, head)) | 24 | if (get_user(*uentry, head)) |
| 25 | return -EFAULT; | 25 | return -EFAULT; |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index ef3c3f88a7a3..f83972b16564 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
| @@ -33,10 +33,11 @@ | |||
| 33 | * @children: child nodes | 33 | * @children: child nodes |
| 34 | * @all: list head for list of all nodes | 34 | * @all: list head for list of all nodes |
| 35 | * @parent: parent node | 35 | * @parent: parent node |
| 36 | * @info: associated profiling data structure if not a directory | 36 | * @loaded_info: array of pointers to profiling data sets for loaded object |
| 37 | * @ghost: when an object file containing profiling data is unloaded we keep a | 37 | * files. |
| 38 | * copy of the profiling data here to allow collecting coverage data | 38 | * @num_loaded: number of profiling data sets for loaded object files. |
| 39 | * for cleanup code. Such a node is called a "ghost". | 39 | * @unloaded_info: accumulated copy of profiling data sets for unloaded |
| 40 | * object files. Used only when gcov_persist=1. | ||
| 40 | * @dentry: main debugfs entry, either a directory or data file | 41 | * @dentry: main debugfs entry, either a directory or data file |
| 41 | * @links: associated symbolic links | 42 | * @links: associated symbolic links |
| 42 | * @name: data file basename | 43 | * @name: data file basename |
| @@ -51,10 +52,11 @@ struct gcov_node { | |||
| 51 | struct list_head children; | 52 | struct list_head children; |
| 52 | struct list_head all; | 53 | struct list_head all; |
| 53 | struct gcov_node *parent; | 54 | struct gcov_node *parent; |
| 54 | struct gcov_info *info; | 55 | struct gcov_info **loaded_info; |
| 55 | struct gcov_info *ghost; | 56 | struct gcov_info *unloaded_info; |
| 56 | struct dentry *dentry; | 57 | struct dentry *dentry; |
| 57 | struct dentry **links; | 58 | struct dentry **links; |
| 59 | int num_loaded; | ||
| 58 | char name[0]; | 60 | char name[0]; |
| 59 | }; | 61 | }; |
| 60 | 62 | ||
| @@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = { | |||
| 136 | }; | 138 | }; |
| 137 | 139 | ||
| 138 | /* | 140 | /* |
| 139 | * Return the profiling data set for a given node. This can either be the | 141 | * Return a profiling data set associated with the given node. This is |
| 140 | * original profiling data structure or a duplicate (also called "ghost") | 142 | * either a data set for a loaded object file or a data set copy in case |
| 141 | * in case the associated object file has been unloaded. | 143 | * all associated object files have been unloaded. |
| 142 | */ | 144 | */ |
| 143 | static struct gcov_info *get_node_info(struct gcov_node *node) | 145 | static struct gcov_info *get_node_info(struct gcov_node *node) |
| 144 | { | 146 | { |
| 145 | if (node->info) | 147 | if (node->num_loaded > 0) |
| 146 | return node->info; | 148 | return node->loaded_info[0]; |
| 147 | 149 | ||
| 148 | return node->ghost; | 150 | return node->unloaded_info; |
| 151 | } | ||
| 152 | |||
| 153 | /* | ||
| 154 | * Return a newly allocated profiling data set which contains the sum of | ||
| 155 | * all profiling data associated with the given node. | ||
| 156 | */ | ||
| 157 | static struct gcov_info *get_accumulated_info(struct gcov_node *node) | ||
| 158 | { | ||
| 159 | struct gcov_info *info; | ||
| 160 | int i = 0; | ||
| 161 | |||
| 162 | if (node->unloaded_info) | ||
| 163 | info = gcov_info_dup(node->unloaded_info); | ||
| 164 | else | ||
| 165 | info = gcov_info_dup(node->loaded_info[i++]); | ||
| 166 | if (!info) | ||
| 167 | return NULL; | ||
| 168 | for (; i < node->num_loaded; i++) | ||
| 169 | gcov_info_add(info, node->loaded_info[i]); | ||
| 170 | |||
| 171 | return info; | ||
| 149 | } | 172 | } |
| 150 | 173 | ||
| 151 | /* | 174 | /* |
| @@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file) | |||
| 163 | mutex_lock(&node_lock); | 186 | mutex_lock(&node_lock); |
| 164 | /* | 187 | /* |
| 165 | * Read from a profiling data copy to minimize reference tracking | 188 | * Read from a profiling data copy to minimize reference tracking |
| 166 | * complexity and concurrent access. | 189 | * complexity and concurrent access and to keep accumulating multiple |
| 190 | * profiling data sets associated with one node simple. | ||
| 167 | */ | 191 | */ |
| 168 | info = gcov_info_dup(get_node_info(node)); | 192 | info = get_accumulated_info(node); |
| 169 | if (!info) | 193 | if (!info) |
| 170 | goto out_unlock; | 194 | goto out_unlock; |
| 171 | iter = gcov_iter_new(info); | 195 | iter = gcov_iter_new(info); |
| @@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name) | |||
| 225 | return NULL; | 249 | return NULL; |
| 226 | } | 250 | } |
| 227 | 251 | ||
| 252 | /* | ||
| 253 | * Reset all profiling data associated with the specified node. | ||
| 254 | */ | ||
| 255 | static void reset_node(struct gcov_node *node) | ||
| 256 | { | ||
| 257 | int i; | ||
| 258 | |||
| 259 | if (node->unloaded_info) | ||
| 260 | gcov_info_reset(node->unloaded_info); | ||
| 261 | for (i = 0; i < node->num_loaded; i++) | ||
| 262 | gcov_info_reset(node->loaded_info[i]); | ||
| 263 | } | ||
| 264 | |||
| 228 | static void remove_node(struct gcov_node *node); | 265 | static void remove_node(struct gcov_node *node); |
| 229 | 266 | ||
| 230 | /* | 267 | /* |
| 231 | * write() implementation for gcov data files. Reset profiling data for the | 268 | * write() implementation for gcov data files. Reset profiling data for the |
| 232 | * associated file. If the object file has been unloaded (i.e. this is | 269 | * corresponding file. If all associated object files have been unloaded, |
| 233 | * a "ghost" node), remove the debug fs node as well. | 270 | * remove the debug fs node as well. |
| 234 | */ | 271 | */ |
| 235 | static ssize_t gcov_seq_write(struct file *file, const char __user *addr, | 272 | static ssize_t gcov_seq_write(struct file *file, const char __user *addr, |
| 236 | size_t len, loff_t *pos) | 273 | size_t len, loff_t *pos) |
| @@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, | |||
| 245 | node = get_node_by_name(info->filename); | 282 | node = get_node_by_name(info->filename); |
| 246 | if (node) { | 283 | if (node) { |
| 247 | /* Reset counts or remove node for unloaded modules. */ | 284 | /* Reset counts or remove node for unloaded modules. */ |
| 248 | if (node->ghost) | 285 | if (node->num_loaded == 0) |
| 249 | remove_node(node); | 286 | remove_node(node); |
| 250 | else | 287 | else |
| 251 | gcov_info_reset(node->info); | 288 | reset_node(node); |
| 252 | } | 289 | } |
| 253 | /* Reset counts for open file. */ | 290 | /* Reset counts for open file. */ |
| 254 | gcov_info_reset(info); | 291 | gcov_info_reset(info); |
| @@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info, | |||
| 378 | INIT_LIST_HEAD(&node->list); | 415 | INIT_LIST_HEAD(&node->list); |
| 379 | INIT_LIST_HEAD(&node->children); | 416 | INIT_LIST_HEAD(&node->children); |
| 380 | INIT_LIST_HEAD(&node->all); | 417 | INIT_LIST_HEAD(&node->all); |
| 381 | node->info = info; | 418 | if (node->loaded_info) { |
| 419 | node->loaded_info[0] = info; | ||
| 420 | node->num_loaded = 1; | ||
| 421 | } | ||
| 382 | node->parent = parent; | 422 | node->parent = parent; |
| 383 | if (name) | 423 | if (name) |
| 384 | strcpy(node->name, name); | 424 | strcpy(node->name, name); |
| @@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
| 394 | struct gcov_node *node; | 434 | struct gcov_node *node; |
| 395 | 435 | ||
| 396 | node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); | 436 | node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); |
| 397 | if (!node) { | 437 | if (!node) |
| 398 | pr_warning("out of memory\n"); | 438 | goto err_nomem; |
| 399 | return NULL; | 439 | if (info) { |
| 440 | node->loaded_info = kcalloc(1, sizeof(struct gcov_info *), | ||
| 441 | GFP_KERNEL); | ||
| 442 | if (!node->loaded_info) | ||
| 443 | goto err_nomem; | ||
| 400 | } | 444 | } |
| 401 | init_node(node, info, name, parent); | 445 | init_node(node, info, name, parent); |
| 402 | /* Differentiate between gcov data file nodes and directory nodes. */ | 446 | /* Differentiate between gcov data file nodes and directory nodes. */ |
| @@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
| 416 | list_add(&node->all, &all_head); | 460 | list_add(&node->all, &all_head); |
| 417 | 461 | ||
| 418 | return node; | 462 | return node; |
| 463 | |||
| 464 | err_nomem: | ||
| 465 | kfree(node); | ||
| 466 | pr_warning("out of memory\n"); | ||
| 467 | return NULL; | ||
| 419 | } | 468 | } |
| 420 | 469 | ||
| 421 | /* Remove symbolic links associated with node. */ | 470 | /* Remove symbolic links associated with node. */ |
| @@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node) | |||
| 441 | list_del(&node->all); | 490 | list_del(&node->all); |
| 442 | debugfs_remove(node->dentry); | 491 | debugfs_remove(node->dentry); |
| 443 | remove_links(node); | 492 | remove_links(node); |
| 444 | if (node->ghost) | 493 | kfree(node->loaded_info); |
| 445 | gcov_info_free(node->ghost); | 494 | if (node->unloaded_info) |
| 495 | gcov_info_free(node->unloaded_info); | ||
| 446 | kfree(node); | 496 | kfree(node); |
| 447 | } | 497 | } |
| 448 | 498 | ||
| @@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent, | |||
| 477 | 527 | ||
| 478 | /* | 528 | /* |
| 479 | * write() implementation for reset file. Reset all profiling data to zero | 529 | * write() implementation for reset file. Reset all profiling data to zero |
| 480 | * and remove ghost nodes. | 530 | * and remove nodes for which all associated object files are unloaded. |
| 481 | */ | 531 | */ |
| 482 | static ssize_t reset_write(struct file *file, const char __user *addr, | 532 | static ssize_t reset_write(struct file *file, const char __user *addr, |
| 483 | size_t len, loff_t *pos) | 533 | size_t len, loff_t *pos) |
| @@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr, | |||
| 487 | mutex_lock(&node_lock); | 537 | mutex_lock(&node_lock); |
| 488 | restart: | 538 | restart: |
| 489 | list_for_each_entry(node, &all_head, all) { | 539 | list_for_each_entry(node, &all_head, all) { |
| 490 | if (node->info) | 540 | if (node->num_loaded > 0) |
| 491 | gcov_info_reset(node->info); | 541 | reset_node(node); |
| 492 | else if (list_empty(&node->children)) { | 542 | else if (list_empty(&node->children)) { |
| 493 | remove_node(node); | 543 | remove_node(node); |
| 494 | /* Several nodes may have gone - restart loop. */ | 544 | /* Several nodes may have gone - restart loop. */ |
| @@ -564,37 +614,115 @@ err_remove: | |||
| 564 | } | 614 | } |
| 565 | 615 | ||
| 566 | /* | 616 | /* |
| 567 | * The profiling data set associated with this node is being unloaded. Store a | 617 | * Associate a profiling data set with an existing node. Needs to be called |
| 568 | * copy of the profiling data and turn this node into a "ghost". | 618 | * with node_lock held. |
| 569 | */ | 619 | */ |
| 570 | static int ghost_node(struct gcov_node *node) | 620 | static void add_info(struct gcov_node *node, struct gcov_info *info) |
| 571 | { | 621 | { |
| 572 | node->ghost = gcov_info_dup(node->info); | 622 | struct gcov_info **loaded_info; |
| 573 | if (!node->ghost) { | 623 | int num = node->num_loaded; |
| 574 | pr_warning("could not save data for '%s' (out of memory)\n", | 624 | |
| 575 | node->info->filename); | 625 | /* |
| 576 | return -ENOMEM; | 626 | * Prepare new array. This is done first to simplify cleanup in |
| 627 | * case the new data set is incompatible, the node only contains | ||
| 628 | * unloaded data sets and there's not enough memory for the array. | ||
| 629 | */ | ||
| 630 | loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); | ||
| 631 | if (!loaded_info) { | ||
| 632 | pr_warning("could not add '%s' (out of memory)\n", | ||
| 633 | info->filename); | ||
| 634 | return; | ||
| 635 | } | ||
| 636 | memcpy(loaded_info, node->loaded_info, | ||
| 637 | num * sizeof(struct gcov_info *)); | ||
| 638 | loaded_info[num] = info; | ||
| 639 | /* Check if the new data set is compatible. */ | ||
| 640 | if (num == 0) { | ||
| 641 | /* | ||
| 642 | * A module was unloaded, modified and reloaded. The new | ||
| 643 | * data set replaces the copy of the last one. | ||
| 644 | */ | ||
| 645 | if (!gcov_info_is_compatible(node->unloaded_info, info)) { | ||
| 646 | pr_warning("discarding saved data for %s " | ||
| 647 | "(incompatible version)\n", info->filename); | ||
| 648 | gcov_info_free(node->unloaded_info); | ||
| 649 | node->unloaded_info = NULL; | ||
| 650 | } | ||
| 651 | } else { | ||
| 652 | /* | ||
| 653 | * Two different versions of the same object file are loaded. | ||
| 654 | * The initial one takes precedence. | ||
| 655 | */ | ||
| 656 | if (!gcov_info_is_compatible(node->loaded_info[0], info)) { | ||
| 657 | pr_warning("could not add '%s' (incompatible " | ||
| 658 | "version)\n", info->filename); | ||
| 659 | kfree(loaded_info); | ||
| 660 | return; | ||
| 661 | } | ||
| 577 | } | 662 | } |
| 578 | node->info = NULL; | 663 | /* Overwrite previous array. */ |
| 664 | kfree(node->loaded_info); | ||
| 665 | node->loaded_info = loaded_info; | ||
| 666 | node->num_loaded = num + 1; | ||
| 667 | } | ||
| 579 | 668 | ||
| 580 | return 0; | 669 | /* |
| 670 | * Return the index of a profiling data set associated with a node. | ||
| 671 | */ | ||
| 672 | static int get_info_index(struct gcov_node *node, struct gcov_info *info) | ||
| 673 | { | ||
| 674 | int i; | ||
| 675 | |||
| 676 | for (i = 0; i < node->num_loaded; i++) { | ||
| 677 | if (node->loaded_info[i] == info) | ||
| 678 | return i; | ||
| 679 | } | ||
| 680 | return -ENOENT; | ||
| 581 | } | 681 | } |
| 582 | 682 | ||
| 583 | /* | 683 | /* |
| 584 | * Profiling data for this node has been loaded again. Add profiling data | 684 | * Save the data of a profiling data set which is being unloaded. |
| 585 | * from previous instantiation and turn this node into a regular node. | ||
| 586 | */ | 685 | */ |
| 587 | static void revive_node(struct gcov_node *node, struct gcov_info *info) | 686 | static void save_info(struct gcov_node *node, struct gcov_info *info) |
| 588 | { | 687 | { |
| 589 | if (gcov_info_is_compatible(node->ghost, info)) | 688 | if (node->unloaded_info) |
| 590 | gcov_info_add(info, node->ghost); | 689 | gcov_info_add(node->unloaded_info, info); |
| 591 | else { | 690 | else { |
| 592 | pr_warning("discarding saved data for '%s' (version changed)\n", | 691 | node->unloaded_info = gcov_info_dup(info); |
| 692 | if (!node->unloaded_info) { | ||
| 693 | pr_warning("could not save data for '%s' " | ||
| 694 | "(out of memory)\n", info->filename); | ||
| 695 | } | ||
| 696 | } | ||
| 697 | } | ||
| 698 | |||
| 699 | /* | ||
| 700 | * Disassociate a profiling data set from a node. Needs to be called with | ||
| 701 | * node_lock held. | ||
| 702 | */ | ||
| 703 | static void remove_info(struct gcov_node *node, struct gcov_info *info) | ||
| 704 | { | ||
| 705 | int i; | ||
| 706 | |||
| 707 | i = get_info_index(node, info); | ||
| 708 | if (i < 0) { | ||
| 709 | pr_warning("could not remove '%s' (not found)\n", | ||
| 593 | info->filename); | 710 | info->filename); |
| 711 | return; | ||
| 594 | } | 712 | } |
| 595 | gcov_info_free(node->ghost); | 713 | if (gcov_persist) |
| 596 | node->ghost = NULL; | 714 | save_info(node, info); |
| 597 | node->info = info; | 715 | /* Shrink array. */ |
| 716 | node->loaded_info[i] = node->loaded_info[node->num_loaded - 1]; | ||
| 717 | node->num_loaded--; | ||
| 718 | if (node->num_loaded > 0) | ||
| 719 | return; | ||
| 720 | /* Last loaded data set was removed. */ | ||
| 721 | kfree(node->loaded_info); | ||
| 722 | node->loaded_info = NULL; | ||
| 723 | node->num_loaded = 0; | ||
| 724 | if (!node->unloaded_info) | ||
| 725 | remove_node(node); | ||
| 598 | } | 726 | } |
| 599 | 727 | ||
| 600 | /* | 728 | /* |
| @@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) | |||
| 609 | node = get_node_by_name(info->filename); | 737 | node = get_node_by_name(info->filename); |
| 610 | switch (action) { | 738 | switch (action) { |
| 611 | case GCOV_ADD: | 739 | case GCOV_ADD: |
| 612 | /* Add new node or revive ghost. */ | 740 | if (node) |
| 613 | if (!node) { | 741 | add_info(node, info); |
| 742 | else | ||
| 614 | add_node(info); | 743 | add_node(info); |
| 615 | break; | ||
| 616 | } | ||
| 617 | if (gcov_persist) | ||
| 618 | revive_node(node, info); | ||
| 619 | else { | ||
| 620 | pr_warning("could not add '%s' (already exists)\n", | ||
| 621 | info->filename); | ||
| 622 | } | ||
| 623 | break; | 744 | break; |
| 624 | case GCOV_REMOVE: | 745 | case GCOV_REMOVE: |
| 625 | /* Remove node or turn into ghost. */ | 746 | if (node) |
| 626 | if (!node) { | 747 | remove_info(node, info); |
| 748 | else { | ||
| 627 | pr_warning("could not remove '%s' (not found)\n", | 749 | pr_warning("could not remove '%s' (not found)\n", |
| 628 | info->filename); | 750 | info->filename); |
| 629 | break; | ||
| 630 | } | 751 | } |
| 631 | if (gcov_persist) { | ||
| 632 | if (!ghost_node(node)) | ||
| 633 | break; | ||
| 634 | } | ||
| 635 | remove_node(node); | ||
| 636 | break; | 752 | break; |
| 637 | } | 753 | } |
| 638 | mutex_unlock(&node_lock); | 754 | mutex_unlock(&node_lock); |
diff --git a/kernel/groups.c b/kernel/groups.c index 53b1916c9492..253dc0f35cf4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) | |||
| 143 | right = group_info->ngroups; | 143 | right = group_info->ngroups; |
| 144 | while (left < right) { | 144 | while (left < right) { |
| 145 | unsigned int mid = (left+right)/2; | 145 | unsigned int mid = (left+right)/2; |
| 146 | int cmp = grp - GROUP_AT(group_info, mid); | 146 | if (grp > GROUP_AT(group_info, mid)) |
| 147 | if (cmp > 0) | ||
| 148 | left = mid + 1; | 147 | left = mid + 1; |
| 149 | else if (cmp < 0) | 148 | else if (grp < GROUP_AT(group_info, mid)) |
| 150 | right = mid; | 149 | right = mid; |
| 151 | else | 150 | else |
| 152 | return 1; | 151 | return 1; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ce669174f355..72206cf5c6cf 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -931,6 +931,7 @@ static inline int | |||
| 931 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | 931 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) |
| 932 | { | 932 | { |
| 933 | if (hrtimer_is_queued(timer)) { | 933 | if (hrtimer_is_queued(timer)) { |
| 934 | unsigned long state; | ||
| 934 | int reprogram; | 935 | int reprogram; |
| 935 | 936 | ||
| 936 | /* | 937 | /* |
| @@ -944,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) | |||
| 944 | debug_deactivate(timer); | 945 | debug_deactivate(timer); |
| 945 | timer_stats_hrtimer_clear_start_info(timer); | 946 | timer_stats_hrtimer_clear_start_info(timer); |
| 946 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); | 947 | reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); |
| 947 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, | 948 | /* |
| 948 | reprogram); | 949 | * We must preserve the CALLBACK state flag here, |
| 950 | * otherwise we could move the timer base in | ||
| 951 | * switch_hrtimer_base. | ||
| 952 | */ | ||
| 953 | state = timer->state & HRTIMER_STATE_CALLBACK; | ||
| 954 | __remove_hrtimer(timer, base, state, reprogram); | ||
| 949 | return 1; | 955 | return 1; |
| 950 | } | 956 | } |
| 951 | return 0; | 957 | return 0; |
| @@ -1091,11 +1097,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); | |||
| 1091 | */ | 1097 | */ |
| 1092 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | 1098 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) |
| 1093 | { | 1099 | { |
| 1094 | struct hrtimer_clock_base *base; | ||
| 1095 | unsigned long flags; | 1100 | unsigned long flags; |
| 1096 | ktime_t rem; | 1101 | ktime_t rem; |
| 1097 | 1102 | ||
| 1098 | base = lock_hrtimer_base(timer, &flags); | 1103 | lock_hrtimer_base(timer, &flags); |
| 1099 | rem = hrtimer_expires_remaining(timer); | 1104 | rem = hrtimer_expires_remaining(timer); |
| 1100 | unlock_hrtimer_base(timer, &flags); | 1105 | unlock_hrtimer_base(timer, &flags); |
| 1101 | 1106 | ||
| @@ -1232,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
| 1232 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | 1237 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); |
| 1233 | enqueue_hrtimer(timer, base); | 1238 | enqueue_hrtimer(timer, base); |
| 1234 | } | 1239 | } |
| 1240 | |||
| 1241 | WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); | ||
| 1242 | |||
| 1235 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1243 | timer->state &= ~HRTIMER_STATE_CALLBACK; |
| 1236 | } | 1244 | } |
| 1237 | 1245 | ||
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac2..53ead174da2f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
| @@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
| 98 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 98 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" |
| 99 | " disables this message.\n"); | 99 | " disables this message.\n"); |
| 100 | sched_show_task(t); | 100 | sched_show_task(t); |
| 101 | __debug_show_held_locks(t); | 101 | debug_show_held_locks(t); |
| 102 | 102 | ||
| 103 | touch_nmi_watchdog(); | 103 | touch_nmi_watchdog(); |
| 104 | 104 | ||
| @@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
| 111 | * periodically exit the critical section and enter a new one. | 111 | * periodically exit the critical section and enter a new one. |
| 112 | * | 112 | * |
| 113 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order | 113 | * For preemptible RCU it is sufficient to call rcu_read_unlock in order |
| 114 | * exit the grace period. For classic RCU, a reschedule is required. | 114 | * to exit the grace period. For classic RCU, a reschedule is required. |
| 115 | */ | 115 | */ |
| 116 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) | 116 | static void rcu_lock_break(struct task_struct *g, struct task_struct *t) |
| 117 | { | 117 | { |
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index d71a987fd2bf..2c9120f0afca 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
| @@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
| 113 | */ | 113 | */ |
| 114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) | 114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) |
| 115 | { | 115 | { |
| 116 | struct perf_event_context *ctx = bp->ctx; | 116 | struct task_struct *tsk = bp->hw.bp_target; |
| 117 | struct perf_event *iter; | 117 | struct perf_event *iter; |
| 118 | int count = 0; | 118 | int count = 0; |
| 119 | 119 | ||
| 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
| 121 | if (iter->ctx == ctx && find_slot_idx(iter) == type) | 121 | if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) |
| 122 | count += hw_breakpoint_weight(iter); | 122 | count += hw_breakpoint_weight(iter); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| @@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
| 134 | enum bp_type_idx type) | 134 | enum bp_type_idx type) |
| 135 | { | 135 | { |
| 136 | int cpu = bp->cpu; | 136 | int cpu = bp->cpu; |
| 137 | struct task_struct *tsk = bp->ctx->task; | 137 | struct task_struct *tsk = bp->hw.bp_target; |
| 138 | 138 | ||
| 139 | if (cpu >= 0) { | 139 | if (cpu >= 0) { |
| 140 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); | 140 | slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); |
| @@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | |||
| 213 | int weight) | 213 | int weight) |
| 214 | { | 214 | { |
| 215 | int cpu = bp->cpu; | 215 | int cpu = bp->cpu; |
| 216 | struct task_struct *tsk = bp->ctx->task; | 216 | struct task_struct *tsk = bp->hw.bp_target; |
| 217 | 217 | ||
| 218 | /* Pinned counter cpu profiling */ | 218 | /* Pinned counter cpu profiling */ |
| 219 | if (!tsk) { | 219 | if (!tsk) { |
| @@ -433,7 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, | |||
| 433 | perf_overflow_handler_t triggered, | 433 | perf_overflow_handler_t triggered, |
| 434 | struct task_struct *tsk) | 434 | struct task_struct *tsk) |
| 435 | { | 435 | { |
| 436 | return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); | 436 | return perf_event_create_kernel_counter(attr, -1, tsk, triggered); |
| 437 | } | 437 | } |
| 438 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 438 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
| 439 | 439 | ||
| @@ -515,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, | |||
| 515 | get_online_cpus(); | 515 | get_online_cpus(); |
| 516 | for_each_online_cpu(cpu) { | 516 | for_each_online_cpu(cpu) { |
| 517 | pevent = per_cpu_ptr(cpu_events, cpu); | 517 | pevent = per_cpu_ptr(cpu_events, cpu); |
| 518 | bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); | 518 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); |
| 519 | 519 | ||
| 520 | *pevent = bp; | 520 | *pevent = bp; |
| 521 | 521 | ||
| @@ -565,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { | |||
| 565 | .priority = 0x7fffffff | 565 | .priority = 0x7fffffff |
| 566 | }; | 566 | }; |
| 567 | 567 | ||
| 568 | static void bp_perf_event_destroy(struct perf_event *event) | ||
| 569 | { | ||
| 570 | release_bp_slot(event); | ||
| 571 | } | ||
| 572 | |||
| 573 | static int hw_breakpoint_event_init(struct perf_event *bp) | ||
| 574 | { | ||
| 575 | int err; | ||
| 576 | |||
| 577 | if (bp->attr.type != PERF_TYPE_BREAKPOINT) | ||
| 578 | return -ENOENT; | ||
| 579 | |||
| 580 | err = register_perf_hw_breakpoint(bp); | ||
| 581 | if (err) | ||
| 582 | return err; | ||
| 583 | |||
| 584 | bp->destroy = bp_perf_event_destroy; | ||
| 585 | |||
| 586 | return 0; | ||
| 587 | } | ||
| 588 | |||
| 589 | static int hw_breakpoint_add(struct perf_event *bp, int flags) | ||
| 590 | { | ||
| 591 | if (!(flags & PERF_EF_START)) | ||
| 592 | bp->hw.state = PERF_HES_STOPPED; | ||
| 593 | |||
| 594 | return arch_install_hw_breakpoint(bp); | ||
| 595 | } | ||
| 596 | |||
| 597 | static void hw_breakpoint_del(struct perf_event *bp, int flags) | ||
| 598 | { | ||
| 599 | arch_uninstall_hw_breakpoint(bp); | ||
| 600 | } | ||
| 601 | |||
| 602 | static void hw_breakpoint_start(struct perf_event *bp, int flags) | ||
| 603 | { | ||
| 604 | bp->hw.state = 0; | ||
| 605 | } | ||
| 606 | |||
| 607 | static void hw_breakpoint_stop(struct perf_event *bp, int flags) | ||
| 608 | { | ||
| 609 | bp->hw.state = PERF_HES_STOPPED; | ||
| 610 | } | ||
| 611 | |||
| 612 | static struct pmu perf_breakpoint = { | ||
| 613 | .task_ctx_nr = perf_sw_context, /* could eventually get its own */ | ||
| 614 | |||
| 615 | .event_init = hw_breakpoint_event_init, | ||
| 616 | .add = hw_breakpoint_add, | ||
| 617 | .del = hw_breakpoint_del, | ||
| 618 | .start = hw_breakpoint_start, | ||
| 619 | .stop = hw_breakpoint_stop, | ||
| 620 | .read = hw_breakpoint_pmu_read, | ||
| 621 | }; | ||
| 622 | |||
| 568 | static int __init init_hw_breakpoint(void) | 623 | static int __init init_hw_breakpoint(void) |
| 569 | { | 624 | { |
| 570 | unsigned int **task_bp_pinned; | 625 | unsigned int **task_bp_pinned; |
| @@ -586,6 +641,8 @@ static int __init init_hw_breakpoint(void) | |||
| 586 | 641 | ||
| 587 | constraints_initialized = 1; | 642 | constraints_initialized = 1; |
| 588 | 643 | ||
| 644 | perf_pmu_register(&perf_breakpoint); | ||
| 645 | |||
| 589 | return register_die_notifier(&hw_breakpoint_exceptions_nb); | 646 | return register_die_notifier(&hw_breakpoint_exceptions_nb); |
| 590 | 647 | ||
| 591 | err_alloc: | 648 | err_alloc: |
| @@ -601,8 +658,3 @@ static int __init init_hw_breakpoint(void) | |||
| 601 | core_initcall(init_hw_breakpoint); | 658 | core_initcall(init_hw_breakpoint); |
| 602 | 659 | ||
| 603 | 660 | ||
| 604 | struct pmu perf_ops_bp = { | ||
| 605 | .enable = arch_install_hw_breakpoint, | ||
| 606 | .disable = arch_uninstall_hw_breakpoint, | ||
| 607 | .read = hw_breakpoint_pmu_read, | ||
| 608 | }; | ||
diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 000000000000..f16763ff8481 --- /dev/null +++ b/kernel/irq_work.c | |||
| @@ -0,0 +1,164 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
| 3 | * | ||
| 4 | * Provides a framework for enqueueing and running callbacks from hardirq | ||
| 5 | * context. The enqueueing is NMI-safe. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/kernel.h> | ||
| 9 | #include <linux/module.h> | ||
| 10 | #include <linux/irq_work.h> | ||
| 11 | #include <linux/hardirq.h> | ||
| 12 | |||
| 13 | /* | ||
| 14 | * An entry can be in one of four states: | ||
| 15 | * | ||
| 16 | * free NULL, 0 -> {claimed} : free to be used | ||
| 17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | ||
| 18 | * pending next, 3 -> {busy} : queued, pending callback | ||
| 19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | ||
| 20 | * | ||
| 21 | * We use the lower two bits of the next pointer to keep PENDING and BUSY | ||
| 22 | * flags. | ||
| 23 | */ | ||
| 24 | |||
| 25 | #define IRQ_WORK_PENDING 1UL | ||
| 26 | #define IRQ_WORK_BUSY 2UL | ||
| 27 | #define IRQ_WORK_FLAGS 3UL | ||
| 28 | |||
| 29 | static inline bool irq_work_is_set(struct irq_work *entry, int flags) | ||
| 30 | { | ||
| 31 | return (unsigned long)entry->next & flags; | ||
| 32 | } | ||
| 33 | |||
| 34 | static inline struct irq_work *irq_work_next(struct irq_work *entry) | ||
| 35 | { | ||
| 36 | unsigned long next = (unsigned long)entry->next; | ||
| 37 | next &= ~IRQ_WORK_FLAGS; | ||
| 38 | return (struct irq_work *)next; | ||
| 39 | } | ||
| 40 | |||
| 41 | static inline struct irq_work *next_flags(struct irq_work *entry, int flags) | ||
| 42 | { | ||
| 43 | unsigned long next = (unsigned long)entry; | ||
| 44 | next |= flags; | ||
| 45 | return (struct irq_work *)next; | ||
| 46 | } | ||
| 47 | |||
| 48 | static DEFINE_PER_CPU(struct irq_work *, irq_work_list); | ||
| 49 | |||
| 50 | /* | ||
| 51 | * Claim the entry so that no one else will poke at it. | ||
| 52 | */ | ||
| 53 | static bool irq_work_claim(struct irq_work *entry) | ||
| 54 | { | ||
| 55 | struct irq_work *next, *nflags; | ||
| 56 | |||
| 57 | do { | ||
| 58 | next = entry->next; | ||
| 59 | if ((unsigned long)next & IRQ_WORK_PENDING) | ||
| 60 | return false; | ||
| 61 | nflags = next_flags(next, IRQ_WORK_FLAGS); | ||
| 62 | } while (cmpxchg(&entry->next, next, nflags) != next); | ||
| 63 | |||
| 64 | return true; | ||
| 65 | } | ||
| 66 | |||
| 67 | |||
| 68 | void __weak arch_irq_work_raise(void) | ||
| 69 | { | ||
| 70 | /* | ||
| 71 | * Lame architectures will get the timer tick callback | ||
| 72 | */ | ||
| 73 | } | ||
| 74 | |||
| 75 | /* | ||
| 76 | * Queue the entry and raise the IPI if needed. | ||
| 77 | */ | ||
| 78 | static void __irq_work_queue(struct irq_work *entry) | ||
| 79 | { | ||
| 80 | struct irq_work **head, *next; | ||
| 81 | |||
| 82 | head = &get_cpu_var(irq_work_list); | ||
| 83 | |||
| 84 | do { | ||
| 85 | next = *head; | ||
| 86 | /* Can assign non-atomic because we keep the flags set. */ | ||
| 87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | ||
| 88 | } while (cmpxchg(head, next, entry) != next); | ||
| 89 | |||
| 90 | /* The list was empty, raise self-interrupt to start processing. */ | ||
| 91 | if (!irq_work_next(entry)) | ||
| 92 | arch_irq_work_raise(); | ||
| 93 | |||
| 94 | put_cpu_var(irq_work_list); | ||
| 95 | } | ||
| 96 | |||
| 97 | /* | ||
| 98 | * Enqueue the irq_work @entry, returns true on success, failure when the | ||
| 99 | * @entry was already enqueued by someone else. | ||
| 100 | * | ||
| 101 | * Can be re-enqueued while the callback is still in progress. | ||
| 102 | */ | ||
| 103 | bool irq_work_queue(struct irq_work *entry) | ||
| 104 | { | ||
| 105 | if (!irq_work_claim(entry)) { | ||
| 106 | /* | ||
| 107 | * Already enqueued, can't do! | ||
| 108 | */ | ||
| 109 | return false; | ||
| 110 | } | ||
| 111 | |||
| 112 | __irq_work_queue(entry); | ||
| 113 | return true; | ||
| 114 | } | ||
| 115 | EXPORT_SYMBOL_GPL(irq_work_queue); | ||
| 116 | |||
| 117 | /* | ||
| 118 | * Run the irq_work entries on this cpu. Requires to be ran from hardirq | ||
| 119 | * context with local IRQs disabled. | ||
| 120 | */ | ||
| 121 | void irq_work_run(void) | ||
| 122 | { | ||
| 123 | struct irq_work *list, **head; | ||
| 124 | |||
| 125 | head = &__get_cpu_var(irq_work_list); | ||
| 126 | if (*head == NULL) | ||
| 127 | return; | ||
| 128 | |||
| 129 | BUG_ON(!in_irq()); | ||
| 130 | BUG_ON(!irqs_disabled()); | ||
| 131 | |||
| 132 | list = xchg(head, NULL); | ||
| 133 | while (list != NULL) { | ||
| 134 | struct irq_work *entry = list; | ||
| 135 | |||
| 136 | list = irq_work_next(list); | ||
| 137 | |||
| 138 | /* | ||
| 139 | * Clear the PENDING bit, after this point the @entry | ||
| 140 | * can be re-used. | ||
| 141 | */ | ||
| 142 | entry->next = next_flags(NULL, IRQ_WORK_BUSY); | ||
| 143 | entry->func(entry); | ||
| 144 | /* | ||
| 145 | * Clear the BUSY bit and return to the free state if | ||
| 146 | * no-one else claimed it meanwhile. | ||
| 147 | */ | ||
| 148 | cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); | ||
| 149 | } | ||
| 150 | } | ||
| 151 | EXPORT_SYMBOL_GPL(irq_work_run); | ||
| 152 | |||
| 153 | /* | ||
| 154 | * Synchronize against the irq_work @entry, ensures the entry is not | ||
| 155 | * currently in use. | ||
| 156 | */ | ||
| 157 | void irq_work_sync(struct irq_work *entry) | ||
| 158 | { | ||
| 159 | WARN_ON_ONCE(irqs_disabled()); | ||
| 160 | |||
| 161 | while (irq_work_is_set(entry, IRQ_WORK_BUSY)) | ||
| 162 | cpu_relax(); | ||
| 163 | } | ||
| 164 | EXPORT_SYMBOL_GPL(irq_work_sync); | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 000000000000..7be868bf25c6 --- /dev/null +++ b/kernel/jump_label.c | |||
| @@ -0,0 +1,429 @@ | |||
| 1 | /* | ||
| 2 | * jump label support | ||
| 3 | * | ||
| 4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> | ||
| 5 | * | ||
| 6 | */ | ||
| 7 | #include <linux/jump_label.h> | ||
| 8 | #include <linux/memory.h> | ||
| 9 | #include <linux/uaccess.h> | ||
| 10 | #include <linux/module.h> | ||
| 11 | #include <linux/list.h> | ||
| 12 | #include <linux/jhash.h> | ||
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/sort.h> | ||
| 15 | #include <linux/err.h> | ||
| 16 | |||
| 17 | #ifdef HAVE_JUMP_LABEL | ||
| 18 | |||
| 19 | #define JUMP_LABEL_HASH_BITS 6 | ||
| 20 | #define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) | ||
| 21 | static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; | ||
| 22 | |||
| 23 | /* mutex to protect coming/going of the the jump_label table */ | ||
| 24 | static DEFINE_MUTEX(jump_label_mutex); | ||
| 25 | |||
| 26 | struct jump_label_entry { | ||
| 27 | struct hlist_node hlist; | ||
| 28 | struct jump_entry *table; | ||
| 29 | int nr_entries; | ||
| 30 | /* hang modules off here */ | ||
| 31 | struct hlist_head modules; | ||
| 32 | unsigned long key; | ||
| 33 | }; | ||
| 34 | |||
| 35 | struct jump_label_module_entry { | ||
| 36 | struct hlist_node hlist; | ||
| 37 | struct jump_entry *table; | ||
| 38 | int nr_entries; | ||
| 39 | struct module *mod; | ||
| 40 | }; | ||
| 41 | |||
| 42 | static int jump_label_cmp(const void *a, const void *b) | ||
| 43 | { | ||
| 44 | const struct jump_entry *jea = a; | ||
| 45 | const struct jump_entry *jeb = b; | ||
| 46 | |||
| 47 | if (jea->key < jeb->key) | ||
| 48 | return -1; | ||
| 49 | |||
| 50 | if (jea->key > jeb->key) | ||
| 51 | return 1; | ||
| 52 | |||
| 53 | return 0; | ||
| 54 | } | ||
| 55 | |||
| 56 | static void | ||
| 57 | sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | ||
| 58 | { | ||
| 59 | unsigned long size; | ||
| 60 | |||
| 61 | size = (((unsigned long)stop - (unsigned long)start) | ||
| 62 | / sizeof(struct jump_entry)); | ||
| 63 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | ||
| 64 | } | ||
| 65 | |||
| 66 | static struct jump_label_entry *get_jump_label_entry(jump_label_t key) | ||
| 67 | { | ||
| 68 | struct hlist_head *head; | ||
| 69 | struct hlist_node *node; | ||
| 70 | struct jump_label_entry *e; | ||
| 71 | u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
| 72 | |||
| 73 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
| 74 | hlist_for_each_entry(e, node, head, hlist) { | ||
| 75 | if (key == e->key) | ||
| 76 | return e; | ||
| 77 | } | ||
| 78 | return NULL; | ||
| 79 | } | ||
| 80 | |||
| 81 | static struct jump_label_entry * | ||
| 82 | add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) | ||
| 83 | { | ||
| 84 | struct hlist_head *head; | ||
| 85 | struct jump_label_entry *e; | ||
| 86 | u32 hash; | ||
| 87 | |||
| 88 | e = get_jump_label_entry(key); | ||
| 89 | if (e) | ||
| 90 | return ERR_PTR(-EEXIST); | ||
| 91 | |||
| 92 | e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); | ||
| 93 | if (!e) | ||
| 94 | return ERR_PTR(-ENOMEM); | ||
| 95 | |||
| 96 | hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
| 97 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
| 98 | e->key = key; | ||
| 99 | e->table = table; | ||
| 100 | e->nr_entries = nr_entries; | ||
| 101 | INIT_HLIST_HEAD(&(e->modules)); | ||
| 102 | hlist_add_head(&e->hlist, head); | ||
| 103 | return e; | ||
| 104 | } | ||
| 105 | |||
| 106 | static int | ||
| 107 | build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) | ||
| 108 | { | ||
| 109 | struct jump_entry *iter, *iter_begin; | ||
| 110 | struct jump_label_entry *entry; | ||
| 111 | int count; | ||
| 112 | |||
| 113 | sort_jump_label_entries(start, stop); | ||
| 114 | iter = start; | ||
| 115 | while (iter < stop) { | ||
| 116 | entry = get_jump_label_entry(iter->key); | ||
| 117 | if (!entry) { | ||
| 118 | iter_begin = iter; | ||
| 119 | count = 0; | ||
| 120 | while ((iter < stop) && | ||
| 121 | (iter->key == iter_begin->key)) { | ||
| 122 | iter++; | ||
| 123 | count++; | ||
| 124 | } | ||
| 125 | entry = add_jump_label_entry(iter_begin->key, | ||
| 126 | count, iter_begin); | ||
| 127 | if (IS_ERR(entry)) | ||
| 128 | return PTR_ERR(entry); | ||
| 129 | } else { | ||
| 130 | WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); | ||
| 131 | return -1; | ||
| 132 | } | ||
| 133 | } | ||
| 134 | return 0; | ||
| 135 | } | ||
| 136 | |||
| 137 | /*** | ||
| 138 | * jump_label_update - update jump label text | ||
| 139 | * @key - key value associated with a a jump label | ||
| 140 | * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE | ||
| 141 | * | ||
| 142 | * Will enable/disable the jump for jump label @key, depending on the | ||
| 143 | * value of @type. | ||
| 144 | * | ||
| 145 | */ | ||
| 146 | |||
| 147 | void jump_label_update(unsigned long key, enum jump_label_type type) | ||
| 148 | { | ||
| 149 | struct jump_entry *iter; | ||
| 150 | struct jump_label_entry *entry; | ||
| 151 | struct hlist_node *module_node; | ||
| 152 | struct jump_label_module_entry *e_module; | ||
| 153 | int count; | ||
| 154 | |||
| 155 | mutex_lock(&jump_label_mutex); | ||
| 156 | entry = get_jump_label_entry((jump_label_t)key); | ||
| 157 | if (entry) { | ||
| 158 | count = entry->nr_entries; | ||
| 159 | iter = entry->table; | ||
| 160 | while (count--) { | ||
| 161 | if (kernel_text_address(iter->code)) | ||
| 162 | arch_jump_label_transform(iter, type); | ||
| 163 | iter++; | ||
| 164 | } | ||
| 165 | /* eanble/disable jump labels in modules */ | ||
| 166 | hlist_for_each_entry(e_module, module_node, &(entry->modules), | ||
| 167 | hlist) { | ||
| 168 | count = e_module->nr_entries; | ||
| 169 | iter = e_module->table; | ||
| 170 | while (count--) { | ||
| 171 | if (kernel_text_address(iter->code)) | ||
| 172 | arch_jump_label_transform(iter, type); | ||
| 173 | iter++; | ||
| 174 | } | ||
| 175 | } | ||
| 176 | } | ||
| 177 | mutex_unlock(&jump_label_mutex); | ||
| 178 | } | ||
| 179 | |||
| 180 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | ||
| 181 | { | ||
| 182 | if (entry->code <= (unsigned long)end && | ||
| 183 | entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) | ||
| 184 | return 1; | ||
| 185 | |||
| 186 | return 0; | ||
| 187 | } | ||
| 188 | |||
| 189 | #ifdef CONFIG_MODULES | ||
| 190 | |||
| 191 | static int module_conflict(void *start, void *end) | ||
| 192 | { | ||
| 193 | struct hlist_head *head; | ||
| 194 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
| 195 | struct jump_label_entry *e; | ||
| 196 | struct jump_label_module_entry *e_module; | ||
| 197 | struct jump_entry *iter; | ||
| 198 | int i, count; | ||
| 199 | int conflict = 0; | ||
| 200 | |||
| 201 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
| 202 | head = &jump_label_table[i]; | ||
| 203 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
| 204 | hlist_for_each_entry_safe(e_module, module_node, | ||
| 205 | module_node_next, | ||
| 206 | &(e->modules), hlist) { | ||
| 207 | count = e_module->nr_entries; | ||
| 208 | iter = e_module->table; | ||
| 209 | while (count--) { | ||
| 210 | if (addr_conflict(iter, start, end)) { | ||
| 211 | conflict = 1; | ||
| 212 | goto out; | ||
| 213 | } | ||
| 214 | iter++; | ||
| 215 | } | ||
| 216 | } | ||
| 217 | } | ||
| 218 | } | ||
| 219 | out: | ||
| 220 | return conflict; | ||
| 221 | } | ||
| 222 | |||
| 223 | #endif | ||
| 224 | |||
| 225 | /*** | ||
| 226 | * jump_label_text_reserved - check if addr range is reserved | ||
| 227 | * @start: start text addr | ||
| 228 | * @end: end text addr | ||
| 229 | * | ||
| 230 | * checks if the text addr located between @start and @end | ||
| 231 | * overlaps with any of the jump label patch addresses. Code | ||
| 232 | * that wants to modify kernel text should first verify that | ||
| 233 | * it does not overlap with any of the jump label addresses. | ||
| 234 | * | ||
| 235 | * returns 1 if there is an overlap, 0 otherwise | ||
| 236 | */ | ||
| 237 | int jump_label_text_reserved(void *start, void *end) | ||
| 238 | { | ||
| 239 | struct jump_entry *iter; | ||
| 240 | struct jump_entry *iter_start = __start___jump_table; | ||
| 241 | struct jump_entry *iter_stop = __start___jump_table; | ||
| 242 | int conflict = 0; | ||
| 243 | |||
| 244 | mutex_lock(&jump_label_mutex); | ||
| 245 | iter = iter_start; | ||
| 246 | while (iter < iter_stop) { | ||
| 247 | if (addr_conflict(iter, start, end)) { | ||
| 248 | conflict = 1; | ||
| 249 | goto out; | ||
| 250 | } | ||
| 251 | iter++; | ||
| 252 | } | ||
| 253 | |||
| 254 | /* now check modules */ | ||
| 255 | #ifdef CONFIG_MODULES | ||
| 256 | conflict = module_conflict(start, end); | ||
| 257 | #endif | ||
| 258 | out: | ||
| 259 | mutex_unlock(&jump_label_mutex); | ||
| 260 | return conflict; | ||
| 261 | } | ||
| 262 | |||
| 263 | static __init int init_jump_label(void) | ||
| 264 | { | ||
| 265 | int ret; | ||
| 266 | struct jump_entry *iter_start = __start___jump_table; | ||
| 267 | struct jump_entry *iter_stop = __stop___jump_table; | ||
| 268 | struct jump_entry *iter; | ||
| 269 | |||
| 270 | mutex_lock(&jump_label_mutex); | ||
| 271 | ret = build_jump_label_hashtable(__start___jump_table, | ||
| 272 | __stop___jump_table); | ||
| 273 | iter = iter_start; | ||
| 274 | while (iter < iter_stop) { | ||
| 275 | arch_jump_label_text_poke_early(iter->code); | ||
| 276 | iter++; | ||
| 277 | } | ||
| 278 | mutex_unlock(&jump_label_mutex); | ||
| 279 | return ret; | ||
| 280 | } | ||
| 281 | early_initcall(init_jump_label); | ||
| 282 | |||
| 283 | #ifdef CONFIG_MODULES | ||
| 284 | |||
| 285 | static struct jump_label_module_entry * | ||
| 286 | add_jump_label_module_entry(struct jump_label_entry *entry, | ||
| 287 | struct jump_entry *iter_begin, | ||
| 288 | int count, struct module *mod) | ||
| 289 | { | ||
| 290 | struct jump_label_module_entry *e; | ||
| 291 | |||
| 292 | e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); | ||
| 293 | if (!e) | ||
| 294 | return ERR_PTR(-ENOMEM); | ||
| 295 | e->mod = mod; | ||
| 296 | e->nr_entries = count; | ||
| 297 | e->table = iter_begin; | ||
| 298 | hlist_add_head(&e->hlist, &entry->modules); | ||
| 299 | return e; | ||
| 300 | } | ||
| 301 | |||
| 302 | static int add_jump_label_module(struct module *mod) | ||
| 303 | { | ||
| 304 | struct jump_entry *iter, *iter_begin; | ||
| 305 | struct jump_label_entry *entry; | ||
| 306 | struct jump_label_module_entry *module_entry; | ||
| 307 | int count; | ||
| 308 | |||
| 309 | /* if the module doesn't have jump label entries, just return */ | ||
| 310 | if (!mod->num_jump_entries) | ||
| 311 | return 0; | ||
| 312 | |||
| 313 | sort_jump_label_entries(mod->jump_entries, | ||
| 314 | mod->jump_entries + mod->num_jump_entries); | ||
| 315 | iter = mod->jump_entries; | ||
| 316 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
| 317 | entry = get_jump_label_entry(iter->key); | ||
| 318 | iter_begin = iter; | ||
| 319 | count = 0; | ||
| 320 | while ((iter < mod->jump_entries + mod->num_jump_entries) && | ||
| 321 | (iter->key == iter_begin->key)) { | ||
| 322 | iter++; | ||
| 323 | count++; | ||
| 324 | } | ||
| 325 | if (!entry) { | ||
| 326 | entry = add_jump_label_entry(iter_begin->key, 0, NULL); | ||
| 327 | if (IS_ERR(entry)) | ||
| 328 | return PTR_ERR(entry); | ||
| 329 | } | ||
| 330 | module_entry = add_jump_label_module_entry(entry, iter_begin, | ||
| 331 | count, mod); | ||
| 332 | if (IS_ERR(module_entry)) | ||
| 333 | return PTR_ERR(module_entry); | ||
| 334 | } | ||
| 335 | return 0; | ||
| 336 | } | ||
| 337 | |||
| 338 | static void remove_jump_label_module(struct module *mod) | ||
| 339 | { | ||
| 340 | struct hlist_head *head; | ||
| 341 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
| 342 | struct jump_label_entry *e; | ||
| 343 | struct jump_label_module_entry *e_module; | ||
| 344 | int i; | ||
| 345 | |||
| 346 | /* if the module doesn't have jump label entries, just return */ | ||
| 347 | if (!mod->num_jump_entries) | ||
| 348 | return; | ||
| 349 | |||
| 350 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
| 351 | head = &jump_label_table[i]; | ||
| 352 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
| 353 | hlist_for_each_entry_safe(e_module, module_node, | ||
| 354 | module_node_next, | ||
| 355 | &(e->modules), hlist) { | ||
| 356 | if (e_module->mod == mod) { | ||
| 357 | hlist_del(&e_module->hlist); | ||
| 358 | kfree(e_module); | ||
| 359 | } | ||
| 360 | } | ||
| 361 | if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { | ||
| 362 | hlist_del(&e->hlist); | ||
| 363 | kfree(e); | ||
| 364 | } | ||
| 365 | } | ||
| 366 | } | ||
| 367 | } | ||
| 368 | |||
| 369 | static int | ||
| 370 | jump_label_module_notify(struct notifier_block *self, unsigned long val, | ||
| 371 | void *data) | ||
| 372 | { | ||
| 373 | struct module *mod = data; | ||
| 374 | int ret = 0; | ||
| 375 | |||
| 376 | switch (val) { | ||
| 377 | case MODULE_STATE_COMING: | ||
| 378 | mutex_lock(&jump_label_mutex); | ||
| 379 | ret = add_jump_label_module(mod); | ||
| 380 | if (ret) | ||
| 381 | remove_jump_label_module(mod); | ||
| 382 | mutex_unlock(&jump_label_mutex); | ||
| 383 | break; | ||
| 384 | case MODULE_STATE_GOING: | ||
| 385 | mutex_lock(&jump_label_mutex); | ||
| 386 | remove_jump_label_module(mod); | ||
| 387 | mutex_unlock(&jump_label_mutex); | ||
| 388 | break; | ||
| 389 | } | ||
| 390 | return ret; | ||
| 391 | } | ||
| 392 | |||
| 393 | /*** | ||
| 394 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
| 395 | * @mod: module to patch | ||
| 396 | * | ||
| 397 | * Allow for run-time selection of the optimal nops. Before the module | ||
| 398 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
| 399 | * the arch specific jump label code. | ||
| 400 | */ | ||
| 401 | void jump_label_apply_nops(struct module *mod) | ||
| 402 | { | ||
| 403 | struct jump_entry *iter; | ||
| 404 | |||
| 405 | /* if the module doesn't have jump label entries, just return */ | ||
| 406 | if (!mod->num_jump_entries) | ||
| 407 | return; | ||
| 408 | |||
| 409 | iter = mod->jump_entries; | ||
| 410 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
| 411 | arch_jump_label_text_poke_early(iter->code); | ||
| 412 | iter++; | ||
| 413 | } | ||
| 414 | } | ||
| 415 | |||
| 416 | struct notifier_block jump_label_module_nb = { | ||
| 417 | .notifier_call = jump_label_module_notify, | ||
| 418 | .priority = 0, | ||
| 419 | }; | ||
| 420 | |||
| 421 | static __init int init_jump_label_module(void) | ||
| 422 | { | ||
| 423 | return register_module_notifier(&jump_label_module_nb); | ||
| 424 | } | ||
| 425 | early_initcall(init_jump_label_module); | ||
| 426 | |||
| 427 | #endif /* CONFIG_MODULES */ | ||
| 428 | |||
| 429 | #endif | ||
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 6b5580c57644..01a0700e873f 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
| @@ -365,8 +365,6 @@ static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, | |||
| 365 | n = setup_sgl_buf(sgl, fifo->data + off, nents, l); | 365 | n = setup_sgl_buf(sgl, fifo->data + off, nents, l); |
| 366 | n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); | 366 | n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); |
| 367 | 367 | ||
| 368 | if (n) | ||
| 369 | sg_mark_end(sgl + n - 1); | ||
| 370 | return n; | 368 | return n; |
| 371 | } | 369 | } |
| 372 | 370 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae96..ec4210c6501e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -47,6 +47,7 @@ | |||
| 47 | #include <linux/memory.h> | 47 | #include <linux/memory.h> |
| 48 | #include <linux/ftrace.h> | 48 | #include <linux/ftrace.h> |
| 49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
| 50 | #include <linux/jump_label.h> | ||
| 50 | 51 | ||
| 51 | #include <asm-generic/sections.h> | 52 | #include <asm-generic/sections.h> |
| 52 | #include <asm/cacheflush.h> | 53 | #include <asm/cacheflush.h> |
| @@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p) | |||
| 399 | * Return an optimized kprobe whose optimizing code replaces | 400 | * Return an optimized kprobe whose optimizing code replaces |
| 400 | * instructions including addr (exclude breakpoint). | 401 | * instructions including addr (exclude breakpoint). |
| 401 | */ | 402 | */ |
| 402 | struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | 403 | static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) |
| 403 | { | 404 | { |
| 404 | int i; | 405 | int i; |
| 405 | struct kprobe *p = NULL; | 406 | struct kprobe *p = NULL; |
| @@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | |||
| 831 | 832 | ||
| 832 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | 833 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, |
| 833 | struct hlist_head **head, unsigned long *flags) | 834 | struct hlist_head **head, unsigned long *flags) |
| 835 | __acquires(hlist_lock) | ||
| 834 | { | 836 | { |
| 835 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 837 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
| 836 | spinlock_t *hlist_lock; | 838 | spinlock_t *hlist_lock; |
| @@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | |||
| 842 | 844 | ||
| 843 | static void __kprobes kretprobe_table_lock(unsigned long hash, | 845 | static void __kprobes kretprobe_table_lock(unsigned long hash, |
| 844 | unsigned long *flags) | 846 | unsigned long *flags) |
| 847 | __acquires(hlist_lock) | ||
| 845 | { | 848 | { |
| 846 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 849 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
| 847 | spin_lock_irqsave(hlist_lock, *flags); | 850 | spin_lock_irqsave(hlist_lock, *flags); |
| @@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash, | |||
| 849 | 852 | ||
| 850 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | 853 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, |
| 851 | unsigned long *flags) | 854 | unsigned long *flags) |
| 855 | __releases(hlist_lock) | ||
| 852 | { | 856 | { |
| 853 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 857 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
| 854 | spinlock_t *hlist_lock; | 858 | spinlock_t *hlist_lock; |
| @@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | |||
| 857 | spin_unlock_irqrestore(hlist_lock, *flags); | 861 | spin_unlock_irqrestore(hlist_lock, *flags); |
| 858 | } | 862 | } |
| 859 | 863 | ||
| 860 | void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) | 864 | static void __kprobes kretprobe_table_unlock(unsigned long hash, |
| 865 | unsigned long *flags) | ||
| 866 | __releases(hlist_lock) | ||
| 861 | { | 867 | { |
| 862 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 868 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
| 863 | spin_unlock_irqrestore(hlist_lock, *flags); | 869 | spin_unlock_irqrestore(hlist_lock, *flags); |
| @@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 1141 | preempt_disable(); | 1147 | preempt_disable(); |
| 1142 | if (!kernel_text_address((unsigned long) p->addr) || | 1148 | if (!kernel_text_address((unsigned long) p->addr) || |
| 1143 | in_kprobes_functions((unsigned long) p->addr) || | 1149 | in_kprobes_functions((unsigned long) p->addr) || |
| 1144 | ftrace_text_reserved(p->addr, p->addr)) { | 1150 | ftrace_text_reserved(p->addr, p->addr) || |
| 1151 | jump_label_text_reserved(p->addr, p->addr)) { | ||
| 1145 | preempt_enable(); | 1152 | preempt_enable(); |
| 1146 | return -EINVAL; | 1153 | return -EINVAL; |
| 1147 | } | 1154 | } |
| @@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) | |||
| 1339 | if (num <= 0) | 1346 | if (num <= 0) |
| 1340 | return -EINVAL; | 1347 | return -EINVAL; |
| 1341 | for (i = 0; i < num; i++) { | 1348 | for (i = 0; i < num; i++) { |
| 1342 | unsigned long addr; | 1349 | unsigned long addr, offset; |
| 1343 | jp = jps[i]; | 1350 | jp = jps[i]; |
| 1344 | addr = arch_deref_entry_point(jp->entry); | 1351 | addr = arch_deref_entry_point(jp->entry); |
| 1345 | 1352 | ||
| 1346 | if (!kernel_text_address(addr)) | 1353 | /* Verify probepoint is a function entry point */ |
| 1347 | ret = -EINVAL; | 1354 | if (kallsyms_lookup_size_offset(addr, NULL, &offset) && |
| 1348 | else { | 1355 | offset == 0) { |
| 1349 | /* Todo: Verify probepoint is a function entry point */ | ||
| 1350 | jp->kp.pre_handler = setjmp_pre_handler; | 1356 | jp->kp.pre_handler = setjmp_pre_handler; |
| 1351 | jp->kp.break_handler = longjmp_break_handler; | 1357 | jp->kp.break_handler = longjmp_break_handler; |
| 1352 | ret = register_kprobe(&jp->kp); | 1358 | ret = register_kprobe(&jp->kp); |
| 1353 | } | 1359 | } else |
| 1360 | ret = -EINVAL; | ||
| 1361 | |||
| 1354 | if (ret < 0) { | 1362 | if (ret < 0) { |
| 1355 | if (i > 0) | 1363 | if (i > 0) |
| 1356 | unregister_jprobes(jps, i); | 1364 | unregister_jprobes(jps, i); |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f2852a510232..42ba65dff7d9 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
| 639 | } | 639 | } |
| 640 | #endif | 640 | #endif |
| 641 | 641 | ||
| 642 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
| 643 | debug_locks_off(); | ||
| 644 | printk(KERN_ERR | ||
| 645 | "BUG: looking up invalid subclass: %u\n", subclass); | ||
| 646 | printk(KERN_ERR | ||
| 647 | "turning off the locking correctness validator.\n"); | ||
| 648 | dump_stack(); | ||
| 649 | return NULL; | ||
| 650 | } | ||
| 651 | |||
| 642 | /* | 652 | /* |
| 643 | * Static locks do not have their class-keys yet - for them the key | 653 | * Static locks do not have their class-keys yet - for them the key |
| 644 | * is the lock object itself: | 654 | * is the lock object itself: |
| @@ -774,7 +784,9 @@ out_unlock_set: | |||
| 774 | raw_local_irq_restore(flags); | 784 | raw_local_irq_restore(flags); |
| 775 | 785 | ||
| 776 | if (!subclass || force) | 786 | if (!subclass || force) |
| 777 | lock->class_cache = class; | 787 | lock->class_cache[0] = class; |
| 788 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) | ||
| 789 | lock->class_cache[subclass] = class; | ||
| 778 | 790 | ||
| 779 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | 791 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) |
| 780 | return NULL; | 792 | return NULL; |
| @@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
| 2679 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2691 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
| 2680 | struct lock_class_key *key, int subclass) | 2692 | struct lock_class_key *key, int subclass) |
| 2681 | { | 2693 | { |
| 2682 | lock->class_cache = NULL; | 2694 | int i; |
| 2695 | |||
| 2696 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) | ||
| 2697 | lock->class_cache[i] = NULL; | ||
| 2698 | |||
| 2683 | #ifdef CONFIG_LOCK_STAT | 2699 | #ifdef CONFIG_LOCK_STAT |
| 2684 | lock->cpu = raw_smp_processor_id(); | 2700 | lock->cpu = raw_smp_processor_id(); |
| 2685 | #endif | 2701 | #endif |
| @@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 2739 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2755 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
| 2740 | return 0; | 2756 | return 0; |
| 2741 | 2757 | ||
| 2742 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
| 2743 | debug_locks_off(); | ||
| 2744 | printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); | ||
| 2745 | printk("turning off the locking correctness validator.\n"); | ||
| 2746 | dump_stack(); | ||
| 2747 | return 0; | ||
| 2748 | } | ||
| 2749 | |||
| 2750 | if (lock->key == &__lockdep_no_validate__) | 2758 | if (lock->key == &__lockdep_no_validate__) |
| 2751 | check = 1; | 2759 | check = 1; |
| 2752 | 2760 | ||
| 2753 | if (!subclass) | 2761 | if (subclass < NR_LOCKDEP_CACHING_CLASSES) |
| 2754 | class = lock->class_cache; | 2762 | class = lock->class_cache[subclass]; |
| 2755 | /* | 2763 | /* |
| 2756 | * Not cached yet or subclass? | 2764 | * Not cached? |
| 2757 | */ | 2765 | */ |
| 2758 | if (unlikely(!class)) { | 2766 | if (unlikely(!class)) { |
| 2759 | class = register_lock_class(lock, subclass, 0); | 2767 | class = register_lock_class(lock, subclass, 0); |
| @@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
| 2918 | return 1; | 2926 | return 1; |
| 2919 | 2927 | ||
| 2920 | if (hlock->references) { | 2928 | if (hlock->references) { |
| 2921 | struct lock_class *class = lock->class_cache; | 2929 | struct lock_class *class = lock->class_cache[0]; |
| 2922 | 2930 | ||
| 2923 | if (!class) | 2931 | if (!class) |
| 2924 | class = look_up_lock_class(lock, 0); | 2932 | class = look_up_lock_class(lock, 0); |
| @@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
| 3559 | if (list_empty(head)) | 3567 | if (list_empty(head)) |
| 3560 | continue; | 3568 | continue; |
| 3561 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3569 | list_for_each_entry_safe(class, next, head, hash_entry) { |
| 3562 | if (unlikely(class == lock->class_cache)) { | 3570 | int match = 0; |
| 3571 | |||
| 3572 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) | ||
| 3573 | match |= class == lock->class_cache[j]; | ||
| 3574 | |||
| 3575 | if (unlikely(match)) { | ||
| 3563 | if (debug_locks_off_graph_unlock()) | 3576 | if (debug_locks_off_graph_unlock()) |
| 3564 | WARN_ON(1); | 3577 | WARN_ON(1); |
| 3565 | goto out_restore; | 3578 | goto out_restore; |
| @@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); | |||
| 3775 | * Careful: only use this function if you are sure that | 3788 | * Careful: only use this function if you are sure that |
| 3776 | * the task cannot run in parallel! | 3789 | * the task cannot run in parallel! |
| 3777 | */ | 3790 | */ |
| 3778 | void __debug_show_held_locks(struct task_struct *task) | 3791 | void debug_show_held_locks(struct task_struct *task) |
| 3779 | { | 3792 | { |
| 3780 | if (unlikely(!debug_locks)) { | 3793 | if (unlikely(!debug_locks)) { |
| 3781 | printk("INFO: lockdep is turned off.\n"); | 3794 | printk("INFO: lockdep is turned off.\n"); |
| @@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task) | |||
| 3783 | } | 3796 | } |
| 3784 | lockdep_print_held_locks(task); | 3797 | lockdep_print_held_locks(task); |
| 3785 | } | 3798 | } |
| 3786 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
| 3787 | |||
| 3788 | void debug_show_held_locks(struct task_struct *task) | ||
| 3789 | { | ||
| 3790 | __debug_show_held_locks(task); | ||
| 3791 | } | ||
| 3792 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3799 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
| 3793 | 3800 | ||
| 3794 | void lockdep_sys_exit(void) | 3801 | void lockdep_sys_exit(void) |
diff --git a/kernel/module.c b/kernel/module.c index d0b5f8db11b4..2df46301a7a4 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -55,6 +55,7 @@ | |||
| 55 | #include <linux/async.h> | 55 | #include <linux/async.h> |
| 56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
| 57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
| 58 | #include <linux/jump_label.h> | ||
| 58 | 59 | ||
| 59 | #define CREATE_TRACE_POINTS | 60 | #define CREATE_TRACE_POINTS |
| 60 | #include <trace/events/module.h> | 61 | #include <trace/events/module.h> |
| @@ -1537,6 +1538,7 @@ static int __unlink_module(void *_mod) | |||
| 1537 | { | 1538 | { |
| 1538 | struct module *mod = _mod; | 1539 | struct module *mod = _mod; |
| 1539 | list_del(&mod->list); | 1540 | list_del(&mod->list); |
| 1541 | module_bug_cleanup(mod); | ||
| 1540 | return 0; | 1542 | return 0; |
| 1541 | } | 1543 | } |
| 1542 | 1544 | ||
| @@ -2308,6 +2310,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) | |||
| 2308 | sizeof(*mod->tracepoints), | 2310 | sizeof(*mod->tracepoints), |
| 2309 | &mod->num_tracepoints); | 2311 | &mod->num_tracepoints); |
| 2310 | #endif | 2312 | #endif |
| 2313 | #ifdef HAVE_JUMP_LABEL | ||
| 2314 | mod->jump_entries = section_objs(info, "__jump_table", | ||
| 2315 | sizeof(*mod->jump_entries), | ||
| 2316 | &mod->num_jump_entries); | ||
| 2317 | #endif | ||
| 2311 | #ifdef CONFIG_EVENT_TRACING | 2318 | #ifdef CONFIG_EVENT_TRACING |
| 2312 | mod->trace_events = section_objs(info, "_ftrace_events", | 2319 | mod->trace_events = section_objs(info, "_ftrace_events", |
| 2313 | sizeof(*mod->trace_events), | 2320 | sizeof(*mod->trace_events), |
| @@ -2625,6 +2632,7 @@ static struct module *load_module(void __user *umod, | |||
| 2625 | if (err < 0) | 2632 | if (err < 0) |
| 2626 | goto ddebug; | 2633 | goto ddebug; |
| 2627 | 2634 | ||
| 2635 | module_bug_finalize(info.hdr, info.sechdrs, mod); | ||
| 2628 | list_add_rcu(&mod->list, &modules); | 2636 | list_add_rcu(&mod->list, &modules); |
| 2629 | mutex_unlock(&module_mutex); | 2637 | mutex_unlock(&module_mutex); |
| 2630 | 2638 | ||
| @@ -2650,6 +2658,8 @@ static struct module *load_module(void __user *umod, | |||
| 2650 | mutex_lock(&module_mutex); | 2658 | mutex_lock(&module_mutex); |
| 2651 | /* Unlink carefully: kallsyms could be walking list. */ | 2659 | /* Unlink carefully: kallsyms could be walking list. */ |
| 2652 | list_del_rcu(&mod->list); | 2660 | list_del_rcu(&mod->list); |
| 2661 | module_bug_cleanup(mod); | ||
| 2662 | |||
| 2653 | ddebug: | 2663 | ddebug: |
| 2654 | if (!mod->taints) | 2664 | if (!mod->taints) |
| 2655 | dynamic_debug_remove(info.debug); | 2665 | dynamic_debug_remove(info.debug); |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 4c0b7b3e6d2e..200407c1502f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -36,15 +36,6 @@ | |||
| 36 | # include <asm/mutex.h> | 36 | # include <asm/mutex.h> |
| 37 | #endif | 37 | #endif |
| 38 | 38 | ||
| 39 | /*** | ||
| 40 | * mutex_init - initialize the mutex | ||
| 41 | * @lock: the mutex to be initialized | ||
| 42 | * @key: the lock_class_key for the class; used by mutex lock debugging | ||
| 43 | * | ||
| 44 | * Initialize the mutex to unlocked state. | ||
| 45 | * | ||
| 46 | * It is not allowed to initialize an already locked mutex. | ||
| 47 | */ | ||
| 48 | void | 39 | void |
| 49 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | 40 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) |
| 50 | { | 41 | { |
| @@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init); | |||
| 68 | static __used noinline void __sched | 59 | static __used noinline void __sched |
| 69 | __mutex_lock_slowpath(atomic_t *lock_count); | 60 | __mutex_lock_slowpath(atomic_t *lock_count); |
| 70 | 61 | ||
| 71 | /*** | 62 | /** |
| 72 | * mutex_lock - acquire the mutex | 63 | * mutex_lock - acquire the mutex |
| 73 | * @lock: the mutex to be acquired | 64 | * @lock: the mutex to be acquired |
| 74 | * | 65 | * |
| @@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock); | |||
| 105 | 96 | ||
| 106 | static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); | 97 | static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); |
| 107 | 98 | ||
| 108 | /*** | 99 | /** |
| 109 | * mutex_unlock - release the mutex | 100 | * mutex_unlock - release the mutex |
| 110 | * @lock: the mutex to be released | 101 | * @lock: the mutex to be released |
| 111 | * | 102 | * |
| @@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count); | |||
| 364 | static noinline int __sched | 355 | static noinline int __sched |
| 365 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); | 356 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); |
| 366 | 357 | ||
| 367 | /*** | 358 | /** |
| 368 | * mutex_lock_interruptible - acquire the mutex, interruptable | 359 | * mutex_lock_interruptible - acquire the mutex, interruptible |
| 369 | * @lock: the mutex to be acquired | 360 | * @lock: the mutex to be acquired |
| 370 | * | 361 | * |
| 371 | * Lock the mutex like mutex_lock(), and return 0 if the mutex has | 362 | * Lock the mutex like mutex_lock(), and return 0 if the mutex has |
| @@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
| 456 | return prev == 1; | 447 | return prev == 1; |
| 457 | } | 448 | } |
| 458 | 449 | ||
| 459 | /*** | 450 | /** |
| 460 | * mutex_trylock - try acquire the mutex, without waiting | 451 | * mutex_trylock - try to acquire the mutex, without waiting |
| 461 | * @lock: the mutex to be acquired | 452 | * @lock: the mutex to be acquired |
| 462 | * | 453 | * |
| 463 | * Try to acquire the mutex atomically. Returns 1 if the mutex | 454 | * Try to acquire the mutex atomically. Returns 1 if the mutex |
| 464 | * has been acquired successfully, and 0 on contention. | 455 | * has been acquired successfully, and 0 on contention. |
| 465 | * | 456 | * |
| 466 | * NOTE: this function follows the spin_trylock() convention, so | 457 | * NOTE: this function follows the spin_trylock() convention, so |
| 467 | * it is negated to the down_trylock() return values! Be careful | 458 | * it is negated from the down_trylock() return values! Be careful |
| 468 | * about this when converting semaphore users to mutexes. | 459 | * about this when converting semaphore users to mutexes. |
| 469 | * | 460 | * |
| 470 | * This function must not be used in interrupt context. The | 461 | * This function must not be used in interrupt context. The |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 403d1804b198..f309e8014c78 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
| @@ -31,24 +31,18 @@ | |||
| 31 | #include <linux/kernel_stat.h> | 31 | #include <linux/kernel_stat.h> |
| 32 | #include <linux/perf_event.h> | 32 | #include <linux/perf_event.h> |
| 33 | #include <linux/ftrace_event.h> | 33 | #include <linux/ftrace_event.h> |
| 34 | #include <linux/hw_breakpoint.h> | ||
| 35 | 34 | ||
| 36 | #include <asm/irq_regs.h> | 35 | #include <asm/irq_regs.h> |
| 37 | 36 | ||
| 38 | /* | 37 | atomic_t perf_task_events __read_mostly; |
| 39 | * Each CPU has a list of per CPU events: | ||
| 40 | */ | ||
| 41 | static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | ||
| 42 | |||
| 43 | int perf_max_events __read_mostly = 1; | ||
| 44 | static int perf_reserved_percpu __read_mostly; | ||
| 45 | static int perf_overcommit __read_mostly = 1; | ||
| 46 | |||
| 47 | static atomic_t nr_events __read_mostly; | ||
| 48 | static atomic_t nr_mmap_events __read_mostly; | 38 | static atomic_t nr_mmap_events __read_mostly; |
| 49 | static atomic_t nr_comm_events __read_mostly; | 39 | static atomic_t nr_comm_events __read_mostly; |
| 50 | static atomic_t nr_task_events __read_mostly; | 40 | static atomic_t nr_task_events __read_mostly; |
| 51 | 41 | ||
| 42 | static LIST_HEAD(pmus); | ||
| 43 | static DEFINE_MUTEX(pmus_lock); | ||
| 44 | static struct srcu_struct pmus_srcu; | ||
| 45 | |||
| 52 | /* | 46 | /* |
| 53 | * perf event paranoia level: | 47 | * perf event paranoia level: |
| 54 | * -1 - not paranoid at all | 48 | * -1 - not paranoid at all |
| @@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; | |||
| 67 | 61 | ||
| 68 | static atomic64_t perf_event_id; | 62 | static atomic64_t perf_event_id; |
| 69 | 63 | ||
| 70 | /* | 64 | void __weak perf_event_print_debug(void) { } |
| 71 | * Lock for (sysadmin-configurable) event reservations: | ||
| 72 | */ | ||
| 73 | static DEFINE_SPINLOCK(perf_resource_lock); | ||
| 74 | 65 | ||
| 75 | /* | 66 | extern __weak const char *perf_pmu_name(void) |
| 76 | * Architecture provided APIs - weak aliases: | ||
| 77 | */ | ||
| 78 | extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) | ||
| 79 | { | 67 | { |
| 80 | return NULL; | 68 | return "pmu"; |
| 81 | } | 69 | } |
| 82 | 70 | ||
| 83 | void __weak hw_perf_disable(void) { barrier(); } | 71 | void perf_pmu_disable(struct pmu *pmu) |
| 84 | void __weak hw_perf_enable(void) { barrier(); } | 72 | { |
| 85 | 73 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | |
| 86 | void __weak perf_event_print_debug(void) { } | 74 | if (!(*count)++) |
| 87 | 75 | pmu->pmu_disable(pmu); | |
| 88 | static DEFINE_PER_CPU(int, perf_disable_count); | 76 | } |
| 89 | 77 | ||
| 90 | void perf_disable(void) | 78 | void perf_pmu_enable(struct pmu *pmu) |
| 91 | { | 79 | { |
| 92 | if (!__get_cpu_var(perf_disable_count)++) | 80 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
| 93 | hw_perf_disable(); | 81 | if (!--(*count)) |
| 82 | pmu->pmu_enable(pmu); | ||
| 94 | } | 83 | } |
| 95 | 84 | ||
| 96 | void perf_enable(void) | 85 | static DEFINE_PER_CPU(struct list_head, rotation_list); |
| 86 | |||
| 87 | /* | ||
| 88 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
| 89 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
| 90 | * disabled, while rotate_context is called from IRQ context. | ||
| 91 | */ | ||
| 92 | static void perf_pmu_rotate_start(struct pmu *pmu) | ||
| 97 | { | 93 | { |
| 98 | if (!--__get_cpu_var(perf_disable_count)) | 94 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
| 99 | hw_perf_enable(); | 95 | struct list_head *head = &__get_cpu_var(rotation_list); |
| 96 | |||
| 97 | WARN_ON(!irqs_disabled()); | ||
| 98 | |||
| 99 | if (list_empty(&cpuctx->rotation_list)) | ||
| 100 | list_add(&cpuctx->rotation_list, head); | ||
| 100 | } | 101 | } |
| 101 | 102 | ||
| 102 | static void get_ctx(struct perf_event_context *ctx) | 103 | static void get_ctx(struct perf_event_context *ctx) |
| @@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event) | |||
| 151 | * the context could get moved to another task. | 152 | * the context could get moved to another task. |
| 152 | */ | 153 | */ |
| 153 | static struct perf_event_context * | 154 | static struct perf_event_context * |
| 154 | perf_lock_task_context(struct task_struct *task, unsigned long *flags) | 155 | perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) |
| 155 | { | 156 | { |
| 156 | struct perf_event_context *ctx; | 157 | struct perf_event_context *ctx; |
| 157 | 158 | ||
| 158 | rcu_read_lock(); | 159 | rcu_read_lock(); |
| 159 | retry: | 160 | retry: |
| 160 | ctx = rcu_dereference(task->perf_event_ctxp); | 161 | ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); |
| 161 | if (ctx) { | 162 | if (ctx) { |
| 162 | /* | 163 | /* |
| 163 | * If this context is a clone of another, it might | 164 | * If this context is a clone of another, it might |
| @@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
| 170 | * can't get swapped on us any more. | 171 | * can't get swapped on us any more. |
| 171 | */ | 172 | */ |
| 172 | raw_spin_lock_irqsave(&ctx->lock, *flags); | 173 | raw_spin_lock_irqsave(&ctx->lock, *flags); |
| 173 | if (ctx != rcu_dereference(task->perf_event_ctxp)) { | 174 | if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { |
| 174 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); | 175 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); |
| 175 | goto retry; | 176 | goto retry; |
| 176 | } | 177 | } |
| @@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) | |||
| 189 | * can't get swapped to another task. This also increments its | 190 | * can't get swapped to another task. This also increments its |
| 190 | * reference count so that the context can't get freed. | 191 | * reference count so that the context can't get freed. |
| 191 | */ | 192 | */ |
| 192 | static struct perf_event_context *perf_pin_task_context(struct task_struct *task) | 193 | static struct perf_event_context * |
| 194 | perf_pin_task_context(struct task_struct *task, int ctxn) | ||
| 193 | { | 195 | { |
| 194 | struct perf_event_context *ctx; | 196 | struct perf_event_context *ctx; |
| 195 | unsigned long flags; | 197 | unsigned long flags; |
| 196 | 198 | ||
| 197 | ctx = perf_lock_task_context(task, &flags); | 199 | ctx = perf_lock_task_context(task, ctxn, &flags); |
| 198 | if (ctx) { | 200 | if (ctx) { |
| 199 | ++ctx->pin_count; | 201 | ++ctx->pin_count; |
| 200 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 202 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| @@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 302 | } | 304 | } |
| 303 | 305 | ||
| 304 | list_add_rcu(&event->event_entry, &ctx->event_list); | 306 | list_add_rcu(&event->event_entry, &ctx->event_list); |
| 307 | if (!ctx->nr_events) | ||
| 308 | perf_pmu_rotate_start(ctx->pmu); | ||
| 305 | ctx->nr_events++; | 309 | ctx->nr_events++; |
| 306 | if (event->attr.inherit_stat) | 310 | if (event->attr.inherit_stat) |
| 307 | ctx->nr_stat++; | 311 | ctx->nr_stat++; |
| @@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) | |||
| 311 | { | 315 | { |
| 312 | struct perf_event *group_leader = event->group_leader; | 316 | struct perf_event *group_leader = event->group_leader; |
| 313 | 317 | ||
| 314 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); | 318 | /* |
| 319 | * We can have double attach due to group movement in perf_event_open. | ||
| 320 | */ | ||
| 321 | if (event->attach_state & PERF_ATTACH_GROUP) | ||
| 322 | return; | ||
| 323 | |||
| 315 | event->attach_state |= PERF_ATTACH_GROUP; | 324 | event->attach_state |= PERF_ATTACH_GROUP; |
| 316 | 325 | ||
| 317 | if (group_leader == event) | 326 | if (group_leader == event) |
| @@ -402,21 +411,40 @@ static void perf_group_detach(struct perf_event *event) | |||
| 402 | } | 411 | } |
| 403 | } | 412 | } |
| 404 | 413 | ||
| 405 | static void | 414 | static inline int |
| 406 | event_sched_out(struct perf_event *event, | 415 | event_filter_match(struct perf_event *event) |
| 416 | { | ||
| 417 | return event->cpu == -1 || event->cpu == smp_processor_id(); | ||
| 418 | } | ||
| 419 | |||
| 420 | static int | ||
| 421 | __event_sched_out(struct perf_event *event, | ||
| 407 | struct perf_cpu_context *cpuctx, | 422 | struct perf_cpu_context *cpuctx, |
| 408 | struct perf_event_context *ctx) | 423 | struct perf_event_context *ctx) |
| 409 | { | 424 | { |
| 425 | u64 delta; | ||
| 426 | /* | ||
| 427 | * An event which could not be activated because of | ||
| 428 | * filter mismatch still needs to have its timings | ||
| 429 | * maintained, otherwise bogus information is return | ||
| 430 | * via read() for time_enabled, time_running: | ||
| 431 | */ | ||
| 432 | if (event->state == PERF_EVENT_STATE_INACTIVE | ||
| 433 | && !event_filter_match(event)) { | ||
| 434 | delta = ctx->time - event->tstamp_stopped; | ||
| 435 | event->tstamp_running += delta; | ||
| 436 | event->tstamp_stopped = ctx->time; | ||
| 437 | } | ||
| 438 | |||
| 410 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 439 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
| 411 | return; | 440 | return 0; |
| 412 | 441 | ||
| 413 | event->state = PERF_EVENT_STATE_INACTIVE; | 442 | event->state = PERF_EVENT_STATE_INACTIVE; |
| 414 | if (event->pending_disable) { | 443 | if (event->pending_disable) { |
| 415 | event->pending_disable = 0; | 444 | event->pending_disable = 0; |
| 416 | event->state = PERF_EVENT_STATE_OFF; | 445 | event->state = PERF_EVENT_STATE_OFF; |
| 417 | } | 446 | } |
| 418 | event->tstamp_stopped = ctx->time; | 447 | event->pmu->del(event, 0); |
| 419 | event->pmu->disable(event); | ||
| 420 | event->oncpu = -1; | 448 | event->oncpu = -1; |
| 421 | 449 | ||
| 422 | if (!is_software_event(event)) | 450 | if (!is_software_event(event)) |
| @@ -424,6 +452,19 @@ event_sched_out(struct perf_event *event, | |||
| 424 | ctx->nr_active--; | 452 | ctx->nr_active--; |
| 425 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 453 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
| 426 | cpuctx->exclusive = 0; | 454 | cpuctx->exclusive = 0; |
| 455 | return 1; | ||
| 456 | } | ||
| 457 | |||
| 458 | static void | ||
| 459 | event_sched_out(struct perf_event *event, | ||
| 460 | struct perf_cpu_context *cpuctx, | ||
| 461 | struct perf_event_context *ctx) | ||
| 462 | { | ||
| 463 | int ret; | ||
| 464 | |||
| 465 | ret = __event_sched_out(event, cpuctx, ctx); | ||
| 466 | if (ret) | ||
| 467 | event->tstamp_stopped = ctx->time; | ||
| 427 | } | 468 | } |
| 428 | 469 | ||
| 429 | static void | 470 | static void |
| @@ -432,9 +473,7 @@ group_sched_out(struct perf_event *group_event, | |||
| 432 | struct perf_event_context *ctx) | 473 | struct perf_event_context *ctx) |
| 433 | { | 474 | { |
| 434 | struct perf_event *event; | 475 | struct perf_event *event; |
| 435 | 476 | int state = group_event->state; | |
| 436 | if (group_event->state != PERF_EVENT_STATE_ACTIVE) | ||
| 437 | return; | ||
| 438 | 477 | ||
| 439 | event_sched_out(group_event, cpuctx, ctx); | 478 | event_sched_out(group_event, cpuctx, ctx); |
| 440 | 479 | ||
| @@ -444,10 +483,16 @@ group_sched_out(struct perf_event *group_event, | |||
| 444 | list_for_each_entry(event, &group_event->sibling_list, group_entry) | 483 | list_for_each_entry(event, &group_event->sibling_list, group_entry) |
| 445 | event_sched_out(event, cpuctx, ctx); | 484 | event_sched_out(event, cpuctx, ctx); |
| 446 | 485 | ||
| 447 | if (group_event->attr.exclusive) | 486 | if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) |
| 448 | cpuctx->exclusive = 0; | 487 | cpuctx->exclusive = 0; |
| 449 | } | 488 | } |
| 450 | 489 | ||
| 490 | static inline struct perf_cpu_context * | ||
| 491 | __get_cpu_context(struct perf_event_context *ctx) | ||
| 492 | { | ||
| 493 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
| 494 | } | ||
| 495 | |||
| 451 | /* | 496 | /* |
| 452 | * Cross CPU call to remove a performance event | 497 | * Cross CPU call to remove a performance event |
| 453 | * | 498 | * |
| @@ -456,9 +501,9 @@ group_sched_out(struct perf_event *group_event, | |||
| 456 | */ | 501 | */ |
| 457 | static void __perf_event_remove_from_context(void *info) | 502 | static void __perf_event_remove_from_context(void *info) |
| 458 | { | 503 | { |
| 459 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 460 | struct perf_event *event = info; | 504 | struct perf_event *event = info; |
| 461 | struct perf_event_context *ctx = event->ctx; | 505 | struct perf_event_context *ctx = event->ctx; |
| 506 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
| 462 | 507 | ||
| 463 | /* | 508 | /* |
| 464 | * If this is a task context, we need to check whether it is | 509 | * If this is a task context, we need to check whether it is |
| @@ -469,27 +514,11 @@ static void __perf_event_remove_from_context(void *info) | |||
| 469 | return; | 514 | return; |
| 470 | 515 | ||
| 471 | raw_spin_lock(&ctx->lock); | 516 | raw_spin_lock(&ctx->lock); |
| 472 | /* | ||
| 473 | * Protect the list operation against NMI by disabling the | ||
| 474 | * events on a global level. | ||
| 475 | */ | ||
| 476 | perf_disable(); | ||
| 477 | 517 | ||
| 478 | event_sched_out(event, cpuctx, ctx); | 518 | event_sched_out(event, cpuctx, ctx); |
| 479 | 519 | ||
| 480 | list_del_event(event, ctx); | 520 | list_del_event(event, ctx); |
| 481 | 521 | ||
| 482 | if (!ctx->task) { | ||
| 483 | /* | ||
| 484 | * Allow more per task events with respect to the | ||
| 485 | * reservation: | ||
| 486 | */ | ||
| 487 | cpuctx->max_pertask = | ||
| 488 | min(perf_max_events - ctx->nr_events, | ||
| 489 | perf_max_events - perf_reserved_percpu); | ||
| 490 | } | ||
| 491 | |||
| 492 | perf_enable(); | ||
| 493 | raw_spin_unlock(&ctx->lock); | 522 | raw_spin_unlock(&ctx->lock); |
| 494 | } | 523 | } |
| 495 | 524 | ||
| @@ -554,8 +583,8 @@ retry: | |||
| 554 | static void __perf_event_disable(void *info) | 583 | static void __perf_event_disable(void *info) |
| 555 | { | 584 | { |
| 556 | struct perf_event *event = info; | 585 | struct perf_event *event = info; |
| 557 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 558 | struct perf_event_context *ctx = event->ctx; | 586 | struct perf_event_context *ctx = event->ctx; |
| 587 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
| 559 | 588 | ||
| 560 | /* | 589 | /* |
| 561 | * If this is a per-task event, need to check whether this | 590 | * If this is a per-task event, need to check whether this |
| @@ -610,7 +639,7 @@ void perf_event_disable(struct perf_event *event) | |||
| 610 | return; | 639 | return; |
| 611 | } | 640 | } |
| 612 | 641 | ||
| 613 | retry: | 642 | retry: |
| 614 | task_oncpu_function_call(task, __perf_event_disable, event); | 643 | task_oncpu_function_call(task, __perf_event_disable, event); |
| 615 | 644 | ||
| 616 | raw_spin_lock_irq(&ctx->lock); | 645 | raw_spin_lock_irq(&ctx->lock); |
| @@ -635,7 +664,7 @@ void perf_event_disable(struct perf_event *event) | |||
| 635 | } | 664 | } |
| 636 | 665 | ||
| 637 | static int | 666 | static int |
| 638 | event_sched_in(struct perf_event *event, | 667 | __event_sched_in(struct perf_event *event, |
| 639 | struct perf_cpu_context *cpuctx, | 668 | struct perf_cpu_context *cpuctx, |
| 640 | struct perf_event_context *ctx) | 669 | struct perf_event_context *ctx) |
| 641 | { | 670 | { |
| @@ -649,14 +678,12 @@ event_sched_in(struct perf_event *event, | |||
| 649 | */ | 678 | */ |
| 650 | smp_wmb(); | 679 | smp_wmb(); |
| 651 | 680 | ||
| 652 | if (event->pmu->enable(event)) { | 681 | if (event->pmu->add(event, PERF_EF_START)) { |
| 653 | event->state = PERF_EVENT_STATE_INACTIVE; | 682 | event->state = PERF_EVENT_STATE_INACTIVE; |
| 654 | event->oncpu = -1; | 683 | event->oncpu = -1; |
| 655 | return -EAGAIN; | 684 | return -EAGAIN; |
| 656 | } | 685 | } |
| 657 | 686 | ||
| 658 | event->tstamp_running += ctx->time - event->tstamp_stopped; | ||
| 659 | |||
| 660 | if (!is_software_event(event)) | 687 | if (!is_software_event(event)) |
| 661 | cpuctx->active_oncpu++; | 688 | cpuctx->active_oncpu++; |
| 662 | ctx->nr_active++; | 689 | ctx->nr_active++; |
| @@ -667,28 +694,56 @@ event_sched_in(struct perf_event *event, | |||
| 667 | return 0; | 694 | return 0; |
| 668 | } | 695 | } |
| 669 | 696 | ||
| 697 | static inline int | ||
| 698 | event_sched_in(struct perf_event *event, | ||
| 699 | struct perf_cpu_context *cpuctx, | ||
| 700 | struct perf_event_context *ctx) | ||
| 701 | { | ||
| 702 | int ret = __event_sched_in(event, cpuctx, ctx); | ||
| 703 | if (ret) | ||
| 704 | return ret; | ||
| 705 | event->tstamp_running += ctx->time - event->tstamp_stopped; | ||
| 706 | return 0; | ||
| 707 | } | ||
| 708 | |||
| 709 | static void | ||
| 710 | group_commit_event_sched_in(struct perf_event *group_event, | ||
| 711 | struct perf_cpu_context *cpuctx, | ||
| 712 | struct perf_event_context *ctx) | ||
| 713 | { | ||
| 714 | struct perf_event *event; | ||
| 715 | u64 now = ctx->time; | ||
| 716 | |||
| 717 | group_event->tstamp_running += now - group_event->tstamp_stopped; | ||
| 718 | /* | ||
| 719 | * Schedule in siblings as one group (if any): | ||
| 720 | */ | ||
| 721 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | ||
| 722 | event->tstamp_running += now - event->tstamp_stopped; | ||
| 723 | } | ||
| 724 | } | ||
| 725 | |||
| 670 | static int | 726 | static int |
| 671 | group_sched_in(struct perf_event *group_event, | 727 | group_sched_in(struct perf_event *group_event, |
| 672 | struct perf_cpu_context *cpuctx, | 728 | struct perf_cpu_context *cpuctx, |
| 673 | struct perf_event_context *ctx) | 729 | struct perf_event_context *ctx) |
| 674 | { | 730 | { |
| 675 | struct perf_event *event, *partial_group = NULL; | 731 | struct perf_event *event, *partial_group = NULL; |
| 676 | const struct pmu *pmu = group_event->pmu; | 732 | struct pmu *pmu = group_event->pmu; |
| 677 | bool txn = false; | ||
| 678 | 733 | ||
| 679 | if (group_event->state == PERF_EVENT_STATE_OFF) | 734 | if (group_event->state == PERF_EVENT_STATE_OFF) |
| 680 | return 0; | 735 | return 0; |
| 681 | 736 | ||
| 682 | /* Check if group transaction availabe */ | 737 | pmu->start_txn(pmu); |
| 683 | if (pmu->start_txn) | ||
| 684 | txn = true; | ||
| 685 | |||
| 686 | if (txn) | ||
| 687 | pmu->start_txn(pmu); | ||
| 688 | 738 | ||
| 689 | if (event_sched_in(group_event, cpuctx, ctx)) { | 739 | /* |
| 690 | if (txn) | 740 | * use __event_sched_in() to delay updating tstamp_running |
| 691 | pmu->cancel_txn(pmu); | 741 | * until the transaction is committed. In case of failure |
| 742 | * we will keep an unmodified tstamp_running which is a | ||
| 743 | * requirement to get correct timing information | ||
| 744 | */ | ||
| 745 | if (__event_sched_in(group_event, cpuctx, ctx)) { | ||
| 746 | pmu->cancel_txn(pmu); | ||
| 692 | return -EAGAIN; | 747 | return -EAGAIN; |
| 693 | } | 748 | } |
| 694 | 749 | ||
| @@ -696,29 +751,33 @@ group_sched_in(struct perf_event *group_event, | |||
| 696 | * Schedule in siblings as one group (if any): | 751 | * Schedule in siblings as one group (if any): |
| 697 | */ | 752 | */ |
| 698 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 753 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
| 699 | if (event_sched_in(event, cpuctx, ctx)) { | 754 | if (__event_sched_in(event, cpuctx, ctx)) { |
| 700 | partial_group = event; | 755 | partial_group = event; |
| 701 | goto group_error; | 756 | goto group_error; |
| 702 | } | 757 | } |
| 703 | } | 758 | } |
| 704 | 759 | ||
| 705 | if (!txn || !pmu->commit_txn(pmu)) | 760 | if (!pmu->commit_txn(pmu)) { |
| 761 | /* commit tstamp_running */ | ||
| 762 | group_commit_event_sched_in(group_event, cpuctx, ctx); | ||
| 706 | return 0; | 763 | return 0; |
| 707 | 764 | } | |
| 708 | group_error: | 765 | group_error: |
| 709 | /* | 766 | /* |
| 710 | * Groups can be scheduled in as one unit only, so undo any | 767 | * Groups can be scheduled in as one unit only, so undo any |
| 711 | * partial group before returning: | 768 | * partial group before returning: |
| 769 | * | ||
| 770 | * use __event_sched_out() to avoid updating tstamp_stopped | ||
| 771 | * because the event never actually ran | ||
| 712 | */ | 772 | */ |
| 713 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { | 773 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
| 714 | if (event == partial_group) | 774 | if (event == partial_group) |
| 715 | break; | 775 | break; |
| 716 | event_sched_out(event, cpuctx, ctx); | 776 | __event_sched_out(event, cpuctx, ctx); |
| 717 | } | 777 | } |
| 718 | event_sched_out(group_event, cpuctx, ctx); | 778 | __event_sched_out(group_event, cpuctx, ctx); |
| 719 | 779 | ||
| 720 | if (txn) | 780 | pmu->cancel_txn(pmu); |
| 721 | pmu->cancel_txn(pmu); | ||
| 722 | 781 | ||
| 723 | return -EAGAIN; | 782 | return -EAGAIN; |
| 724 | } | 783 | } |
| @@ -771,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event, | |||
| 771 | */ | 830 | */ |
| 772 | static void __perf_install_in_context(void *info) | 831 | static void __perf_install_in_context(void *info) |
| 773 | { | 832 | { |
| 774 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 775 | struct perf_event *event = info; | 833 | struct perf_event *event = info; |
| 776 | struct perf_event_context *ctx = event->ctx; | 834 | struct perf_event_context *ctx = event->ctx; |
| 777 | struct perf_event *leader = event->group_leader; | 835 | struct perf_event *leader = event->group_leader; |
| 836 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
| 778 | int err; | 837 | int err; |
| 779 | 838 | ||
| 780 | /* | 839 | /* |
| @@ -794,12 +853,6 @@ static void __perf_install_in_context(void *info) | |||
| 794 | ctx->is_active = 1; | 853 | ctx->is_active = 1; |
| 795 | update_context_time(ctx); | 854 | update_context_time(ctx); |
| 796 | 855 | ||
| 797 | /* | ||
| 798 | * Protect the list operation against NMI by disabling the | ||
| 799 | * events on a global level. NOP for non NMI based events. | ||
| 800 | */ | ||
| 801 | perf_disable(); | ||
| 802 | |||
| 803 | add_event_to_ctx(event, ctx); | 856 | add_event_to_ctx(event, ctx); |
| 804 | 857 | ||
| 805 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 858 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| @@ -837,12 +890,7 @@ static void __perf_install_in_context(void *info) | |||
| 837 | } | 890 | } |
| 838 | } | 891 | } |
| 839 | 892 | ||
| 840 | if (!err && !ctx->task && cpuctx->max_pertask) | 893 | unlock: |
| 841 | cpuctx->max_pertask--; | ||
| 842 | |||
| 843 | unlock: | ||
| 844 | perf_enable(); | ||
| 845 | |||
| 846 | raw_spin_unlock(&ctx->lock); | 894 | raw_spin_unlock(&ctx->lock); |
| 847 | } | 895 | } |
| 848 | 896 | ||
| @@ -865,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
| 865 | { | 913 | { |
| 866 | struct task_struct *task = ctx->task; | 914 | struct task_struct *task = ctx->task; |
| 867 | 915 | ||
| 916 | event->ctx = ctx; | ||
| 917 | |||
| 868 | if (!task) { | 918 | if (!task) { |
| 869 | /* | 919 | /* |
| 870 | * Per cpu events are installed via an smp call and | 920 | * Per cpu events are installed via an smp call and |
| @@ -913,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
| 913 | 963 | ||
| 914 | event->state = PERF_EVENT_STATE_INACTIVE; | 964 | event->state = PERF_EVENT_STATE_INACTIVE; |
| 915 | event->tstamp_enabled = ctx->time - event->total_time_enabled; | 965 | event->tstamp_enabled = ctx->time - event->total_time_enabled; |
| 916 | list_for_each_entry(sub, &event->sibling_list, group_entry) | 966 | list_for_each_entry(sub, &event->sibling_list, group_entry) { |
| 917 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) | 967 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { |
| 918 | sub->tstamp_enabled = | 968 | sub->tstamp_enabled = |
| 919 | ctx->time - sub->total_time_enabled; | 969 | ctx->time - sub->total_time_enabled; |
| 970 | } | ||
| 971 | } | ||
| 920 | } | 972 | } |
| 921 | 973 | ||
| 922 | /* | 974 | /* |
| @@ -925,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
| 925 | static void __perf_event_enable(void *info) | 977 | static void __perf_event_enable(void *info) |
| 926 | { | 978 | { |
| 927 | struct perf_event *event = info; | 979 | struct perf_event *event = info; |
| 928 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 929 | struct perf_event_context *ctx = event->ctx; | 980 | struct perf_event_context *ctx = event->ctx; |
| 930 | struct perf_event *leader = event->group_leader; | 981 | struct perf_event *leader = event->group_leader; |
| 982 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
| 931 | int err; | 983 | int err; |
| 932 | 984 | ||
| 933 | /* | 985 | /* |
| @@ -961,12 +1013,10 @@ static void __perf_event_enable(void *info) | |||
| 961 | if (!group_can_go_on(event, cpuctx, 1)) { | 1013 | if (!group_can_go_on(event, cpuctx, 1)) { |
| 962 | err = -EEXIST; | 1014 | err = -EEXIST; |
| 963 | } else { | 1015 | } else { |
| 964 | perf_disable(); | ||
| 965 | if (event == leader) | 1016 | if (event == leader) |
| 966 | err = group_sched_in(event, cpuctx, ctx); | 1017 | err = group_sched_in(event, cpuctx, ctx); |
| 967 | else | 1018 | else |
| 968 | err = event_sched_in(event, cpuctx, ctx); | 1019 | err = event_sched_in(event, cpuctx, ctx); |
| 969 | perf_enable(); | ||
| 970 | } | 1020 | } |
| 971 | 1021 | ||
| 972 | if (err) { | 1022 | if (err) { |
| @@ -982,7 +1032,7 @@ static void __perf_event_enable(void *info) | |||
| 982 | } | 1032 | } |
| 983 | } | 1033 | } |
| 984 | 1034 | ||
| 985 | unlock: | 1035 | unlock: |
| 986 | raw_spin_unlock(&ctx->lock); | 1036 | raw_spin_unlock(&ctx->lock); |
| 987 | } | 1037 | } |
| 988 | 1038 | ||
| @@ -1023,7 +1073,7 @@ void perf_event_enable(struct perf_event *event) | |||
| 1023 | if (event->state == PERF_EVENT_STATE_ERROR) | 1073 | if (event->state == PERF_EVENT_STATE_ERROR) |
| 1024 | event->state = PERF_EVENT_STATE_OFF; | 1074 | event->state = PERF_EVENT_STATE_OFF; |
| 1025 | 1075 | ||
| 1026 | retry: | 1076 | retry: |
| 1027 | raw_spin_unlock_irq(&ctx->lock); | 1077 | raw_spin_unlock_irq(&ctx->lock); |
| 1028 | task_oncpu_function_call(task, __perf_event_enable, event); | 1078 | task_oncpu_function_call(task, __perf_event_enable, event); |
| 1029 | 1079 | ||
| @@ -1043,7 +1093,7 @@ void perf_event_enable(struct perf_event *event) | |||
| 1043 | if (event->state == PERF_EVENT_STATE_OFF) | 1093 | if (event->state == PERF_EVENT_STATE_OFF) |
| 1044 | __perf_event_mark_enabled(event, ctx); | 1094 | __perf_event_mark_enabled(event, ctx); |
| 1045 | 1095 | ||
| 1046 | out: | 1096 | out: |
| 1047 | raw_spin_unlock_irq(&ctx->lock); | 1097 | raw_spin_unlock_irq(&ctx->lock); |
| 1048 | } | 1098 | } |
| 1049 | 1099 | ||
| @@ -1074,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 1074 | struct perf_event *event; | 1124 | struct perf_event *event; |
| 1075 | 1125 | ||
| 1076 | raw_spin_lock(&ctx->lock); | 1126 | raw_spin_lock(&ctx->lock); |
| 1127 | perf_pmu_disable(ctx->pmu); | ||
| 1077 | ctx->is_active = 0; | 1128 | ctx->is_active = 0; |
| 1078 | if (likely(!ctx->nr_events)) | 1129 | if (likely(!ctx->nr_events)) |
| 1079 | goto out; | 1130 | goto out; |
| 1080 | update_context_time(ctx); | 1131 | update_context_time(ctx); |
| 1081 | 1132 | ||
| 1082 | perf_disable(); | ||
| 1083 | if (!ctx->nr_active) | 1133 | if (!ctx->nr_active) |
| 1084 | goto out_enable; | 1134 | goto out; |
| 1085 | 1135 | ||
| 1086 | if (event_type & EVENT_PINNED) | 1136 | if (event_type & EVENT_PINNED) { |
| 1087 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 1137 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
| 1088 | group_sched_out(event, cpuctx, ctx); | 1138 | group_sched_out(event, cpuctx, ctx); |
| 1139 | } | ||
| 1089 | 1140 | ||
| 1090 | if (event_type & EVENT_FLEXIBLE) | 1141 | if (event_type & EVENT_FLEXIBLE) { |
| 1091 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 1142 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
| 1092 | group_sched_out(event, cpuctx, ctx); | 1143 | group_sched_out(event, cpuctx, ctx); |
| 1093 | 1144 | } | |
| 1094 | out_enable: | 1145 | out: |
| 1095 | perf_enable(); | 1146 | perf_pmu_enable(ctx->pmu); |
| 1096 | out: | ||
| 1097 | raw_spin_unlock(&ctx->lock); | 1147 | raw_spin_unlock(&ctx->lock); |
| 1098 | } | 1148 | } |
| 1099 | 1149 | ||
| @@ -1191,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
| 1191 | } | 1241 | } |
| 1192 | } | 1242 | } |
| 1193 | 1243 | ||
| 1194 | /* | 1244 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
| 1195 | * Called from scheduler to remove the events of the current task, | 1245 | struct task_struct *next) |
| 1196 | * with interrupts disabled. | ||
| 1197 | * | ||
| 1198 | * We stop each event and update the event value in event->count. | ||
| 1199 | * | ||
| 1200 | * This does not protect us against NMI, but disable() | ||
| 1201 | * sets the disabled bit in the control field of event _before_ | ||
| 1202 | * accessing the event control register. If a NMI hits, then it will | ||
| 1203 | * not restart the event. | ||
| 1204 | */ | ||
| 1205 | void perf_event_task_sched_out(struct task_struct *task, | ||
| 1206 | struct task_struct *next) | ||
| 1207 | { | 1246 | { |
| 1208 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1247 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
| 1209 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
| 1210 | struct perf_event_context *next_ctx; | 1248 | struct perf_event_context *next_ctx; |
| 1211 | struct perf_event_context *parent; | 1249 | struct perf_event_context *parent; |
| 1250 | struct perf_cpu_context *cpuctx; | ||
| 1212 | int do_switch = 1; | 1251 | int do_switch = 1; |
| 1213 | 1252 | ||
| 1214 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | 1253 | if (likely(!ctx)) |
| 1254 | return; | ||
| 1215 | 1255 | ||
| 1216 | if (likely(!ctx || !cpuctx->task_ctx)) | 1256 | cpuctx = __get_cpu_context(ctx); |
| 1257 | if (!cpuctx->task_ctx) | ||
| 1217 | return; | 1258 | return; |
| 1218 | 1259 | ||
| 1219 | rcu_read_lock(); | 1260 | rcu_read_lock(); |
| 1220 | parent = rcu_dereference(ctx->parent_ctx); | 1261 | parent = rcu_dereference(ctx->parent_ctx); |
| 1221 | next_ctx = next->perf_event_ctxp; | 1262 | next_ctx = next->perf_event_ctxp[ctxn]; |
| 1222 | if (parent && next_ctx && | 1263 | if (parent && next_ctx && |
| 1223 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 1264 | rcu_dereference(next_ctx->parent_ctx) == parent) { |
| 1224 | /* | 1265 | /* |
| @@ -1237,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
| 1237 | * XXX do we need a memory barrier of sorts | 1278 | * XXX do we need a memory barrier of sorts |
| 1238 | * wrt to rcu_dereference() of perf_event_ctxp | 1279 | * wrt to rcu_dereference() of perf_event_ctxp |
| 1239 | */ | 1280 | */ |
| 1240 | task->perf_event_ctxp = next_ctx; | 1281 | task->perf_event_ctxp[ctxn] = next_ctx; |
| 1241 | next->perf_event_ctxp = ctx; | 1282 | next->perf_event_ctxp[ctxn] = ctx; |
| 1242 | ctx->task = next; | 1283 | ctx->task = next; |
| 1243 | next_ctx->task = task; | 1284 | next_ctx->task = task; |
| 1244 | do_switch = 0; | 1285 | do_switch = 0; |
| @@ -1256,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task, | |||
| 1256 | } | 1297 | } |
| 1257 | } | 1298 | } |
| 1258 | 1299 | ||
| 1300 | #define for_each_task_context_nr(ctxn) \ | ||
| 1301 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | ||
| 1302 | |||
| 1303 | /* | ||
| 1304 | * Called from scheduler to remove the events of the current task, | ||
| 1305 | * with interrupts disabled. | ||
| 1306 | * | ||
| 1307 | * We stop each event and update the event value in event->count. | ||
| 1308 | * | ||
| 1309 | * This does not protect us against NMI, but disable() | ||
| 1310 | * sets the disabled bit in the control field of event _before_ | ||
| 1311 | * accessing the event control register. If a NMI hits, then it will | ||
| 1312 | * not restart the event. | ||
| 1313 | */ | ||
| 1314 | void __perf_event_task_sched_out(struct task_struct *task, | ||
| 1315 | struct task_struct *next) | ||
| 1316 | { | ||
| 1317 | int ctxn; | ||
| 1318 | |||
| 1319 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | ||
| 1320 | |||
| 1321 | for_each_task_context_nr(ctxn) | ||
| 1322 | perf_event_context_sched_out(task, ctxn, next); | ||
| 1323 | } | ||
| 1324 | |||
| 1259 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1325 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
| 1260 | enum event_type_t event_type) | 1326 | enum event_type_t event_type) |
| 1261 | { | 1327 | { |
| 1262 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1328 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
| 1263 | 1329 | ||
| 1264 | if (!cpuctx->task_ctx) | 1330 | if (!cpuctx->task_ctx) |
| 1265 | return; | 1331 | return; |
| @@ -1274,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, | |||
| 1274 | /* | 1340 | /* |
| 1275 | * Called with IRQs disabled | 1341 | * Called with IRQs disabled |
| 1276 | */ | 1342 | */ |
| 1277 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) | ||
| 1278 | { | ||
| 1279 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
| 1280 | } | ||
| 1281 | |||
| 1282 | /* | ||
| 1283 | * Called with IRQs disabled | ||
| 1284 | */ | ||
| 1285 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | 1343 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
| 1286 | enum event_type_t event_type) | 1344 | enum event_type_t event_type) |
| 1287 | { | 1345 | { |
| @@ -1332,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
| 1332 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 1390 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| 1333 | continue; | 1391 | continue; |
| 1334 | 1392 | ||
| 1335 | if (group_can_go_on(event, cpuctx, can_add_hw)) | 1393 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
| 1336 | if (group_sched_in(event, cpuctx, ctx)) | 1394 | if (group_sched_in(event, cpuctx, ctx)) |
| 1337 | can_add_hw = 0; | 1395 | can_add_hw = 0; |
| 1396 | } | ||
| 1338 | } | 1397 | } |
| 1339 | } | 1398 | } |
| 1340 | 1399 | ||
| @@ -1350,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
| 1350 | 1409 | ||
| 1351 | ctx->timestamp = perf_clock(); | 1410 | ctx->timestamp = perf_clock(); |
| 1352 | 1411 | ||
| 1353 | perf_disable(); | ||
| 1354 | |||
| 1355 | /* | 1412 | /* |
| 1356 | * First go through the list and put on any pinned groups | 1413 | * First go through the list and put on any pinned groups |
| 1357 | * in order to give them the best chance of going on. | 1414 | * in order to give them the best chance of going on. |
| @@ -1363,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
| 1363 | if (event_type & EVENT_FLEXIBLE) | 1420 | if (event_type & EVENT_FLEXIBLE) |
| 1364 | ctx_flexible_sched_in(ctx, cpuctx); | 1421 | ctx_flexible_sched_in(ctx, cpuctx); |
| 1365 | 1422 | ||
| 1366 | perf_enable(); | 1423 | out: |
| 1367 | out: | ||
| 1368 | raw_spin_unlock(&ctx->lock); | 1424 | raw_spin_unlock(&ctx->lock); |
| 1369 | } | 1425 | } |
| 1370 | 1426 | ||
| @@ -1376,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
| 1376 | ctx_sched_in(ctx, cpuctx, event_type); | 1432 | ctx_sched_in(ctx, cpuctx, event_type); |
| 1377 | } | 1433 | } |
| 1378 | 1434 | ||
| 1379 | static void task_ctx_sched_in(struct task_struct *task, | 1435 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
| 1380 | enum event_type_t event_type) | 1436 | enum event_type_t event_type) |
| 1381 | { | 1437 | { |
| 1382 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 1438 | struct perf_cpu_context *cpuctx; |
| 1383 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
| 1384 | 1439 | ||
| 1385 | if (likely(!ctx)) | 1440 | cpuctx = __get_cpu_context(ctx); |
| 1386 | return; | ||
| 1387 | if (cpuctx->task_ctx == ctx) | 1441 | if (cpuctx->task_ctx == ctx) |
| 1388 | return; | 1442 | return; |
| 1443 | |||
| 1389 | ctx_sched_in(ctx, cpuctx, event_type); | 1444 | ctx_sched_in(ctx, cpuctx, event_type); |
| 1390 | cpuctx->task_ctx = ctx; | 1445 | cpuctx->task_ctx = ctx; |
| 1391 | } | 1446 | } |
| 1392 | /* | ||
| 1393 | * Called from scheduler to add the events of the current task | ||
| 1394 | * with interrupts disabled. | ||
| 1395 | * | ||
| 1396 | * We restore the event value and then enable it. | ||
| 1397 | * | ||
| 1398 | * This does not protect us against NMI, but enable() | ||
| 1399 | * sets the enabled bit in the control field of event _before_ | ||
| 1400 | * accessing the event control register. If a NMI hits, then it will | ||
| 1401 | * keep the event running. | ||
| 1402 | */ | ||
| 1403 | void perf_event_task_sched_in(struct task_struct *task) | ||
| 1404 | { | ||
| 1405 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 1406 | struct perf_event_context *ctx = task->perf_event_ctxp; | ||
| 1407 | 1447 | ||
| 1408 | if (likely(!ctx)) | 1448 | void perf_event_context_sched_in(struct perf_event_context *ctx) |
| 1409 | return; | 1449 | { |
| 1450 | struct perf_cpu_context *cpuctx; | ||
| 1410 | 1451 | ||
| 1452 | cpuctx = __get_cpu_context(ctx); | ||
| 1411 | if (cpuctx->task_ctx == ctx) | 1453 | if (cpuctx->task_ctx == ctx) |
| 1412 | return; | 1454 | return; |
| 1413 | 1455 | ||
| 1414 | perf_disable(); | 1456 | perf_pmu_disable(ctx->pmu); |
| 1415 | |||
| 1416 | /* | 1457 | /* |
| 1417 | * We want to keep the following priority order: | 1458 | * We want to keep the following priority order: |
| 1418 | * cpu pinned (that don't need to move), task pinned, | 1459 | * cpu pinned (that don't need to move), task pinned, |
| @@ -1426,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task) | |||
| 1426 | 1467 | ||
| 1427 | cpuctx->task_ctx = ctx; | 1468 | cpuctx->task_ctx = ctx; |
| 1428 | 1469 | ||
| 1429 | perf_enable(); | 1470 | /* |
| 1471 | * Since these rotations are per-cpu, we need to ensure the | ||
| 1472 | * cpu-context we got scheduled on is actually rotating. | ||
| 1473 | */ | ||
| 1474 | perf_pmu_rotate_start(ctx->pmu); | ||
| 1475 | perf_pmu_enable(ctx->pmu); | ||
| 1476 | } | ||
| 1477 | |||
| 1478 | /* | ||
| 1479 | * Called from scheduler to add the events of the current task | ||
| 1480 | * with interrupts disabled. | ||
| 1481 | * | ||
| 1482 | * We restore the event value and then enable it. | ||
| 1483 | * | ||
| 1484 | * This does not protect us against NMI, but enable() | ||
| 1485 | * sets the enabled bit in the control field of event _before_ | ||
| 1486 | * accessing the event control register. If a NMI hits, then it will | ||
| 1487 | * keep the event running. | ||
| 1488 | */ | ||
| 1489 | void __perf_event_task_sched_in(struct task_struct *task) | ||
| 1490 | { | ||
| 1491 | struct perf_event_context *ctx; | ||
| 1492 | int ctxn; | ||
| 1493 | |||
| 1494 | for_each_task_context_nr(ctxn) { | ||
| 1495 | ctx = task->perf_event_ctxp[ctxn]; | ||
| 1496 | if (likely(!ctx)) | ||
| 1497 | continue; | ||
| 1498 | |||
| 1499 | perf_event_context_sched_in(ctx); | ||
| 1500 | } | ||
| 1430 | } | 1501 | } |
| 1431 | 1502 | ||
| 1432 | #define MAX_INTERRUPTS (~0ULL) | 1503 | #define MAX_INTERRUPTS (~0ULL) |
| @@ -1506,22 +1577,6 @@ do { \ | |||
| 1506 | return div64_u64(dividend, divisor); | 1577 | return div64_u64(dividend, divisor); |
| 1507 | } | 1578 | } |
| 1508 | 1579 | ||
| 1509 | static void perf_event_stop(struct perf_event *event) | ||
| 1510 | { | ||
| 1511 | if (!event->pmu->stop) | ||
| 1512 | return event->pmu->disable(event); | ||
| 1513 | |||
| 1514 | return event->pmu->stop(event); | ||
| 1515 | } | ||
| 1516 | |||
| 1517 | static int perf_event_start(struct perf_event *event) | ||
| 1518 | { | ||
| 1519 | if (!event->pmu->start) | ||
| 1520 | return event->pmu->enable(event); | ||
| 1521 | |||
| 1522 | return event->pmu->start(event); | ||
| 1523 | } | ||
| 1524 | |||
| 1525 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | 1580 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
| 1526 | { | 1581 | { |
| 1527 | struct hw_perf_event *hwc = &event->hw; | 1582 | struct hw_perf_event *hwc = &event->hw; |
| @@ -1541,15 +1596,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
| 1541 | hwc->sample_period = sample_period; | 1596 | hwc->sample_period = sample_period; |
| 1542 | 1597 | ||
| 1543 | if (local64_read(&hwc->period_left) > 8*sample_period) { | 1598 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
| 1544 | perf_disable(); | 1599 | event->pmu->stop(event, PERF_EF_UPDATE); |
| 1545 | perf_event_stop(event); | ||
| 1546 | local64_set(&hwc->period_left, 0); | 1600 | local64_set(&hwc->period_left, 0); |
| 1547 | perf_event_start(event); | 1601 | event->pmu->start(event, PERF_EF_RELOAD); |
| 1548 | perf_enable(); | ||
| 1549 | } | 1602 | } |
| 1550 | } | 1603 | } |
| 1551 | 1604 | ||
| 1552 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | 1605 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) |
| 1553 | { | 1606 | { |
| 1554 | struct perf_event *event; | 1607 | struct perf_event *event; |
| 1555 | struct hw_perf_event *hwc; | 1608 | struct hw_perf_event *hwc; |
| @@ -1574,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
| 1574 | */ | 1627 | */ |
| 1575 | if (interrupts == MAX_INTERRUPTS) { | 1628 | if (interrupts == MAX_INTERRUPTS) { |
| 1576 | perf_log_throttle(event, 1); | 1629 | perf_log_throttle(event, 1); |
| 1577 | perf_disable(); | 1630 | event->pmu->start(event, 0); |
| 1578 | event->pmu->unthrottle(event); | ||
| 1579 | perf_enable(); | ||
| 1580 | } | 1631 | } |
| 1581 | 1632 | ||
| 1582 | if (!event->attr.freq || !event->attr.sample_freq) | 1633 | if (!event->attr.freq || !event->attr.sample_freq) |
| 1583 | continue; | 1634 | continue; |
| 1584 | 1635 | ||
| 1585 | perf_disable(); | ||
| 1586 | event->pmu->read(event); | 1636 | event->pmu->read(event); |
| 1587 | now = local64_read(&event->count); | 1637 | now = local64_read(&event->count); |
| 1588 | delta = now - hwc->freq_count_stamp; | 1638 | delta = now - hwc->freq_count_stamp; |
| 1589 | hwc->freq_count_stamp = now; | 1639 | hwc->freq_count_stamp = now; |
| 1590 | 1640 | ||
| 1591 | if (delta > 0) | 1641 | if (delta > 0) |
| 1592 | perf_adjust_period(event, TICK_NSEC, delta); | 1642 | perf_adjust_period(event, period, delta); |
| 1593 | perf_enable(); | ||
| 1594 | } | 1643 | } |
| 1595 | raw_spin_unlock(&ctx->lock); | 1644 | raw_spin_unlock(&ctx->lock); |
| 1596 | } | 1645 | } |
| @@ -1608,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx) | |||
| 1608 | raw_spin_unlock(&ctx->lock); | 1657 | raw_spin_unlock(&ctx->lock); |
| 1609 | } | 1658 | } |
| 1610 | 1659 | ||
| 1611 | void perf_event_task_tick(struct task_struct *curr) | 1660 | /* |
| 1661 | * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized | ||
| 1662 | * because they're strictly cpu affine and rotate_start is called with IRQs | ||
| 1663 | * disabled, while rotate_context is called from IRQ context. | ||
| 1664 | */ | ||
| 1665 | static void perf_rotate_context(struct perf_cpu_context *cpuctx) | ||
| 1612 | { | 1666 | { |
| 1613 | struct perf_cpu_context *cpuctx; | 1667 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; |
| 1614 | struct perf_event_context *ctx; | 1668 | struct perf_event_context *ctx = NULL; |
| 1615 | int rotate = 0; | 1669 | int rotate = 0, remove = 1; |
| 1616 | |||
| 1617 | if (!atomic_read(&nr_events)) | ||
| 1618 | return; | ||
| 1619 | 1670 | ||
| 1620 | cpuctx = &__get_cpu_var(perf_cpu_context); | 1671 | if (cpuctx->ctx.nr_events) { |
| 1621 | if (cpuctx->ctx.nr_events && | 1672 | remove = 0; |
| 1622 | cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 1673 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
| 1623 | rotate = 1; | 1674 | rotate = 1; |
| 1675 | } | ||
| 1624 | 1676 | ||
| 1625 | ctx = curr->perf_event_ctxp; | 1677 | ctx = cpuctx->task_ctx; |
| 1626 | if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) | 1678 | if (ctx && ctx->nr_events) { |
| 1627 | rotate = 1; | 1679 | remove = 0; |
| 1680 | if (ctx->nr_events != ctx->nr_active) | ||
| 1681 | rotate = 1; | ||
| 1682 | } | ||
| 1628 | 1683 | ||
| 1629 | perf_ctx_adjust_freq(&cpuctx->ctx); | 1684 | perf_pmu_disable(cpuctx->ctx.pmu); |
| 1685 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
| 1630 | if (ctx) | 1686 | if (ctx) |
| 1631 | perf_ctx_adjust_freq(ctx); | 1687 | perf_ctx_adjust_freq(ctx, interval); |
| 1632 | 1688 | ||
| 1633 | if (!rotate) | 1689 | if (!rotate) |
| 1634 | return; | 1690 | goto done; |
| 1635 | 1691 | ||
| 1636 | perf_disable(); | ||
| 1637 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 1692 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
| 1638 | if (ctx) | 1693 | if (ctx) |
| 1639 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 1694 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
| @@ -1644,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr) | |||
| 1644 | 1699 | ||
| 1645 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 1700 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
| 1646 | if (ctx) | 1701 | if (ctx) |
| 1647 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); | 1702 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
| 1648 | perf_enable(); | 1703 | |
| 1704 | done: | ||
| 1705 | if (remove) | ||
| 1706 | list_del_init(&cpuctx->rotation_list); | ||
| 1707 | |||
| 1708 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
| 1709 | } | ||
| 1710 | |||
| 1711 | void perf_event_task_tick(void) | ||
| 1712 | { | ||
| 1713 | struct list_head *head = &__get_cpu_var(rotation_list); | ||
| 1714 | struct perf_cpu_context *cpuctx, *tmp; | ||
| 1715 | |||
| 1716 | WARN_ON(!irqs_disabled()); | ||
| 1717 | |||
| 1718 | list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { | ||
| 1719 | if (cpuctx->jiffies_interval == 1 || | ||
| 1720 | !(jiffies % cpuctx->jiffies_interval)) | ||
| 1721 | perf_rotate_context(cpuctx); | ||
| 1722 | } | ||
| 1649 | } | 1723 | } |
| 1650 | 1724 | ||
| 1651 | static int event_enable_on_exec(struct perf_event *event, | 1725 | static int event_enable_on_exec(struct perf_event *event, |
| @@ -1667,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event, | |||
| 1667 | * Enable all of a task's events that have been marked enable-on-exec. | 1741 | * Enable all of a task's events that have been marked enable-on-exec. |
| 1668 | * This expects task == current. | 1742 | * This expects task == current. |
| 1669 | */ | 1743 | */ |
| 1670 | static void perf_event_enable_on_exec(struct task_struct *task) | 1744 | static void perf_event_enable_on_exec(struct perf_event_context *ctx) |
| 1671 | { | 1745 | { |
| 1672 | struct perf_event_context *ctx; | ||
| 1673 | struct perf_event *event; | 1746 | struct perf_event *event; |
| 1674 | unsigned long flags; | 1747 | unsigned long flags; |
| 1675 | int enabled = 0; | 1748 | int enabled = 0; |
| 1676 | int ret; | 1749 | int ret; |
| 1677 | 1750 | ||
| 1678 | local_irq_save(flags); | 1751 | local_irq_save(flags); |
| 1679 | ctx = task->perf_event_ctxp; | ||
| 1680 | if (!ctx || !ctx->nr_events) | 1752 | if (!ctx || !ctx->nr_events) |
| 1681 | goto out; | 1753 | goto out; |
| 1682 | 1754 | ||
| 1683 | __perf_event_task_sched_out(ctx); | 1755 | task_ctx_sched_out(ctx, EVENT_ALL); |
| 1684 | 1756 | ||
| 1685 | raw_spin_lock(&ctx->lock); | 1757 | raw_spin_lock(&ctx->lock); |
| 1686 | 1758 | ||
| @@ -1704,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
| 1704 | 1776 | ||
| 1705 | raw_spin_unlock(&ctx->lock); | 1777 | raw_spin_unlock(&ctx->lock); |
| 1706 | 1778 | ||
| 1707 | perf_event_task_sched_in(task); | 1779 | perf_event_context_sched_in(ctx); |
| 1708 | out: | 1780 | out: |
| 1709 | local_irq_restore(flags); | 1781 | local_irq_restore(flags); |
| 1710 | } | 1782 | } |
| 1711 | 1783 | ||
| @@ -1714,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) | |||
| 1714 | */ | 1786 | */ |
| 1715 | static void __perf_event_read(void *info) | 1787 | static void __perf_event_read(void *info) |
| 1716 | { | 1788 | { |
| 1717 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 1718 | struct perf_event *event = info; | 1789 | struct perf_event *event = info; |
| 1719 | struct perf_event_context *ctx = event->ctx; | 1790 | struct perf_event_context *ctx = event->ctx; |
| 1791 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
| 1720 | 1792 | ||
| 1721 | /* | 1793 | /* |
| 1722 | * If this is a task context, we need to check whether it is | 1794 | * If this is a task context, we need to check whether it is |
| @@ -1755,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event) | |||
| 1755 | unsigned long flags; | 1827 | unsigned long flags; |
| 1756 | 1828 | ||
| 1757 | raw_spin_lock_irqsave(&ctx->lock, flags); | 1829 | raw_spin_lock_irqsave(&ctx->lock, flags); |
| 1758 | update_context_time(ctx); | 1830 | /* |
| 1831 | * may read while context is not active | ||
| 1832 | * (e.g., thread is blocked), in that case | ||
| 1833 | * we cannot update context time | ||
| 1834 | */ | ||
| 1835 | if (ctx->is_active) | ||
| 1836 | update_context_time(ctx); | ||
| 1759 | update_event_times(event); | 1837 | update_event_times(event); |
| 1760 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 1838 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 1761 | } | 1839 | } |
| @@ -1764,11 +1842,219 @@ static u64 perf_event_read(struct perf_event *event) | |||
| 1764 | } | 1842 | } |
| 1765 | 1843 | ||
| 1766 | /* | 1844 | /* |
| 1767 | * Initialize the perf_event context in a task_struct: | 1845 | * Callchain support |
| 1768 | */ | 1846 | */ |
| 1847 | |||
| 1848 | struct callchain_cpus_entries { | ||
| 1849 | struct rcu_head rcu_head; | ||
| 1850 | struct perf_callchain_entry *cpu_entries[0]; | ||
| 1851 | }; | ||
| 1852 | |||
| 1853 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
| 1854 | static atomic_t nr_callchain_events; | ||
| 1855 | static DEFINE_MUTEX(callchain_mutex); | ||
| 1856 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
| 1857 | |||
| 1858 | |||
| 1859 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
| 1860 | struct pt_regs *regs) | ||
| 1861 | { | ||
| 1862 | } | ||
| 1863 | |||
| 1864 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
| 1865 | struct pt_regs *regs) | ||
| 1866 | { | ||
| 1867 | } | ||
| 1868 | |||
| 1869 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
| 1870 | { | ||
| 1871 | struct callchain_cpus_entries *entries; | ||
| 1872 | int cpu; | ||
| 1873 | |||
| 1874 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
| 1875 | |||
| 1876 | for_each_possible_cpu(cpu) | ||
| 1877 | kfree(entries->cpu_entries[cpu]); | ||
| 1878 | |||
| 1879 | kfree(entries); | ||
| 1880 | } | ||
| 1881 | |||
| 1882 | static void release_callchain_buffers(void) | ||
| 1883 | { | ||
| 1884 | struct callchain_cpus_entries *entries; | ||
| 1885 | |||
| 1886 | entries = callchain_cpus_entries; | ||
| 1887 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
| 1888 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
| 1889 | } | ||
| 1890 | |||
| 1891 | static int alloc_callchain_buffers(void) | ||
| 1892 | { | ||
| 1893 | int cpu; | ||
| 1894 | int size; | ||
| 1895 | struct callchain_cpus_entries *entries; | ||
| 1896 | |||
| 1897 | /* | ||
| 1898 | * We can't use the percpu allocation API for data that can be | ||
| 1899 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
| 1900 | * until that gets sorted out. | ||
| 1901 | */ | ||
| 1902 | size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * | ||
| 1903 | num_possible_cpus(); | ||
| 1904 | |||
| 1905 | entries = kzalloc(size, GFP_KERNEL); | ||
| 1906 | if (!entries) | ||
| 1907 | return -ENOMEM; | ||
| 1908 | |||
| 1909 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
| 1910 | |||
| 1911 | for_each_possible_cpu(cpu) { | ||
| 1912 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
| 1913 | cpu_to_node(cpu)); | ||
| 1914 | if (!entries->cpu_entries[cpu]) | ||
| 1915 | goto fail; | ||
| 1916 | } | ||
| 1917 | |||
| 1918 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
| 1919 | |||
| 1920 | return 0; | ||
| 1921 | |||
| 1922 | fail: | ||
| 1923 | for_each_possible_cpu(cpu) | ||
| 1924 | kfree(entries->cpu_entries[cpu]); | ||
| 1925 | kfree(entries); | ||
| 1926 | |||
| 1927 | return -ENOMEM; | ||
| 1928 | } | ||
| 1929 | |||
| 1930 | static int get_callchain_buffers(void) | ||
| 1931 | { | ||
| 1932 | int err = 0; | ||
| 1933 | int count; | ||
| 1934 | |||
| 1935 | mutex_lock(&callchain_mutex); | ||
| 1936 | |||
| 1937 | count = atomic_inc_return(&nr_callchain_events); | ||
| 1938 | if (WARN_ON_ONCE(count < 1)) { | ||
| 1939 | err = -EINVAL; | ||
| 1940 | goto exit; | ||
| 1941 | } | ||
| 1942 | |||
| 1943 | if (count > 1) { | ||
| 1944 | /* If the allocation failed, give up */ | ||
| 1945 | if (!callchain_cpus_entries) | ||
| 1946 | err = -ENOMEM; | ||
| 1947 | goto exit; | ||
| 1948 | } | ||
| 1949 | |||
| 1950 | err = alloc_callchain_buffers(); | ||
| 1951 | if (err) | ||
| 1952 | release_callchain_buffers(); | ||
| 1953 | exit: | ||
| 1954 | mutex_unlock(&callchain_mutex); | ||
| 1955 | |||
| 1956 | return err; | ||
| 1957 | } | ||
| 1958 | |||
| 1959 | static void put_callchain_buffers(void) | ||
| 1960 | { | ||
| 1961 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
| 1962 | release_callchain_buffers(); | ||
| 1963 | mutex_unlock(&callchain_mutex); | ||
| 1964 | } | ||
| 1965 | } | ||
| 1966 | |||
| 1967 | static int get_recursion_context(int *recursion) | ||
| 1968 | { | ||
| 1969 | int rctx; | ||
| 1970 | |||
| 1971 | if (in_nmi()) | ||
| 1972 | rctx = 3; | ||
| 1973 | else if (in_irq()) | ||
| 1974 | rctx = 2; | ||
| 1975 | else if (in_softirq()) | ||
| 1976 | rctx = 1; | ||
| 1977 | else | ||
| 1978 | rctx = 0; | ||
| 1979 | |||
| 1980 | if (recursion[rctx]) | ||
| 1981 | return -1; | ||
| 1982 | |||
| 1983 | recursion[rctx]++; | ||
| 1984 | barrier(); | ||
| 1985 | |||
| 1986 | return rctx; | ||
| 1987 | } | ||
| 1988 | |||
| 1989 | static inline void put_recursion_context(int *recursion, int rctx) | ||
| 1990 | { | ||
| 1991 | barrier(); | ||
| 1992 | recursion[rctx]--; | ||
| 1993 | } | ||
| 1994 | |||
| 1995 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
| 1996 | { | ||
| 1997 | int cpu; | ||
| 1998 | struct callchain_cpus_entries *entries; | ||
| 1999 | |||
| 2000 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
| 2001 | if (*rctx == -1) | ||
| 2002 | return NULL; | ||
| 2003 | |||
| 2004 | entries = rcu_dereference(callchain_cpus_entries); | ||
| 2005 | if (!entries) | ||
| 2006 | return NULL; | ||
| 2007 | |||
| 2008 | cpu = smp_processor_id(); | ||
| 2009 | |||
| 2010 | return &entries->cpu_entries[cpu][*rctx]; | ||
| 2011 | } | ||
| 2012 | |||
| 1769 | static void | 2013 | static void |
| 1770 | __perf_event_init_context(struct perf_event_context *ctx, | 2014 | put_callchain_entry(int rctx) |
| 1771 | struct task_struct *task) | 2015 | { |
| 2016 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
| 2017 | } | ||
| 2018 | |||
| 2019 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
| 2020 | { | ||
| 2021 | int rctx; | ||
| 2022 | struct perf_callchain_entry *entry; | ||
| 2023 | |||
| 2024 | |||
| 2025 | entry = get_callchain_entry(&rctx); | ||
| 2026 | if (rctx == -1) | ||
| 2027 | return NULL; | ||
| 2028 | |||
| 2029 | if (!entry) | ||
| 2030 | goto exit_put; | ||
| 2031 | |||
| 2032 | entry->nr = 0; | ||
| 2033 | |||
| 2034 | if (!user_mode(regs)) { | ||
| 2035 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
| 2036 | perf_callchain_kernel(entry, regs); | ||
| 2037 | if (current->mm) | ||
| 2038 | regs = task_pt_regs(current); | ||
| 2039 | else | ||
| 2040 | regs = NULL; | ||
| 2041 | } | ||
| 2042 | |||
| 2043 | if (regs) { | ||
| 2044 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
| 2045 | perf_callchain_user(entry, regs); | ||
| 2046 | } | ||
| 2047 | |||
| 2048 | exit_put: | ||
| 2049 | put_callchain_entry(rctx); | ||
| 2050 | |||
| 2051 | return entry; | ||
| 2052 | } | ||
| 2053 | |||
| 2054 | /* | ||
| 2055 | * Initialize the perf_event context in a task_struct: | ||
| 2056 | */ | ||
| 2057 | static void __perf_event_init_context(struct perf_event_context *ctx) | ||
| 1772 | { | 2058 | { |
| 1773 | raw_spin_lock_init(&ctx->lock); | 2059 | raw_spin_lock_init(&ctx->lock); |
| 1774 | mutex_init(&ctx->mutex); | 2060 | mutex_init(&ctx->mutex); |
| @@ -1776,45 +2062,38 @@ __perf_event_init_context(struct perf_event_context *ctx, | |||
| 1776 | INIT_LIST_HEAD(&ctx->flexible_groups); | 2062 | INIT_LIST_HEAD(&ctx->flexible_groups); |
| 1777 | INIT_LIST_HEAD(&ctx->event_list); | 2063 | INIT_LIST_HEAD(&ctx->event_list); |
| 1778 | atomic_set(&ctx->refcount, 1); | 2064 | atomic_set(&ctx->refcount, 1); |
| 1779 | ctx->task = task; | ||
| 1780 | } | 2065 | } |
| 1781 | 2066 | ||
| 1782 | static struct perf_event_context *find_get_context(pid_t pid, int cpu) | 2067 | static struct perf_event_context * |
| 2068 | alloc_perf_context(struct pmu *pmu, struct task_struct *task) | ||
| 1783 | { | 2069 | { |
| 1784 | struct perf_event_context *ctx; | 2070 | struct perf_event_context *ctx; |
| 1785 | struct perf_cpu_context *cpuctx; | ||
| 1786 | struct task_struct *task; | ||
| 1787 | unsigned long flags; | ||
| 1788 | int err; | ||
| 1789 | 2071 | ||
| 1790 | if (pid == -1 && cpu != -1) { | 2072 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); |
| 1791 | /* Must be root to operate on a CPU event: */ | 2073 | if (!ctx) |
| 1792 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | 2074 | return NULL; |
| 1793 | return ERR_PTR(-EACCES); | ||
| 1794 | |||
| 1795 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
| 1796 | return ERR_PTR(-EINVAL); | ||
| 1797 | 2075 | ||
| 1798 | /* | 2076 | __perf_event_init_context(ctx); |
| 1799 | * We could be clever and allow to attach a event to an | 2077 | if (task) { |
| 1800 | * offline CPU and activate it when the CPU comes up, but | 2078 | ctx->task = task; |
| 1801 | * that's for later. | 2079 | get_task_struct(task); |
| 1802 | */ | 2080 | } |
| 1803 | if (!cpu_online(cpu)) | 2081 | ctx->pmu = pmu; |
| 1804 | return ERR_PTR(-ENODEV); | ||
| 1805 | 2082 | ||
| 1806 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 2083 | return ctx; |
| 1807 | ctx = &cpuctx->ctx; | 2084 | } |
| 1808 | get_ctx(ctx); | ||
| 1809 | 2085 | ||
| 1810 | return ctx; | 2086 | static struct task_struct * |
| 1811 | } | 2087 | find_lively_task_by_vpid(pid_t vpid) |
| 2088 | { | ||
| 2089 | struct task_struct *task; | ||
| 2090 | int err; | ||
| 1812 | 2091 | ||
| 1813 | rcu_read_lock(); | 2092 | rcu_read_lock(); |
| 1814 | if (!pid) | 2093 | if (!vpid) |
| 1815 | task = current; | 2094 | task = current; |
| 1816 | else | 2095 | else |
| 1817 | task = find_task_by_vpid(pid); | 2096 | task = find_task_by_vpid(vpid); |
| 1818 | if (task) | 2097 | if (task) |
| 1819 | get_task_struct(task); | 2098 | get_task_struct(task); |
| 1820 | rcu_read_unlock(); | 2099 | rcu_read_unlock(); |
| @@ -1834,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) | |||
| 1834 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | 2113 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
| 1835 | goto errout; | 2114 | goto errout; |
| 1836 | 2115 | ||
| 1837 | retry: | 2116 | return task; |
| 1838 | ctx = perf_lock_task_context(task, &flags); | 2117 | errout: |
| 2118 | put_task_struct(task); | ||
| 2119 | return ERR_PTR(err); | ||
| 2120 | |||
| 2121 | } | ||
| 2122 | |||
| 2123 | static struct perf_event_context * | ||
| 2124 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | ||
| 2125 | { | ||
| 2126 | struct perf_event_context *ctx; | ||
| 2127 | struct perf_cpu_context *cpuctx; | ||
| 2128 | unsigned long flags; | ||
| 2129 | int ctxn, err; | ||
| 2130 | |||
| 2131 | if (!task && cpu != -1) { | ||
| 2132 | /* Must be root to operate on a CPU event: */ | ||
| 2133 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | ||
| 2134 | return ERR_PTR(-EACCES); | ||
| 2135 | |||
| 2136 | if (cpu < 0 || cpu >= nr_cpumask_bits) | ||
| 2137 | return ERR_PTR(-EINVAL); | ||
| 2138 | |||
| 2139 | /* | ||
| 2140 | * We could be clever and allow to attach a event to an | ||
| 2141 | * offline CPU and activate it when the CPU comes up, but | ||
| 2142 | * that's for later. | ||
| 2143 | */ | ||
| 2144 | if (!cpu_online(cpu)) | ||
| 2145 | return ERR_PTR(-ENODEV); | ||
| 2146 | |||
| 2147 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
| 2148 | ctx = &cpuctx->ctx; | ||
| 2149 | get_ctx(ctx); | ||
| 2150 | |||
| 2151 | return ctx; | ||
| 2152 | } | ||
| 2153 | |||
| 2154 | err = -EINVAL; | ||
| 2155 | ctxn = pmu->task_ctx_nr; | ||
| 2156 | if (ctxn < 0) | ||
| 2157 | goto errout; | ||
| 2158 | |||
| 2159 | retry: | ||
| 2160 | ctx = perf_lock_task_context(task, ctxn, &flags); | ||
| 1839 | if (ctx) { | 2161 | if (ctx) { |
| 1840 | unclone_ctx(ctx); | 2162 | unclone_ctx(ctx); |
| 1841 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2163 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 1842 | } | 2164 | } |
| 1843 | 2165 | ||
| 1844 | if (!ctx) { | 2166 | if (!ctx) { |
| 1845 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); | 2167 | ctx = alloc_perf_context(pmu, task); |
| 1846 | err = -ENOMEM; | 2168 | err = -ENOMEM; |
| 1847 | if (!ctx) | 2169 | if (!ctx) |
| 1848 | goto errout; | 2170 | goto errout; |
| 1849 | __perf_event_init_context(ctx, task); | 2171 | |
| 1850 | get_ctx(ctx); | 2172 | get_ctx(ctx); |
| 1851 | if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { | 2173 | |
| 2174 | if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { | ||
| 1852 | /* | 2175 | /* |
| 1853 | * We raced with some other task; use | 2176 | * We raced with some other task; use |
| 1854 | * the context they set. | 2177 | * the context they set. |
| 1855 | */ | 2178 | */ |
| 2179 | put_task_struct(task); | ||
| 1856 | kfree(ctx); | 2180 | kfree(ctx); |
| 1857 | goto retry; | 2181 | goto retry; |
| 1858 | } | 2182 | } |
| 1859 | get_task_struct(task); | ||
| 1860 | } | 2183 | } |
| 1861 | 2184 | ||
| 1862 | put_task_struct(task); | ||
| 1863 | return ctx; | 2185 | return ctx; |
| 1864 | 2186 | ||
| 1865 | errout: | 2187 | errout: |
| 1866 | put_task_struct(task); | ||
| 1867 | return ERR_PTR(err); | 2188 | return ERR_PTR(err); |
| 1868 | } | 2189 | } |
| 1869 | 2190 | ||
| @@ -1880,21 +2201,23 @@ static void free_event_rcu(struct rcu_head *head) | |||
| 1880 | kfree(event); | 2201 | kfree(event); |
| 1881 | } | 2202 | } |
| 1882 | 2203 | ||
| 1883 | static void perf_pending_sync(struct perf_event *event); | ||
| 1884 | static void perf_buffer_put(struct perf_buffer *buffer); | 2204 | static void perf_buffer_put(struct perf_buffer *buffer); |
| 1885 | 2205 | ||
| 1886 | static void free_event(struct perf_event *event) | 2206 | static void free_event(struct perf_event *event) |
| 1887 | { | 2207 | { |
| 1888 | perf_pending_sync(event); | 2208 | irq_work_sync(&event->pending); |
| 1889 | 2209 | ||
| 1890 | if (!event->parent) { | 2210 | if (!event->parent) { |
| 1891 | atomic_dec(&nr_events); | 2211 | if (event->attach_state & PERF_ATTACH_TASK) |
| 2212 | jump_label_dec(&perf_task_events); | ||
| 1892 | if (event->attr.mmap || event->attr.mmap_data) | 2213 | if (event->attr.mmap || event->attr.mmap_data) |
| 1893 | atomic_dec(&nr_mmap_events); | 2214 | atomic_dec(&nr_mmap_events); |
| 1894 | if (event->attr.comm) | 2215 | if (event->attr.comm) |
| 1895 | atomic_dec(&nr_comm_events); | 2216 | atomic_dec(&nr_comm_events); |
| 1896 | if (event->attr.task) | 2217 | if (event->attr.task) |
| 1897 | atomic_dec(&nr_task_events); | 2218 | atomic_dec(&nr_task_events); |
| 2219 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | ||
| 2220 | put_callchain_buffers(); | ||
| 1898 | } | 2221 | } |
| 1899 | 2222 | ||
| 1900 | if (event->buffer) { | 2223 | if (event->buffer) { |
| @@ -1905,7 +2228,9 @@ static void free_event(struct perf_event *event) | |||
| 1905 | if (event->destroy) | 2228 | if (event->destroy) |
| 1906 | event->destroy(event); | 2229 | event->destroy(event); |
| 1907 | 2230 | ||
| 1908 | put_ctx(event->ctx); | 2231 | if (event->ctx) |
| 2232 | put_ctx(event->ctx); | ||
| 2233 | |||
| 1909 | call_rcu(&event->rcu_head, free_event_rcu); | 2234 | call_rcu(&event->rcu_head, free_event_rcu); |
| 1910 | } | 2235 | } |
| 1911 | 2236 | ||
| @@ -2184,15 +2509,13 @@ static void perf_event_for_each(struct perf_event *event, | |||
| 2184 | static int perf_event_period(struct perf_event *event, u64 __user *arg) | 2509 | static int perf_event_period(struct perf_event *event, u64 __user *arg) |
| 2185 | { | 2510 | { |
| 2186 | struct perf_event_context *ctx = event->ctx; | 2511 | struct perf_event_context *ctx = event->ctx; |
| 2187 | unsigned long size; | ||
| 2188 | int ret = 0; | 2512 | int ret = 0; |
| 2189 | u64 value; | 2513 | u64 value; |
| 2190 | 2514 | ||
| 2191 | if (!event->attr.sample_period) | 2515 | if (!event->attr.sample_period) |
| 2192 | return -EINVAL; | 2516 | return -EINVAL; |
| 2193 | 2517 | ||
| 2194 | size = copy_from_user(&value, arg, sizeof(value)); | 2518 | if (copy_from_user(&value, arg, sizeof(value))) |
| 2195 | if (size != sizeof(value)) | ||
| 2196 | return -EFAULT; | 2519 | return -EFAULT; |
| 2197 | 2520 | ||
| 2198 | if (!value) | 2521 | if (!value) |
| @@ -2326,6 +2649,9 @@ int perf_event_task_disable(void) | |||
| 2326 | 2649 | ||
| 2327 | static int perf_event_index(struct perf_event *event) | 2650 | static int perf_event_index(struct perf_event *event) |
| 2328 | { | 2651 | { |
| 2652 | if (event->hw.state & PERF_HES_STOPPED) | ||
| 2653 | return 0; | ||
| 2654 | |||
| 2329 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2655 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
| 2330 | return 0; | 2656 | return 0; |
| 2331 | 2657 | ||
| @@ -2829,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event) | |||
| 2829 | } | 3155 | } |
| 2830 | } | 3156 | } |
| 2831 | 3157 | ||
| 2832 | /* | 3158 | static void perf_pending_event(struct irq_work *entry) |
| 2833 | * Pending wakeups | ||
| 2834 | * | ||
| 2835 | * Handle the case where we need to wakeup up from NMI (or rq->lock) context. | ||
| 2836 | * | ||
| 2837 | * The NMI bit means we cannot possibly take locks. Therefore, maintain a | ||
| 2838 | * single linked list and use cmpxchg() to add entries lockless. | ||
| 2839 | */ | ||
| 2840 | |||
| 2841 | static void perf_pending_event(struct perf_pending_entry *entry) | ||
| 2842 | { | 3159 | { |
| 2843 | struct perf_event *event = container_of(entry, | 3160 | struct perf_event *event = container_of(entry, |
| 2844 | struct perf_event, pending); | 3161 | struct perf_event, pending); |
| @@ -2854,99 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) | |||
| 2854 | } | 3171 | } |
| 2855 | } | 3172 | } |
| 2856 | 3173 | ||
| 2857 | #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) | ||
| 2858 | |||
| 2859 | static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { | ||
| 2860 | PENDING_TAIL, | ||
| 2861 | }; | ||
| 2862 | |||
| 2863 | static void perf_pending_queue(struct perf_pending_entry *entry, | ||
| 2864 | void (*func)(struct perf_pending_entry *)) | ||
| 2865 | { | ||
| 2866 | struct perf_pending_entry **head; | ||
| 2867 | |||
| 2868 | if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) | ||
| 2869 | return; | ||
| 2870 | |||
| 2871 | entry->func = func; | ||
| 2872 | |||
| 2873 | head = &get_cpu_var(perf_pending_head); | ||
| 2874 | |||
| 2875 | do { | ||
| 2876 | entry->next = *head; | ||
| 2877 | } while (cmpxchg(head, entry->next, entry) != entry->next); | ||
| 2878 | |||
| 2879 | set_perf_event_pending(); | ||
| 2880 | |||
| 2881 | put_cpu_var(perf_pending_head); | ||
| 2882 | } | ||
| 2883 | |||
| 2884 | static int __perf_pending_run(void) | ||
| 2885 | { | ||
| 2886 | struct perf_pending_entry *list; | ||
| 2887 | int nr = 0; | ||
| 2888 | |||
| 2889 | list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); | ||
| 2890 | while (list != PENDING_TAIL) { | ||
| 2891 | void (*func)(struct perf_pending_entry *); | ||
| 2892 | struct perf_pending_entry *entry = list; | ||
| 2893 | |||
| 2894 | list = list->next; | ||
| 2895 | |||
| 2896 | func = entry->func; | ||
| 2897 | entry->next = NULL; | ||
| 2898 | /* | ||
| 2899 | * Ensure we observe the unqueue before we issue the wakeup, | ||
| 2900 | * so that we won't be waiting forever. | ||
| 2901 | * -- see perf_not_pending(). | ||
| 2902 | */ | ||
| 2903 | smp_wmb(); | ||
| 2904 | |||
| 2905 | func(entry); | ||
| 2906 | nr++; | ||
| 2907 | } | ||
| 2908 | |||
| 2909 | return nr; | ||
| 2910 | } | ||
| 2911 | |||
| 2912 | static inline int perf_not_pending(struct perf_event *event) | ||
| 2913 | { | ||
| 2914 | /* | ||
| 2915 | * If we flush on whatever cpu we run, there is a chance we don't | ||
| 2916 | * need to wait. | ||
| 2917 | */ | ||
| 2918 | get_cpu(); | ||
| 2919 | __perf_pending_run(); | ||
| 2920 | put_cpu(); | ||
| 2921 | |||
| 2922 | /* | ||
| 2923 | * Ensure we see the proper queue state before going to sleep | ||
| 2924 | * so that we do not miss the wakeup. -- see perf_pending_handle() | ||
| 2925 | */ | ||
| 2926 | smp_rmb(); | ||
| 2927 | return event->pending.next == NULL; | ||
| 2928 | } | ||
| 2929 | |||
| 2930 | static void perf_pending_sync(struct perf_event *event) | ||
| 2931 | { | ||
| 2932 | wait_event(event->waitq, perf_not_pending(event)); | ||
| 2933 | } | ||
| 2934 | |||
| 2935 | void perf_event_do_pending(void) | ||
| 2936 | { | ||
| 2937 | __perf_pending_run(); | ||
| 2938 | } | ||
| 2939 | |||
| 2940 | /* | ||
| 2941 | * Callchain support -- arch specific | ||
| 2942 | */ | ||
| 2943 | |||
| 2944 | __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
| 2945 | { | ||
| 2946 | return NULL; | ||
| 2947 | } | ||
| 2948 | |||
| 2949 | |||
| 2950 | /* | 3174 | /* |
| 2951 | * We assume there is only KVM supporting the callbacks. | 3175 | * We assume there is only KVM supporting the callbacks. |
| 2952 | * Later on, we might change it to a list if there is | 3176 | * Later on, we might change it to a list if there is |
| @@ -2996,8 +3220,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
| 2996 | 3220 | ||
| 2997 | if (handle->nmi) { | 3221 | if (handle->nmi) { |
| 2998 | handle->event->pending_wakeup = 1; | 3222 | handle->event->pending_wakeup = 1; |
| 2999 | perf_pending_queue(&handle->event->pending, | 3223 | irq_work_queue(&handle->event->pending); |
| 3000 | perf_pending_event); | ||
| 3001 | } else | 3224 | } else |
| 3002 | perf_event_wakeup(handle->event); | 3225 | perf_event_wakeup(handle->event); |
| 3003 | } | 3226 | } |
| @@ -3053,7 +3276,7 @@ again: | |||
| 3053 | if (handle->wakeup != local_read(&buffer->wakeup)) | 3276 | if (handle->wakeup != local_read(&buffer->wakeup)) |
| 3054 | perf_output_wakeup(handle); | 3277 | perf_output_wakeup(handle); |
| 3055 | 3278 | ||
| 3056 | out: | 3279 | out: |
| 3057 | preempt_enable(); | 3280 | preempt_enable(); |
| 3058 | } | 3281 | } |
| 3059 | 3282 | ||
| @@ -3441,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi, | |||
| 3441 | struct perf_output_handle handle; | 3664 | struct perf_output_handle handle; |
| 3442 | struct perf_event_header header; | 3665 | struct perf_event_header header; |
| 3443 | 3666 | ||
| 3667 | /* protect the callchain buffers */ | ||
| 3668 | rcu_read_lock(); | ||
| 3669 | |||
| 3444 | perf_prepare_sample(&header, data, event, regs); | 3670 | perf_prepare_sample(&header, data, event, regs); |
| 3445 | 3671 | ||
| 3446 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) | 3672 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) |
| 3447 | return; | 3673 | goto exit; |
| 3448 | 3674 | ||
| 3449 | perf_output_sample(&handle, &header, data, event); | 3675 | perf_output_sample(&handle, &header, data, event); |
| 3450 | 3676 | ||
| 3451 | perf_output_end(&handle); | 3677 | perf_output_end(&handle); |
| 3678 | |||
| 3679 | exit: | ||
| 3680 | rcu_read_unlock(); | ||
| 3452 | } | 3681 | } |
| 3453 | 3682 | ||
| 3454 | /* | 3683 | /* |
| @@ -3562,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, | |||
| 3562 | static void perf_event_task_event(struct perf_task_event *task_event) | 3791 | static void perf_event_task_event(struct perf_task_event *task_event) |
| 3563 | { | 3792 | { |
| 3564 | struct perf_cpu_context *cpuctx; | 3793 | struct perf_cpu_context *cpuctx; |
| 3565 | struct perf_event_context *ctx = task_event->task_ctx; | 3794 | struct perf_event_context *ctx; |
| 3795 | struct pmu *pmu; | ||
| 3796 | int ctxn; | ||
| 3566 | 3797 | ||
| 3567 | rcu_read_lock(); | 3798 | rcu_read_lock(); |
| 3568 | cpuctx = &get_cpu_var(perf_cpu_context); | 3799 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 3569 | perf_event_task_ctx(&cpuctx->ctx, task_event); | 3800 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
| 3570 | if (!ctx) | 3801 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
| 3571 | ctx = rcu_dereference(current->perf_event_ctxp); | 3802 | |
| 3572 | if (ctx) | 3803 | ctx = task_event->task_ctx; |
| 3573 | perf_event_task_ctx(ctx, task_event); | 3804 | if (!ctx) { |
| 3574 | put_cpu_var(perf_cpu_context); | 3805 | ctxn = pmu->task_ctx_nr; |
| 3806 | if (ctxn < 0) | ||
| 3807 | goto next; | ||
| 3808 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
| 3809 | } | ||
| 3810 | if (ctx) | ||
| 3811 | perf_event_task_ctx(ctx, task_event); | ||
| 3812 | next: | ||
| 3813 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
| 3814 | } | ||
| 3575 | rcu_read_unlock(); | 3815 | rcu_read_unlock(); |
| 3576 | } | 3816 | } |
| 3577 | 3817 | ||
| @@ -3676,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
| 3676 | { | 3916 | { |
| 3677 | struct perf_cpu_context *cpuctx; | 3917 | struct perf_cpu_context *cpuctx; |
| 3678 | struct perf_event_context *ctx; | 3918 | struct perf_event_context *ctx; |
| 3679 | unsigned int size; | ||
| 3680 | char comm[TASK_COMM_LEN]; | 3919 | char comm[TASK_COMM_LEN]; |
| 3920 | unsigned int size; | ||
| 3921 | struct pmu *pmu; | ||
| 3922 | int ctxn; | ||
| 3681 | 3923 | ||
| 3682 | memset(comm, 0, sizeof(comm)); | 3924 | memset(comm, 0, sizeof(comm)); |
| 3683 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); | 3925 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); |
| @@ -3689,21 +3931,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
| 3689 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 3931 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
| 3690 | 3932 | ||
| 3691 | rcu_read_lock(); | 3933 | rcu_read_lock(); |
| 3692 | cpuctx = &get_cpu_var(perf_cpu_context); | 3934 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 3693 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); | 3935 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
| 3694 | ctx = rcu_dereference(current->perf_event_ctxp); | 3936 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
| 3695 | if (ctx) | 3937 | |
| 3696 | perf_event_comm_ctx(ctx, comm_event); | 3938 | ctxn = pmu->task_ctx_nr; |
| 3697 | put_cpu_var(perf_cpu_context); | 3939 | if (ctxn < 0) |
| 3940 | goto next; | ||
| 3941 | |||
| 3942 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
| 3943 | if (ctx) | ||
| 3944 | perf_event_comm_ctx(ctx, comm_event); | ||
| 3945 | next: | ||
| 3946 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
| 3947 | } | ||
| 3698 | rcu_read_unlock(); | 3948 | rcu_read_unlock(); |
| 3699 | } | 3949 | } |
| 3700 | 3950 | ||
| 3701 | void perf_event_comm(struct task_struct *task) | 3951 | void perf_event_comm(struct task_struct *task) |
| 3702 | { | 3952 | { |
| 3703 | struct perf_comm_event comm_event; | 3953 | struct perf_comm_event comm_event; |
| 3954 | struct perf_event_context *ctx; | ||
| 3955 | int ctxn; | ||
| 3704 | 3956 | ||
| 3705 | if (task->perf_event_ctxp) | 3957 | for_each_task_context_nr(ctxn) { |
| 3706 | perf_event_enable_on_exec(task); | 3958 | ctx = task->perf_event_ctxp[ctxn]; |
| 3959 | if (!ctx) | ||
| 3960 | continue; | ||
| 3961 | |||
| 3962 | perf_event_enable_on_exec(ctx); | ||
| 3963 | } | ||
| 3707 | 3964 | ||
| 3708 | if (!atomic_read(&nr_comm_events)) | 3965 | if (!atomic_read(&nr_comm_events)) |
| 3709 | return; | 3966 | return; |
| @@ -3805,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
| 3805 | char tmp[16]; | 4062 | char tmp[16]; |
| 3806 | char *buf = NULL; | 4063 | char *buf = NULL; |
| 3807 | const char *name; | 4064 | const char *name; |
| 4065 | struct pmu *pmu; | ||
| 4066 | int ctxn; | ||
| 3808 | 4067 | ||
| 3809 | memset(tmp, 0, sizeof(tmp)); | 4068 | memset(tmp, 0, sizeof(tmp)); |
| 3810 | 4069 | ||
| @@ -3857,12 +4116,23 @@ got_name: | |||
| 3857 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 4116 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
| 3858 | 4117 | ||
| 3859 | rcu_read_lock(); | 4118 | rcu_read_lock(); |
| 3860 | cpuctx = &get_cpu_var(perf_cpu_context); | 4119 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 3861 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4120 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
| 3862 | ctx = rcu_dereference(current->perf_event_ctxp); | 4121 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, |
| 3863 | if (ctx) | 4122 | vma->vm_flags & VM_EXEC); |
| 3864 | perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); | 4123 | |
| 3865 | put_cpu_var(perf_cpu_context); | 4124 | ctxn = pmu->task_ctx_nr; |
| 4125 | if (ctxn < 0) | ||
| 4126 | goto next; | ||
| 4127 | |||
| 4128 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | ||
| 4129 | if (ctx) { | ||
| 4130 | perf_event_mmap_ctx(ctx, mmap_event, | ||
| 4131 | vma->vm_flags & VM_EXEC); | ||
| 4132 | } | ||
| 4133 | next: | ||
| 4134 | put_cpu_ptr(pmu->pmu_cpu_context); | ||
| 4135 | } | ||
| 3866 | rcu_read_unlock(); | 4136 | rcu_read_unlock(); |
| 3867 | 4137 | ||
| 3868 | kfree(buf); | 4138 | kfree(buf); |
| @@ -3944,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
| 3944 | struct hw_perf_event *hwc = &event->hw; | 4214 | struct hw_perf_event *hwc = &event->hw; |
| 3945 | int ret = 0; | 4215 | int ret = 0; |
| 3946 | 4216 | ||
| 3947 | throttle = (throttle && event->pmu->unthrottle != NULL); | ||
| 3948 | |||
| 3949 | if (!throttle) { | 4217 | if (!throttle) { |
| 3950 | hwc->interrupts++; | 4218 | hwc->interrupts++; |
| 3951 | } else { | 4219 | } else { |
| @@ -3988,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
| 3988 | event->pending_kill = POLL_HUP; | 4256 | event->pending_kill = POLL_HUP; |
| 3989 | if (nmi) { | 4257 | if (nmi) { |
| 3990 | event->pending_disable = 1; | 4258 | event->pending_disable = 1; |
| 3991 | perf_pending_queue(&event->pending, | 4259 | irq_work_queue(&event->pending); |
| 3992 | perf_pending_event); | ||
| 3993 | } else | 4260 | } else |
| 3994 | perf_event_disable(event); | 4261 | perf_event_disable(event); |
| 3995 | } | 4262 | } |
| @@ -4013,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, | |||
| 4013 | * Generic software event infrastructure | 4280 | * Generic software event infrastructure |
| 4014 | */ | 4281 | */ |
| 4015 | 4282 | ||
| 4283 | struct swevent_htable { | ||
| 4284 | struct swevent_hlist *swevent_hlist; | ||
| 4285 | struct mutex hlist_mutex; | ||
| 4286 | int hlist_refcount; | ||
| 4287 | |||
| 4288 | /* Recursion avoidance in each contexts */ | ||
| 4289 | int recursion[PERF_NR_CONTEXTS]; | ||
| 4290 | }; | ||
| 4291 | |||
| 4292 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | ||
| 4293 | |||
| 4016 | /* | 4294 | /* |
| 4017 | * We directly increment event->count and keep a second value in | 4295 | * We directly increment event->count and keep a second value in |
| 4018 | * event->hw.period_left to count intervals. This period event | 4296 | * event->hw.period_left to count intervals. This period event |
| @@ -4070,7 +4348,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
| 4070 | } | 4348 | } |
| 4071 | } | 4349 | } |
| 4072 | 4350 | ||
| 4073 | static void perf_swevent_add(struct perf_event *event, u64 nr, | 4351 | static void perf_swevent_event(struct perf_event *event, u64 nr, |
| 4074 | int nmi, struct perf_sample_data *data, | 4352 | int nmi, struct perf_sample_data *data, |
| 4075 | struct pt_regs *regs) | 4353 | struct pt_regs *regs) |
| 4076 | { | 4354 | { |
| @@ -4096,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
| 4096 | static int perf_exclude_event(struct perf_event *event, | 4374 | static int perf_exclude_event(struct perf_event *event, |
| 4097 | struct pt_regs *regs) | 4375 | struct pt_regs *regs) |
| 4098 | { | 4376 | { |
| 4377 | if (event->hw.state & PERF_HES_STOPPED) | ||
| 4378 | return 0; | ||
| 4379 | |||
| 4099 | if (regs) { | 4380 | if (regs) { |
| 4100 | if (event->attr.exclude_user && user_mode(regs)) | 4381 | if (event->attr.exclude_user && user_mode(regs)) |
| 4101 | return 1; | 4382 | return 1; |
| @@ -4142,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) | |||
| 4142 | 4423 | ||
| 4143 | /* For the read side: events when they trigger */ | 4424 | /* For the read side: events when they trigger */ |
| 4144 | static inline struct hlist_head * | 4425 | static inline struct hlist_head * |
| 4145 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | 4426 | find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) |
| 4146 | { | 4427 | { |
| 4147 | struct swevent_hlist *hlist; | 4428 | struct swevent_hlist *hlist; |
| 4148 | 4429 | ||
| 4149 | hlist = rcu_dereference(ctx->swevent_hlist); | 4430 | hlist = rcu_dereference(swhash->swevent_hlist); |
| 4150 | if (!hlist) | 4431 | if (!hlist) |
| 4151 | return NULL; | 4432 | return NULL; |
| 4152 | 4433 | ||
| @@ -4155,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) | |||
| 4155 | 4436 | ||
| 4156 | /* For the event head insertion and removal in the hlist */ | 4437 | /* For the event head insertion and removal in the hlist */ |
| 4157 | static inline struct hlist_head * | 4438 | static inline struct hlist_head * |
| 4158 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | 4439 | find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) |
| 4159 | { | 4440 | { |
| 4160 | struct swevent_hlist *hlist; | 4441 | struct swevent_hlist *hlist; |
| 4161 | u32 event_id = event->attr.config; | 4442 | u32 event_id = event->attr.config; |
| @@ -4166,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) | |||
| 4166 | * and release. Which makes the protected version suitable here. | 4447 | * and release. Which makes the protected version suitable here. |
| 4167 | * The context lock guarantees that. | 4448 | * The context lock guarantees that. |
| 4168 | */ | 4449 | */ |
| 4169 | hlist = rcu_dereference_protected(ctx->swevent_hlist, | 4450 | hlist = rcu_dereference_protected(swhash->swevent_hlist, |
| 4170 | lockdep_is_held(&event->ctx->lock)); | 4451 | lockdep_is_held(&event->ctx->lock)); |
| 4171 | if (!hlist) | 4452 | if (!hlist) |
| 4172 | return NULL; | 4453 | return NULL; |
| @@ -4179,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
| 4179 | struct perf_sample_data *data, | 4460 | struct perf_sample_data *data, |
| 4180 | struct pt_regs *regs) | 4461 | struct pt_regs *regs) |
| 4181 | { | 4462 | { |
| 4182 | struct perf_cpu_context *cpuctx; | 4463 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
| 4183 | struct perf_event *event; | 4464 | struct perf_event *event; |
| 4184 | struct hlist_node *node; | 4465 | struct hlist_node *node; |
| 4185 | struct hlist_head *head; | 4466 | struct hlist_head *head; |
| 4186 | 4467 | ||
| 4187 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 4188 | |||
| 4189 | rcu_read_lock(); | 4468 | rcu_read_lock(); |
| 4190 | 4469 | head = find_swevent_head_rcu(swhash, type, event_id); | |
| 4191 | head = find_swevent_head_rcu(cpuctx, type, event_id); | ||
| 4192 | |||
| 4193 | if (!head) | 4470 | if (!head) |
| 4194 | goto end; | 4471 | goto end; |
| 4195 | 4472 | ||
| 4196 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4473 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
| 4197 | if (perf_swevent_match(event, type, event_id, data, regs)) | 4474 | if (perf_swevent_match(event, type, event_id, data, regs)) |
| 4198 | perf_swevent_add(event, nr, nmi, data, regs); | 4475 | perf_swevent_event(event, nr, nmi, data, regs); |
| 4199 | } | 4476 | } |
| 4200 | end: | 4477 | end: |
| 4201 | rcu_read_unlock(); | 4478 | rcu_read_unlock(); |
| @@ -4203,33 +4480,17 @@ end: | |||
| 4203 | 4480 | ||
| 4204 | int perf_swevent_get_recursion_context(void) | 4481 | int perf_swevent_get_recursion_context(void) |
| 4205 | { | 4482 | { |
| 4206 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4483 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
| 4207 | int rctx; | ||
| 4208 | |||
| 4209 | if (in_nmi()) | ||
| 4210 | rctx = 3; | ||
| 4211 | else if (in_irq()) | ||
| 4212 | rctx = 2; | ||
| 4213 | else if (in_softirq()) | ||
| 4214 | rctx = 1; | ||
| 4215 | else | ||
| 4216 | rctx = 0; | ||
| 4217 | 4484 | ||
| 4218 | if (cpuctx->recursion[rctx]) | 4485 | return get_recursion_context(swhash->recursion); |
| 4219 | return -1; | ||
| 4220 | |||
| 4221 | cpuctx->recursion[rctx]++; | ||
| 4222 | barrier(); | ||
| 4223 | |||
| 4224 | return rctx; | ||
| 4225 | } | 4486 | } |
| 4226 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4487 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
| 4227 | 4488 | ||
| 4228 | void inline perf_swevent_put_recursion_context(int rctx) | 4489 | void inline perf_swevent_put_recursion_context(int rctx) |
| 4229 | { | 4490 | { |
| 4230 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4491 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); |
| 4231 | barrier(); | 4492 | |
| 4232 | cpuctx->recursion[rctx]--; | 4493 | put_recursion_context(swhash->recursion, rctx); |
| 4233 | } | 4494 | } |
| 4234 | 4495 | ||
| 4235 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4496 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
| @@ -4255,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event) | |||
| 4255 | { | 4516 | { |
| 4256 | } | 4517 | } |
| 4257 | 4518 | ||
| 4258 | static int perf_swevent_enable(struct perf_event *event) | 4519 | static int perf_swevent_add(struct perf_event *event, int flags) |
| 4259 | { | 4520 | { |
| 4521 | struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); | ||
| 4260 | struct hw_perf_event *hwc = &event->hw; | 4522 | struct hw_perf_event *hwc = &event->hw; |
| 4261 | struct perf_cpu_context *cpuctx; | ||
| 4262 | struct hlist_head *head; | 4523 | struct hlist_head *head; |
| 4263 | 4524 | ||
| 4264 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
| 4265 | |||
| 4266 | if (hwc->sample_period) { | 4525 | if (hwc->sample_period) { |
| 4267 | hwc->last_period = hwc->sample_period; | 4526 | hwc->last_period = hwc->sample_period; |
| 4268 | perf_swevent_set_period(event); | 4527 | perf_swevent_set_period(event); |
| 4269 | } | 4528 | } |
| 4270 | 4529 | ||
| 4271 | head = find_swevent_head(cpuctx, event); | 4530 | hwc->state = !(flags & PERF_EF_START); |
| 4531 | |||
| 4532 | head = find_swevent_head(swhash, event); | ||
| 4272 | if (WARN_ON_ONCE(!head)) | 4533 | if (WARN_ON_ONCE(!head)) |
| 4273 | return -EINVAL; | 4534 | return -EINVAL; |
| 4274 | 4535 | ||
| @@ -4277,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event) | |||
| 4277 | return 0; | 4538 | return 0; |
| 4278 | } | 4539 | } |
| 4279 | 4540 | ||
| 4280 | static void perf_swevent_disable(struct perf_event *event) | 4541 | static void perf_swevent_del(struct perf_event *event, int flags) |
| 4281 | { | 4542 | { |
| 4282 | hlist_del_rcu(&event->hlist_entry); | 4543 | hlist_del_rcu(&event->hlist_entry); |
| 4283 | } | 4544 | } |
| 4284 | 4545 | ||
| 4285 | static void perf_swevent_void(struct perf_event *event) | 4546 | static void perf_swevent_start(struct perf_event *event, int flags) |
| 4286 | { | ||
| 4287 | } | ||
| 4288 | |||
| 4289 | static int perf_swevent_int(struct perf_event *event) | ||
| 4290 | { | ||
| 4291 | return 0; | ||
| 4292 | } | ||
| 4293 | |||
| 4294 | static const struct pmu perf_ops_generic = { | ||
| 4295 | .enable = perf_swevent_enable, | ||
| 4296 | .disable = perf_swevent_disable, | ||
| 4297 | .start = perf_swevent_int, | ||
| 4298 | .stop = perf_swevent_void, | ||
| 4299 | .read = perf_swevent_read, | ||
| 4300 | .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ | ||
| 4301 | }; | ||
| 4302 | |||
| 4303 | /* | ||
| 4304 | * hrtimer based swevent callback | ||
| 4305 | */ | ||
| 4306 | |||
| 4307 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | ||
| 4308 | { | ||
| 4309 | enum hrtimer_restart ret = HRTIMER_RESTART; | ||
| 4310 | struct perf_sample_data data; | ||
| 4311 | struct pt_regs *regs; | ||
| 4312 | struct perf_event *event; | ||
| 4313 | u64 period; | ||
| 4314 | |||
| 4315 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | ||
| 4316 | event->pmu->read(event); | ||
| 4317 | |||
| 4318 | perf_sample_data_init(&data, 0); | ||
| 4319 | data.period = event->hw.last_period; | ||
| 4320 | regs = get_irq_regs(); | ||
| 4321 | |||
| 4322 | if (regs && !perf_exclude_event(event, regs)) { | ||
| 4323 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
| 4324 | if (perf_event_overflow(event, 0, &data, regs)) | ||
| 4325 | ret = HRTIMER_NORESTART; | ||
| 4326 | } | ||
| 4327 | |||
| 4328 | period = max_t(u64, 10000, event->hw.sample_period); | ||
| 4329 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
| 4330 | |||
| 4331 | return ret; | ||
| 4332 | } | ||
| 4333 | |||
| 4334 | static void perf_swevent_start_hrtimer(struct perf_event *event) | ||
| 4335 | { | ||
| 4336 | struct hw_perf_event *hwc = &event->hw; | ||
| 4337 | |||
| 4338 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 4339 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
| 4340 | if (hwc->sample_period) { | ||
| 4341 | u64 period; | ||
| 4342 | |||
| 4343 | if (hwc->remaining) { | ||
| 4344 | if (hwc->remaining < 0) | ||
| 4345 | period = 10000; | ||
| 4346 | else | ||
| 4347 | period = hwc->remaining; | ||
| 4348 | hwc->remaining = 0; | ||
| 4349 | } else { | ||
| 4350 | period = max_t(u64, 10000, hwc->sample_period); | ||
| 4351 | } | ||
| 4352 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
| 4353 | ns_to_ktime(period), 0, | ||
| 4354 | HRTIMER_MODE_REL, 0); | ||
| 4355 | } | ||
| 4356 | } | ||
| 4357 | |||
| 4358 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | ||
| 4359 | { | ||
| 4360 | struct hw_perf_event *hwc = &event->hw; | ||
| 4361 | |||
| 4362 | if (hwc->sample_period) { | ||
| 4363 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
| 4364 | hwc->remaining = ktime_to_ns(remaining); | ||
| 4365 | |||
| 4366 | hrtimer_cancel(&hwc->hrtimer); | ||
| 4367 | } | ||
| 4368 | } | ||
| 4369 | |||
| 4370 | /* | ||
| 4371 | * Software event: cpu wall time clock | ||
| 4372 | */ | ||
| 4373 | |||
| 4374 | static void cpu_clock_perf_event_update(struct perf_event *event) | ||
| 4375 | { | ||
| 4376 | int cpu = raw_smp_processor_id(); | ||
| 4377 | s64 prev; | ||
| 4378 | u64 now; | ||
| 4379 | |||
| 4380 | now = cpu_clock(cpu); | ||
| 4381 | prev = local64_xchg(&event->hw.prev_count, now); | ||
| 4382 | local64_add(now - prev, &event->count); | ||
| 4383 | } | ||
| 4384 | |||
| 4385 | static int cpu_clock_perf_event_enable(struct perf_event *event) | ||
| 4386 | { | ||
| 4387 | struct hw_perf_event *hwc = &event->hw; | ||
| 4388 | int cpu = raw_smp_processor_id(); | ||
| 4389 | |||
| 4390 | local64_set(&hwc->prev_count, cpu_clock(cpu)); | ||
| 4391 | perf_swevent_start_hrtimer(event); | ||
| 4392 | |||
| 4393 | return 0; | ||
| 4394 | } | ||
| 4395 | |||
| 4396 | static void cpu_clock_perf_event_disable(struct perf_event *event) | ||
| 4397 | { | 4547 | { |
| 4398 | perf_swevent_cancel_hrtimer(event); | 4548 | event->hw.state = 0; |
| 4399 | cpu_clock_perf_event_update(event); | ||
| 4400 | } | ||
| 4401 | |||
| 4402 | static void cpu_clock_perf_event_read(struct perf_event *event) | ||
| 4403 | { | ||
| 4404 | cpu_clock_perf_event_update(event); | ||
| 4405 | } | ||
| 4406 | |||
| 4407 | static const struct pmu perf_ops_cpu_clock = { | ||
| 4408 | .enable = cpu_clock_perf_event_enable, | ||
| 4409 | .disable = cpu_clock_perf_event_disable, | ||
| 4410 | .read = cpu_clock_perf_event_read, | ||
| 4411 | }; | ||
| 4412 | |||
| 4413 | /* | ||
| 4414 | * Software event: task time clock | ||
| 4415 | */ | ||
| 4416 | |||
| 4417 | static void task_clock_perf_event_update(struct perf_event *event, u64 now) | ||
| 4418 | { | ||
| 4419 | u64 prev; | ||
| 4420 | s64 delta; | ||
| 4421 | |||
| 4422 | prev = local64_xchg(&event->hw.prev_count, now); | ||
| 4423 | delta = now - prev; | ||
| 4424 | local64_add(delta, &event->count); | ||
| 4425 | } | ||
| 4426 | |||
| 4427 | static int task_clock_perf_event_enable(struct perf_event *event) | ||
| 4428 | { | ||
| 4429 | struct hw_perf_event *hwc = &event->hw; | ||
| 4430 | u64 now; | ||
| 4431 | |||
| 4432 | now = event->ctx->time; | ||
| 4433 | |||
| 4434 | local64_set(&hwc->prev_count, now); | ||
| 4435 | |||
| 4436 | perf_swevent_start_hrtimer(event); | ||
| 4437 | |||
| 4438 | return 0; | ||
| 4439 | } | 4549 | } |
| 4440 | 4550 | ||
| 4441 | static void task_clock_perf_event_disable(struct perf_event *event) | 4551 | static void perf_swevent_stop(struct perf_event *event, int flags) |
| 4442 | { | 4552 | { |
| 4443 | perf_swevent_cancel_hrtimer(event); | 4553 | event->hw.state = PERF_HES_STOPPED; |
| 4444 | task_clock_perf_event_update(event, event->ctx->time); | ||
| 4445 | |||
| 4446 | } | 4554 | } |
| 4447 | 4555 | ||
| 4448 | static void task_clock_perf_event_read(struct perf_event *event) | ||
| 4449 | { | ||
| 4450 | u64 time; | ||
| 4451 | |||
| 4452 | if (!in_nmi()) { | ||
| 4453 | update_context_time(event->ctx); | ||
| 4454 | time = event->ctx->time; | ||
| 4455 | } else { | ||
| 4456 | u64 now = perf_clock(); | ||
| 4457 | u64 delta = now - event->ctx->timestamp; | ||
| 4458 | time = event->ctx->time + delta; | ||
| 4459 | } | ||
| 4460 | |||
| 4461 | task_clock_perf_event_update(event, time); | ||
| 4462 | } | ||
| 4463 | |||
| 4464 | static const struct pmu perf_ops_task_clock = { | ||
| 4465 | .enable = task_clock_perf_event_enable, | ||
| 4466 | .disable = task_clock_perf_event_disable, | ||
| 4467 | .read = task_clock_perf_event_read, | ||
| 4468 | }; | ||
| 4469 | |||
| 4470 | /* Deref the hlist from the update side */ | 4556 | /* Deref the hlist from the update side */ |
| 4471 | static inline struct swevent_hlist * | 4557 | static inline struct swevent_hlist * |
| 4472 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) | 4558 | swevent_hlist_deref(struct swevent_htable *swhash) |
| 4473 | { | 4559 | { |
| 4474 | return rcu_dereference_protected(cpuctx->swevent_hlist, | 4560 | return rcu_dereference_protected(swhash->swevent_hlist, |
| 4475 | lockdep_is_held(&cpuctx->hlist_mutex)); | 4561 | lockdep_is_held(&swhash->hlist_mutex)); |
| 4476 | } | 4562 | } |
| 4477 | 4563 | ||
| 4478 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | 4564 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) |
| @@ -4483,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | |||
| 4483 | kfree(hlist); | 4569 | kfree(hlist); |
| 4484 | } | 4570 | } |
| 4485 | 4571 | ||
| 4486 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) | 4572 | static void swevent_hlist_release(struct swevent_htable *swhash) |
| 4487 | { | 4573 | { |
| 4488 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); | 4574 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
| 4489 | 4575 | ||
| 4490 | if (!hlist) | 4576 | if (!hlist) |
| 4491 | return; | 4577 | return; |
| 4492 | 4578 | ||
| 4493 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); | 4579 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
| 4494 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 4580 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); |
| 4495 | } | 4581 | } |
| 4496 | 4582 | ||
| 4497 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 4583 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
| 4498 | { | 4584 | { |
| 4499 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4585 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
| 4500 | 4586 | ||
| 4501 | mutex_lock(&cpuctx->hlist_mutex); | 4587 | mutex_lock(&swhash->hlist_mutex); |
| 4502 | 4588 | ||
| 4503 | if (!--cpuctx->hlist_refcount) | 4589 | if (!--swhash->hlist_refcount) |
| 4504 | swevent_hlist_release(cpuctx); | 4590 | swevent_hlist_release(swhash); |
| 4505 | 4591 | ||
| 4506 | mutex_unlock(&cpuctx->hlist_mutex); | 4592 | mutex_unlock(&swhash->hlist_mutex); |
| 4507 | } | 4593 | } |
| 4508 | 4594 | ||
| 4509 | static void swevent_hlist_put(struct perf_event *event) | 4595 | static void swevent_hlist_put(struct perf_event *event) |
| @@ -4521,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event) | |||
| 4521 | 4607 | ||
| 4522 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | 4608 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) |
| 4523 | { | 4609 | { |
| 4524 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 4610 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
| 4525 | int err = 0; | 4611 | int err = 0; |
| 4526 | 4612 | ||
| 4527 | mutex_lock(&cpuctx->hlist_mutex); | 4613 | mutex_lock(&swhash->hlist_mutex); |
| 4528 | 4614 | ||
| 4529 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { | 4615 | if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { |
| 4530 | struct swevent_hlist *hlist; | 4616 | struct swevent_hlist *hlist; |
| 4531 | 4617 | ||
| 4532 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 4618 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
| @@ -4534,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) | |||
| 4534 | err = -ENOMEM; | 4620 | err = -ENOMEM; |
| 4535 | goto exit; | 4621 | goto exit; |
| 4536 | } | 4622 | } |
| 4537 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 4623 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
| 4538 | } | 4624 | } |
| 4539 | cpuctx->hlist_refcount++; | 4625 | swhash->hlist_refcount++; |
| 4540 | exit: | 4626 | exit: |
| 4541 | mutex_unlock(&cpuctx->hlist_mutex); | 4627 | mutex_unlock(&swhash->hlist_mutex); |
| 4542 | 4628 | ||
| 4543 | return err; | 4629 | return err; |
| 4544 | } | 4630 | } |
| @@ -4562,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event) | |||
| 4562 | put_online_cpus(); | 4648 | put_online_cpus(); |
| 4563 | 4649 | ||
| 4564 | return 0; | 4650 | return 0; |
| 4565 | fail: | 4651 | fail: |
| 4566 | for_each_possible_cpu(cpu) { | 4652 | for_each_possible_cpu(cpu) { |
| 4567 | if (cpu == failed_cpu) | 4653 | if (cpu == failed_cpu) |
| 4568 | break; | 4654 | break; |
| @@ -4573,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event) | |||
| 4573 | return err; | 4659 | return err; |
| 4574 | } | 4660 | } |
| 4575 | 4661 | ||
| 4576 | #ifdef CONFIG_EVENT_TRACING | 4662 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
| 4663 | |||
| 4664 | static void sw_perf_event_destroy(struct perf_event *event) | ||
| 4665 | { | ||
| 4666 | u64 event_id = event->attr.config; | ||
| 4667 | |||
| 4668 | WARN_ON(event->parent); | ||
| 4577 | 4669 | ||
| 4578 | static const struct pmu perf_ops_tracepoint = { | 4670 | jump_label_dec(&perf_swevent_enabled[event_id]); |
| 4579 | .enable = perf_trace_enable, | 4671 | swevent_hlist_put(event); |
| 4580 | .disable = perf_trace_disable, | 4672 | } |
| 4581 | .start = perf_swevent_int, | 4673 | |
| 4582 | .stop = perf_swevent_void, | 4674 | static int perf_swevent_init(struct perf_event *event) |
| 4675 | { | ||
| 4676 | int event_id = event->attr.config; | ||
| 4677 | |||
| 4678 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
| 4679 | return -ENOENT; | ||
| 4680 | |||
| 4681 | switch (event_id) { | ||
| 4682 | case PERF_COUNT_SW_CPU_CLOCK: | ||
| 4683 | case PERF_COUNT_SW_TASK_CLOCK: | ||
| 4684 | return -ENOENT; | ||
| 4685 | |||
| 4686 | default: | ||
| 4687 | break; | ||
| 4688 | } | ||
| 4689 | |||
| 4690 | if (event_id > PERF_COUNT_SW_MAX) | ||
| 4691 | return -ENOENT; | ||
| 4692 | |||
| 4693 | if (!event->parent) { | ||
| 4694 | int err; | ||
| 4695 | |||
| 4696 | err = swevent_hlist_get(event); | ||
| 4697 | if (err) | ||
| 4698 | return err; | ||
| 4699 | |||
| 4700 | jump_label_inc(&perf_swevent_enabled[event_id]); | ||
| 4701 | event->destroy = sw_perf_event_destroy; | ||
| 4702 | } | ||
| 4703 | |||
| 4704 | return 0; | ||
| 4705 | } | ||
| 4706 | |||
| 4707 | static struct pmu perf_swevent = { | ||
| 4708 | .task_ctx_nr = perf_sw_context, | ||
| 4709 | |||
| 4710 | .event_init = perf_swevent_init, | ||
| 4711 | .add = perf_swevent_add, | ||
| 4712 | .del = perf_swevent_del, | ||
| 4713 | .start = perf_swevent_start, | ||
| 4714 | .stop = perf_swevent_stop, | ||
| 4583 | .read = perf_swevent_read, | 4715 | .read = perf_swevent_read, |
| 4584 | .unthrottle = perf_swevent_void, | ||
| 4585 | }; | 4716 | }; |
| 4586 | 4717 | ||
| 4718 | #ifdef CONFIG_EVENT_TRACING | ||
| 4719 | |||
| 4587 | static int perf_tp_filter_match(struct perf_event *event, | 4720 | static int perf_tp_filter_match(struct perf_event *event, |
| 4588 | struct perf_sample_data *data) | 4721 | struct perf_sample_data *data) |
| 4589 | { | 4722 | { |
| @@ -4627,7 +4760,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
| 4627 | 4760 | ||
| 4628 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4761 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
| 4629 | if (perf_tp_event_match(event, &data, regs)) | 4762 | if (perf_tp_event_match(event, &data, regs)) |
| 4630 | perf_swevent_add(event, count, 1, &data, regs); | 4763 | perf_swevent_event(event, count, 1, &data, regs); |
| 4631 | } | 4764 | } |
| 4632 | 4765 | ||
| 4633 | perf_swevent_put_recursion_context(rctx); | 4766 | perf_swevent_put_recursion_context(rctx); |
| @@ -4639,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event) | |||
| 4639 | perf_trace_destroy(event); | 4772 | perf_trace_destroy(event); |
| 4640 | } | 4773 | } |
| 4641 | 4774 | ||
| 4642 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4775 | static int perf_tp_event_init(struct perf_event *event) |
| 4643 | { | 4776 | { |
| 4644 | int err; | 4777 | int err; |
| 4645 | 4778 | ||
| 4779 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
| 4780 | return -ENOENT; | ||
| 4781 | |||
| 4646 | /* | 4782 | /* |
| 4647 | * Raw tracepoint data is a severe data leak, only allow root to | 4783 | * Raw tracepoint data is a severe data leak, only allow root to |
| 4648 | * have these. | 4784 | * have these. |
| @@ -4650,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) | |||
| 4650 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && | 4786 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && |
| 4651 | perf_paranoid_tracepoint_raw() && | 4787 | perf_paranoid_tracepoint_raw() && |
| 4652 | !capable(CAP_SYS_ADMIN)) | 4788 | !capable(CAP_SYS_ADMIN)) |
| 4653 | return ERR_PTR(-EPERM); | 4789 | return -EPERM; |
| 4654 | 4790 | ||
| 4655 | err = perf_trace_init(event); | 4791 | err = perf_trace_init(event); |
| 4656 | if (err) | 4792 | if (err) |
| 4657 | return NULL; | 4793 | return err; |
| 4658 | 4794 | ||
| 4659 | event->destroy = tp_perf_event_destroy; | 4795 | event->destroy = tp_perf_event_destroy; |
| 4660 | 4796 | ||
| 4661 | return &perf_ops_tracepoint; | 4797 | return 0; |
| 4798 | } | ||
| 4799 | |||
| 4800 | static struct pmu perf_tracepoint = { | ||
| 4801 | .task_ctx_nr = perf_sw_context, | ||
| 4802 | |||
| 4803 | .event_init = perf_tp_event_init, | ||
| 4804 | .add = perf_trace_add, | ||
| 4805 | .del = perf_trace_del, | ||
| 4806 | .start = perf_swevent_start, | ||
| 4807 | .stop = perf_swevent_stop, | ||
| 4808 | .read = perf_swevent_read, | ||
| 4809 | }; | ||
| 4810 | |||
| 4811 | static inline void perf_tp_register(void) | ||
| 4812 | { | ||
| 4813 | perf_pmu_register(&perf_tracepoint); | ||
| 4662 | } | 4814 | } |
| 4663 | 4815 | ||
| 4664 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4816 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
| @@ -4686,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event) | |||
| 4686 | 4838 | ||
| 4687 | #else | 4839 | #else |
| 4688 | 4840 | ||
| 4689 | static const struct pmu *tp_perf_event_init(struct perf_event *event) | 4841 | static inline void perf_tp_register(void) |
| 4690 | { | 4842 | { |
| 4691 | return NULL; | ||
| 4692 | } | 4843 | } |
| 4693 | 4844 | ||
| 4694 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 4845 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
| @@ -4703,105 +4854,389 @@ static void perf_event_free_filter(struct perf_event *event) | |||
| 4703 | #endif /* CONFIG_EVENT_TRACING */ | 4854 | #endif /* CONFIG_EVENT_TRACING */ |
| 4704 | 4855 | ||
| 4705 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 4856 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
| 4706 | static void bp_perf_event_destroy(struct perf_event *event) | 4857 | void perf_bp_event(struct perf_event *bp, void *data) |
| 4707 | { | 4858 | { |
| 4708 | release_bp_slot(event); | 4859 | struct perf_sample_data sample; |
| 4860 | struct pt_regs *regs = data; | ||
| 4861 | |||
| 4862 | perf_sample_data_init(&sample, bp->attr.bp_addr); | ||
| 4863 | |||
| 4864 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | ||
| 4865 | perf_swevent_event(bp, 1, 1, &sample, regs); | ||
| 4709 | } | 4866 | } |
| 4867 | #endif | ||
| 4868 | |||
| 4869 | /* | ||
| 4870 | * hrtimer based swevent callback | ||
| 4871 | */ | ||
| 4710 | 4872 | ||
| 4711 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4873 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) |
| 4712 | { | 4874 | { |
| 4713 | int err; | 4875 | enum hrtimer_restart ret = HRTIMER_RESTART; |
| 4876 | struct perf_sample_data data; | ||
| 4877 | struct pt_regs *regs; | ||
| 4878 | struct perf_event *event; | ||
| 4879 | u64 period; | ||
| 4714 | 4880 | ||
| 4715 | err = register_perf_hw_breakpoint(bp); | 4881 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
| 4716 | if (err) | 4882 | event->pmu->read(event); |
| 4717 | return ERR_PTR(err); | ||
| 4718 | 4883 | ||
| 4719 | bp->destroy = bp_perf_event_destroy; | 4884 | perf_sample_data_init(&data, 0); |
| 4885 | data.period = event->hw.last_period; | ||
| 4886 | regs = get_irq_regs(); | ||
| 4720 | 4887 | ||
| 4721 | return &perf_ops_bp; | 4888 | if (regs && !perf_exclude_event(event, regs)) { |
| 4889 | if (!(event->attr.exclude_idle && current->pid == 0)) | ||
| 4890 | if (perf_event_overflow(event, 0, &data, regs)) | ||
| 4891 | ret = HRTIMER_NORESTART; | ||
| 4892 | } | ||
| 4893 | |||
| 4894 | period = max_t(u64, 10000, event->hw.sample_period); | ||
| 4895 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); | ||
| 4896 | |||
| 4897 | return ret; | ||
| 4722 | } | 4898 | } |
| 4723 | 4899 | ||
| 4724 | void perf_bp_event(struct perf_event *bp, void *data) | 4900 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
| 4725 | { | 4901 | { |
| 4726 | struct perf_sample_data sample; | 4902 | struct hw_perf_event *hwc = &event->hw; |
| 4727 | struct pt_regs *regs = data; | ||
| 4728 | 4903 | ||
| 4729 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 4904 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 4905 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
| 4906 | if (hwc->sample_period) { | ||
| 4907 | s64 period = local64_read(&hwc->period_left); | ||
| 4908 | |||
| 4909 | if (period) { | ||
| 4910 | if (period < 0) | ||
| 4911 | period = 10000; | ||
| 4912 | |||
| 4913 | local64_set(&hwc->period_left, 0); | ||
| 4914 | } else { | ||
| 4915 | period = max_t(u64, 10000, hwc->sample_period); | ||
| 4916 | } | ||
| 4917 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
| 4918 | ns_to_ktime(period), 0, | ||
| 4919 | HRTIMER_MODE_REL_PINNED, 0); | ||
| 4920 | } | ||
| 4921 | } | ||
| 4730 | 4922 | ||
| 4731 | if (!perf_exclude_event(bp, regs)) | 4923 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
| 4732 | perf_swevent_add(bp, 1, 1, &sample, regs); | 4924 | { |
| 4925 | struct hw_perf_event *hwc = &event->hw; | ||
| 4926 | |||
| 4927 | if (hwc->sample_period) { | ||
| 4928 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
| 4929 | local64_set(&hwc->period_left, ktime_to_ns(remaining)); | ||
| 4930 | |||
| 4931 | hrtimer_cancel(&hwc->hrtimer); | ||
| 4932 | } | ||
| 4733 | } | 4933 | } |
| 4734 | #else | 4934 | |
| 4735 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) | 4935 | /* |
| 4936 | * Software event: cpu wall time clock | ||
| 4937 | */ | ||
| 4938 | |||
| 4939 | static void cpu_clock_event_update(struct perf_event *event) | ||
| 4736 | { | 4940 | { |
| 4737 | return NULL; | 4941 | s64 prev; |
| 4942 | u64 now; | ||
| 4943 | |||
| 4944 | now = local_clock(); | ||
| 4945 | prev = local64_xchg(&event->hw.prev_count, now); | ||
| 4946 | local64_add(now - prev, &event->count); | ||
| 4738 | } | 4947 | } |
| 4739 | 4948 | ||
| 4740 | void perf_bp_event(struct perf_event *bp, void *regs) | 4949 | static void cpu_clock_event_start(struct perf_event *event, int flags) |
| 4741 | { | 4950 | { |
| 4951 | local64_set(&event->hw.prev_count, local_clock()); | ||
| 4952 | perf_swevent_start_hrtimer(event); | ||
| 4742 | } | 4953 | } |
| 4743 | #endif | ||
| 4744 | 4954 | ||
| 4745 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 4955 | static void cpu_clock_event_stop(struct perf_event *event, int flags) |
| 4956 | { | ||
| 4957 | perf_swevent_cancel_hrtimer(event); | ||
| 4958 | cpu_clock_event_update(event); | ||
| 4959 | } | ||
| 4746 | 4960 | ||
| 4747 | static void sw_perf_event_destroy(struct perf_event *event) | 4961 | static int cpu_clock_event_add(struct perf_event *event, int flags) |
| 4748 | { | 4962 | { |
| 4749 | u64 event_id = event->attr.config; | 4963 | if (flags & PERF_EF_START) |
| 4964 | cpu_clock_event_start(event, flags); | ||
| 4750 | 4965 | ||
| 4751 | WARN_ON(event->parent); | 4966 | return 0; |
| 4967 | } | ||
| 4752 | 4968 | ||
| 4753 | atomic_dec(&perf_swevent_enabled[event_id]); | 4969 | static void cpu_clock_event_del(struct perf_event *event, int flags) |
| 4754 | swevent_hlist_put(event); | 4970 | { |
| 4971 | cpu_clock_event_stop(event, flags); | ||
| 4755 | } | 4972 | } |
| 4756 | 4973 | ||
| 4757 | static const struct pmu *sw_perf_event_init(struct perf_event *event) | 4974 | static void cpu_clock_event_read(struct perf_event *event) |
| 4758 | { | 4975 | { |
| 4759 | const struct pmu *pmu = NULL; | 4976 | cpu_clock_event_update(event); |
| 4760 | u64 event_id = event->attr.config; | 4977 | } |
| 4978 | |||
| 4979 | static int cpu_clock_event_init(struct perf_event *event) | ||
| 4980 | { | ||
| 4981 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
| 4982 | return -ENOENT; | ||
| 4983 | |||
| 4984 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | ||
| 4985 | return -ENOENT; | ||
| 4761 | 4986 | ||
| 4987 | return 0; | ||
| 4988 | } | ||
| 4989 | |||
| 4990 | static struct pmu perf_cpu_clock = { | ||
| 4991 | .task_ctx_nr = perf_sw_context, | ||
| 4992 | |||
| 4993 | .event_init = cpu_clock_event_init, | ||
| 4994 | .add = cpu_clock_event_add, | ||
| 4995 | .del = cpu_clock_event_del, | ||
| 4996 | .start = cpu_clock_event_start, | ||
| 4997 | .stop = cpu_clock_event_stop, | ||
| 4998 | .read = cpu_clock_event_read, | ||
| 4999 | }; | ||
| 5000 | |||
| 5001 | /* | ||
| 5002 | * Software event: task time clock | ||
| 5003 | */ | ||
| 5004 | |||
| 5005 | static void task_clock_event_update(struct perf_event *event, u64 now) | ||
| 5006 | { | ||
| 5007 | u64 prev; | ||
| 5008 | s64 delta; | ||
| 5009 | |||
| 5010 | prev = local64_xchg(&event->hw.prev_count, now); | ||
| 5011 | delta = now - prev; | ||
| 5012 | local64_add(delta, &event->count); | ||
| 5013 | } | ||
| 5014 | |||
| 5015 | static void task_clock_event_start(struct perf_event *event, int flags) | ||
| 5016 | { | ||
| 5017 | local64_set(&event->hw.prev_count, event->ctx->time); | ||
| 5018 | perf_swevent_start_hrtimer(event); | ||
| 5019 | } | ||
| 5020 | |||
| 5021 | static void task_clock_event_stop(struct perf_event *event, int flags) | ||
| 5022 | { | ||
| 5023 | perf_swevent_cancel_hrtimer(event); | ||
| 5024 | task_clock_event_update(event, event->ctx->time); | ||
| 5025 | } | ||
| 5026 | |||
| 5027 | static int task_clock_event_add(struct perf_event *event, int flags) | ||
| 5028 | { | ||
| 5029 | if (flags & PERF_EF_START) | ||
| 5030 | task_clock_event_start(event, flags); | ||
| 5031 | |||
| 5032 | return 0; | ||
| 5033 | } | ||
| 5034 | |||
| 5035 | static void task_clock_event_del(struct perf_event *event, int flags) | ||
| 5036 | { | ||
| 5037 | task_clock_event_stop(event, PERF_EF_UPDATE); | ||
| 5038 | } | ||
| 5039 | |||
| 5040 | static void task_clock_event_read(struct perf_event *event) | ||
| 5041 | { | ||
| 5042 | u64 time; | ||
| 5043 | |||
| 5044 | if (!in_nmi()) { | ||
| 5045 | update_context_time(event->ctx); | ||
| 5046 | time = event->ctx->time; | ||
| 5047 | } else { | ||
| 5048 | u64 now = perf_clock(); | ||
| 5049 | u64 delta = now - event->ctx->timestamp; | ||
| 5050 | time = event->ctx->time + delta; | ||
| 5051 | } | ||
| 5052 | |||
| 5053 | task_clock_event_update(event, time); | ||
| 5054 | } | ||
| 5055 | |||
| 5056 | static int task_clock_event_init(struct perf_event *event) | ||
| 5057 | { | ||
| 5058 | if (event->attr.type != PERF_TYPE_SOFTWARE) | ||
| 5059 | return -ENOENT; | ||
| 5060 | |||
| 5061 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | ||
| 5062 | return -ENOENT; | ||
| 5063 | |||
| 5064 | return 0; | ||
| 5065 | } | ||
| 5066 | |||
| 5067 | static struct pmu perf_task_clock = { | ||
| 5068 | .task_ctx_nr = perf_sw_context, | ||
| 5069 | |||
| 5070 | .event_init = task_clock_event_init, | ||
| 5071 | .add = task_clock_event_add, | ||
| 5072 | .del = task_clock_event_del, | ||
| 5073 | .start = task_clock_event_start, | ||
| 5074 | .stop = task_clock_event_stop, | ||
| 5075 | .read = task_clock_event_read, | ||
| 5076 | }; | ||
| 5077 | |||
| 5078 | static void perf_pmu_nop_void(struct pmu *pmu) | ||
| 5079 | { | ||
| 5080 | } | ||
| 5081 | |||
| 5082 | static int perf_pmu_nop_int(struct pmu *pmu) | ||
| 5083 | { | ||
| 5084 | return 0; | ||
| 5085 | } | ||
| 5086 | |||
| 5087 | static void perf_pmu_start_txn(struct pmu *pmu) | ||
| 5088 | { | ||
| 5089 | perf_pmu_disable(pmu); | ||
| 5090 | } | ||
| 5091 | |||
| 5092 | static int perf_pmu_commit_txn(struct pmu *pmu) | ||
| 5093 | { | ||
| 5094 | perf_pmu_enable(pmu); | ||
| 5095 | return 0; | ||
| 5096 | } | ||
| 5097 | |||
| 5098 | static void perf_pmu_cancel_txn(struct pmu *pmu) | ||
| 5099 | { | ||
| 5100 | perf_pmu_enable(pmu); | ||
| 5101 | } | ||
| 5102 | |||
| 5103 | /* | ||
| 5104 | * Ensures all contexts with the same task_ctx_nr have the same | ||
| 5105 | * pmu_cpu_context too. | ||
| 5106 | */ | ||
| 5107 | static void *find_pmu_context(int ctxn) | ||
| 5108 | { | ||
| 5109 | struct pmu *pmu; | ||
| 5110 | |||
| 5111 | if (ctxn < 0) | ||
| 5112 | return NULL; | ||
| 5113 | |||
| 5114 | list_for_each_entry(pmu, &pmus, entry) { | ||
| 5115 | if (pmu->task_ctx_nr == ctxn) | ||
| 5116 | return pmu->pmu_cpu_context; | ||
| 5117 | } | ||
| 5118 | |||
| 5119 | return NULL; | ||
| 5120 | } | ||
| 5121 | |||
| 5122 | static void free_pmu_context(void * __percpu cpu_context) | ||
| 5123 | { | ||
| 5124 | struct pmu *pmu; | ||
| 5125 | |||
| 5126 | mutex_lock(&pmus_lock); | ||
| 4762 | /* | 5127 | /* |
| 4763 | * Software events (currently) can't in general distinguish | 5128 | * Like a real lame refcount. |
| 4764 | * between user, kernel and hypervisor events. | ||
| 4765 | * However, context switches and cpu migrations are considered | ||
| 4766 | * to be kernel events, and page faults are never hypervisor | ||
| 4767 | * events. | ||
| 4768 | */ | 5129 | */ |
| 4769 | switch (event_id) { | 5130 | list_for_each_entry(pmu, &pmus, entry) { |
| 4770 | case PERF_COUNT_SW_CPU_CLOCK: | 5131 | if (pmu->pmu_cpu_context == cpu_context) |
| 4771 | pmu = &perf_ops_cpu_clock; | 5132 | goto out; |
| 5133 | } | ||
| 4772 | 5134 | ||
| 4773 | break; | 5135 | free_percpu(cpu_context); |
| 4774 | case PERF_COUNT_SW_TASK_CLOCK: | 5136 | out: |
| 4775 | /* | 5137 | mutex_unlock(&pmus_lock); |
| 4776 | * If the user instantiates this as a per-cpu event, | 5138 | } |
| 4777 | * use the cpu_clock event instead. | ||
| 4778 | */ | ||
| 4779 | if (event->ctx->task) | ||
| 4780 | pmu = &perf_ops_task_clock; | ||
| 4781 | else | ||
| 4782 | pmu = &perf_ops_cpu_clock; | ||
| 4783 | 5139 | ||
| 4784 | break; | 5140 | int perf_pmu_register(struct pmu *pmu) |
| 4785 | case PERF_COUNT_SW_PAGE_FAULTS: | 5141 | { |
| 4786 | case PERF_COUNT_SW_PAGE_FAULTS_MIN: | 5142 | int cpu, ret; |
| 4787 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: | 5143 | |
| 4788 | case PERF_COUNT_SW_CONTEXT_SWITCHES: | 5144 | mutex_lock(&pmus_lock); |
| 4789 | case PERF_COUNT_SW_CPU_MIGRATIONS: | 5145 | ret = -ENOMEM; |
| 4790 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: | 5146 | pmu->pmu_disable_count = alloc_percpu(int); |
| 4791 | case PERF_COUNT_SW_EMULATION_FAULTS: | 5147 | if (!pmu->pmu_disable_count) |
| 4792 | if (!event->parent) { | 5148 | goto unlock; |
| 4793 | int err; | 5149 | |
| 4794 | 5150 | pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); | |
| 4795 | err = swevent_hlist_get(event); | 5151 | if (pmu->pmu_cpu_context) |
| 4796 | if (err) | 5152 | goto got_cpu_context; |
| 4797 | return ERR_PTR(err); | 5153 | |
| 5154 | pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); | ||
| 5155 | if (!pmu->pmu_cpu_context) | ||
| 5156 | goto free_pdc; | ||
| 4798 | 5157 | ||
| 4799 | atomic_inc(&perf_swevent_enabled[event_id]); | 5158 | for_each_possible_cpu(cpu) { |
| 4800 | event->destroy = sw_perf_event_destroy; | 5159 | struct perf_cpu_context *cpuctx; |
| 5160 | |||
| 5161 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
| 5162 | __perf_event_init_context(&cpuctx->ctx); | ||
| 5163 | cpuctx->ctx.type = cpu_context; | ||
| 5164 | cpuctx->ctx.pmu = pmu; | ||
| 5165 | cpuctx->jiffies_interval = 1; | ||
| 5166 | INIT_LIST_HEAD(&cpuctx->rotation_list); | ||
| 5167 | } | ||
| 5168 | |||
| 5169 | got_cpu_context: | ||
| 5170 | if (!pmu->start_txn) { | ||
| 5171 | if (pmu->pmu_enable) { | ||
| 5172 | /* | ||
| 5173 | * If we have pmu_enable/pmu_disable calls, install | ||
| 5174 | * transaction stubs that use that to try and batch | ||
| 5175 | * hardware accesses. | ||
| 5176 | */ | ||
| 5177 | pmu->start_txn = perf_pmu_start_txn; | ||
| 5178 | pmu->commit_txn = perf_pmu_commit_txn; | ||
| 5179 | pmu->cancel_txn = perf_pmu_cancel_txn; | ||
| 5180 | } else { | ||
| 5181 | pmu->start_txn = perf_pmu_nop_void; | ||
| 5182 | pmu->commit_txn = perf_pmu_nop_int; | ||
| 5183 | pmu->cancel_txn = perf_pmu_nop_void; | ||
| 5184 | } | ||
| 5185 | } | ||
| 5186 | |||
| 5187 | if (!pmu->pmu_enable) { | ||
| 5188 | pmu->pmu_enable = perf_pmu_nop_void; | ||
| 5189 | pmu->pmu_disable = perf_pmu_nop_void; | ||
| 5190 | } | ||
| 5191 | |||
| 5192 | list_add_rcu(&pmu->entry, &pmus); | ||
| 5193 | ret = 0; | ||
| 5194 | unlock: | ||
| 5195 | mutex_unlock(&pmus_lock); | ||
| 5196 | |||
| 5197 | return ret; | ||
| 5198 | |||
| 5199 | free_pdc: | ||
| 5200 | free_percpu(pmu->pmu_disable_count); | ||
| 5201 | goto unlock; | ||
| 5202 | } | ||
| 5203 | |||
| 5204 | void perf_pmu_unregister(struct pmu *pmu) | ||
| 5205 | { | ||
| 5206 | mutex_lock(&pmus_lock); | ||
| 5207 | list_del_rcu(&pmu->entry); | ||
| 5208 | mutex_unlock(&pmus_lock); | ||
| 5209 | |||
| 5210 | /* | ||
| 5211 | * We dereference the pmu list under both SRCU and regular RCU, so | ||
| 5212 | * synchronize against both of those. | ||
| 5213 | */ | ||
| 5214 | synchronize_srcu(&pmus_srcu); | ||
| 5215 | synchronize_rcu(); | ||
| 5216 | |||
| 5217 | free_percpu(pmu->pmu_disable_count); | ||
| 5218 | free_pmu_context(pmu->pmu_cpu_context); | ||
| 5219 | } | ||
| 5220 | |||
| 5221 | struct pmu *perf_init_event(struct perf_event *event) | ||
| 5222 | { | ||
| 5223 | struct pmu *pmu = NULL; | ||
| 5224 | int idx; | ||
| 5225 | |||
| 5226 | idx = srcu_read_lock(&pmus_srcu); | ||
| 5227 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
| 5228 | int ret = pmu->event_init(event); | ||
| 5229 | if (!ret) | ||
| 5230 | goto unlock; | ||
| 5231 | |||
| 5232 | if (ret != -ENOENT) { | ||
| 5233 | pmu = ERR_PTR(ret); | ||
| 5234 | goto unlock; | ||
| 4801 | } | 5235 | } |
| 4802 | pmu = &perf_ops_generic; | ||
| 4803 | break; | ||
| 4804 | } | 5236 | } |
| 5237 | pmu = ERR_PTR(-ENOENT); | ||
| 5238 | unlock: | ||
| 5239 | srcu_read_unlock(&pmus_srcu, idx); | ||
| 4805 | 5240 | ||
| 4806 | return pmu; | 5241 | return pmu; |
| 4807 | } | 5242 | } |
| @@ -4810,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) | |||
| 4810 | * Allocate and initialize a event structure | 5245 | * Allocate and initialize a event structure |
| 4811 | */ | 5246 | */ |
| 4812 | static struct perf_event * | 5247 | static struct perf_event * |
| 4813 | perf_event_alloc(struct perf_event_attr *attr, | 5248 | perf_event_alloc(struct perf_event_attr *attr, int cpu, |
| 4814 | int cpu, | 5249 | struct task_struct *task, |
| 4815 | struct perf_event_context *ctx, | 5250 | struct perf_event *group_leader, |
| 4816 | struct perf_event *group_leader, | 5251 | struct perf_event *parent_event, |
| 4817 | struct perf_event *parent_event, | 5252 | perf_overflow_handler_t overflow_handler) |
| 4818 | perf_overflow_handler_t overflow_handler, | 5253 | { |
| 4819 | gfp_t gfpflags) | 5254 | struct pmu *pmu; |
| 4820 | { | ||
| 4821 | const struct pmu *pmu; | ||
| 4822 | struct perf_event *event; | 5255 | struct perf_event *event; |
| 4823 | struct hw_perf_event *hwc; | 5256 | struct hw_perf_event *hwc; |
| 4824 | long err; | 5257 | long err; |
| 4825 | 5258 | ||
| 4826 | event = kzalloc(sizeof(*event), gfpflags); | 5259 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
| 4827 | if (!event) | 5260 | if (!event) |
| 4828 | return ERR_PTR(-ENOMEM); | 5261 | return ERR_PTR(-ENOMEM); |
| 4829 | 5262 | ||
| @@ -4841,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
| 4841 | INIT_LIST_HEAD(&event->event_entry); | 5274 | INIT_LIST_HEAD(&event->event_entry); |
| 4842 | INIT_LIST_HEAD(&event->sibling_list); | 5275 | INIT_LIST_HEAD(&event->sibling_list); |
| 4843 | init_waitqueue_head(&event->waitq); | 5276 | init_waitqueue_head(&event->waitq); |
| 5277 | init_irq_work(&event->pending, perf_pending_event); | ||
| 4844 | 5278 | ||
| 4845 | mutex_init(&event->mmap_mutex); | 5279 | mutex_init(&event->mmap_mutex); |
| 4846 | 5280 | ||
| @@ -4848,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
| 4848 | event->attr = *attr; | 5282 | event->attr = *attr; |
| 4849 | event->group_leader = group_leader; | 5283 | event->group_leader = group_leader; |
| 4850 | event->pmu = NULL; | 5284 | event->pmu = NULL; |
| 4851 | event->ctx = ctx; | ||
| 4852 | event->oncpu = -1; | 5285 | event->oncpu = -1; |
| 4853 | 5286 | ||
| 4854 | event->parent = parent_event; | 5287 | event->parent = parent_event; |
| @@ -4858,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
| 4858 | 5291 | ||
| 4859 | event->state = PERF_EVENT_STATE_INACTIVE; | 5292 | event->state = PERF_EVENT_STATE_INACTIVE; |
| 4860 | 5293 | ||
| 5294 | if (task) { | ||
| 5295 | event->attach_state = PERF_ATTACH_TASK; | ||
| 5296 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
| 5297 | /* | ||
| 5298 | * hw_breakpoint is a bit difficult here.. | ||
| 5299 | */ | ||
| 5300 | if (attr->type == PERF_TYPE_BREAKPOINT) | ||
| 5301 | event->hw.bp_target = task; | ||
| 5302 | #endif | ||
| 5303 | } | ||
| 5304 | |||
| 4861 | if (!overflow_handler && parent_event) | 5305 | if (!overflow_handler && parent_event) |
| 4862 | overflow_handler = parent_event->overflow_handler; | 5306 | overflow_handler = parent_event->overflow_handler; |
| 4863 | 5307 | ||
| @@ -4882,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
| 4882 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 5326 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
| 4883 | goto done; | 5327 | goto done; |
| 4884 | 5328 | ||
| 4885 | switch (attr->type) { | 5329 | pmu = perf_init_event(event); |
| 4886 | case PERF_TYPE_RAW: | ||
| 4887 | case PERF_TYPE_HARDWARE: | ||
| 4888 | case PERF_TYPE_HW_CACHE: | ||
| 4889 | pmu = hw_perf_event_init(event); | ||
| 4890 | break; | ||
| 4891 | |||
| 4892 | case PERF_TYPE_SOFTWARE: | ||
| 4893 | pmu = sw_perf_event_init(event); | ||
| 4894 | break; | ||
| 4895 | |||
| 4896 | case PERF_TYPE_TRACEPOINT: | ||
| 4897 | pmu = tp_perf_event_init(event); | ||
| 4898 | break; | ||
| 4899 | |||
| 4900 | case PERF_TYPE_BREAKPOINT: | ||
| 4901 | pmu = bp_perf_event_init(event); | ||
| 4902 | break; | ||
| 4903 | |||
| 4904 | 5330 | ||
| 4905 | default: | ||
| 4906 | break; | ||
| 4907 | } | ||
| 4908 | done: | 5331 | done: |
| 4909 | err = 0; | 5332 | err = 0; |
| 4910 | if (!pmu) | 5333 | if (!pmu) |
| @@ -4922,13 +5345,21 @@ done: | |||
| 4922 | event->pmu = pmu; | 5345 | event->pmu = pmu; |
| 4923 | 5346 | ||
| 4924 | if (!event->parent) { | 5347 | if (!event->parent) { |
| 4925 | atomic_inc(&nr_events); | 5348 | if (event->attach_state & PERF_ATTACH_TASK) |
| 5349 | jump_label_inc(&perf_task_events); | ||
| 4926 | if (event->attr.mmap || event->attr.mmap_data) | 5350 | if (event->attr.mmap || event->attr.mmap_data) |
| 4927 | atomic_inc(&nr_mmap_events); | 5351 | atomic_inc(&nr_mmap_events); |
| 4928 | if (event->attr.comm) | 5352 | if (event->attr.comm) |
| 4929 | atomic_inc(&nr_comm_events); | 5353 | atomic_inc(&nr_comm_events); |
| 4930 | if (event->attr.task) | 5354 | if (event->attr.task) |
| 4931 | atomic_inc(&nr_task_events); | 5355 | atomic_inc(&nr_task_events); |
| 5356 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | ||
| 5357 | err = get_callchain_buffers(); | ||
| 5358 | if (err) { | ||
| 5359 | free_event(event); | ||
| 5360 | return ERR_PTR(err); | ||
| 5361 | } | ||
| 5362 | } | ||
| 4932 | } | 5363 | } |
| 4933 | 5364 | ||
| 4934 | return event; | 5365 | return event; |
| @@ -5076,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5076 | struct perf_event_attr __user *, attr_uptr, | 5507 | struct perf_event_attr __user *, attr_uptr, |
| 5077 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) | 5508 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) |
| 5078 | { | 5509 | { |
| 5079 | struct perf_event *event, *group_leader = NULL, *output_event = NULL; | 5510 | struct perf_event *group_leader = NULL, *output_event = NULL; |
| 5511 | struct perf_event *event, *sibling; | ||
| 5080 | struct perf_event_attr attr; | 5512 | struct perf_event_attr attr; |
| 5081 | struct perf_event_context *ctx; | 5513 | struct perf_event_context *ctx; |
| 5082 | struct file *event_file = NULL; | 5514 | struct file *event_file = NULL; |
| 5083 | struct file *group_file = NULL; | 5515 | struct file *group_file = NULL; |
| 5516 | struct task_struct *task = NULL; | ||
| 5517 | struct pmu *pmu; | ||
| 5084 | int event_fd; | 5518 | int event_fd; |
| 5519 | int move_group = 0; | ||
| 5085 | int fput_needed = 0; | 5520 | int fput_needed = 0; |
| 5086 | int err; | 5521 | int err; |
| 5087 | 5522 | ||
| @@ -5107,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5107 | if (event_fd < 0) | 5542 | if (event_fd < 0) |
| 5108 | return event_fd; | 5543 | return event_fd; |
| 5109 | 5544 | ||
| 5110 | /* | ||
| 5111 | * Get the target context (task or percpu): | ||
| 5112 | */ | ||
| 5113 | ctx = find_get_context(pid, cpu); | ||
| 5114 | if (IS_ERR(ctx)) { | ||
| 5115 | err = PTR_ERR(ctx); | ||
| 5116 | goto err_fd; | ||
| 5117 | } | ||
| 5118 | |||
| 5119 | if (group_fd != -1) { | 5545 | if (group_fd != -1) { |
| 5120 | group_leader = perf_fget_light(group_fd, &fput_needed); | 5546 | group_leader = perf_fget_light(group_fd, &fput_needed); |
| 5121 | if (IS_ERR(group_leader)) { | 5547 | if (IS_ERR(group_leader)) { |
| 5122 | err = PTR_ERR(group_leader); | 5548 | err = PTR_ERR(group_leader); |
| 5123 | goto err_put_context; | 5549 | goto err_fd; |
| 5124 | } | 5550 | } |
| 5125 | group_file = group_leader->filp; | 5551 | group_file = group_leader->filp; |
| 5126 | if (flags & PERF_FLAG_FD_OUTPUT) | 5552 | if (flags & PERF_FLAG_FD_OUTPUT) |
| @@ -5129,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5129 | group_leader = NULL; | 5555 | group_leader = NULL; |
| 5130 | } | 5556 | } |
| 5131 | 5557 | ||
| 5558 | if (pid != -1) { | ||
| 5559 | task = find_lively_task_by_vpid(pid); | ||
| 5560 | if (IS_ERR(task)) { | ||
| 5561 | err = PTR_ERR(task); | ||
| 5562 | goto err_group_fd; | ||
| 5563 | } | ||
| 5564 | } | ||
| 5565 | |||
| 5566 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); | ||
| 5567 | if (IS_ERR(event)) { | ||
| 5568 | err = PTR_ERR(event); | ||
| 5569 | goto err_task; | ||
| 5570 | } | ||
| 5571 | |||
| 5572 | /* | ||
| 5573 | * Special case software events and allow them to be part of | ||
| 5574 | * any hardware group. | ||
| 5575 | */ | ||
| 5576 | pmu = event->pmu; | ||
| 5577 | |||
| 5578 | if (group_leader && | ||
| 5579 | (is_software_event(event) != is_software_event(group_leader))) { | ||
| 5580 | if (is_software_event(event)) { | ||
| 5581 | /* | ||
| 5582 | * If event and group_leader are not both a software | ||
| 5583 | * event, and event is, then group leader is not. | ||
| 5584 | * | ||
| 5585 | * Allow the addition of software events to !software | ||
| 5586 | * groups, this is safe because software events never | ||
| 5587 | * fail to schedule. | ||
| 5588 | */ | ||
| 5589 | pmu = group_leader->pmu; | ||
| 5590 | } else if (is_software_event(group_leader) && | ||
| 5591 | (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { | ||
| 5592 | /* | ||
| 5593 | * In case the group is a pure software group, and we | ||
| 5594 | * try to add a hardware event, move the whole group to | ||
| 5595 | * the hardware context. | ||
| 5596 | */ | ||
| 5597 | move_group = 1; | ||
| 5598 | } | ||
| 5599 | } | ||
| 5600 | |||
| 5601 | /* | ||
| 5602 | * Get the target context (task or percpu): | ||
| 5603 | */ | ||
| 5604 | ctx = find_get_context(pmu, task, cpu); | ||
| 5605 | if (IS_ERR(ctx)) { | ||
| 5606 | err = PTR_ERR(ctx); | ||
| 5607 | goto err_alloc; | ||
| 5608 | } | ||
| 5609 | |||
| 5132 | /* | 5610 | /* |
| 5133 | * Look up the group leader (we will attach this event to it): | 5611 | * Look up the group leader (we will attach this event to it): |
| 5134 | */ | 5612 | */ |
| @@ -5140,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5140 | * becoming part of another group-sibling): | 5618 | * becoming part of another group-sibling): |
| 5141 | */ | 5619 | */ |
| 5142 | if (group_leader->group_leader != group_leader) | 5620 | if (group_leader->group_leader != group_leader) |
| 5143 | goto err_put_context; | 5621 | goto err_context; |
| 5144 | /* | 5622 | /* |
| 5145 | * Do not allow to attach to a group in a different | 5623 | * Do not allow to attach to a group in a different |
| 5146 | * task or CPU context: | 5624 | * task or CPU context: |
| 5147 | */ | 5625 | */ |
| 5148 | if (group_leader->ctx != ctx) | 5626 | if (move_group) { |
| 5149 | goto err_put_context; | 5627 | if (group_leader->ctx->type != ctx->type) |
| 5628 | goto err_context; | ||
| 5629 | } else { | ||
| 5630 | if (group_leader->ctx != ctx) | ||
| 5631 | goto err_context; | ||
| 5632 | } | ||
| 5633 | |||
| 5150 | /* | 5634 | /* |
| 5151 | * Only a group leader can be exclusive or pinned | 5635 | * Only a group leader can be exclusive or pinned |
| 5152 | */ | 5636 | */ |
| 5153 | if (attr.exclusive || attr.pinned) | 5637 | if (attr.exclusive || attr.pinned) |
| 5154 | goto err_put_context; | 5638 | goto err_context; |
| 5155 | } | ||
| 5156 | |||
| 5157 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, | ||
| 5158 | NULL, NULL, GFP_KERNEL); | ||
| 5159 | if (IS_ERR(event)) { | ||
| 5160 | err = PTR_ERR(event); | ||
| 5161 | goto err_put_context; | ||
| 5162 | } | 5639 | } |
| 5163 | 5640 | ||
| 5164 | if (output_event) { | 5641 | if (output_event) { |
| 5165 | err = perf_event_set_output(event, output_event); | 5642 | err = perf_event_set_output(event, output_event); |
| 5166 | if (err) | 5643 | if (err) |
| 5167 | goto err_free_put_context; | 5644 | goto err_context; |
| 5168 | } | 5645 | } |
| 5169 | 5646 | ||
| 5170 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); | 5647 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); |
| 5171 | if (IS_ERR(event_file)) { | 5648 | if (IS_ERR(event_file)) { |
| 5172 | err = PTR_ERR(event_file); | 5649 | err = PTR_ERR(event_file); |
| 5173 | goto err_free_put_context; | 5650 | goto err_context; |
| 5651 | } | ||
| 5652 | |||
| 5653 | if (move_group) { | ||
| 5654 | struct perf_event_context *gctx = group_leader->ctx; | ||
| 5655 | |||
| 5656 | mutex_lock(&gctx->mutex); | ||
| 5657 | perf_event_remove_from_context(group_leader); | ||
| 5658 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
| 5659 | group_entry) { | ||
| 5660 | perf_event_remove_from_context(sibling); | ||
| 5661 | put_ctx(gctx); | ||
| 5662 | } | ||
| 5663 | mutex_unlock(&gctx->mutex); | ||
| 5664 | put_ctx(gctx); | ||
| 5174 | } | 5665 | } |
| 5175 | 5666 | ||
| 5176 | event->filp = event_file; | 5667 | event->filp = event_file; |
| 5177 | WARN_ON_ONCE(ctx->parent_ctx); | 5668 | WARN_ON_ONCE(ctx->parent_ctx); |
| 5178 | mutex_lock(&ctx->mutex); | 5669 | mutex_lock(&ctx->mutex); |
| 5670 | |||
| 5671 | if (move_group) { | ||
| 5672 | perf_install_in_context(ctx, group_leader, cpu); | ||
| 5673 | get_ctx(ctx); | ||
| 5674 | list_for_each_entry(sibling, &group_leader->sibling_list, | ||
| 5675 | group_entry) { | ||
| 5676 | perf_install_in_context(ctx, sibling, cpu); | ||
| 5677 | get_ctx(ctx); | ||
| 5678 | } | ||
| 5679 | } | ||
| 5680 | |||
| 5179 | perf_install_in_context(ctx, event, cpu); | 5681 | perf_install_in_context(ctx, event, cpu); |
| 5180 | ++ctx->generation; | 5682 | ++ctx->generation; |
| 5181 | mutex_unlock(&ctx->mutex); | 5683 | mutex_unlock(&ctx->mutex); |
| @@ -5196,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 5196 | fd_install(event_fd, event_file); | 5698 | fd_install(event_fd, event_file); |
| 5197 | return event_fd; | 5699 | return event_fd; |
| 5198 | 5700 | ||
| 5199 | err_free_put_context: | 5701 | err_context: |
| 5702 | put_ctx(ctx); | ||
| 5703 | err_alloc: | ||
| 5200 | free_event(event); | 5704 | free_event(event); |
| 5201 | err_put_context: | 5705 | err_task: |
| 5706 | if (task) | ||
| 5707 | put_task_struct(task); | ||
| 5708 | err_group_fd: | ||
| 5202 | fput_light(group_file, fput_needed); | 5709 | fput_light(group_file, fput_needed); |
| 5203 | put_ctx(ctx); | ||
| 5204 | err_fd: | 5710 | err_fd: |
| 5205 | put_unused_fd(event_fd); | 5711 | put_unused_fd(event_fd); |
| 5206 | return err; | 5712 | return err; |
| @@ -5211,32 +5717,31 @@ err_fd: | |||
| 5211 | * | 5717 | * |
| 5212 | * @attr: attributes of the counter to create | 5718 | * @attr: attributes of the counter to create |
| 5213 | * @cpu: cpu in which the counter is bound | 5719 | * @cpu: cpu in which the counter is bound |
| 5214 | * @pid: task to profile | 5720 | * @task: task to profile (NULL for percpu) |
| 5215 | */ | 5721 | */ |
| 5216 | struct perf_event * | 5722 | struct perf_event * |
| 5217 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | 5723 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
| 5218 | pid_t pid, | 5724 | struct task_struct *task, |
| 5219 | perf_overflow_handler_t overflow_handler) | 5725 | perf_overflow_handler_t overflow_handler) |
| 5220 | { | 5726 | { |
| 5221 | struct perf_event *event; | ||
| 5222 | struct perf_event_context *ctx; | 5727 | struct perf_event_context *ctx; |
| 5728 | struct perf_event *event; | ||
| 5223 | int err; | 5729 | int err; |
| 5224 | 5730 | ||
| 5225 | /* | 5731 | /* |
| 5226 | * Get the target context (task or percpu): | 5732 | * Get the target context (task or percpu): |
| 5227 | */ | 5733 | */ |
| 5228 | 5734 | ||
| 5229 | ctx = find_get_context(pid, cpu); | 5735 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); |
| 5230 | if (IS_ERR(ctx)) { | ||
| 5231 | err = PTR_ERR(ctx); | ||
| 5232 | goto err_exit; | ||
| 5233 | } | ||
| 5234 | |||
| 5235 | event = perf_event_alloc(attr, cpu, ctx, NULL, | ||
| 5236 | NULL, overflow_handler, GFP_KERNEL); | ||
| 5237 | if (IS_ERR(event)) { | 5736 | if (IS_ERR(event)) { |
| 5238 | err = PTR_ERR(event); | 5737 | err = PTR_ERR(event); |
| 5239 | goto err_put_context; | 5738 | goto err; |
| 5739 | } | ||
| 5740 | |||
| 5741 | ctx = find_get_context(event->pmu, task, cpu); | ||
| 5742 | if (IS_ERR(ctx)) { | ||
| 5743 | err = PTR_ERR(ctx); | ||
| 5744 | goto err_free; | ||
| 5240 | } | 5745 | } |
| 5241 | 5746 | ||
| 5242 | event->filp = NULL; | 5747 | event->filp = NULL; |
| @@ -5254,112 +5759,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 5254 | 5759 | ||
| 5255 | return event; | 5760 | return event; |
| 5256 | 5761 | ||
| 5257 | err_put_context: | 5762 | err_free: |
| 5258 | put_ctx(ctx); | 5763 | free_event(event); |
| 5259 | err_exit: | 5764 | err: |
| 5260 | return ERR_PTR(err); | 5765 | return ERR_PTR(err); |
| 5261 | } | 5766 | } |
| 5262 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); | 5767 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
| 5263 | 5768 | ||
| 5264 | /* | ||
| 5265 | * inherit a event from parent task to child task: | ||
| 5266 | */ | ||
| 5267 | static struct perf_event * | ||
| 5268 | inherit_event(struct perf_event *parent_event, | ||
| 5269 | struct task_struct *parent, | ||
| 5270 | struct perf_event_context *parent_ctx, | ||
| 5271 | struct task_struct *child, | ||
| 5272 | struct perf_event *group_leader, | ||
| 5273 | struct perf_event_context *child_ctx) | ||
| 5274 | { | ||
| 5275 | struct perf_event *child_event; | ||
| 5276 | |||
| 5277 | /* | ||
| 5278 | * Instead of creating recursive hierarchies of events, | ||
| 5279 | * we link inherited events back to the original parent, | ||
| 5280 | * which has a filp for sure, which we use as the reference | ||
| 5281 | * count: | ||
| 5282 | */ | ||
| 5283 | if (parent_event->parent) | ||
| 5284 | parent_event = parent_event->parent; | ||
| 5285 | |||
| 5286 | child_event = perf_event_alloc(&parent_event->attr, | ||
| 5287 | parent_event->cpu, child_ctx, | ||
| 5288 | group_leader, parent_event, | ||
| 5289 | NULL, GFP_KERNEL); | ||
| 5290 | if (IS_ERR(child_event)) | ||
| 5291 | return child_event; | ||
| 5292 | get_ctx(child_ctx); | ||
| 5293 | |||
| 5294 | /* | ||
| 5295 | * Make the child state follow the state of the parent event, | ||
| 5296 | * not its attr.disabled bit. We hold the parent's mutex, | ||
| 5297 | * so we won't race with perf_event_{en, dis}able_family. | ||
| 5298 | */ | ||
| 5299 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
| 5300 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
| 5301 | else | ||
| 5302 | child_event->state = PERF_EVENT_STATE_OFF; | ||
| 5303 | |||
| 5304 | if (parent_event->attr.freq) { | ||
| 5305 | u64 sample_period = parent_event->hw.sample_period; | ||
| 5306 | struct hw_perf_event *hwc = &child_event->hw; | ||
| 5307 | |||
| 5308 | hwc->sample_period = sample_period; | ||
| 5309 | hwc->last_period = sample_period; | ||
| 5310 | |||
| 5311 | local64_set(&hwc->period_left, sample_period); | ||
| 5312 | } | ||
| 5313 | |||
| 5314 | child_event->overflow_handler = parent_event->overflow_handler; | ||
| 5315 | |||
| 5316 | /* | ||
| 5317 | * Link it up in the child's context: | ||
| 5318 | */ | ||
| 5319 | add_event_to_ctx(child_event, child_ctx); | ||
| 5320 | |||
| 5321 | /* | ||
| 5322 | * Get a reference to the parent filp - we will fput it | ||
| 5323 | * when the child event exits. This is safe to do because | ||
| 5324 | * we are in the parent and we know that the filp still | ||
| 5325 | * exists and has a nonzero count: | ||
| 5326 | */ | ||
| 5327 | atomic_long_inc(&parent_event->filp->f_count); | ||
| 5328 | |||
| 5329 | /* | ||
| 5330 | * Link this into the parent event's child list | ||
| 5331 | */ | ||
| 5332 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
| 5333 | mutex_lock(&parent_event->child_mutex); | ||
| 5334 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
| 5335 | mutex_unlock(&parent_event->child_mutex); | ||
| 5336 | |||
| 5337 | return child_event; | ||
| 5338 | } | ||
| 5339 | |||
| 5340 | static int inherit_group(struct perf_event *parent_event, | ||
| 5341 | struct task_struct *parent, | ||
| 5342 | struct perf_event_context *parent_ctx, | ||
| 5343 | struct task_struct *child, | ||
| 5344 | struct perf_event_context *child_ctx) | ||
| 5345 | { | ||
| 5346 | struct perf_event *leader; | ||
| 5347 | struct perf_event *sub; | ||
| 5348 | struct perf_event *child_ctr; | ||
| 5349 | |||
| 5350 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
| 5351 | child, NULL, child_ctx); | ||
| 5352 | if (IS_ERR(leader)) | ||
| 5353 | return PTR_ERR(leader); | ||
| 5354 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
| 5355 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
| 5356 | child, leader, child_ctx); | ||
| 5357 | if (IS_ERR(child_ctr)) | ||
| 5358 | return PTR_ERR(child_ctr); | ||
| 5359 | } | ||
| 5360 | return 0; | ||
| 5361 | } | ||
| 5362 | |||
| 5363 | static void sync_child_event(struct perf_event *child_event, | 5769 | static void sync_child_event(struct perf_event *child_event, |
| 5364 | struct task_struct *child) | 5770 | struct task_struct *child) |
| 5365 | { | 5771 | { |
| @@ -5416,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
| 5416 | } | 5822 | } |
| 5417 | } | 5823 | } |
| 5418 | 5824 | ||
| 5419 | /* | 5825 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
| 5420 | * When a child task exits, feed back event values to parent events. | ||
| 5421 | */ | ||
| 5422 | void perf_event_exit_task(struct task_struct *child) | ||
| 5423 | { | 5826 | { |
| 5424 | struct perf_event *child_event, *tmp; | 5827 | struct perf_event *child_event, *tmp; |
| 5425 | struct perf_event_context *child_ctx; | 5828 | struct perf_event_context *child_ctx; |
| 5426 | unsigned long flags; | 5829 | unsigned long flags; |
| 5427 | 5830 | ||
| 5428 | if (likely(!child->perf_event_ctxp)) { | 5831 | if (likely(!child->perf_event_ctxp[ctxn])) { |
| 5429 | perf_event_task(child, NULL, 0); | 5832 | perf_event_task(child, NULL, 0); |
| 5430 | return; | 5833 | return; |
| 5431 | } | 5834 | } |
| @@ -5437,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child) | |||
| 5437 | * scheduled, so we are now safe from rescheduling changing | 5840 | * scheduled, so we are now safe from rescheduling changing |
| 5438 | * our context. | 5841 | * our context. |
| 5439 | */ | 5842 | */ |
| 5440 | child_ctx = child->perf_event_ctxp; | 5843 | child_ctx = child->perf_event_ctxp[ctxn]; |
| 5441 | __perf_event_task_sched_out(child_ctx); | 5844 | task_ctx_sched_out(child_ctx, EVENT_ALL); |
| 5442 | 5845 | ||
| 5443 | /* | 5846 | /* |
| 5444 | * Take the context lock here so that if find_get_context is | 5847 | * Take the context lock here so that if find_get_context is |
| @@ -5446,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child) | |||
| 5446 | * incremented the context's refcount before we do put_ctx below. | 5849 | * incremented the context's refcount before we do put_ctx below. |
| 5447 | */ | 5850 | */ |
| 5448 | raw_spin_lock(&child_ctx->lock); | 5851 | raw_spin_lock(&child_ctx->lock); |
| 5449 | child->perf_event_ctxp = NULL; | 5852 | child->perf_event_ctxp[ctxn] = NULL; |
| 5450 | /* | 5853 | /* |
| 5451 | * If this context is a clone; unclone it so it can't get | 5854 | * If this context is a clone; unclone it so it can't get |
| 5452 | * swapped to another process while we're removing all | 5855 | * swapped to another process while we're removing all |
| @@ -5499,6 +5902,17 @@ again: | |||
| 5499 | put_ctx(child_ctx); | 5902 | put_ctx(child_ctx); |
| 5500 | } | 5903 | } |
| 5501 | 5904 | ||
| 5905 | /* | ||
| 5906 | * When a child task exits, feed back event values to parent events. | ||
| 5907 | */ | ||
| 5908 | void perf_event_exit_task(struct task_struct *child) | ||
| 5909 | { | ||
| 5910 | int ctxn; | ||
| 5911 | |||
| 5912 | for_each_task_context_nr(ctxn) | ||
| 5913 | perf_event_exit_task_context(child, ctxn); | ||
| 5914 | } | ||
| 5915 | |||
| 5502 | static void perf_free_event(struct perf_event *event, | 5916 | static void perf_free_event(struct perf_event *event, |
| 5503 | struct perf_event_context *ctx) | 5917 | struct perf_event_context *ctx) |
| 5504 | { | 5918 | { |
| @@ -5520,48 +5934,166 @@ static void perf_free_event(struct perf_event *event, | |||
| 5520 | 5934 | ||
| 5521 | /* | 5935 | /* |
| 5522 | * free an unexposed, unused context as created by inheritance by | 5936 | * free an unexposed, unused context as created by inheritance by |
| 5523 | * init_task below, used by fork() in case of fail. | 5937 | * perf_event_init_task below, used by fork() in case of fail. |
| 5524 | */ | 5938 | */ |
| 5525 | void perf_event_free_task(struct task_struct *task) | 5939 | void perf_event_free_task(struct task_struct *task) |
| 5526 | { | 5940 | { |
| 5527 | struct perf_event_context *ctx = task->perf_event_ctxp; | 5941 | struct perf_event_context *ctx; |
| 5528 | struct perf_event *event, *tmp; | 5942 | struct perf_event *event, *tmp; |
| 5943 | int ctxn; | ||
| 5529 | 5944 | ||
| 5530 | if (!ctx) | 5945 | for_each_task_context_nr(ctxn) { |
| 5531 | return; | 5946 | ctx = task->perf_event_ctxp[ctxn]; |
| 5947 | if (!ctx) | ||
| 5948 | continue; | ||
| 5532 | 5949 | ||
| 5533 | mutex_lock(&ctx->mutex); | 5950 | mutex_lock(&ctx->mutex); |
| 5534 | again: | 5951 | again: |
| 5535 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 5952 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, |
| 5536 | perf_free_event(event, ctx); | 5953 | group_entry) |
| 5954 | perf_free_event(event, ctx); | ||
| 5537 | 5955 | ||
| 5538 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, | 5956 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, |
| 5539 | group_entry) | 5957 | group_entry) |
| 5540 | perf_free_event(event, ctx); | 5958 | perf_free_event(event, ctx); |
| 5541 | 5959 | ||
| 5542 | if (!list_empty(&ctx->pinned_groups) || | 5960 | if (!list_empty(&ctx->pinned_groups) || |
| 5543 | !list_empty(&ctx->flexible_groups)) | 5961 | !list_empty(&ctx->flexible_groups)) |
| 5544 | goto again; | 5962 | goto again; |
| 5545 | 5963 | ||
| 5546 | mutex_unlock(&ctx->mutex); | 5964 | mutex_unlock(&ctx->mutex); |
| 5547 | 5965 | ||
| 5548 | put_ctx(ctx); | 5966 | put_ctx(ctx); |
| 5967 | } | ||
| 5968 | } | ||
| 5969 | |||
| 5970 | void perf_event_delayed_put(struct task_struct *task) | ||
| 5971 | { | ||
| 5972 | int ctxn; | ||
| 5973 | |||
| 5974 | for_each_task_context_nr(ctxn) | ||
| 5975 | WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); | ||
| 5976 | } | ||
| 5977 | |||
| 5978 | /* | ||
| 5979 | * inherit a event from parent task to child task: | ||
| 5980 | */ | ||
| 5981 | static struct perf_event * | ||
| 5982 | inherit_event(struct perf_event *parent_event, | ||
| 5983 | struct task_struct *parent, | ||
| 5984 | struct perf_event_context *parent_ctx, | ||
| 5985 | struct task_struct *child, | ||
| 5986 | struct perf_event *group_leader, | ||
| 5987 | struct perf_event_context *child_ctx) | ||
| 5988 | { | ||
| 5989 | struct perf_event *child_event; | ||
| 5990 | unsigned long flags; | ||
| 5991 | |||
| 5992 | /* | ||
| 5993 | * Instead of creating recursive hierarchies of events, | ||
| 5994 | * we link inherited events back to the original parent, | ||
| 5995 | * which has a filp for sure, which we use as the reference | ||
| 5996 | * count: | ||
| 5997 | */ | ||
| 5998 | if (parent_event->parent) | ||
| 5999 | parent_event = parent_event->parent; | ||
| 6000 | |||
| 6001 | child_event = perf_event_alloc(&parent_event->attr, | ||
| 6002 | parent_event->cpu, | ||
| 6003 | child, | ||
| 6004 | group_leader, parent_event, | ||
| 6005 | NULL); | ||
| 6006 | if (IS_ERR(child_event)) | ||
| 6007 | return child_event; | ||
| 6008 | get_ctx(child_ctx); | ||
| 6009 | |||
| 6010 | /* | ||
| 6011 | * Make the child state follow the state of the parent event, | ||
| 6012 | * not its attr.disabled bit. We hold the parent's mutex, | ||
| 6013 | * so we won't race with perf_event_{en, dis}able_family. | ||
| 6014 | */ | ||
| 6015 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) | ||
| 6016 | child_event->state = PERF_EVENT_STATE_INACTIVE; | ||
| 6017 | else | ||
| 6018 | child_event->state = PERF_EVENT_STATE_OFF; | ||
| 6019 | |||
| 6020 | if (parent_event->attr.freq) { | ||
| 6021 | u64 sample_period = parent_event->hw.sample_period; | ||
| 6022 | struct hw_perf_event *hwc = &child_event->hw; | ||
| 6023 | |||
| 6024 | hwc->sample_period = sample_period; | ||
| 6025 | hwc->last_period = sample_period; | ||
| 6026 | |||
| 6027 | local64_set(&hwc->period_left, sample_period); | ||
| 6028 | } | ||
| 6029 | |||
| 6030 | child_event->ctx = child_ctx; | ||
| 6031 | child_event->overflow_handler = parent_event->overflow_handler; | ||
| 6032 | |||
| 6033 | /* | ||
| 6034 | * Link it up in the child's context: | ||
| 6035 | */ | ||
| 6036 | raw_spin_lock_irqsave(&child_ctx->lock, flags); | ||
| 6037 | add_event_to_ctx(child_event, child_ctx); | ||
| 6038 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); | ||
| 6039 | |||
| 6040 | /* | ||
| 6041 | * Get a reference to the parent filp - we will fput it | ||
| 6042 | * when the child event exits. This is safe to do because | ||
| 6043 | * we are in the parent and we know that the filp still | ||
| 6044 | * exists and has a nonzero count: | ||
| 6045 | */ | ||
| 6046 | atomic_long_inc(&parent_event->filp->f_count); | ||
| 6047 | |||
| 6048 | /* | ||
| 6049 | * Link this into the parent event's child list | ||
| 6050 | */ | ||
| 6051 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); | ||
| 6052 | mutex_lock(&parent_event->child_mutex); | ||
| 6053 | list_add_tail(&child_event->child_list, &parent_event->child_list); | ||
| 6054 | mutex_unlock(&parent_event->child_mutex); | ||
| 6055 | |||
| 6056 | return child_event; | ||
| 6057 | } | ||
| 6058 | |||
| 6059 | static int inherit_group(struct perf_event *parent_event, | ||
| 6060 | struct task_struct *parent, | ||
| 6061 | struct perf_event_context *parent_ctx, | ||
| 6062 | struct task_struct *child, | ||
| 6063 | struct perf_event_context *child_ctx) | ||
| 6064 | { | ||
| 6065 | struct perf_event *leader; | ||
| 6066 | struct perf_event *sub; | ||
| 6067 | struct perf_event *child_ctr; | ||
| 6068 | |||
| 6069 | leader = inherit_event(parent_event, parent, parent_ctx, | ||
| 6070 | child, NULL, child_ctx); | ||
| 6071 | if (IS_ERR(leader)) | ||
| 6072 | return PTR_ERR(leader); | ||
| 6073 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { | ||
| 6074 | child_ctr = inherit_event(sub, parent, parent_ctx, | ||
| 6075 | child, leader, child_ctx); | ||
| 6076 | if (IS_ERR(child_ctr)) | ||
| 6077 | return PTR_ERR(child_ctr); | ||
| 6078 | } | ||
| 6079 | return 0; | ||
| 5549 | } | 6080 | } |
| 5550 | 6081 | ||
| 5551 | static int | 6082 | static int |
| 5552 | inherit_task_group(struct perf_event *event, struct task_struct *parent, | 6083 | inherit_task_group(struct perf_event *event, struct task_struct *parent, |
| 5553 | struct perf_event_context *parent_ctx, | 6084 | struct perf_event_context *parent_ctx, |
| 5554 | struct task_struct *child, | 6085 | struct task_struct *child, int ctxn, |
| 5555 | int *inherited_all) | 6086 | int *inherited_all) |
| 5556 | { | 6087 | { |
| 5557 | int ret; | 6088 | int ret; |
| 5558 | struct perf_event_context *child_ctx = child->perf_event_ctxp; | 6089 | struct perf_event_context *child_ctx; |
| 5559 | 6090 | ||
| 5560 | if (!event->attr.inherit) { | 6091 | if (!event->attr.inherit) { |
| 5561 | *inherited_all = 0; | 6092 | *inherited_all = 0; |
| 5562 | return 0; | 6093 | return 0; |
| 5563 | } | 6094 | } |
| 5564 | 6095 | ||
| 6096 | child_ctx = child->perf_event_ctxp[ctxn]; | ||
| 5565 | if (!child_ctx) { | 6097 | if (!child_ctx) { |
| 5566 | /* | 6098 | /* |
| 5567 | * This is executed from the parent task context, so | 6099 | * This is executed from the parent task context, so |
| @@ -5570,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
| 5570 | * child. | 6102 | * child. |
| 5571 | */ | 6103 | */ |
| 5572 | 6104 | ||
| 5573 | child_ctx = kzalloc(sizeof(struct perf_event_context), | 6105 | child_ctx = alloc_perf_context(event->pmu, child); |
| 5574 | GFP_KERNEL); | ||
| 5575 | if (!child_ctx) | 6106 | if (!child_ctx) |
| 5576 | return -ENOMEM; | 6107 | return -ENOMEM; |
| 5577 | 6108 | ||
| 5578 | __perf_event_init_context(child_ctx, child); | 6109 | child->perf_event_ctxp[ctxn] = child_ctx; |
| 5579 | child->perf_event_ctxp = child_ctx; | ||
| 5580 | get_task_struct(child); | ||
| 5581 | } | 6110 | } |
| 5582 | 6111 | ||
| 5583 | ret = inherit_group(event, parent, parent_ctx, | 6112 | ret = inherit_group(event, parent, parent_ctx, |
| @@ -5589,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
| 5589 | return ret; | 6118 | return ret; |
| 5590 | } | 6119 | } |
| 5591 | 6120 | ||
| 5592 | |||
| 5593 | /* | 6121 | /* |
| 5594 | * Initialize the perf_event context in task_struct | 6122 | * Initialize the perf_event context in task_struct |
| 5595 | */ | 6123 | */ |
| 5596 | int perf_event_init_task(struct task_struct *child) | 6124 | int perf_event_init_context(struct task_struct *child, int ctxn) |
| 5597 | { | 6125 | { |
| 5598 | struct perf_event_context *child_ctx, *parent_ctx; | 6126 | struct perf_event_context *child_ctx, *parent_ctx; |
| 5599 | struct perf_event_context *cloned_ctx; | 6127 | struct perf_event_context *cloned_ctx; |
| @@ -5602,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child) | |||
| 5602 | int inherited_all = 1; | 6130 | int inherited_all = 1; |
| 5603 | int ret = 0; | 6131 | int ret = 0; |
| 5604 | 6132 | ||
| 5605 | child->perf_event_ctxp = NULL; | 6133 | child->perf_event_ctxp[ctxn] = NULL; |
| 5606 | 6134 | ||
| 5607 | mutex_init(&child->perf_event_mutex); | 6135 | mutex_init(&child->perf_event_mutex); |
| 5608 | INIT_LIST_HEAD(&child->perf_event_list); | 6136 | INIT_LIST_HEAD(&child->perf_event_list); |
| 5609 | 6137 | ||
| 5610 | if (likely(!parent->perf_event_ctxp)) | 6138 | if (likely(!parent->perf_event_ctxp[ctxn])) |
| 5611 | return 0; | 6139 | return 0; |
| 5612 | 6140 | ||
| 5613 | /* | 6141 | /* |
| 5614 | * If the parent's context is a clone, pin it so it won't get | 6142 | * If the parent's context is a clone, pin it so it won't get |
| 5615 | * swapped under us. | 6143 | * swapped under us. |
| 5616 | */ | 6144 | */ |
| 5617 | parent_ctx = perf_pin_task_context(parent); | 6145 | parent_ctx = perf_pin_task_context(parent, ctxn); |
| 5618 | 6146 | ||
| 5619 | /* | 6147 | /* |
| 5620 | * No need to check if parent_ctx != NULL here; since we saw | 6148 | * No need to check if parent_ctx != NULL here; since we saw |
| @@ -5634,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child) | |||
| 5634 | * the list, not manipulating it: | 6162 | * the list, not manipulating it: |
| 5635 | */ | 6163 | */ |
| 5636 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { | 6164 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { |
| 5637 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6165 | ret = inherit_task_group(event, parent, parent_ctx, |
| 5638 | &inherited_all); | 6166 | child, ctxn, &inherited_all); |
| 5639 | if (ret) | 6167 | if (ret) |
| 5640 | break; | 6168 | break; |
| 5641 | } | 6169 | } |
| 5642 | 6170 | ||
| 5643 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { | 6171 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
| 5644 | ret = inherit_task_group(event, parent, parent_ctx, child, | 6172 | ret = inherit_task_group(event, parent, parent_ctx, |
| 5645 | &inherited_all); | 6173 | child, ctxn, &inherited_all); |
| 5646 | if (ret) | 6174 | if (ret) |
| 5647 | break; | 6175 | break; |
| 5648 | } | 6176 | } |
| 5649 | 6177 | ||
| 5650 | child_ctx = child->perf_event_ctxp; | 6178 | child_ctx = child->perf_event_ctxp[ctxn]; |
| 5651 | 6179 | ||
| 5652 | if (child_ctx && inherited_all) { | 6180 | if (child_ctx && inherited_all) { |
| 5653 | /* | 6181 | /* |
| @@ -5676,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child) | |||
| 5676 | return ret; | 6204 | return ret; |
| 5677 | } | 6205 | } |
| 5678 | 6206 | ||
| 6207 | /* | ||
| 6208 | * Initialize the perf_event context in task_struct | ||
| 6209 | */ | ||
| 6210 | int perf_event_init_task(struct task_struct *child) | ||
| 6211 | { | ||
| 6212 | int ctxn, ret; | ||
| 6213 | |||
| 6214 | for_each_task_context_nr(ctxn) { | ||
| 6215 | ret = perf_event_init_context(child, ctxn); | ||
| 6216 | if (ret) | ||
| 6217 | return ret; | ||
| 6218 | } | ||
| 6219 | |||
| 6220 | return 0; | ||
| 6221 | } | ||
| 6222 | |||
| 5679 | static void __init perf_event_init_all_cpus(void) | 6223 | static void __init perf_event_init_all_cpus(void) |
| 5680 | { | 6224 | { |
| 6225 | struct swevent_htable *swhash; | ||
| 5681 | int cpu; | 6226 | int cpu; |
| 5682 | struct perf_cpu_context *cpuctx; | ||
| 5683 | 6227 | ||
| 5684 | for_each_possible_cpu(cpu) { | 6228 | for_each_possible_cpu(cpu) { |
| 5685 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 6229 | swhash = &per_cpu(swevent_htable, cpu); |
| 5686 | mutex_init(&cpuctx->hlist_mutex); | 6230 | mutex_init(&swhash->hlist_mutex); |
| 5687 | __perf_event_init_context(&cpuctx->ctx, NULL); | 6231 | INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); |
| 5688 | } | 6232 | } |
| 5689 | } | 6233 | } |
| 5690 | 6234 | ||
| 5691 | static void __cpuinit perf_event_init_cpu(int cpu) | 6235 | static void __cpuinit perf_event_init_cpu(int cpu) |
| 5692 | { | 6236 | { |
| 5693 | struct perf_cpu_context *cpuctx; | 6237 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
| 5694 | 6238 | ||
| 5695 | cpuctx = &per_cpu(perf_cpu_context, cpu); | 6239 | mutex_lock(&swhash->hlist_mutex); |
| 5696 | 6240 | if (swhash->hlist_refcount > 0) { | |
| 5697 | spin_lock(&perf_resource_lock); | ||
| 5698 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; | ||
| 5699 | spin_unlock(&perf_resource_lock); | ||
| 5700 | |||
| 5701 | mutex_lock(&cpuctx->hlist_mutex); | ||
| 5702 | if (cpuctx->hlist_refcount > 0) { | ||
| 5703 | struct swevent_hlist *hlist; | 6241 | struct swevent_hlist *hlist; |
| 5704 | 6242 | ||
| 5705 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 6243 | hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); |
| 5706 | WARN_ON_ONCE(!hlist); | 6244 | WARN_ON(!hlist); |
| 5707 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); | 6245 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
| 5708 | } | 6246 | } |
| 5709 | mutex_unlock(&cpuctx->hlist_mutex); | 6247 | mutex_unlock(&swhash->hlist_mutex); |
| 5710 | } | 6248 | } |
| 5711 | 6249 | ||
| 5712 | #ifdef CONFIG_HOTPLUG_CPU | 6250 | #ifdef CONFIG_HOTPLUG_CPU |
| 5713 | static void __perf_event_exit_cpu(void *info) | 6251 | static void perf_pmu_rotate_stop(struct pmu *pmu) |
| 5714 | { | 6252 | { |
| 5715 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 6253 | struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
| 5716 | struct perf_event_context *ctx = &cpuctx->ctx; | 6254 | |
| 6255 | WARN_ON(!irqs_disabled()); | ||
| 6256 | |||
| 6257 | list_del_init(&cpuctx->rotation_list); | ||
| 6258 | } | ||
| 6259 | |||
| 6260 | static void __perf_event_exit_context(void *__info) | ||
| 6261 | { | ||
| 6262 | struct perf_event_context *ctx = __info; | ||
| 5717 | struct perf_event *event, *tmp; | 6263 | struct perf_event *event, *tmp; |
| 5718 | 6264 | ||
| 6265 | perf_pmu_rotate_stop(ctx->pmu); | ||
| 6266 | |||
| 5719 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 6267 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
| 5720 | __perf_event_remove_from_context(event); | 6268 | __perf_event_remove_from_context(event); |
| 5721 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 6269 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
| 5722 | __perf_event_remove_from_context(event); | 6270 | __perf_event_remove_from_context(event); |
| 5723 | } | 6271 | } |
| 6272 | |||
| 6273 | static void perf_event_exit_cpu_context(int cpu) | ||
| 6274 | { | ||
| 6275 | struct perf_event_context *ctx; | ||
| 6276 | struct pmu *pmu; | ||
| 6277 | int idx; | ||
| 6278 | |||
| 6279 | idx = srcu_read_lock(&pmus_srcu); | ||
| 6280 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
| 6281 | ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; | ||
| 6282 | |||
| 6283 | mutex_lock(&ctx->mutex); | ||
| 6284 | smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); | ||
| 6285 | mutex_unlock(&ctx->mutex); | ||
| 6286 | } | ||
| 6287 | srcu_read_unlock(&pmus_srcu, idx); | ||
| 6288 | } | ||
| 6289 | |||
| 5724 | static void perf_event_exit_cpu(int cpu) | 6290 | static void perf_event_exit_cpu(int cpu) |
| 5725 | { | 6291 | { |
| 5726 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | 6292 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
| 5727 | struct perf_event_context *ctx = &cpuctx->ctx; | ||
| 5728 | 6293 | ||
| 5729 | mutex_lock(&cpuctx->hlist_mutex); | 6294 | mutex_lock(&swhash->hlist_mutex); |
| 5730 | swevent_hlist_release(cpuctx); | 6295 | swevent_hlist_release(swhash); |
| 5731 | mutex_unlock(&cpuctx->hlist_mutex); | 6296 | mutex_unlock(&swhash->hlist_mutex); |
| 5732 | 6297 | ||
| 5733 | mutex_lock(&ctx->mutex); | 6298 | perf_event_exit_cpu_context(cpu); |
| 5734 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); | ||
| 5735 | mutex_unlock(&ctx->mutex); | ||
| 5736 | } | 6299 | } |
| 5737 | #else | 6300 | #else |
| 5738 | static inline void perf_event_exit_cpu(int cpu) { } | 6301 | static inline void perf_event_exit_cpu(int cpu) { } |
| @@ -5743,15 +6306,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
| 5743 | { | 6306 | { |
| 5744 | unsigned int cpu = (long)hcpu; | 6307 | unsigned int cpu = (long)hcpu; |
| 5745 | 6308 | ||
| 5746 | switch (action) { | 6309 | switch (action & ~CPU_TASKS_FROZEN) { |
| 5747 | 6310 | ||
| 5748 | case CPU_UP_PREPARE: | 6311 | case CPU_UP_PREPARE: |
| 5749 | case CPU_UP_PREPARE_FROZEN: | 6312 | case CPU_DOWN_FAILED: |
| 5750 | perf_event_init_cpu(cpu); | 6313 | perf_event_init_cpu(cpu); |
| 5751 | break; | 6314 | break; |
| 5752 | 6315 | ||
| 6316 | case CPU_UP_CANCELED: | ||
| 5753 | case CPU_DOWN_PREPARE: | 6317 | case CPU_DOWN_PREPARE: |
| 5754 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 5755 | perf_event_exit_cpu(cpu); | 6318 | perf_event_exit_cpu(cpu); |
| 5756 | break; | 6319 | break; |
| 5757 | 6320 | ||
| @@ -5762,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
| 5762 | return NOTIFY_OK; | 6325 | return NOTIFY_OK; |
| 5763 | } | 6326 | } |
| 5764 | 6327 | ||
| 5765 | /* | ||
| 5766 | * This has to have a higher priority than migration_notifier in sched.c. | ||
| 5767 | */ | ||
| 5768 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
| 5769 | .notifier_call = perf_cpu_notify, | ||
| 5770 | .priority = 20, | ||
| 5771 | }; | ||
| 5772 | |||
| 5773 | void __init perf_event_init(void) | 6328 | void __init perf_event_init(void) |
| 5774 | { | 6329 | { |
| 5775 | perf_event_init_all_cpus(); | 6330 | perf_event_init_all_cpus(); |
| 5776 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | 6331 | init_srcu_struct(&pmus_srcu); |
| 5777 | (void *)(long)smp_processor_id()); | 6332 | perf_pmu_register(&perf_swevent); |
| 5778 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, | 6333 | perf_pmu_register(&perf_cpu_clock); |
| 5779 | (void *)(long)smp_processor_id()); | 6334 | perf_pmu_register(&perf_task_clock); |
| 5780 | register_cpu_notifier(&perf_cpu_nb); | 6335 | perf_tp_register(); |
| 5781 | } | 6336 | perf_cpu_notifier(perf_cpu_notify); |
| 5782 | |||
| 5783 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, | ||
| 5784 | struct sysdev_class_attribute *attr, | ||
| 5785 | char *buf) | ||
| 5786 | { | ||
| 5787 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
| 5788 | } | ||
| 5789 | |||
| 5790 | static ssize_t | ||
| 5791 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
| 5792 | struct sysdev_class_attribute *attr, | ||
| 5793 | const char *buf, | ||
| 5794 | size_t count) | ||
| 5795 | { | ||
| 5796 | struct perf_cpu_context *cpuctx; | ||
| 5797 | unsigned long val; | ||
| 5798 | int err, cpu, mpt; | ||
| 5799 | |||
| 5800 | err = strict_strtoul(buf, 10, &val); | ||
| 5801 | if (err) | ||
| 5802 | return err; | ||
| 5803 | if (val > perf_max_events) | ||
| 5804 | return -EINVAL; | ||
| 5805 | |||
| 5806 | spin_lock(&perf_resource_lock); | ||
| 5807 | perf_reserved_percpu = val; | ||
| 5808 | for_each_online_cpu(cpu) { | ||
| 5809 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
| 5810 | raw_spin_lock_irq(&cpuctx->ctx.lock); | ||
| 5811 | mpt = min(perf_max_events - cpuctx->ctx.nr_events, | ||
| 5812 | perf_max_events - perf_reserved_percpu); | ||
| 5813 | cpuctx->max_pertask = mpt; | ||
| 5814 | raw_spin_unlock_irq(&cpuctx->ctx.lock); | ||
| 5815 | } | ||
| 5816 | spin_unlock(&perf_resource_lock); | ||
| 5817 | |||
| 5818 | return count; | ||
| 5819 | } | ||
| 5820 | |||
| 5821 | static ssize_t perf_show_overcommit(struct sysdev_class *class, | ||
| 5822 | struct sysdev_class_attribute *attr, | ||
| 5823 | char *buf) | ||
| 5824 | { | ||
| 5825 | return sprintf(buf, "%d\n", perf_overcommit); | ||
| 5826 | } | ||
| 5827 | |||
| 5828 | static ssize_t | ||
| 5829 | perf_set_overcommit(struct sysdev_class *class, | ||
| 5830 | struct sysdev_class_attribute *attr, | ||
| 5831 | const char *buf, size_t count) | ||
| 5832 | { | ||
| 5833 | unsigned long val; | ||
| 5834 | int err; | ||
| 5835 | |||
| 5836 | err = strict_strtoul(buf, 10, &val); | ||
| 5837 | if (err) | ||
| 5838 | return err; | ||
| 5839 | if (val > 1) | ||
| 5840 | return -EINVAL; | ||
| 5841 | |||
| 5842 | spin_lock(&perf_resource_lock); | ||
| 5843 | perf_overcommit = val; | ||
| 5844 | spin_unlock(&perf_resource_lock); | ||
| 5845 | |||
| 5846 | return count; | ||
| 5847 | } | ||
| 5848 | |||
| 5849 | static SYSDEV_CLASS_ATTR( | ||
| 5850 | reserve_percpu, | ||
| 5851 | 0644, | ||
| 5852 | perf_show_reserve_percpu, | ||
| 5853 | perf_set_reserve_percpu | ||
| 5854 | ); | ||
| 5855 | |||
| 5856 | static SYSDEV_CLASS_ATTR( | ||
| 5857 | overcommit, | ||
| 5858 | 0644, | ||
| 5859 | perf_show_overcommit, | ||
| 5860 | perf_set_overcommit | ||
| 5861 | ); | ||
| 5862 | |||
| 5863 | static struct attribute *perfclass_attrs[] = { | ||
| 5864 | &attr_reserve_percpu.attr, | ||
| 5865 | &attr_overcommit.attr, | ||
| 5866 | NULL | ||
| 5867 | }; | ||
| 5868 | |||
| 5869 | static struct attribute_group perfclass_attr_group = { | ||
| 5870 | .attrs = perfclass_attrs, | ||
| 5871 | .name = "perf_events", | ||
| 5872 | }; | ||
| 5873 | |||
| 5874 | static int __init perf_event_sysfs_init(void) | ||
| 5875 | { | ||
| 5876 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | ||
| 5877 | &perfclass_attr_group); | ||
| 5878 | } | 6337 | } |
| 5879 | device_initcall(perf_event_sysfs_init); | ||
diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d087..39b65b69584f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
| 401 | struct task_struct *result = NULL; | 401 | struct task_struct *result = NULL; |
| 402 | if (pid) { | 402 | if (pid) { |
| 403 | struct hlist_node *first; | 403 | struct hlist_node *first; |
| 404 | first = rcu_dereference_check(pid->tasks[type].first, | 404 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), |
| 405 | rcu_read_lock_held() || | 405 | rcu_read_lock_held() || |
| 406 | lockdep_tasklist_lock_is_held()); | 406 | lockdep_tasklist_lock_is_held()); |
| 407 | if (first) | 407 | if (first) |
| @@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task); | |||
| 416 | */ | 416 | */ |
| 417 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 417 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
| 418 | { | 418 | { |
| 419 | rcu_lockdep_assert(rcu_read_lock_held()); | ||
| 419 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); | 420 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); |
| 420 | } | 421 | } |
| 421 | 422 | ||
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index b7e4c362361b..645e541a45f6 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
| @@ -389,10 +389,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
| 389 | } else if (count == 11) { /* len('0x12345678/0') */ | 389 | } else if (count == 11) { /* len('0x12345678/0') */ |
| 390 | if (copy_from_user(ascii_value, buf, 11)) | 390 | if (copy_from_user(ascii_value, buf, 11)) |
| 391 | return -EFAULT; | 391 | return -EFAULT; |
| 392 | if (strlen(ascii_value) != 10) | ||
| 393 | return -EINVAL; | ||
| 392 | x = sscanf(ascii_value, "%x", &value); | 394 | x = sscanf(ascii_value, "%x", &value); |
| 393 | if (x != 1) | 395 | if (x != 1) |
| 394 | return -EINVAL; | 396 | return -EINVAL; |
| 395 | pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); | 397 | pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); |
| 396 | } else | 398 | } else |
| 397 | return -EINVAL; | 399 | return -EINVAL; |
| 398 | 400 | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c77963938bca..8dc31e02ae12 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -338,7 +338,6 @@ int hibernation_snapshot(int platform_mode) | |||
| 338 | goto Close; | 338 | goto Close; |
| 339 | 339 | ||
| 340 | suspend_console(); | 340 | suspend_console(); |
| 341 | hibernation_freeze_swap(); | ||
| 342 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); | 341 | saved_mask = clear_gfp_allowed_mask(GFP_IOFS); |
| 343 | error = dpm_suspend_start(PMSG_FREEZE); | 342 | error = dpm_suspend_start(PMSG_FREEZE); |
| 344 | if (error) | 343 | if (error) |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5e7edfb05e66..d3f795f01bbc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -1086,7 +1086,6 @@ void swsusp_free(void) | |||
| 1086 | buffer = NULL; | 1086 | buffer = NULL; |
| 1087 | alloc_normal = 0; | 1087 | alloc_normal = 0; |
| 1088 | alloc_highmem = 0; | 1088 | alloc_highmem = 0; |
| 1089 | hibernation_thaw_swap(); | ||
| 1090 | } | 1089 | } |
| 1091 | 1090 | ||
| 1092 | /* Helper functions used for the shrinking of memory. */ | 1091 | /* Helper functions used for the shrinking of memory. */ |
| @@ -1122,9 +1121,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) | |||
| 1122 | return nr_alloc; | 1121 | return nr_alloc; |
| 1123 | } | 1122 | } |
| 1124 | 1123 | ||
| 1125 | static unsigned long preallocate_image_memory(unsigned long nr_pages) | 1124 | static unsigned long preallocate_image_memory(unsigned long nr_pages, |
| 1125 | unsigned long avail_normal) | ||
| 1126 | { | 1126 | { |
| 1127 | return preallocate_image_pages(nr_pages, GFP_IMAGE); | 1127 | unsigned long alloc; |
| 1128 | |||
| 1129 | if (avail_normal <= alloc_normal) | ||
| 1130 | return 0; | ||
| 1131 | |||
| 1132 | alloc = avail_normal - alloc_normal; | ||
| 1133 | if (nr_pages < alloc) | ||
| 1134 | alloc = nr_pages; | ||
| 1135 | |||
| 1136 | return preallocate_image_pages(alloc, GFP_IMAGE); | ||
| 1128 | } | 1137 | } |
| 1129 | 1138 | ||
| 1130 | #ifdef CONFIG_HIGHMEM | 1139 | #ifdef CONFIG_HIGHMEM |
| @@ -1170,15 +1179,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, | |||
| 1170 | */ | 1179 | */ |
| 1171 | static void free_unnecessary_pages(void) | 1180 | static void free_unnecessary_pages(void) |
| 1172 | { | 1181 | { |
| 1173 | unsigned long save_highmem, to_free_normal, to_free_highmem; | 1182 | unsigned long save, to_free_normal, to_free_highmem; |
| 1174 | 1183 | ||
| 1175 | to_free_normal = alloc_normal - count_data_pages(); | 1184 | save = count_data_pages(); |
| 1176 | save_highmem = count_highmem_pages(); | 1185 | if (alloc_normal >= save) { |
| 1177 | if (alloc_highmem > save_highmem) { | 1186 | to_free_normal = alloc_normal - save; |
| 1178 | to_free_highmem = alloc_highmem - save_highmem; | 1187 | save = 0; |
| 1188 | } else { | ||
| 1189 | to_free_normal = 0; | ||
| 1190 | save -= alloc_normal; | ||
| 1191 | } | ||
| 1192 | save += count_highmem_pages(); | ||
| 1193 | if (alloc_highmem >= save) { | ||
| 1194 | to_free_highmem = alloc_highmem - save; | ||
| 1179 | } else { | 1195 | } else { |
| 1180 | to_free_highmem = 0; | 1196 | to_free_highmem = 0; |
| 1181 | to_free_normal -= save_highmem - alloc_highmem; | 1197 | to_free_normal -= save - alloc_highmem; |
| 1182 | } | 1198 | } |
| 1183 | 1199 | ||
| 1184 | memory_bm_position_reset(©_bm); | 1200 | memory_bm_position_reset(©_bm); |
| @@ -1259,7 +1275,7 @@ int hibernate_preallocate_memory(void) | |||
| 1259 | { | 1275 | { |
| 1260 | struct zone *zone; | 1276 | struct zone *zone; |
| 1261 | unsigned long saveable, size, max_size, count, highmem, pages = 0; | 1277 | unsigned long saveable, size, max_size, count, highmem, pages = 0; |
| 1262 | unsigned long alloc, save_highmem, pages_highmem; | 1278 | unsigned long alloc, save_highmem, pages_highmem, avail_normal; |
| 1263 | struct timeval start, stop; | 1279 | struct timeval start, stop; |
| 1264 | int error; | 1280 | int error; |
| 1265 | 1281 | ||
| @@ -1296,6 +1312,7 @@ int hibernate_preallocate_memory(void) | |||
| 1296 | else | 1312 | else |
| 1297 | count += zone_page_state(zone, NR_FREE_PAGES); | 1313 | count += zone_page_state(zone, NR_FREE_PAGES); |
| 1298 | } | 1314 | } |
| 1315 | avail_normal = count; | ||
| 1299 | count += highmem; | 1316 | count += highmem; |
| 1300 | count -= totalreserve_pages; | 1317 | count -= totalreserve_pages; |
| 1301 | 1318 | ||
| @@ -1310,12 +1327,21 @@ int hibernate_preallocate_memory(void) | |||
| 1310 | */ | 1327 | */ |
| 1311 | if (size >= saveable) { | 1328 | if (size >= saveable) { |
| 1312 | pages = preallocate_image_highmem(save_highmem); | 1329 | pages = preallocate_image_highmem(save_highmem); |
| 1313 | pages += preallocate_image_memory(saveable - pages); | 1330 | pages += preallocate_image_memory(saveable - pages, avail_normal); |
| 1314 | goto out; | 1331 | goto out; |
| 1315 | } | 1332 | } |
| 1316 | 1333 | ||
| 1317 | /* Estimate the minimum size of the image. */ | 1334 | /* Estimate the minimum size of the image. */ |
| 1318 | pages = minimum_image_size(saveable); | 1335 | pages = minimum_image_size(saveable); |
| 1336 | /* | ||
| 1337 | * To avoid excessive pressure on the normal zone, leave room in it to | ||
| 1338 | * accommodate an image of the minimum size (unless it's already too | ||
| 1339 | * small, in which case don't preallocate pages from it at all). | ||
| 1340 | */ | ||
| 1341 | if (avail_normal > pages) | ||
| 1342 | avail_normal -= pages; | ||
| 1343 | else | ||
| 1344 | avail_normal = 0; | ||
| 1319 | if (size < pages) | 1345 | if (size < pages) |
| 1320 | size = min_t(unsigned long, pages, max_size); | 1346 | size = min_t(unsigned long, pages, max_size); |
| 1321 | 1347 | ||
| @@ -1336,16 +1362,34 @@ int hibernate_preallocate_memory(void) | |||
| 1336 | */ | 1362 | */ |
| 1337 | pages_highmem = preallocate_image_highmem(highmem / 2); | 1363 | pages_highmem = preallocate_image_highmem(highmem / 2); |
| 1338 | alloc = (count - max_size) - pages_highmem; | 1364 | alloc = (count - max_size) - pages_highmem; |
| 1339 | pages = preallocate_image_memory(alloc); | 1365 | pages = preallocate_image_memory(alloc, avail_normal); |
| 1340 | if (pages < alloc) | 1366 | if (pages < alloc) { |
| 1341 | goto err_out; | 1367 | /* We have exhausted non-highmem pages, try highmem. */ |
| 1342 | size = max_size - size; | 1368 | alloc -= pages; |
| 1343 | alloc = size; | 1369 | pages += pages_highmem; |
| 1344 | size = preallocate_highmem_fraction(size, highmem, count); | 1370 | pages_highmem = preallocate_image_highmem(alloc); |
| 1345 | pages_highmem += size; | 1371 | if (pages_highmem < alloc) |
| 1346 | alloc -= size; | 1372 | goto err_out; |
| 1347 | pages += preallocate_image_memory(alloc); | 1373 | pages += pages_highmem; |
| 1348 | pages += pages_highmem; | 1374 | /* |
| 1375 | * size is the desired number of saveable pages to leave in | ||
| 1376 | * memory, so try to preallocate (all memory - size) pages. | ||
| 1377 | */ | ||
| 1378 | alloc = (count - pages) - size; | ||
| 1379 | pages += preallocate_image_highmem(alloc); | ||
| 1380 | } else { | ||
| 1381 | /* | ||
| 1382 | * There are approximately max_size saveable pages at this point | ||
| 1383 | * and we want to reduce this number down to size. | ||
| 1384 | */ | ||
| 1385 | alloc = max_size - size; | ||
| 1386 | size = preallocate_highmem_fraction(alloc, highmem, count); | ||
| 1387 | pages_highmem += size; | ||
| 1388 | alloc -= size; | ||
| 1389 | size = preallocate_image_memory(alloc, avail_normal); | ||
| 1390 | pages_highmem += preallocate_image_highmem(alloc - size); | ||
| 1391 | pages += pages_highmem + size; | ||
| 1392 | } | ||
| 1349 | 1393 | ||
| 1350 | /* | 1394 | /* |
| 1351 | * We only need as many page frames for the image as there are saveable | 1395 | * We only need as many page frames for the image as there are saveable |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 5d0059eed3e4..e6a5bdf61a37 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap) | |||
| 136 | { | 136 | { |
| 137 | unsigned long offset; | 137 | unsigned long offset; |
| 138 | 138 | ||
| 139 | offset = swp_offset(get_swap_for_hibernation(swap)); | 139 | offset = swp_offset(get_swap_page_of_type(swap)); |
| 140 | if (offset) { | 140 | if (offset) { |
| 141 | if (swsusp_extents_insert(offset)) | 141 | if (swsusp_extents_insert(offset)) |
| 142 | swap_free_for_hibernation(swp_entry(swap, offset)); | 142 | swap_free(swp_entry(swap, offset)); |
| 143 | else | 143 | else |
| 144 | return swapdev_block(swap, offset); | 144 | return swapdev_block(swap, offset); |
| 145 | } | 145 | } |
| @@ -163,7 +163,7 @@ void free_all_swap_pages(int swap) | |||
| 163 | ext = container_of(node, struct swsusp_extent, node); | 163 | ext = container_of(node, struct swsusp_extent, node); |
| 164 | rb_erase(node, &swsusp_extents); | 164 | rb_erase(node, &swsusp_extents); |
| 165 | for (offset = ext->start; offset <= ext->end; offset++) | 165 | for (offset = ext->start; offset <= ext->end; offset++) |
| 166 | swap_free_for_hibernation(swp_entry(swap, offset)); | 166 | swap_free(swp_entry(swap, offset)); |
| 167 | 167 | ||
| 168 | kfree(ext); | 168 | kfree(ext); |
| 169 | } | 169 | } |
diff --git a/kernel/printk.c b/kernel/printk.c index 8fe465ac008a..2531017795f6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); | |||
| 85 | * provides serialisation for access to the entire console | 85 | * provides serialisation for access to the entire console |
| 86 | * driver system. | 86 | * driver system. |
| 87 | */ | 87 | */ |
| 88 | static DECLARE_MUTEX(console_sem); | 88 | static DEFINE_SEMAPHORE(console_sem); |
| 89 | struct console *console_drivers; | 89 | struct console *console_drivers; |
| 90 | EXPORT_SYMBOL_GPL(console_drivers); | 90 | EXPORT_SYMBOL_GPL(console_drivers); |
| 91 | 91 | ||
| @@ -556,7 +556,7 @@ static void zap_locks(void) | |||
| 556 | /* If a crash is occurring, make sure we can't deadlock */ | 556 | /* If a crash is occurring, make sure we can't deadlock */ |
| 557 | spin_lock_init(&logbuf_lock); | 557 | spin_lock_init(&logbuf_lock); |
| 558 | /* And make sure that we print immediately */ | 558 | /* And make sure that we print immediately */ |
| 559 | init_MUTEX(&console_sem); | 559 | sema_init(&console_sem, 1); |
| 560 | } | 560 | } |
| 561 | 561 | ||
| 562 | #if defined(CONFIG_PRINTK_TIME) | 562 | #if defined(CONFIG_PRINTK_TIME) |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb36..a23a57a976d1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) | |||
| 73 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | 73 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); |
| 74 | 74 | ||
| 75 | /** | 75 | /** |
| 76 | * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? | 76 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? |
| 77 | * | 77 | * |
| 78 | * Check for bottom half being disabled, which covers both the | 78 | * Check for bottom half being disabled, which covers both the |
| 79 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses | 79 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses |
| 80 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) | 80 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) |
| 81 | * will show the situation. | 81 | * will show the situation. This is useful for debug checks in functions |
| 82 | * that require that they be called within an RCU read-side critical | ||
| 83 | * section. | ||
| 82 | * | 84 | * |
| 83 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. | 85 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. |
| 84 | */ | 86 | */ |
| @@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) | |||
| 86 | { | 88 | { |
| 87 | if (!debug_lockdep_rcu_enabled()) | 89 | if (!debug_lockdep_rcu_enabled()) |
| 88 | return 1; | 90 | return 1; |
| 89 | return in_softirq(); | 91 | return in_softirq() || irqs_disabled(); |
| 90 | } | 92 | } |
| 91 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 93 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
| 92 | 94 | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 196ec02f8be0..d806735342ac 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
| @@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; | |||
| 59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
| 60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 60 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
| 61 | 61 | ||
| 62 | /* Forward declarations for rcutiny_plugin.h. */ | ||
| 63 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | ||
| 64 | static void __call_rcu(struct rcu_head *head, | ||
| 65 | void (*func)(struct rcu_head *rcu), | ||
| 66 | struct rcu_ctrlblk *rcp); | ||
| 67 | |||
| 68 | #include "rcutiny_plugin.h" | ||
| 69 | |||
| 62 | #ifdef CONFIG_NO_HZ | 70 | #ifdef CONFIG_NO_HZ |
| 63 | 71 | ||
| 64 | static long rcu_dynticks_nesting = 1; | 72 | static long rcu_dynticks_nesting = 1; |
| @@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 140 | rcu_sched_qs(cpu); | 148 | rcu_sched_qs(cpu); |
| 141 | else if (!in_softirq()) | 149 | else if (!in_softirq()) |
| 142 | rcu_bh_qs(cpu); | 150 | rcu_bh_qs(cpu); |
| 151 | rcu_preempt_check_callbacks(); | ||
| 143 | } | 152 | } |
| 144 | 153 | ||
| 145 | /* | 154 | /* |
| @@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 162 | *rcp->donetail = NULL; | 171 | *rcp->donetail = NULL; |
| 163 | if (rcp->curtail == rcp->donetail) | 172 | if (rcp->curtail == rcp->donetail) |
| 164 | rcp->curtail = &rcp->rcucblist; | 173 | rcp->curtail = &rcp->rcucblist; |
| 174 | rcu_preempt_remove_callbacks(rcp); | ||
| 165 | rcp->donetail = &rcp->rcucblist; | 175 | rcp->donetail = &rcp->rcucblist; |
| 166 | local_irq_restore(flags); | 176 | local_irq_restore(flags); |
| 167 | 177 | ||
| @@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
| 182 | { | 192 | { |
| 183 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | 193 | __rcu_process_callbacks(&rcu_sched_ctrlblk); |
| 184 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | 194 | __rcu_process_callbacks(&rcu_bh_ctrlblk); |
| 195 | rcu_preempt_process_callbacks(); | ||
| 185 | } | 196 | } |
| 186 | 197 | ||
| 187 | /* | 198 | /* |
| @@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, | |||
| 223 | } | 234 | } |
| 224 | 235 | ||
| 225 | /* | 236 | /* |
| 226 | * Post an RCU callback to be invoked after the end of an RCU grace | 237 | * Post an RCU callback to be invoked after the end of an RCU-sched grace |
| 227 | * period. But since we have but one CPU, that would be after any | 238 | * period. But since we have but one CPU, that would be after any |
| 228 | * quiescent state. | 239 | * quiescent state. |
| 229 | */ | 240 | */ |
| 230 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 241 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
| 231 | { | 242 | { |
| 232 | __call_rcu(head, func, &rcu_sched_ctrlblk); | 243 | __call_rcu(head, func, &rcu_sched_ctrlblk); |
| 233 | } | 244 | } |
| 234 | EXPORT_SYMBOL_GPL(call_rcu); | 245 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
| 235 | 246 | ||
| 236 | /* | 247 | /* |
| 237 | * Post an RCU bottom-half callback to be invoked after any subsequent | 248 | * Post an RCU bottom-half callback to be invoked after any subsequent |
| @@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
| 243 | } | 254 | } |
| 244 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 255 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
| 245 | 256 | ||
| 246 | void rcu_barrier(void) | ||
| 247 | { | ||
| 248 | struct rcu_synchronize rcu; | ||
| 249 | |||
| 250 | init_rcu_head_on_stack(&rcu.head); | ||
| 251 | init_completion(&rcu.completion); | ||
| 252 | /* Will wake me after RCU finished. */ | ||
| 253 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
| 254 | /* Wait for it. */ | ||
| 255 | wait_for_completion(&rcu.completion); | ||
| 256 | destroy_rcu_head_on_stack(&rcu.head); | ||
| 257 | } | ||
| 258 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
| 259 | |||
| 260 | void rcu_barrier_bh(void) | 257 | void rcu_barrier_bh(void) |
| 261 | { | 258 | { |
| 262 | struct rcu_synchronize rcu; | 259 | struct rcu_synchronize rcu; |
| @@ -289,5 +286,3 @@ void __init rcu_init(void) | |||
| 289 | { | 286 | { |
| 290 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 287 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
| 291 | } | 288 | } |
| 292 | |||
| 293 | #include "rcutiny_plugin.h" | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc742..6ceca4f745ff 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | 2 | * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition |
| 3 | * Internal non-public definitions that provide either classic | 3 | * Internal non-public definitions that provide either classic |
| 4 | * or preemptable semantics. | 4 | * or preemptible semantics. |
| 5 | * | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
| @@ -17,11 +17,587 @@ | |||
| 17 | * along with this program; if not, write to the Free Software | 17 | * along with this program; if not, write to the Free Software |
| 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 19 | * | 19 | * |
| 20 | * Copyright IBM Corporation, 2009 | 20 | * Copyright (c) 2010 Linaro |
| 21 | * | 21 | * |
| 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
| 23 | */ | 23 | */ |
| 24 | 24 | ||
| 25 | #ifdef CONFIG_TINY_PREEMPT_RCU | ||
| 26 | |||
| 27 | #include <linux/delay.h> | ||
| 28 | |||
| 29 | /* Global control variables for preemptible RCU. */ | ||
| 30 | struct rcu_preempt_ctrlblk { | ||
| 31 | struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ | ||
| 32 | struct rcu_head **nexttail; | ||
| 33 | /* Tasks blocked in a preemptible RCU */ | ||
| 34 | /* read-side critical section while an */ | ||
| 35 | /* preemptible-RCU grace period is in */ | ||
| 36 | /* progress must wait for a later grace */ | ||
| 37 | /* period. This pointer points to the */ | ||
| 38 | /* ->next pointer of the last task that */ | ||
| 39 | /* must wait for a later grace period, or */ | ||
| 40 | /* to &->rcb.rcucblist if there is no */ | ||
| 41 | /* such task. */ | ||
| 42 | struct list_head blkd_tasks; | ||
| 43 | /* Tasks blocked in RCU read-side critical */ | ||
| 44 | /* section. Tasks are placed at the head */ | ||
| 45 | /* of this list and age towards the tail. */ | ||
| 46 | struct list_head *gp_tasks; | ||
| 47 | /* Pointer to the first task blocking the */ | ||
| 48 | /* current grace period, or NULL if there */ | ||
| 49 | /* is not such task. */ | ||
| 50 | struct list_head *exp_tasks; | ||
| 51 | /* Pointer to first task blocking the */ | ||
| 52 | /* current expedited grace period, or NULL */ | ||
| 53 | /* if there is no such task. If there */ | ||
| 54 | /* is no current expedited grace period, */ | ||
| 55 | /* then there cannot be any such task. */ | ||
| 56 | u8 gpnum; /* Current grace period. */ | ||
| 57 | u8 gpcpu; /* Last grace period blocked by the CPU. */ | ||
| 58 | u8 completed; /* Last grace period completed. */ | ||
| 59 | /* If all three are equal, RCU is idle. */ | ||
| 60 | }; | ||
| 61 | |||
| 62 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | ||
| 63 | .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
| 64 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
| 65 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, | ||
| 66 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), | ||
| 67 | }; | ||
| 68 | |||
| 69 | static int rcu_preempted_readers_exp(void); | ||
| 70 | static void rcu_report_exp_done(void); | ||
| 71 | |||
| 72 | /* | ||
| 73 | * Return true if the CPU has not yet responded to the current grace period. | ||
| 74 | */ | ||
| 75 | static int rcu_cpu_blocking_cur_gp(void) | ||
| 76 | { | ||
| 77 | return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; | ||
| 78 | } | ||
| 79 | |||
| 80 | /* | ||
| 81 | * Check for a running RCU reader. Because there is only one CPU, | ||
| 82 | * there can be but one running RCU reader at a time. ;-) | ||
| 83 | */ | ||
| 84 | static int rcu_preempt_running_reader(void) | ||
| 85 | { | ||
| 86 | return current->rcu_read_lock_nesting; | ||
| 87 | } | ||
| 88 | |||
| 89 | /* | ||
| 90 | * Check for preempted RCU readers blocking any grace period. | ||
| 91 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
| 92 | */ | ||
| 93 | static int rcu_preempt_blocked_readers_any(void) | ||
| 94 | { | ||
| 95 | return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); | ||
| 96 | } | ||
| 97 | |||
| 98 | /* | ||
| 99 | * Check for preempted RCU readers blocking the current grace period. | ||
| 100 | * If the caller needs a reliable answer, it must disable hard irqs. | ||
| 101 | */ | ||
| 102 | static int rcu_preempt_blocked_readers_cgp(void) | ||
| 103 | { | ||
| 104 | return rcu_preempt_ctrlblk.gp_tasks != NULL; | ||
| 105 | } | ||
| 106 | |||
| 107 | /* | ||
| 108 | * Return true if another preemptible-RCU grace period is needed. | ||
| 109 | */ | ||
| 110 | static int rcu_preempt_needs_another_gp(void) | ||
| 111 | { | ||
| 112 | return *rcu_preempt_ctrlblk.rcb.curtail != NULL; | ||
| 113 | } | ||
| 114 | |||
| 115 | /* | ||
| 116 | * Return true if a preemptible-RCU grace period is in progress. | ||
| 117 | * The caller must disable hardirqs. | ||
| 118 | */ | ||
| 119 | static int rcu_preempt_gp_in_progress(void) | ||
| 120 | { | ||
| 121 | return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; | ||
| 122 | } | ||
| 123 | |||
| 124 | /* | ||
| 125 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | ||
| 126 | * that this just means that the task currently running on the CPU is | ||
| 127 | * in a quiescent state. There might be any number of tasks blocked | ||
| 128 | * while in an RCU read-side critical section. | ||
| 129 | * | ||
| 130 | * Unlike the other rcu_*_qs() functions, callers to this function | ||
| 131 | * must disable irqs in order to protect the assignment to | ||
| 132 | * ->rcu_read_unlock_special. | ||
| 133 | * | ||
| 134 | * Because this is a single-CPU implementation, the only way a grace | ||
| 135 | * period can end is if the CPU is in a quiescent state. The reason is | ||
| 136 | * that a blocked preemptible-RCU reader can exit its critical section | ||
| 137 | * only if the CPU is running it at the time. Therefore, when the | ||
| 138 | * last task blocking the current grace period exits its RCU read-side | ||
| 139 | * critical section, neither the CPU nor blocked tasks will be stopping | ||
| 140 | * the current grace period. (In contrast, SMP implementations | ||
| 141 | * might have CPUs running in RCU read-side critical sections that | ||
| 142 | * block later grace periods -- but this is not possible given only | ||
| 143 | * one CPU.) | ||
| 144 | */ | ||
| 145 | static void rcu_preempt_cpu_qs(void) | ||
| 146 | { | ||
| 147 | /* Record both CPU and task as having responded to current GP. */ | ||
| 148 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; | ||
| 149 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | ||
| 150 | |||
| 151 | /* | ||
| 152 | * If there is no GP, or if blocked readers are still blocking GP, | ||
| 153 | * then there is nothing more to do. | ||
| 154 | */ | ||
| 155 | if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) | ||
| 156 | return; | ||
| 157 | |||
| 158 | /* Advance callbacks. */ | ||
| 159 | rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; | ||
| 160 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; | ||
| 161 | rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; | ||
| 162 | |||
| 163 | /* If there are no blocked readers, next GP is done instantly. */ | ||
| 164 | if (!rcu_preempt_blocked_readers_any()) | ||
| 165 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; | ||
| 166 | |||
| 167 | /* If there are done callbacks, make RCU_SOFTIRQ process them. */ | ||
| 168 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | ||
| 169 | raise_softirq(RCU_SOFTIRQ); | ||
| 170 | } | ||
| 171 | |||
| 172 | /* | ||
| 173 | * Start a new RCU grace period if warranted. Hard irqs must be disabled. | ||
| 174 | */ | ||
| 175 | static void rcu_preempt_start_gp(void) | ||
| 176 | { | ||
| 177 | if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { | ||
| 178 | |||
| 179 | /* Official start of GP. */ | ||
| 180 | rcu_preempt_ctrlblk.gpnum++; | ||
| 181 | |||
| 182 | /* Any blocked RCU readers block new GP. */ | ||
| 183 | if (rcu_preempt_blocked_readers_any()) | ||
| 184 | rcu_preempt_ctrlblk.gp_tasks = | ||
| 185 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
| 186 | |||
| 187 | /* If there is no running reader, CPU is done with GP. */ | ||
| 188 | if (!rcu_preempt_running_reader()) | ||
| 189 | rcu_preempt_cpu_qs(); | ||
| 190 | } | ||
| 191 | } | ||
| 192 | |||
| 193 | /* | ||
| 194 | * We have entered the scheduler, and the current task might soon be | ||
| 195 | * context-switched away from. If this task is in an RCU read-side | ||
| 196 | * critical section, we will no longer be able to rely on the CPU to | ||
| 197 | * record that fact, so we enqueue the task on the blkd_tasks list. | ||
| 198 | * If the task started after the current grace period began, as recorded | ||
| 199 | * by ->gpcpu, we enqueue at the beginning of the list. Otherwise | ||
| 200 | * before the element referenced by ->gp_tasks (or at the tail if | ||
| 201 | * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. | ||
| 202 | * The task will dequeue itself when it exits the outermost enclosing | ||
| 203 | * RCU read-side critical section. Therefore, the current grace period | ||
| 204 | * cannot be permitted to complete until the ->gp_tasks pointer becomes | ||
| 205 | * NULL. | ||
| 206 | * | ||
| 207 | * Caller must disable preemption. | ||
| 208 | */ | ||
| 209 | void rcu_preempt_note_context_switch(void) | ||
| 210 | { | ||
| 211 | struct task_struct *t = current; | ||
| 212 | unsigned long flags; | ||
| 213 | |||
| 214 | local_irq_save(flags); /* must exclude scheduler_tick(). */ | ||
| 215 | if (rcu_preempt_running_reader() && | ||
| 216 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | ||
| 217 | |||
| 218 | /* Possibly blocking in an RCU read-side critical section. */ | ||
| 219 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | ||
| 220 | |||
| 221 | /* | ||
| 222 | * If this CPU has already checked in, then this task | ||
| 223 | * will hold up the next grace period rather than the | ||
| 224 | * current grace period. Queue the task accordingly. | ||
| 225 | * If the task is queued for the current grace period | ||
| 226 | * (i.e., this CPU has not yet passed through a quiescent | ||
| 227 | * state for the current grace period), then as long | ||
| 228 | * as that task remains queued, the current grace period | ||
| 229 | * cannot end. | ||
| 230 | */ | ||
| 231 | list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); | ||
| 232 | if (rcu_cpu_blocking_cur_gp()) | ||
| 233 | rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; | ||
| 234 | } | ||
| 235 | |||
| 236 | /* | ||
| 237 | * Either we were not in an RCU read-side critical section to | ||
| 238 | * begin with, or we have now recorded that critical section | ||
| 239 | * globally. Either way, we can now note a quiescent state | ||
| 240 | * for this CPU. Again, if we were in an RCU read-side critical | ||
| 241 | * section, and if that critical section was blocking the current | ||
| 242 | * grace period, then the fact that the task has been enqueued | ||
| 243 | * means that current grace period continues to be blocked. | ||
| 244 | */ | ||
| 245 | rcu_preempt_cpu_qs(); | ||
| 246 | local_irq_restore(flags); | ||
| 247 | } | ||
| 248 | |||
| 249 | /* | ||
| 250 | * Tiny-preemptible RCU implementation for rcu_read_lock(). | ||
| 251 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
| 252 | * if we block. | ||
| 253 | */ | ||
| 254 | void __rcu_read_lock(void) | ||
| 255 | { | ||
| 256 | current->rcu_read_lock_nesting++; | ||
| 257 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ | ||
| 258 | } | ||
| 259 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
| 260 | |||
| 261 | /* | ||
| 262 | * Handle special cases during rcu_read_unlock(), such as needing to | ||
| 263 | * notify RCU core processing or task having blocked during the RCU | ||
| 264 | * read-side critical section. | ||
| 265 | */ | ||
| 266 | static void rcu_read_unlock_special(struct task_struct *t) | ||
| 267 | { | ||
| 268 | int empty; | ||
| 269 | int empty_exp; | ||
| 270 | unsigned long flags; | ||
| 271 | struct list_head *np; | ||
| 272 | int special; | ||
| 273 | |||
| 274 | /* | ||
| 275 | * NMI handlers cannot block and cannot safely manipulate state. | ||
| 276 | * They therefore cannot possibly be special, so just leave. | ||
| 277 | */ | ||
| 278 | if (in_nmi()) | ||
| 279 | return; | ||
| 280 | |||
| 281 | local_irq_save(flags); | ||
| 282 | |||
| 283 | /* | ||
| 284 | * If RCU core is waiting for this CPU to exit critical section, | ||
| 285 | * let it know that we have done so. | ||
| 286 | */ | ||
| 287 | special = t->rcu_read_unlock_special; | ||
| 288 | if (special & RCU_READ_UNLOCK_NEED_QS) | ||
| 289 | rcu_preempt_cpu_qs(); | ||
| 290 | |||
| 291 | /* Hardware IRQ handlers cannot block. */ | ||
| 292 | if (in_irq()) { | ||
| 293 | local_irq_restore(flags); | ||
| 294 | return; | ||
| 295 | } | ||
| 296 | |||
| 297 | /* Clean up if blocked during RCU read-side critical section. */ | ||
| 298 | if (special & RCU_READ_UNLOCK_BLOCKED) { | ||
| 299 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | ||
| 300 | |||
| 301 | /* | ||
| 302 | * Remove this task from the ->blkd_tasks list and adjust | ||
| 303 | * any pointers that might have been referencing it. | ||
| 304 | */ | ||
| 305 | empty = !rcu_preempt_blocked_readers_cgp(); | ||
| 306 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | ||
| 307 | np = t->rcu_node_entry.next; | ||
| 308 | if (np == &rcu_preempt_ctrlblk.blkd_tasks) | ||
| 309 | np = NULL; | ||
| 310 | list_del(&t->rcu_node_entry); | ||
| 311 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | ||
| 312 | rcu_preempt_ctrlblk.gp_tasks = np; | ||
| 313 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | ||
| 314 | rcu_preempt_ctrlblk.exp_tasks = np; | ||
| 315 | INIT_LIST_HEAD(&t->rcu_node_entry); | ||
| 316 | |||
| 317 | /* | ||
| 318 | * If this was the last task on the current list, and if | ||
| 319 | * we aren't waiting on the CPU, report the quiescent state | ||
| 320 | * and start a new grace period if needed. | ||
| 321 | */ | ||
| 322 | if (!empty && !rcu_preempt_blocked_readers_cgp()) { | ||
| 323 | rcu_preempt_cpu_qs(); | ||
| 324 | rcu_preempt_start_gp(); | ||
| 325 | } | ||
| 326 | |||
| 327 | /* | ||
| 328 | * If this was the last task on the expedited lists, | ||
| 329 | * then we need wake up the waiting task. | ||
| 330 | */ | ||
| 331 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
| 332 | rcu_report_exp_done(); | ||
| 333 | } | ||
| 334 | local_irq_restore(flags); | ||
| 335 | } | ||
| 336 | |||
| 337 | /* | ||
| 338 | * Tiny-preemptible RCU implementation for rcu_read_unlock(). | ||
| 339 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
| 340 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
| 341 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
| 342 | * in an RCU read-side critical section and other special cases. | ||
| 343 | */ | ||
| 344 | void __rcu_read_unlock(void) | ||
| 345 | { | ||
| 346 | struct task_struct *t = current; | ||
| 347 | |||
| 348 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ | ||
| 349 | --t->rcu_read_lock_nesting; | ||
| 350 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | ||
| 351 | if (t->rcu_read_lock_nesting == 0 && | ||
| 352 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
| 353 | rcu_read_unlock_special(t); | ||
| 354 | #ifdef CONFIG_PROVE_LOCKING | ||
| 355 | WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); | ||
| 356 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
| 357 | } | ||
| 358 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
| 359 | |||
| 360 | /* | ||
| 361 | * Check for a quiescent state from the current CPU. When a task blocks, | ||
| 362 | * the task is recorded in the rcu_preempt_ctrlblk structure, which is | ||
| 363 | * checked elsewhere. This is called from the scheduling-clock interrupt. | ||
| 364 | * | ||
| 365 | * Caller must disable hard irqs. | ||
| 366 | */ | ||
| 367 | static void rcu_preempt_check_callbacks(void) | ||
| 368 | { | ||
| 369 | struct task_struct *t = current; | ||
| 370 | |||
| 371 | if (rcu_preempt_gp_in_progress() && | ||
| 372 | (!rcu_preempt_running_reader() || | ||
| 373 | !rcu_cpu_blocking_cur_gp())) | ||
| 374 | rcu_preempt_cpu_qs(); | ||
| 375 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | ||
| 376 | rcu_preempt_ctrlblk.rcb.donetail) | ||
| 377 | raise_softirq(RCU_SOFTIRQ); | ||
| 378 | if (rcu_preempt_gp_in_progress() && | ||
| 379 | rcu_cpu_blocking_cur_gp() && | ||
| 380 | rcu_preempt_running_reader()) | ||
| 381 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | ||
| 382 | } | ||
| 383 | |||
| 384 | /* | ||
| 385 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to | ||
| 386 | * update, so this is invoked from __rcu_process_callbacks() to | ||
| 387 | * handle that case. Of course, it is invoked for all flavors of | ||
| 388 | * RCU, but RCU callbacks can appear only on one of the lists, and | ||
| 389 | * neither ->nexttail nor ->donetail can possibly be NULL, so there | ||
| 390 | * is no need for an explicit check. | ||
| 391 | */ | ||
| 392 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
| 393 | { | ||
| 394 | if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) | ||
| 395 | rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; | ||
| 396 | } | ||
| 397 | |||
| 398 | /* | ||
| 399 | * Process callbacks for preemptible RCU. | ||
| 400 | */ | ||
| 401 | static void rcu_preempt_process_callbacks(void) | ||
| 402 | { | ||
| 403 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | ||
| 404 | } | ||
| 405 | |||
| 406 | /* | ||
| 407 | * Queue a preemptible -RCU callback for invocation after a grace period. | ||
| 408 | */ | ||
| 409 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
| 410 | { | ||
| 411 | unsigned long flags; | ||
| 412 | |||
| 413 | debug_rcu_head_queue(head); | ||
| 414 | head->func = func; | ||
| 415 | head->next = NULL; | ||
| 416 | |||
| 417 | local_irq_save(flags); | ||
| 418 | *rcu_preempt_ctrlblk.nexttail = head; | ||
| 419 | rcu_preempt_ctrlblk.nexttail = &head->next; | ||
| 420 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ | ||
| 421 | local_irq_restore(flags); | ||
| 422 | } | ||
| 423 | EXPORT_SYMBOL_GPL(call_rcu); | ||
| 424 | |||
| 425 | void rcu_barrier(void) | ||
| 426 | { | ||
| 427 | struct rcu_synchronize rcu; | ||
| 428 | |||
| 429 | init_rcu_head_on_stack(&rcu.head); | ||
| 430 | init_completion(&rcu.completion); | ||
| 431 | /* Will wake me after RCU finished. */ | ||
| 432 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
| 433 | /* Wait for it. */ | ||
| 434 | wait_for_completion(&rcu.completion); | ||
| 435 | destroy_rcu_head_on_stack(&rcu.head); | ||
| 436 | } | ||
| 437 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
| 438 | |||
| 439 | /* | ||
| 440 | * synchronize_rcu - wait until a grace period has elapsed. | ||
| 441 | * | ||
| 442 | * Control will return to the caller some time after a full grace | ||
| 443 | * period has elapsed, in other words after all currently executing RCU | ||
| 444 | * read-side critical sections have completed. RCU read-side critical | ||
| 445 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
| 446 | * and may be nested. | ||
| 447 | */ | ||
| 448 | void synchronize_rcu(void) | ||
| 449 | { | ||
| 450 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 451 | if (!rcu_scheduler_active) | ||
| 452 | return; | ||
| 453 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 454 | |||
| 455 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
| 456 | if (!rcu_preempt_blocked_readers_any()) | ||
| 457 | return; | ||
| 458 | |||
| 459 | /* Once we get past the fastpath checks, same code as rcu_barrier(). */ | ||
| 460 | rcu_barrier(); | ||
| 461 | } | ||
| 462 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
| 463 | |||
| 464 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | ||
| 465 | static unsigned long sync_rcu_preempt_exp_count; | ||
| 466 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | ||
| 467 | |||
| 468 | /* | ||
| 469 | * Return non-zero if there are any tasks in RCU read-side critical | ||
| 470 | * sections blocking the current preemptible-RCU expedited grace period. | ||
| 471 | * If there is no preemptible-RCU expedited grace period currently in | ||
| 472 | * progress, returns zero unconditionally. | ||
| 473 | */ | ||
| 474 | static int rcu_preempted_readers_exp(void) | ||
| 475 | { | ||
| 476 | return rcu_preempt_ctrlblk.exp_tasks != NULL; | ||
| 477 | } | ||
| 478 | |||
| 479 | /* | ||
| 480 | * Report the exit from RCU read-side critical section for the last task | ||
| 481 | * that queued itself during or before the current expedited preemptible-RCU | ||
| 482 | * grace period. | ||
| 483 | */ | ||
| 484 | static void rcu_report_exp_done(void) | ||
| 485 | { | ||
| 486 | wake_up(&sync_rcu_preempt_exp_wq); | ||
| 487 | } | ||
| 488 | |||
| 489 | /* | ||
| 490 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | ||
| 491 | * is to rely in the fact that there is but one CPU, and that it is | ||
| 492 | * illegal for a task to invoke synchronize_rcu_expedited() while in a | ||
| 493 | * preemptible-RCU read-side critical section. Therefore, any such | ||
| 494 | * critical sections must correspond to blocked tasks, which must therefore | ||
| 495 | * be on the ->blkd_tasks list. So just record the current head of the | ||
| 496 | * list in the ->exp_tasks pointer, and wait for all tasks including and | ||
| 497 | * after the task pointed to by ->exp_tasks to drain. | ||
| 498 | */ | ||
| 499 | void synchronize_rcu_expedited(void) | ||
| 500 | { | ||
| 501 | unsigned long flags; | ||
| 502 | struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; | ||
| 503 | unsigned long snap; | ||
| 504 | |||
| 505 | barrier(); /* ensure prior action seen before grace period. */ | ||
| 506 | |||
| 507 | WARN_ON_ONCE(rcu_preempt_running_reader()); | ||
| 508 | |||
| 509 | /* | ||
| 510 | * Acquire lock so that there is only one preemptible RCU grace | ||
| 511 | * period in flight. Of course, if someone does the expedited | ||
| 512 | * grace period for us while we are acquiring the lock, just leave. | ||
| 513 | */ | ||
| 514 | snap = sync_rcu_preempt_exp_count + 1; | ||
| 515 | mutex_lock(&sync_rcu_preempt_exp_mutex); | ||
| 516 | if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) | ||
| 517 | goto unlock_mb_ret; /* Others did our work for us. */ | ||
| 518 | |||
| 519 | local_irq_save(flags); | ||
| 520 | |||
| 521 | /* | ||
| 522 | * All RCU readers have to already be on blkd_tasks because | ||
| 523 | * we cannot legally be executing in an RCU read-side critical | ||
| 524 | * section. | ||
| 525 | */ | ||
| 526 | |||
| 527 | /* Snapshot current head of ->blkd_tasks list. */ | ||
| 528 | rpcp->exp_tasks = rpcp->blkd_tasks.next; | ||
| 529 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) | ||
| 530 | rpcp->exp_tasks = NULL; | ||
| 531 | local_irq_restore(flags); | ||
| 532 | |||
| 533 | /* Wait for tail of ->blkd_tasks list to drain. */ | ||
| 534 | if (rcu_preempted_readers_exp()) | ||
| 535 | wait_event(sync_rcu_preempt_exp_wq, | ||
| 536 | !rcu_preempted_readers_exp()); | ||
| 537 | |||
| 538 | /* Clean up and exit. */ | ||
| 539 | barrier(); /* ensure expedited GP seen before counter increment. */ | ||
| 540 | sync_rcu_preempt_exp_count++; | ||
| 541 | unlock_mb_ret: | ||
| 542 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | ||
| 543 | barrier(); /* ensure subsequent action seen after grace period. */ | ||
| 544 | } | ||
| 545 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
| 546 | |||
| 547 | /* | ||
| 548 | * Does preemptible RCU need the CPU to stay out of dynticks mode? | ||
| 549 | */ | ||
| 550 | int rcu_preempt_needs_cpu(void) | ||
| 551 | { | ||
| 552 | if (!rcu_preempt_running_reader()) | ||
| 553 | rcu_preempt_cpu_qs(); | ||
| 554 | return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; | ||
| 555 | } | ||
| 556 | |||
| 557 | /* | ||
| 558 | * Check for a task exiting while in a preemptible -RCU read-side | ||
| 559 | * critical section, clean up if so. No need to issue warnings, | ||
| 560 | * as debug_check_no_locks_held() already does this if lockdep | ||
| 561 | * is enabled. | ||
| 562 | */ | ||
| 563 | void exit_rcu(void) | ||
| 564 | { | ||
| 565 | struct task_struct *t = current; | ||
| 566 | |||
| 567 | if (t->rcu_read_lock_nesting == 0) | ||
| 568 | return; | ||
| 569 | t->rcu_read_lock_nesting = 1; | ||
| 570 | rcu_read_unlock(); | ||
| 571 | } | ||
| 572 | |||
| 573 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
| 574 | |||
| 575 | /* | ||
| 576 | * Because preemptible RCU does not exist, it never has any callbacks | ||
| 577 | * to check. | ||
| 578 | */ | ||
| 579 | static void rcu_preempt_check_callbacks(void) | ||
| 580 | { | ||
| 581 | } | ||
| 582 | |||
| 583 | /* | ||
| 584 | * Because preemptible RCU does not exist, it never has any callbacks | ||
| 585 | * to remove. | ||
| 586 | */ | ||
| 587 | static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | ||
| 588 | { | ||
| 589 | } | ||
| 590 | |||
| 591 | /* | ||
| 592 | * Because preemptible RCU does not exist, it never has any callbacks | ||
| 593 | * to process. | ||
| 594 | */ | ||
| 595 | static void rcu_preempt_process_callbacks(void) | ||
| 596 | { | ||
| 597 | } | ||
| 598 | |||
| 599 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | ||
| 600 | |||
| 25 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 601 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 26 | 602 | ||
| 27 | #include <linux/kernel_stat.h> | 603 | #include <linux/kernel_stat.h> |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e2726d790b9..9d8e8fb2515f 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -120,7 +120,7 @@ struct rcu_torture { | |||
| 120 | }; | 120 | }; |
| 121 | 121 | ||
| 122 | static LIST_HEAD(rcu_torture_freelist); | 122 | static LIST_HEAD(rcu_torture_freelist); |
| 123 | static struct rcu_torture *rcu_torture_current; | 123 | static struct rcu_torture __rcu *rcu_torture_current; |
| 124 | static long rcu_torture_current_version; | 124 | static long rcu_torture_current_version; |
| 125 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 125 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
| 126 | static DEFINE_SPINLOCK(rcu_torture_lock); | 126 | static DEFINE_SPINLOCK(rcu_torture_lock); |
| @@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | |||
| 153 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ | 153 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ |
| 154 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ | 154 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ |
| 155 | static int fullstop = FULLSTOP_RMMOD; | 155 | static int fullstop = FULLSTOP_RMMOD; |
| 156 | DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ | 156 | /* |
| 157 | /* of kthreads. */ | 157 | * Protect fullstop transitions and spawning of kthreads. |
| 158 | */ | ||
| 159 | static DEFINE_MUTEX(fullstop_mutex); | ||
| 158 | 160 | ||
| 159 | /* | 161 | /* |
| 160 | * Detect and respond to a system shutdown. | 162 | * Detect and respond to a system shutdown. |
| @@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) | |||
| 303 | mdelay(longdelay_ms); | 305 | mdelay(longdelay_ms); |
| 304 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) | 306 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) |
| 305 | udelay(shortdelay_us); | 307 | udelay(shortdelay_us); |
| 308 | #ifdef CONFIG_PREEMPT | ||
| 309 | if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) | ||
| 310 | preempt_schedule(); /* No QS if preempt_disable() in effect */ | ||
| 311 | #endif | ||
| 306 | } | 312 | } |
| 307 | 313 | ||
| 308 | static void rcu_torture_read_unlock(int idx) __releases(RCU) | 314 | static void rcu_torture_read_unlock(int idx) __releases(RCU) |
| @@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) | |||
| 536 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); | 542 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); |
| 537 | if (!delay) | 543 | if (!delay) |
| 538 | schedule_timeout_interruptible(longdelay); | 544 | schedule_timeout_interruptible(longdelay); |
| 545 | else | ||
| 546 | rcu_read_delay(rrsp); | ||
| 539 | } | 547 | } |
| 540 | 548 | ||
| 541 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) | 549 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) |
| @@ -731,7 +739,8 @@ rcu_torture_writer(void *arg) | |||
| 731 | continue; | 739 | continue; |
| 732 | rp->rtort_pipe_count = 0; | 740 | rp->rtort_pipe_count = 0; |
| 733 | udelay(rcu_random(&rand) & 0x3ff); | 741 | udelay(rcu_random(&rand) & 0x3ff); |
| 734 | old_rp = rcu_torture_current; | 742 | old_rp = rcu_dereference_check(rcu_torture_current, |
| 743 | current == writer_task); | ||
| 735 | rp->rtort_mbtest = 1; | 744 | rp->rtort_mbtest = 1; |
| 736 | rcu_assign_pointer(rcu_torture_current, rp); | 745 | rcu_assign_pointer(rcu_torture_current, rp); |
| 737 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ | 746 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5bc43976c5a..ccdc04c47981 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -143,6 +143,11 @@ module_param(blimit, int, 0); | |||
| 143 | module_param(qhimark, int, 0); | 143 | module_param(qhimark, int, 0); |
| 144 | module_param(qlowmark, int, 0); | 144 | module_param(qlowmark, int, 0); |
| 145 | 145 | ||
| 146 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 147 | int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; | ||
| 148 | module_param(rcu_cpu_stall_suppress, int, 0644); | ||
| 149 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 150 | |||
| 146 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 151 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); |
| 147 | static int rcu_pending(int cpu); | 152 | static int rcu_pending(int cpu); |
| 148 | 153 | ||
| @@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
| 450 | 455 | ||
| 451 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 456 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
| 452 | 457 | ||
| 453 | int rcu_cpu_stall_panicking __read_mostly; | 458 | int rcu_cpu_stall_suppress __read_mostly; |
| 454 | 459 | ||
| 455 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 460 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
| 456 | { | 461 | { |
| @@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 482 | rcu_print_task_stall(rnp); | 487 | rcu_print_task_stall(rnp); |
| 483 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 488 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 484 | 489 | ||
| 485 | /* OK, time to rat on our buddy... */ | 490 | /* |
| 486 | 491 | * OK, time to rat on our buddy... | |
| 492 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
| 493 | * RCU CPU stall warnings. | ||
| 494 | */ | ||
| 487 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", | 495 | printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", |
| 488 | rsp->name); | 496 | rsp->name); |
| 489 | rcu_for_each_leaf_node(rsp, rnp) { | 497 | rcu_for_each_leaf_node(rsp, rnp) { |
| @@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 512 | unsigned long flags; | 520 | unsigned long flags; |
| 513 | struct rcu_node *rnp = rcu_get_root(rsp); | 521 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 514 | 522 | ||
| 523 | /* | ||
| 524 | * OK, time to rat on ourselves... | ||
| 525 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
| 526 | * RCU CPU stall warnings. | ||
| 527 | */ | ||
| 515 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", | 528 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", |
| 516 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); | 529 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); |
| 517 | trigger_all_cpu_backtrace(); | 530 | trigger_all_cpu_backtrace(); |
| @@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 530 | long delta; | 543 | long delta; |
| 531 | struct rcu_node *rnp; | 544 | struct rcu_node *rnp; |
| 532 | 545 | ||
| 533 | if (rcu_cpu_stall_panicking) | 546 | if (rcu_cpu_stall_suppress) |
| 534 | return; | 547 | return; |
| 535 | delta = jiffies - rsp->jiffies_stall; | 548 | delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); |
| 536 | rnp = rdp->mynode; | 549 | rnp = rdp->mynode; |
| 537 | if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { | 550 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { |
| 538 | 551 | ||
| 539 | /* We haven't checked in, so go dump stack. */ | 552 | /* We haven't checked in, so go dump stack. */ |
| 540 | print_cpu_stall(rsp); | 553 | print_cpu_stall(rsp); |
| @@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 548 | 561 | ||
| 549 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | 562 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) |
| 550 | { | 563 | { |
| 551 | rcu_cpu_stall_panicking = 1; | 564 | rcu_cpu_stall_suppress = 1; |
| 552 | return NOTIFY_DONE; | 565 | return NOTIFY_DONE; |
| 553 | } | 566 | } |
| 554 | 567 | ||
| 568 | /** | ||
| 569 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period | ||
| 570 | * | ||
| 571 | * Set the stall-warning timeout way off into the future, thus preventing | ||
| 572 | * any RCU CPU stall-warning messages from appearing in the current set of | ||
| 573 | * RCU grace periods. | ||
| 574 | * | ||
| 575 | * The caller must disable hard irqs. | ||
| 576 | */ | ||
| 577 | void rcu_cpu_stall_reset(void) | ||
| 578 | { | ||
| 579 | rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
| 580 | rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
| 581 | rcu_preempt_stall_reset(); | ||
| 582 | } | ||
| 583 | |||
| 555 | static struct notifier_block rcu_panic_block = { | 584 | static struct notifier_block rcu_panic_block = { |
| 556 | .notifier_call = rcu_panic, | 585 | .notifier_call = rcu_panic, |
| 557 | }; | 586 | }; |
| @@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 571 | { | 600 | { |
| 572 | } | 601 | } |
| 573 | 602 | ||
| 603 | void rcu_cpu_stall_reset(void) | ||
| 604 | { | ||
| 605 | } | ||
| 606 | |||
| 574 | static void __init check_cpu_stall_init(void) | 607 | static void __init check_cpu_stall_init(void) |
| 575 | { | 608 | { |
| 576 | } | 609 | } |
| @@ -712,7 +745,7 @@ static void | |||
| 712 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | 745 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) |
| 713 | __releases(rcu_get_root(rsp)->lock) | 746 | __releases(rcu_get_root(rsp)->lock) |
| 714 | { | 747 | { |
| 715 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 748 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
| 716 | struct rcu_node *rnp = rcu_get_root(rsp); | 749 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 717 | 750 | ||
| 718 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { | 751 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { |
| @@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 960 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | 993 | static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) |
| 961 | { | 994 | { |
| 962 | int i; | 995 | int i; |
| 963 | struct rcu_data *rdp = rsp->rda[smp_processor_id()]; | 996 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
| 964 | 997 | ||
| 965 | if (rdp->nxtlist == NULL) | 998 | if (rdp->nxtlist == NULL) |
| 966 | return; /* irqs disabled, so comparison is stable. */ | 999 | return; /* irqs disabled, so comparison is stable. */ |
| @@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |||
| 971 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1004 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
| 972 | rdp->nxttail[i] = &rdp->nxtlist; | 1005 | rdp->nxttail[i] = &rdp->nxtlist; |
| 973 | rsp->orphan_qlen += rdp->qlen; | 1006 | rsp->orphan_qlen += rdp->qlen; |
| 1007 | rdp->n_cbs_orphaned += rdp->qlen; | ||
| 974 | rdp->qlen = 0; | 1008 | rdp->qlen = 0; |
| 975 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | 1009 | raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ |
| 976 | } | 1010 | } |
| @@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
| 984 | struct rcu_data *rdp; | 1018 | struct rcu_data *rdp; |
| 985 | 1019 | ||
| 986 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1020 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| 987 | rdp = rsp->rda[smp_processor_id()]; | 1021 | rdp = this_cpu_ptr(rsp->rda); |
| 988 | if (rsp->orphan_cbs_list == NULL) { | 1022 | if (rsp->orphan_cbs_list == NULL) { |
| 989 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1023 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 990 | return; | 1024 | return; |
| @@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
| 992 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | 1026 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; |
| 993 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; | 1027 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; |
| 994 | rdp->qlen += rsp->orphan_qlen; | 1028 | rdp->qlen += rsp->orphan_qlen; |
| 1029 | rdp->n_cbs_adopted += rsp->orphan_qlen; | ||
| 995 | rsp->orphan_cbs_list = NULL; | 1030 | rsp->orphan_cbs_list = NULL; |
| 996 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | 1031 | rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; |
| 997 | rsp->orphan_qlen = 0; | 1032 | rsp->orphan_qlen = 0; |
| @@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
| 1007 | unsigned long flags; | 1042 | unsigned long flags; |
| 1008 | unsigned long mask; | 1043 | unsigned long mask; |
| 1009 | int need_report = 0; | 1044 | int need_report = 0; |
| 1010 | struct rcu_data *rdp = rsp->rda[cpu]; | 1045 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 1011 | struct rcu_node *rnp; | 1046 | struct rcu_node *rnp; |
| 1012 | 1047 | ||
| 1013 | /* Exclude any attempts to start a new grace period. */ | 1048 | /* Exclude any attempts to start a new grace period. */ |
| @@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1123 | 1158 | ||
| 1124 | /* Update count, and requeue any remaining callbacks. */ | 1159 | /* Update count, and requeue any remaining callbacks. */ |
| 1125 | rdp->qlen -= count; | 1160 | rdp->qlen -= count; |
| 1161 | rdp->n_cbs_invoked += count; | ||
| 1126 | if (list != NULL) { | 1162 | if (list != NULL) { |
| 1127 | *tail = rdp->nxtlist; | 1163 | *tail = rdp->nxtlist; |
| 1128 | rdp->nxtlist = list; | 1164 | rdp->nxtlist = list; |
| @@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
| 1226 | cpu = rnp->grplo; | 1262 | cpu = rnp->grplo; |
| 1227 | bit = 1; | 1263 | bit = 1; |
| 1228 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 1264 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
| 1229 | if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) | 1265 | if ((rnp->qsmask & bit) != 0 && |
| 1266 | f(per_cpu_ptr(rsp->rda, cpu))) | ||
| 1230 | mask |= bit; | 1267 | mask |= bit; |
| 1231 | } | 1268 | } |
| 1232 | if (mask != 0) { | 1269 | if (mask != 0) { |
| @@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1402 | * a quiescent state betweentimes. | 1439 | * a quiescent state betweentimes. |
| 1403 | */ | 1440 | */ |
| 1404 | local_irq_save(flags); | 1441 | local_irq_save(flags); |
| 1405 | rdp = rsp->rda[smp_processor_id()]; | 1442 | rdp = this_cpu_ptr(rsp->rda); |
| 1406 | rcu_process_gp_end(rsp, rdp); | 1443 | rcu_process_gp_end(rsp, rdp); |
| 1407 | check_for_new_grace_period(rsp, rdp); | 1444 | check_for_new_grace_period(rsp, rdp); |
| 1408 | 1445 | ||
| @@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 1701 | { | 1738 | { |
| 1702 | unsigned long flags; | 1739 | unsigned long flags; |
| 1703 | int i; | 1740 | int i; |
| 1704 | struct rcu_data *rdp = rsp->rda[cpu]; | 1741 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 1705 | struct rcu_node *rnp = rcu_get_root(rsp); | 1742 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1706 | 1743 | ||
| 1707 | /* Set up local state, ensuring consistent view of global state. */ | 1744 | /* Set up local state, ensuring consistent view of global state. */ |
| @@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
| 1729 | { | 1766 | { |
| 1730 | unsigned long flags; | 1767 | unsigned long flags; |
| 1731 | unsigned long mask; | 1768 | unsigned long mask; |
| 1732 | struct rcu_data *rdp = rsp->rda[cpu]; | 1769 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 1733 | struct rcu_node *rnp = rcu_get_root(rsp); | 1770 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1734 | 1771 | ||
| 1735 | /* Set up local state, ensuring consistent view of global state. */ | 1772 | /* Set up local state, ensuring consistent view of global state. */ |
| @@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
| 1865 | /* | 1902 | /* |
| 1866 | * Helper function for rcu_init() that initializes one rcu_state structure. | 1903 | * Helper function for rcu_init() that initializes one rcu_state structure. |
| 1867 | */ | 1904 | */ |
| 1868 | static void __init rcu_init_one(struct rcu_state *rsp) | 1905 | static void __init rcu_init_one(struct rcu_state *rsp, |
| 1906 | struct rcu_data __percpu *rda) | ||
| 1869 | { | 1907 | { |
| 1870 | static char *buf[] = { "rcu_node_level_0", | 1908 | static char *buf[] = { "rcu_node_level_0", |
| 1871 | "rcu_node_level_1", | 1909 | "rcu_node_level_1", |
| @@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
| 1918 | } | 1956 | } |
| 1919 | } | 1957 | } |
| 1920 | 1958 | ||
| 1959 | rsp->rda = rda; | ||
| 1921 | rnp = rsp->level[NUM_RCU_LVLS - 1]; | 1960 | rnp = rsp->level[NUM_RCU_LVLS - 1]; |
| 1922 | for_each_possible_cpu(i) { | 1961 | for_each_possible_cpu(i) { |
| 1923 | while (i > rnp->grphi) | 1962 | while (i > rnp->grphi) |
| 1924 | rnp++; | 1963 | rnp++; |
| 1925 | rsp->rda[i]->mynode = rnp; | 1964 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; |
| 1926 | rcu_boot_init_percpu_data(i, rsp); | 1965 | rcu_boot_init_percpu_data(i, rsp); |
| 1927 | } | 1966 | } |
| 1928 | } | 1967 | } |
| 1929 | 1968 | ||
| 1930 | /* | ||
| 1931 | * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used | ||
| 1932 | * nowhere else! Assigns leaf node pointers into each CPU's rcu_data | ||
| 1933 | * structure. | ||
| 1934 | */ | ||
| 1935 | #define RCU_INIT_FLAVOR(rsp, rcu_data) \ | ||
| 1936 | do { \ | ||
| 1937 | int i; \ | ||
| 1938 | \ | ||
| 1939 | for_each_possible_cpu(i) { \ | ||
| 1940 | (rsp)->rda[i] = &per_cpu(rcu_data, i); \ | ||
| 1941 | } \ | ||
| 1942 | rcu_init_one(rsp); \ | ||
| 1943 | } while (0) | ||
| 1944 | |||
| 1945 | void __init rcu_init(void) | 1969 | void __init rcu_init(void) |
| 1946 | { | 1970 | { |
| 1947 | int cpu; | 1971 | int cpu; |
| 1948 | 1972 | ||
| 1949 | rcu_bootup_announce(); | 1973 | rcu_bootup_announce(); |
| 1950 | RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); | 1974 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
| 1951 | RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); | 1975 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
| 1952 | __rcu_init_preempt(); | 1976 | __rcu_init_preempt(); |
| 1953 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 1977 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
| 1954 | 1978 | ||
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed0..91d4170c5c13 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -202,6 +202,9 @@ struct rcu_data { | |||
| 202 | long qlen; /* # of queued callbacks */ | 202 | long qlen; /* # of queued callbacks */ |
| 203 | long qlen_last_fqs_check; | 203 | long qlen_last_fqs_check; |
| 204 | /* qlen at last check for QS forcing */ | 204 | /* qlen at last check for QS forcing */ |
| 205 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | ||
| 206 | unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ | ||
| 207 | unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ | ||
| 205 | unsigned long n_force_qs_snap; | 208 | unsigned long n_force_qs_snap; |
| 206 | /* did other CPU force QS recently? */ | 209 | /* did other CPU force QS recently? */ |
| 207 | long blimit; /* Upper limit on a processed batch */ | 210 | long blimit; /* Upper limit on a processed batch */ |
| @@ -254,19 +257,23 @@ struct rcu_data { | |||
| 254 | #define RCU_STALL_DELAY_DELTA 0 | 257 | #define RCU_STALL_DELAY_DELTA 0 |
| 255 | #endif | 258 | #endif |
| 256 | 259 | ||
| 257 | #define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) | 260 | #define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ |
| 261 | RCU_STALL_DELAY_DELTA) | ||
| 258 | /* for rsp->jiffies_stall */ | 262 | /* for rsp->jiffies_stall */ |
| 259 | #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) | 263 | #define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) |
| 260 | /* for rsp->jiffies_stall */ | 264 | /* for rsp->jiffies_stall */ |
| 261 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ | 265 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ |
| 262 | /* to take at least one */ | 266 | /* to take at least one */ |
| 263 | /* scheduling clock irq */ | 267 | /* scheduling clock irq */ |
| 264 | /* before ratting on them. */ | 268 | /* before ratting on them. */ |
| 265 | 269 | ||
| 266 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 270 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE |
| 271 | #define RCU_CPU_STALL_SUPPRESS_INIT 0 | ||
| 272 | #else | ||
| 273 | #define RCU_CPU_STALL_SUPPRESS_INIT 1 | ||
| 274 | #endif | ||
| 267 | 275 | ||
| 268 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | 276 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
| 269 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | ||
| 270 | 277 | ||
| 271 | /* | 278 | /* |
| 272 | * RCU global state, including node hierarchy. This hierarchy is | 279 | * RCU global state, including node hierarchy. This hierarchy is |
| @@ -283,7 +290,7 @@ struct rcu_state { | |||
| 283 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ | 290 | struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ |
| 284 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | 291 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ |
| 285 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ | 292 | u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ |
| 286 | struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ | 293 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
| 287 | 294 | ||
| 288 | /* The following fields are guarded by the root rcu_node's lock. */ | 295 | /* The following fields are guarded by the root rcu_node's lock. */ |
| 289 | 296 | ||
| @@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | |||
| 365 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 372 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR |
| 366 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 373 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
| 367 | static void rcu_print_task_stall(struct rcu_node *rnp); | 374 | static void rcu_print_task_stall(struct rcu_node *rnp); |
| 375 | static void rcu_preempt_stall_reset(void); | ||
| 368 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 376 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
| 369 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 377 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
| 370 | #ifdef CONFIG_HOTPLUG_CPU | 378 | #ifdef CONFIG_HOTPLUG_CPU |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d9..71a4147473f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
| @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 57 | printk(KERN_INFO | 57 | printk(KERN_INFO |
| 58 | "\tRCU-based detection of stalled CPUs is disabled.\n"); | 58 | "\tRCU-based detection of stalled CPUs is disabled.\n"); |
| 59 | #endif | 59 | #endif |
| 60 | #ifndef CONFIG_RCU_CPU_STALL_VERBOSE | 60 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
| 61 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); | 61 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); |
| 62 | #endif | 62 | #endif |
| 63 | #if NUM_RCU_LVL_4 != 0 | 63 | #if NUM_RCU_LVL_4 != 0 |
| @@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 154 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 154 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
| 155 | 155 | ||
| 156 | /* Possibly blocking in an RCU read-side critical section. */ | 156 | /* Possibly blocking in an RCU read-side critical section. */ |
| 157 | rdp = rcu_preempt_state.rda[cpu]; | 157 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); |
| 158 | rnp = rdp->mynode; | 158 | rnp = rdp->mynode; |
| 159 | raw_spin_lock_irqsave(&rnp->lock, flags); | 159 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 160 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | 160 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; |
| @@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 201 | */ | 201 | */ |
| 202 | void __rcu_read_lock(void) | 202 | void __rcu_read_lock(void) |
| 203 | { | 203 | { |
| 204 | ACCESS_ONCE(current->rcu_read_lock_nesting)++; | 204 | current->rcu_read_lock_nesting++; |
| 205 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ | 205 | barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ |
| 206 | } | 206 | } |
| 207 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | 207 | EXPORT_SYMBOL_GPL(__rcu_read_lock); |
| @@ -344,7 +344,9 @@ void __rcu_read_unlock(void) | |||
| 344 | struct task_struct *t = current; | 344 | struct task_struct *t = current; |
| 345 | 345 | ||
| 346 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ | 346 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ |
| 347 | if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && | 347 | --t->rcu_read_lock_nesting; |
| 348 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | ||
| 349 | if (t->rcu_read_lock_nesting == 0 && | ||
| 348 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 350 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
| 349 | rcu_read_unlock_special(t); | 351 | rcu_read_unlock_special(t); |
| 350 | #ifdef CONFIG_PROVE_LOCKING | 352 | #ifdef CONFIG_PROVE_LOCKING |
| @@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) | |||
| 417 | } | 419 | } |
| 418 | } | 420 | } |
| 419 | 421 | ||
| 422 | /* | ||
| 423 | * Suppress preemptible RCU's CPU stall warnings by pushing the | ||
| 424 | * time of the next stall-warning message comfortably far into the | ||
| 425 | * future. | ||
| 426 | */ | ||
| 427 | static void rcu_preempt_stall_reset(void) | ||
| 428 | { | ||
| 429 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | ||
| 430 | } | ||
| 431 | |||
| 420 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 432 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
| 421 | 433 | ||
| 422 | /* | 434 | /* |
| @@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
| 546 | * | 558 | * |
| 547 | * Control will return to the caller some time after a full grace | 559 | * Control will return to the caller some time after a full grace |
| 548 | * period has elapsed, in other words after all currently executing RCU | 560 | * period has elapsed, in other words after all currently executing RCU |
| 549 | * read-side critical sections have completed. RCU read-side critical | 561 | * read-side critical sections have completed. Note, however, that |
| 550 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 562 | * upon return from synchronize_rcu(), the caller might well be executing |
| 551 | * and may be nested. | 563 | * concurrently with new RCU read-side critical sections that began while |
| 564 | * synchronize_rcu() was waiting. RCU read-side critical sections are | ||
| 565 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | ||
| 552 | */ | 566 | */ |
| 553 | void synchronize_rcu(void) | 567 | void synchronize_rcu(void) |
| 554 | { | 568 | { |
| @@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) | |||
| 771 | */ | 785 | */ |
| 772 | static void __init __rcu_init_preempt(void) | 786 | static void __init __rcu_init_preempt(void) |
| 773 | { | 787 | { |
| 774 | RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); | 788 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); |
| 775 | } | 789 | } |
| 776 | 790 | ||
| 777 | /* | 791 | /* |
| @@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) | |||
| 865 | { | 879 | { |
| 866 | } | 880 | } |
| 867 | 881 | ||
| 882 | /* | ||
| 883 | * Because preemptible RCU does not exist, there is no need to suppress | ||
| 884 | * its CPU stall warnings. | ||
| 885 | */ | ||
| 886 | static void rcu_preempt_stall_reset(void) | ||
| 887 | { | ||
| 888 | } | ||
| 889 | |||
| 868 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 890 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
| 869 | 891 | ||
| 870 | /* | 892 | /* |
| @@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void) | |||
| 919 | } | 941 | } |
| 920 | 942 | ||
| 921 | /* | 943 | /* |
| 922 | * In classic RCU, call_rcu() is just call_rcu_sched(). | ||
| 923 | */ | ||
| 924 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
| 925 | { | ||
| 926 | call_rcu_sched(head, func); | ||
| 927 | } | ||
| 928 | EXPORT_SYMBOL_GPL(call_rcu); | ||
| 929 | |||
| 930 | /* | ||
| 931 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 944 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
| 932 | * But because preemptable RCU does not exist, map to rcu-sched. | 945 | * But because preemptable RCU does not exist, map to rcu-sched. |
| 933 | */ | 946 | */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738e..d15430b9d122 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
| @@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
| 64 | rdp->dynticks_fqs); | 64 | rdp->dynticks_fqs); |
| 65 | #endif /* #ifdef CONFIG_NO_HZ */ | 65 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
| 67 | seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); | 67 | seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); |
| 68 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | ||
| 69 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
| 68 | } | 70 | } |
| 69 | 71 | ||
| 70 | #define PRINT_RCU_DATA(name, func, m) \ | 72 | #define PRINT_RCU_DATA(name, func, m) \ |
| @@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
| 119 | rdp->dynticks_fqs); | 121 | rdp->dynticks_fqs); |
| 120 | #endif /* #ifdef CONFIG_NO_HZ */ | 122 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 121 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 123 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
| 122 | seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); | 124 | seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); |
| 125 | seq_printf(m, ",%lu,%lu,%lu\n", | ||
| 126 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
| 123 | } | 127 | } |
| 124 | 128 | ||
| 125 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 129 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
| @@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
| 128 | #ifdef CONFIG_NO_HZ | 132 | #ifdef CONFIG_NO_HZ |
| 129 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); | 133 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); |
| 130 | #endif /* #ifdef CONFIG_NO_HZ */ | 134 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 131 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); | 135 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); |
| 132 | #ifdef CONFIG_TREE_PREEMPT_RCU | 136 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| 133 | seq_puts(m, "\"rcu_preempt:\"\n"); | 137 | seq_puts(m, "\"rcu_preempt:\"\n"); |
| 134 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); | 138 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); |
| @@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) | |||
| 262 | struct rcu_data *rdp; | 266 | struct rcu_data *rdp; |
| 263 | 267 | ||
| 264 | for_each_possible_cpu(cpu) { | 268 | for_each_possible_cpu(cpu) { |
| 265 | rdp = rsp->rda[cpu]; | 269 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 266 | if (rdp->beenonline) | 270 | if (rdp->beenonline) |
| 267 | print_one_rcu_pending(m, rdp); | 271 | print_one_rcu_pending(m, rdp); |
| 268 | } | 272 | } |
diff --git a/kernel/sched.c b/kernel/sched.c index 09b574e7f4df..d42992bccdfa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -426,9 +426,7 @@ struct root_domain { | |||
| 426 | */ | 426 | */ |
| 427 | cpumask_var_t rto_mask; | 427 | cpumask_var_t rto_mask; |
| 428 | atomic_t rto_count; | 428 | atomic_t rto_count; |
| 429 | #ifdef CONFIG_SMP | ||
| 430 | struct cpupri cpupri; | 429 | struct cpupri cpupri; |
| 431 | #endif | ||
| 432 | }; | 430 | }; |
| 433 | 431 | ||
| 434 | /* | 432 | /* |
| @@ -437,7 +435,7 @@ struct root_domain { | |||
| 437 | */ | 435 | */ |
| 438 | static struct root_domain def_root_domain; | 436 | static struct root_domain def_root_domain; |
| 439 | 437 | ||
| 440 | #endif | 438 | #endif /* CONFIG_SMP */ |
| 441 | 439 | ||
| 442 | /* | 440 | /* |
| 443 | * This is the main, per-CPU runqueue data structure. | 441 | * This is the main, per-CPU runqueue data structure. |
| @@ -488,11 +486,12 @@ struct rq { | |||
| 488 | */ | 486 | */ |
| 489 | unsigned long nr_uninterruptible; | 487 | unsigned long nr_uninterruptible; |
| 490 | 488 | ||
| 491 | struct task_struct *curr, *idle; | 489 | struct task_struct *curr, *idle, *stop; |
| 492 | unsigned long next_balance; | 490 | unsigned long next_balance; |
| 493 | struct mm_struct *prev_mm; | 491 | struct mm_struct *prev_mm; |
| 494 | 492 | ||
| 495 | u64 clock; | 493 | u64 clock; |
| 494 | u64 clock_task; | ||
| 496 | 495 | ||
| 497 | atomic_t nr_iowait; | 496 | atomic_t nr_iowait; |
| 498 | 497 | ||
| @@ -520,6 +519,10 @@ struct rq { | |||
| 520 | u64 avg_idle; | 519 | u64 avg_idle; |
| 521 | #endif | 520 | #endif |
| 522 | 521 | ||
| 522 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 523 | u64 prev_irq_time; | ||
| 524 | #endif | ||
| 525 | |||
| 523 | /* calc_load related fields */ | 526 | /* calc_load related fields */ |
| 524 | unsigned long calc_load_update; | 527 | unsigned long calc_load_update; |
| 525 | long calc_load_active; | 528 | long calc_load_active; |
| @@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
| 643 | 646 | ||
| 644 | #endif /* CONFIG_CGROUP_SCHED */ | 647 | #endif /* CONFIG_CGROUP_SCHED */ |
| 645 | 648 | ||
| 649 | static u64 irq_time_cpu(int cpu); | ||
| 650 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
| 651 | |||
| 646 | inline void update_rq_clock(struct rq *rq) | 652 | inline void update_rq_clock(struct rq *rq) |
| 647 | { | 653 | { |
| 648 | if (!rq->skip_clock_update) | 654 | if (!rq->skip_clock_update) { |
| 649 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 655 | int cpu = cpu_of(rq); |
| 656 | u64 irq_time; | ||
| 657 | |||
| 658 | rq->clock = sched_clock_cpu(cpu); | ||
| 659 | irq_time = irq_time_cpu(cpu); | ||
| 660 | if (rq->clock - irq_time > rq->clock_task) | ||
| 661 | rq->clock_task = rq->clock - irq_time; | ||
| 662 | |||
| 663 | sched_irq_time_avg_update(rq, irq_time); | ||
| 664 | } | ||
| 650 | } | 665 | } |
| 651 | 666 | ||
| 652 | /* | 667 | /* |
| @@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 723 | size_t cnt, loff_t *ppos) | 738 | size_t cnt, loff_t *ppos) |
| 724 | { | 739 | { |
| 725 | char buf[64]; | 740 | char buf[64]; |
| 726 | char *cmp = buf; | 741 | char *cmp; |
| 727 | int neg = 0; | 742 | int neg = 0; |
| 728 | int i; | 743 | int i; |
| 729 | 744 | ||
| @@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 734 | return -EFAULT; | 749 | return -EFAULT; |
| 735 | 750 | ||
| 736 | buf[cnt] = 0; | 751 | buf[cnt] = 0; |
| 752 | cmp = strstrip(buf); | ||
| 737 | 753 | ||
| 738 | if (strncmp(buf, "NO_", 3) == 0) { | 754 | if (strncmp(buf, "NO_", 3) == 0) { |
| 739 | neg = 1; | 755 | neg = 1; |
| @@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 741 | } | 757 | } |
| 742 | 758 | ||
| 743 | for (i = 0; sched_feat_names[i]; i++) { | 759 | for (i = 0; sched_feat_names[i]; i++) { |
| 744 | int len = strlen(sched_feat_names[i]); | 760 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
| 745 | |||
| 746 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
| 747 | if (neg) | 761 | if (neg) |
| 748 | sysctl_sched_features &= ~(1UL << i); | 762 | sysctl_sched_features &= ~(1UL << i); |
| 749 | else | 763 | else |
| @@ -1294,6 +1308,10 @@ static void resched_task(struct task_struct *p) | |||
| 1294 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1308 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
| 1295 | { | 1309 | { |
| 1296 | } | 1310 | } |
| 1311 | |||
| 1312 | static void sched_avg_update(struct rq *rq) | ||
| 1313 | { | ||
| 1314 | } | ||
| 1297 | #endif /* CONFIG_SMP */ | 1315 | #endif /* CONFIG_SMP */ |
| 1298 | 1316 | ||
| 1299 | #if BITS_PER_LONG == 32 | 1317 | #if BITS_PER_LONG == 32 |
| @@ -1836,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
| 1836 | 1854 | ||
| 1837 | static const struct sched_class rt_sched_class; | 1855 | static const struct sched_class rt_sched_class; |
| 1838 | 1856 | ||
| 1839 | #define sched_class_highest (&rt_sched_class) | 1857 | #define sched_class_highest (&stop_sched_class) |
| 1840 | #define for_each_class(class) \ | 1858 | #define for_each_class(class) \ |
| 1841 | for (class = sched_class_highest; class; class = class->next) | 1859 | for (class = sched_class_highest; class; class = class->next) |
| 1842 | 1860 | ||
| @@ -1854,12 +1872,6 @@ static void dec_nr_running(struct rq *rq) | |||
| 1854 | 1872 | ||
| 1855 | static void set_load_weight(struct task_struct *p) | 1873 | static void set_load_weight(struct task_struct *p) |
| 1856 | { | 1874 | { |
| 1857 | if (task_has_rt_policy(p)) { | ||
| 1858 | p->se.load.weight = 0; | ||
| 1859 | p->se.load.inv_weight = WMULT_CONST; | ||
| 1860 | return; | ||
| 1861 | } | ||
| 1862 | |||
| 1863 | /* | 1875 | /* |
| 1864 | * SCHED_IDLE tasks get minimal weight: | 1876 | * SCHED_IDLE tasks get minimal weight: |
| 1865 | */ | 1877 | */ |
| @@ -1913,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1913 | dec_nr_running(rq); | 1925 | dec_nr_running(rq); |
| 1914 | } | 1926 | } |
| 1915 | 1927 | ||
| 1928 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 1929 | |||
| 1930 | /* | ||
| 1931 | * There are no locks covering percpu hardirq/softirq time. | ||
| 1932 | * They are only modified in account_system_vtime, on corresponding CPU | ||
| 1933 | * with interrupts disabled. So, writes are safe. | ||
| 1934 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
| 1935 | * This may result in other CPU reading this CPU's irq time and can | ||
| 1936 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
| 1937 | * or new value (or semi updated value on 32 bit) with a side effect of | ||
| 1938 | * accounting a slice of irq time to wrong task when irq is in progress | ||
| 1939 | * while we read rq->clock. That is a worthy compromise in place of having | ||
| 1940 | * locks on each irq in account_system_time. | ||
| 1941 | */ | ||
| 1942 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
| 1943 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
| 1944 | |||
| 1945 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
| 1946 | static int sched_clock_irqtime; | ||
| 1947 | |||
| 1948 | void enable_sched_clock_irqtime(void) | ||
| 1949 | { | ||
| 1950 | sched_clock_irqtime = 1; | ||
| 1951 | } | ||
| 1952 | |||
| 1953 | void disable_sched_clock_irqtime(void) | ||
| 1954 | { | ||
| 1955 | sched_clock_irqtime = 0; | ||
| 1956 | } | ||
| 1957 | |||
| 1958 | static u64 irq_time_cpu(int cpu) | ||
| 1959 | { | ||
| 1960 | if (!sched_clock_irqtime) | ||
| 1961 | return 0; | ||
| 1962 | |||
| 1963 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
| 1964 | } | ||
| 1965 | |||
| 1966 | void account_system_vtime(struct task_struct *curr) | ||
| 1967 | { | ||
| 1968 | unsigned long flags; | ||
| 1969 | int cpu; | ||
| 1970 | u64 now, delta; | ||
| 1971 | |||
| 1972 | if (!sched_clock_irqtime) | ||
| 1973 | return; | ||
| 1974 | |||
| 1975 | local_irq_save(flags); | ||
| 1976 | |||
| 1977 | cpu = smp_processor_id(); | ||
| 1978 | now = sched_clock_cpu(cpu); | ||
| 1979 | delta = now - per_cpu(irq_start_time, cpu); | ||
| 1980 | per_cpu(irq_start_time, cpu) = now; | ||
| 1981 | /* | ||
| 1982 | * We do not account for softirq time from ksoftirqd here. | ||
| 1983 | * We want to continue accounting softirq time to ksoftirqd thread | ||
| 1984 | * in that case, so as not to confuse scheduler with a special task | ||
| 1985 | * that do not consume any time, but still wants to run. | ||
| 1986 | */ | ||
| 1987 | if (hardirq_count()) | ||
| 1988 | per_cpu(cpu_hardirq_time, cpu) += delta; | ||
| 1989 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | ||
| 1990 | per_cpu(cpu_softirq_time, cpu) += delta; | ||
| 1991 | |||
| 1992 | local_irq_restore(flags); | ||
| 1993 | } | ||
| 1994 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
| 1995 | |||
| 1996 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | ||
| 1997 | { | ||
| 1998 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | ||
| 1999 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | ||
| 2000 | rq->prev_irq_time = curr_irq_time; | ||
| 2001 | sched_rt_avg_update(rq, delta_irq); | ||
| 2002 | } | ||
| 2003 | } | ||
| 2004 | |||
| 2005 | #else | ||
| 2006 | |||
| 2007 | static u64 irq_time_cpu(int cpu) | ||
| 2008 | { | ||
| 2009 | return 0; | ||
| 2010 | } | ||
| 2011 | |||
| 2012 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | ||
| 2013 | |||
| 2014 | #endif | ||
| 2015 | |||
| 1916 | #include "sched_idletask.c" | 2016 | #include "sched_idletask.c" |
| 1917 | #include "sched_fair.c" | 2017 | #include "sched_fair.c" |
| 1918 | #include "sched_rt.c" | 2018 | #include "sched_rt.c" |
| 2019 | #include "sched_stoptask.c" | ||
| 1919 | #ifdef CONFIG_SCHED_DEBUG | 2020 | #ifdef CONFIG_SCHED_DEBUG |
| 1920 | # include "sched_debug.c" | 2021 | # include "sched_debug.c" |
| 1921 | #endif | 2022 | #endif |
| 1922 | 2023 | ||
| 2024 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
| 2025 | { | ||
| 2026 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
| 2027 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
| 2028 | |||
| 2029 | if (stop) { | ||
| 2030 | /* | ||
| 2031 | * Make it appear like a SCHED_FIFO task, its something | ||
| 2032 | * userspace knows about and won't get confused about. | ||
| 2033 | * | ||
| 2034 | * Also, it will make PI more or less work without too | ||
| 2035 | * much confusion -- but then, stop work should not | ||
| 2036 | * rely on PI working anyway. | ||
| 2037 | */ | ||
| 2038 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
| 2039 | |||
| 2040 | stop->sched_class = &stop_sched_class; | ||
| 2041 | } | ||
| 2042 | |||
| 2043 | cpu_rq(cpu)->stop = stop; | ||
| 2044 | |||
| 2045 | if (old_stop) { | ||
| 2046 | /* | ||
| 2047 | * Reset it back to a normal scheduling class so that | ||
| 2048 | * it can die in pieces. | ||
| 2049 | */ | ||
| 2050 | old_stop->sched_class = &rt_sched_class; | ||
| 2051 | } | ||
| 2052 | } | ||
| 2053 | |||
| 1923 | /* | 2054 | /* |
| 1924 | * __normal_prio - return the priority that is based on the static prio | 2055 | * __normal_prio - return the priority that is based on the static prio |
| 1925 | */ | 2056 | */ |
| @@ -1999,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 1999 | if (p->sched_class != &fair_sched_class) | 2130 | if (p->sched_class != &fair_sched_class) |
| 2000 | return 0; | 2131 | return 0; |
| 2001 | 2132 | ||
| 2133 | if (unlikely(p->policy == SCHED_IDLE)) | ||
| 2134 | return 0; | ||
| 2135 | |||
| 2002 | /* | 2136 | /* |
| 2003 | * Buddy candidates are cache hot: | 2137 | * Buddy candidates are cache hot: |
| 2004 | */ | 2138 | */ |
| @@ -2848,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2848 | */ | 2982 | */ |
| 2849 | arch_start_context_switch(prev); | 2983 | arch_start_context_switch(prev); |
| 2850 | 2984 | ||
| 2851 | if (likely(!mm)) { | 2985 | if (!mm) { |
| 2852 | next->active_mm = oldmm; | 2986 | next->active_mm = oldmm; |
| 2853 | atomic_inc(&oldmm->mm_count); | 2987 | atomic_inc(&oldmm->mm_count); |
| 2854 | enter_lazy_tlb(oldmm, next); | 2988 | enter_lazy_tlb(oldmm, next); |
| 2855 | } else | 2989 | } else |
| 2856 | switch_mm(oldmm, mm, next); | 2990 | switch_mm(oldmm, mm, next); |
| 2857 | 2991 | ||
| 2858 | if (likely(!prev->mm)) { | 2992 | if (!prev->mm) { |
| 2859 | prev->active_mm = NULL; | 2993 | prev->active_mm = NULL; |
| 2860 | rq->prev_mm = oldmm; | 2994 | rq->prev_mm = oldmm; |
| 2861 | } | 2995 | } |
| @@ -3182,6 +3316,8 @@ static void update_cpu_load(struct rq *this_rq) | |||
| 3182 | 3316 | ||
| 3183 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | 3317 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; |
| 3184 | } | 3318 | } |
| 3319 | |||
| 3320 | sched_avg_update(this_rq); | ||
| 3185 | } | 3321 | } |
| 3186 | 3322 | ||
| 3187 | static void update_cpu_load_active(struct rq *this_rq) | 3323 | static void update_cpu_load_active(struct rq *this_rq) |
| @@ -3242,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
| 3242 | 3378 | ||
| 3243 | if (task_current(rq, p)) { | 3379 | if (task_current(rq, p)) { |
| 3244 | update_rq_clock(rq); | 3380 | update_rq_clock(rq); |
| 3245 | ns = rq->clock - p->se.exec_start; | 3381 | ns = rq->clock_task - p->se.exec_start; |
| 3246 | if ((s64)ns < 0) | 3382 | if ((s64)ns < 0) |
| 3247 | ns = 0; | 3383 | ns = 0; |
| 3248 | } | 3384 | } |
| @@ -3391,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 3391 | tmp = cputime_to_cputime64(cputime); | 3527 | tmp = cputime_to_cputime64(cputime); |
| 3392 | if (hardirq_count() - hardirq_offset) | 3528 | if (hardirq_count() - hardirq_offset) |
| 3393 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3529 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
| 3394 | else if (softirq_count()) | 3530 | else if (in_serving_softirq()) |
| 3395 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3531 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
| 3396 | else | 3532 | else |
| 3397 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3533 | cpustat->system = cputime64_add(cpustat->system, tmp); |
| @@ -3507,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
| 3507 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 3643 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
| 3508 | 3644 | ||
| 3509 | if (total) { | 3645 | if (total) { |
| 3510 | u64 temp; | 3646 | u64 temp = rtime; |
| 3511 | 3647 | ||
| 3512 | temp = (u64)(rtime * utime); | 3648 | temp *= utime; |
| 3513 | do_div(temp, total); | 3649 | do_div(temp, total); |
| 3514 | utime = (cputime_t)temp; | 3650 | utime = (cputime_t)temp; |
| 3515 | } else | 3651 | } else |
| @@ -3540,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
| 3540 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 3676 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
| 3541 | 3677 | ||
| 3542 | if (total) { | 3678 | if (total) { |
| 3543 | u64 temp; | 3679 | u64 temp = rtime; |
| 3544 | 3680 | ||
| 3545 | temp = (u64)(rtime * cputime.utime); | 3681 | temp *= cputime.utime; |
| 3546 | do_div(temp, total); | 3682 | do_div(temp, total); |
| 3547 | utime = (cputime_t)temp; | 3683 | utime = (cputime_t)temp; |
| 3548 | } else | 3684 | } else |
| @@ -3578,7 +3714,7 @@ void scheduler_tick(void) | |||
| 3578 | curr->sched_class->task_tick(rq, curr, 0); | 3714 | curr->sched_class->task_tick(rq, curr, 0); |
| 3579 | raw_spin_unlock(&rq->lock); | 3715 | raw_spin_unlock(&rq->lock); |
| 3580 | 3716 | ||
| 3581 | perf_event_task_tick(curr); | 3717 | perf_event_task_tick(); |
| 3582 | 3718 | ||
| 3583 | #ifdef CONFIG_SMP | 3719 | #ifdef CONFIG_SMP |
| 3584 | rq->idle_at_tick = idle_cpu(cpu); | 3720 | rq->idle_at_tick = idle_cpu(cpu); |
| @@ -3717,17 +3853,13 @@ pick_next_task(struct rq *rq) | |||
| 3717 | return p; | 3853 | return p; |
| 3718 | } | 3854 | } |
| 3719 | 3855 | ||
| 3720 | class = sched_class_highest; | 3856 | for_each_class(class) { |
| 3721 | for ( ; ; ) { | ||
| 3722 | p = class->pick_next_task(rq); | 3857 | p = class->pick_next_task(rq); |
| 3723 | if (p) | 3858 | if (p) |
| 3724 | return p; | 3859 | return p; |
| 3725 | /* | ||
| 3726 | * Will never be NULL as the idle class always | ||
| 3727 | * returns a non-NULL p: | ||
| 3728 | */ | ||
| 3729 | class = class->next; | ||
| 3730 | } | 3860 | } |
| 3861 | |||
| 3862 | BUG(); /* the idle class will always have a runnable task */ | ||
| 3731 | } | 3863 | } |
| 3732 | 3864 | ||
| 3733 | /* | 3865 | /* |
| @@ -4352,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4352 | 4484 | ||
| 4353 | rq = task_rq_lock(p, &flags); | 4485 | rq = task_rq_lock(p, &flags); |
| 4354 | 4486 | ||
| 4487 | trace_sched_pi_setprio(p, prio); | ||
| 4355 | oldprio = p->prio; | 4488 | oldprio = p->prio; |
| 4356 | prev_class = p->sched_class; | 4489 | prev_class = p->sched_class; |
| 4357 | on_rq = p->se.on_rq; | 4490 | on_rq = p->se.on_rq; |
| @@ -4639,7 +4772,7 @@ recheck: | |||
| 4639 | } | 4772 | } |
| 4640 | 4773 | ||
| 4641 | if (user) { | 4774 | if (user) { |
| 4642 | retval = security_task_setscheduler(p, policy, param); | 4775 | retval = security_task_setscheduler(p); |
| 4643 | if (retval) | 4776 | if (retval) |
| 4644 | return retval; | 4777 | return retval; |
| 4645 | } | 4778 | } |
| @@ -4655,6 +4788,15 @@ recheck: | |||
| 4655 | */ | 4788 | */ |
| 4656 | rq = __task_rq_lock(p); | 4789 | rq = __task_rq_lock(p); |
| 4657 | 4790 | ||
| 4791 | /* | ||
| 4792 | * Changing the policy of the stop threads its a very bad idea | ||
| 4793 | */ | ||
| 4794 | if (p == rq->stop) { | ||
| 4795 | __task_rq_unlock(rq); | ||
| 4796 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 4797 | return -EINVAL; | ||
| 4798 | } | ||
| 4799 | |||
| 4658 | #ifdef CONFIG_RT_GROUP_SCHED | 4800 | #ifdef CONFIG_RT_GROUP_SCHED |
| 4659 | if (user) { | 4801 | if (user) { |
| 4660 | /* | 4802 | /* |
| @@ -4881,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 4881 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5023 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) |
| 4882 | goto out_unlock; | 5024 | goto out_unlock; |
| 4883 | 5025 | ||
| 4884 | retval = security_task_setscheduler(p, 0, NULL); | 5026 | retval = security_task_setscheduler(p); |
| 4885 | if (retval) | 5027 | if (retval) |
| 4886 | goto out_unlock; | 5028 | goto out_unlock; |
| 4887 | 5029 | ||
| 4888 | cpuset_cpus_allowed(p, cpus_allowed); | 5030 | cpuset_cpus_allowed(p, cpus_allowed); |
| 4889 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5031 | cpumask_and(new_mask, in_mask, cpus_allowed); |
| 4890 | again: | 5032 | again: |
| 4891 | retval = set_cpus_allowed_ptr(p, new_mask); | 5033 | retval = set_cpus_allowed_ptr(p, new_mask); |
| 4892 | 5034 | ||
| 4893 | if (!retval) { | 5035 | if (!retval) { |
| @@ -5331,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 5331 | idle->se.exec_start = sched_clock(); | 5473 | idle->se.exec_start = sched_clock(); |
| 5332 | 5474 | ||
| 5333 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5475 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); |
| 5476 | /* | ||
| 5477 | * We're having a chicken and egg problem, even though we are | ||
| 5478 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
| 5479 | * lockdep check in task_group() will fail. | ||
| 5480 | * | ||
| 5481 | * Similar case to sched_fork(). / Alternatively we could | ||
| 5482 | * use task_rq_lock() here and obtain the other rq->lock. | ||
| 5483 | * | ||
| 5484 | * Silence PROVE_RCU | ||
| 5485 | */ | ||
| 5486 | rcu_read_lock(); | ||
| 5334 | __set_task_cpu(idle, cpu); | 5487 | __set_task_cpu(idle, cpu); |
| 5488 | rcu_read_unlock(); | ||
| 5335 | 5489 | ||
| 5336 | rq->curr = rq->idle = idle; | 5490 | rq->curr = rq->idle = idle; |
| 5337 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5491 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
| @@ -6508,6 +6662,7 @@ struct s_data { | |||
| 6508 | cpumask_var_t nodemask; | 6662 | cpumask_var_t nodemask; |
| 6509 | cpumask_var_t this_sibling_map; | 6663 | cpumask_var_t this_sibling_map; |
| 6510 | cpumask_var_t this_core_map; | 6664 | cpumask_var_t this_core_map; |
| 6665 | cpumask_var_t this_book_map; | ||
| 6511 | cpumask_var_t send_covered; | 6666 | cpumask_var_t send_covered; |
| 6512 | cpumask_var_t tmpmask; | 6667 | cpumask_var_t tmpmask; |
| 6513 | struct sched_group **sched_group_nodes; | 6668 | struct sched_group **sched_group_nodes; |
| @@ -6519,6 +6674,7 @@ enum s_alloc { | |||
| 6519 | sa_rootdomain, | 6674 | sa_rootdomain, |
| 6520 | sa_tmpmask, | 6675 | sa_tmpmask, |
| 6521 | sa_send_covered, | 6676 | sa_send_covered, |
| 6677 | sa_this_book_map, | ||
| 6522 | sa_this_core_map, | 6678 | sa_this_core_map, |
| 6523 | sa_this_sibling_map, | 6679 | sa_this_sibling_map, |
| 6524 | sa_nodemask, | 6680 | sa_nodemask, |
| @@ -6554,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | |||
| 6554 | #ifdef CONFIG_SCHED_MC | 6710 | #ifdef CONFIG_SCHED_MC |
| 6555 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 6711 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); |
| 6556 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | 6712 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); |
| 6557 | #endif /* CONFIG_SCHED_MC */ | ||
| 6558 | 6713 | ||
| 6559 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | ||
| 6560 | static int | 6714 | static int |
| 6561 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6715 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, |
| 6562 | struct sched_group **sg, struct cpumask *mask) | 6716 | struct sched_group **sg, struct cpumask *mask) |
| 6563 | { | 6717 | { |
| 6564 | int group; | 6718 | int group; |
| 6565 | 6719 | #ifdef CONFIG_SCHED_SMT | |
| 6566 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | 6720 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); |
| 6567 | group = cpumask_first(mask); | 6721 | group = cpumask_first(mask); |
| 6722 | #else | ||
| 6723 | group = cpu; | ||
| 6724 | #endif | ||
| 6568 | if (sg) | 6725 | if (sg) |
| 6569 | *sg = &per_cpu(sched_group_core, group).sg; | 6726 | *sg = &per_cpu(sched_group_core, group).sg; |
| 6570 | return group; | 6727 | return group; |
| 6571 | } | 6728 | } |
| 6572 | #elif defined(CONFIG_SCHED_MC) | 6729 | #endif /* CONFIG_SCHED_MC */ |
| 6730 | |||
| 6731 | /* | ||
| 6732 | * book sched-domains: | ||
| 6733 | */ | ||
| 6734 | #ifdef CONFIG_SCHED_BOOK | ||
| 6735 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
| 6736 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
| 6737 | |||
| 6573 | static int | 6738 | static int |
| 6574 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6739 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, |
| 6575 | struct sched_group **sg, struct cpumask *unused) | 6740 | struct sched_group **sg, struct cpumask *mask) |
| 6576 | { | 6741 | { |
| 6742 | int group = cpu; | ||
| 6743 | #ifdef CONFIG_SCHED_MC | ||
| 6744 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
| 6745 | group = cpumask_first(mask); | ||
| 6746 | #elif defined(CONFIG_SCHED_SMT) | ||
| 6747 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
| 6748 | group = cpumask_first(mask); | ||
| 6749 | #endif | ||
| 6577 | if (sg) | 6750 | if (sg) |
| 6578 | *sg = &per_cpu(sched_group_core, cpu).sg; | 6751 | *sg = &per_cpu(sched_group_book, group).sg; |
| 6579 | return cpu; | 6752 | return group; |
| 6580 | } | 6753 | } |
| 6581 | #endif | 6754 | #endif /* CONFIG_SCHED_BOOK */ |
| 6582 | 6755 | ||
| 6583 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6756 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); |
| 6584 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6757 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); |
| @@ -6588,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | |||
| 6588 | struct sched_group **sg, struct cpumask *mask) | 6761 | struct sched_group **sg, struct cpumask *mask) |
| 6589 | { | 6762 | { |
| 6590 | int group; | 6763 | int group; |
| 6591 | #ifdef CONFIG_SCHED_MC | 6764 | #ifdef CONFIG_SCHED_BOOK |
| 6765 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
| 6766 | group = cpumask_first(mask); | ||
| 6767 | #elif defined(CONFIG_SCHED_MC) | ||
| 6592 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | 6768 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); |
| 6593 | group = cpumask_first(mask); | 6769 | group = cpumask_first(mask); |
| 6594 | #elif defined(CONFIG_SCHED_SMT) | 6770 | #elif defined(CONFIG_SCHED_SMT) |
| @@ -6849,6 +7025,9 @@ SD_INIT_FUNC(CPU) | |||
| 6849 | #ifdef CONFIG_SCHED_MC | 7025 | #ifdef CONFIG_SCHED_MC |
| 6850 | SD_INIT_FUNC(MC) | 7026 | SD_INIT_FUNC(MC) |
| 6851 | #endif | 7027 | #endif |
| 7028 | #ifdef CONFIG_SCHED_BOOK | ||
| 7029 | SD_INIT_FUNC(BOOK) | ||
| 7030 | #endif | ||
| 6852 | 7031 | ||
| 6853 | static int default_relax_domain_level = -1; | 7032 | static int default_relax_domain_level = -1; |
| 6854 | 7033 | ||
| @@ -6898,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
| 6898 | free_cpumask_var(d->tmpmask); /* fall through */ | 7077 | free_cpumask_var(d->tmpmask); /* fall through */ |
| 6899 | case sa_send_covered: | 7078 | case sa_send_covered: |
| 6900 | free_cpumask_var(d->send_covered); /* fall through */ | 7079 | free_cpumask_var(d->send_covered); /* fall through */ |
| 7080 | case sa_this_book_map: | ||
| 7081 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
| 6901 | case sa_this_core_map: | 7082 | case sa_this_core_map: |
| 6902 | free_cpumask_var(d->this_core_map); /* fall through */ | 7083 | free_cpumask_var(d->this_core_map); /* fall through */ |
| 6903 | case sa_this_sibling_map: | 7084 | case sa_this_sibling_map: |
| @@ -6944,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | |||
| 6944 | return sa_nodemask; | 7125 | return sa_nodemask; |
| 6945 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | 7126 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) |
| 6946 | return sa_this_sibling_map; | 7127 | return sa_this_sibling_map; |
| 6947 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | 7128 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) |
| 6948 | return sa_this_core_map; | 7129 | return sa_this_core_map; |
| 7130 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
| 7131 | return sa_this_book_map; | ||
| 6949 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | 7132 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) |
| 6950 | return sa_send_covered; | 7133 | return sa_send_covered; |
| 6951 | d->rd = alloc_rootdomain(); | 7134 | d->rd = alloc_rootdomain(); |
| @@ -7003,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | |||
| 7003 | return sd; | 7186 | return sd; |
| 7004 | } | 7187 | } |
| 7005 | 7188 | ||
| 7189 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | ||
| 7190 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 7191 | struct sched_domain *parent, int i) | ||
| 7192 | { | ||
| 7193 | struct sched_domain *sd = parent; | ||
| 7194 | #ifdef CONFIG_SCHED_BOOK | ||
| 7195 | sd = &per_cpu(book_domains, i).sd; | ||
| 7196 | SD_INIT(sd, BOOK); | ||
| 7197 | set_domain_attribute(sd, attr); | ||
| 7198 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
| 7199 | sd->parent = parent; | ||
| 7200 | parent->child = sd; | ||
| 7201 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 7202 | #endif | ||
| 7203 | return sd; | ||
| 7204 | } | ||
| 7205 | |||
| 7006 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7206 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, |
| 7007 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7207 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
| 7008 | struct sched_domain *parent, int i) | 7208 | struct sched_domain *parent, int i) |
| @@ -7060,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | |||
| 7060 | d->send_covered, d->tmpmask); | 7260 | d->send_covered, d->tmpmask); |
| 7061 | break; | 7261 | break; |
| 7062 | #endif | 7262 | #endif |
| 7263 | #ifdef CONFIG_SCHED_BOOK | ||
| 7264 | case SD_LV_BOOK: /* set up book groups */ | ||
| 7265 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
| 7266 | if (cpu == cpumask_first(d->this_book_map)) | ||
| 7267 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
| 7268 | &cpu_to_book_group, | ||
| 7269 | d->send_covered, d->tmpmask); | ||
| 7270 | break; | ||
| 7271 | #endif | ||
| 7063 | case SD_LV_CPU: /* set up physical groups */ | 7272 | case SD_LV_CPU: /* set up physical groups */ |
| 7064 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | 7273 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
| 7065 | if (!cpumask_empty(d->nodemask)) | 7274 | if (!cpumask_empty(d->nodemask)) |
| @@ -7107,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 7107 | 7316 | ||
| 7108 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7317 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
| 7109 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7318 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
| 7319 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | ||
| 7110 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | 7320 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
| 7111 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | 7321 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
| 7112 | } | 7322 | } |
| 7113 | 7323 | ||
| 7114 | for_each_cpu(i, cpu_map) { | 7324 | for_each_cpu(i, cpu_map) { |
| 7115 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7325 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
| 7326 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
| 7116 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7327 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); |
| 7117 | } | 7328 | } |
| 7118 | 7329 | ||
| @@ -7143,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 7143 | init_sched_groups_power(i, sd); | 7354 | init_sched_groups_power(i, sd); |
| 7144 | } | 7355 | } |
| 7145 | #endif | 7356 | #endif |
| 7357 | #ifdef CONFIG_SCHED_BOOK | ||
| 7358 | for_each_cpu(i, cpu_map) { | ||
| 7359 | sd = &per_cpu(book_domains, i).sd; | ||
| 7360 | init_sched_groups_power(i, sd); | ||
| 7361 | } | ||
| 7362 | #endif | ||
| 7146 | 7363 | ||
| 7147 | for_each_cpu(i, cpu_map) { | 7364 | for_each_cpu(i, cpu_map) { |
| 7148 | sd = &per_cpu(phys_domains, i).sd; | 7365 | sd = &per_cpu(phys_domains, i).sd; |
| @@ -7168,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
| 7168 | sd = &per_cpu(cpu_domains, i).sd; | 7385 | sd = &per_cpu(cpu_domains, i).sd; |
| 7169 | #elif defined(CONFIG_SCHED_MC) | 7386 | #elif defined(CONFIG_SCHED_MC) |
| 7170 | sd = &per_cpu(core_domains, i).sd; | 7387 | sd = &per_cpu(core_domains, i).sd; |
| 7388 | #elif defined(CONFIG_SCHED_BOOK) | ||
| 7389 | sd = &per_cpu(book_domains, i).sd; | ||
| 7171 | #else | 7390 | #else |
| 7172 | sd = &per_cpu(phys_domains, i).sd; | 7391 | sd = &per_cpu(phys_domains, i).sd; |
| 7173 | #endif | 7392 | #endif |
| @@ -8072,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8072 | 8291 | ||
| 8073 | return 1; | 8292 | return 1; |
| 8074 | 8293 | ||
| 8075 | err_free_rq: | 8294 | err_free_rq: |
| 8076 | kfree(cfs_rq); | 8295 | kfree(cfs_rq); |
| 8077 | err: | 8296 | err: |
| 8078 | return 0; | 8297 | return 0; |
| 8079 | } | 8298 | } |
| 8080 | 8299 | ||
| @@ -8162,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8162 | 8381 | ||
| 8163 | return 1; | 8382 | return 1; |
| 8164 | 8383 | ||
| 8165 | err_free_rq: | 8384 | err_free_rq: |
| 8166 | kfree(rt_rq); | 8385 | kfree(rt_rq); |
| 8167 | err: | 8386 | err: |
| 8168 | return 0; | 8387 | return 0; |
| 8169 | } | 8388 | } |
| 8170 | 8389 | ||
| @@ -8522,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
| 8522 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8741 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
| 8523 | } | 8742 | } |
| 8524 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8743 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
| 8525 | unlock: | 8744 | unlock: |
| 8526 | read_unlock(&tasklist_lock); | 8745 | read_unlock(&tasklist_lock); |
| 8527 | mutex_unlock(&rt_constraints_mutex); | 8746 | mutex_unlock(&rt_constraints_mutex); |
| 8528 | 8747 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ab661ebc4895..933f3d1b62ea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | 25 | ||
| 26 | /* | 26 | /* |
| 27 | * Targeted preemption latency for CPU-bound tasks: | 27 | * Targeted preemption latency for CPU-bound tasks: |
| 28 | * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) | 28 | * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) |
| 29 | * | 29 | * |
| 30 | * NOTE: this latency value is not the same as the concept of | 30 | * NOTE: this latency value is not the same as the concept of |
| 31 | * 'timeslice length' - timeslices in CFS are of variable length | 31 | * 'timeslice length' - timeslices in CFS are of variable length |
| @@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling | |||
| 52 | 52 | ||
| 53 | /* | 53 | /* |
| 54 | * Minimal preemption granularity for CPU-bound tasks: | 54 | * Minimal preemption granularity for CPU-bound tasks: |
| 55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) | 55 | * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) |
| 56 | */ | 56 | */ |
| 57 | unsigned int sysctl_sched_min_granularity = 2000000ULL; | 57 | unsigned int sysctl_sched_min_granularity = 750000ULL; |
| 58 | unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; | 58 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; |
| 59 | 59 | ||
| 60 | /* | 60 | /* |
| 61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
| 62 | */ | 62 | */ |
| 63 | static unsigned int sched_nr_latency = 3; | 63 | static unsigned int sched_nr_latency = 8; |
| 64 | 64 | ||
| 65 | /* | 65 | /* |
| 66 | * After fork, child runs first. If set to 0 (default) then | 66 | * After fork, child runs first. If set to 0 (default) then |
| @@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
| 519 | static void update_curr(struct cfs_rq *cfs_rq) | 519 | static void update_curr(struct cfs_rq *cfs_rq) |
| 520 | { | 520 | { |
| 521 | struct sched_entity *curr = cfs_rq->curr; | 521 | struct sched_entity *curr = cfs_rq->curr; |
| 522 | u64 now = rq_of(cfs_rq)->clock; | 522 | u64 now = rq_of(cfs_rq)->clock_task; |
| 523 | unsigned long delta_exec; | 523 | unsigned long delta_exec; |
| 524 | 524 | ||
| 525 | if (unlikely(!curr)) | 525 | if (unlikely(!curr)) |
| @@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 602 | /* | 602 | /* |
| 603 | * We are starting a new run period: | 603 | * We are starting a new run period: |
| 604 | */ | 604 | */ |
| 605 | se->exec_start = rq_of(cfs_rq)->clock; | 605 | se->exec_start = rq_of(cfs_rq)->clock_task; |
| 606 | } | 606 | } |
| 607 | 607 | ||
| 608 | /************************************************** | 608 | /************************************************** |
| @@ -1313,7 +1313,7 @@ static struct sched_group * | |||
| 1313 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 1313 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
| 1314 | int this_cpu, int load_idx) | 1314 | int this_cpu, int load_idx) |
| 1315 | { | 1315 | { |
| 1316 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | 1316 | struct sched_group *idlest = NULL, *group = sd->groups; |
| 1317 | unsigned long min_load = ULONG_MAX, this_load = 0; | 1317 | unsigned long min_load = ULONG_MAX, this_load = 0; |
| 1318 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 1318 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
| 1319 | 1319 | ||
| @@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 1348 | 1348 | ||
| 1349 | if (local_group) { | 1349 | if (local_group) { |
| 1350 | this_load = avg_load; | 1350 | this_load = avg_load; |
| 1351 | this = group; | ||
| 1352 | } else if (avg_load < min_load) { | 1351 | } else if (avg_load < min_load) { |
| 1353 | min_load = avg_load; | 1352 | min_load = avg_load; |
| 1354 | idlest = group; | 1353 | idlest = group; |
| @@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
| 1765 | set_task_cpu(p, this_cpu); | 1764 | set_task_cpu(p, this_cpu); |
| 1766 | activate_task(this_rq, p, 0); | 1765 | activate_task(this_rq, p, 0); |
| 1767 | check_preempt_curr(this_rq, p, 0); | 1766 | check_preempt_curr(this_rq, p, 0); |
| 1767 | |||
| 1768 | /* re-arm NEWIDLE balancing when moving tasks */ | ||
| 1769 | src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
| 1770 | this_rq->idle_stamp = 0; | ||
| 1768 | } | 1771 | } |
| 1769 | 1772 | ||
| 1770 | /* | 1773 | /* |
| @@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
| 1799 | * 2) too many balance attempts have failed. | 1802 | * 2) too many balance attempts have failed. |
| 1800 | */ | 1803 | */ |
| 1801 | 1804 | ||
| 1802 | tsk_cache_hot = task_hot(p, rq->clock, sd); | 1805 | tsk_cache_hot = task_hot(p, rq->clock_task, sd); |
| 1803 | if (!tsk_cache_hot || | 1806 | if (!tsk_cache_hot || |
| 1804 | sd->nr_balance_failed > sd->cache_nice_tries) { | 1807 | sd->nr_balance_failed > sd->cache_nice_tries) { |
| 1805 | #ifdef CONFIG_SCHEDSTATS | 1808 | #ifdef CONFIG_SCHEDSTATS |
| @@ -2031,12 +2034,14 @@ struct sd_lb_stats { | |||
| 2031 | unsigned long this_load; | 2034 | unsigned long this_load; |
| 2032 | unsigned long this_load_per_task; | 2035 | unsigned long this_load_per_task; |
| 2033 | unsigned long this_nr_running; | 2036 | unsigned long this_nr_running; |
| 2037 | unsigned long this_has_capacity; | ||
| 2034 | 2038 | ||
| 2035 | /* Statistics of the busiest group */ | 2039 | /* Statistics of the busiest group */ |
| 2036 | unsigned long max_load; | 2040 | unsigned long max_load; |
| 2037 | unsigned long busiest_load_per_task; | 2041 | unsigned long busiest_load_per_task; |
| 2038 | unsigned long busiest_nr_running; | 2042 | unsigned long busiest_nr_running; |
| 2039 | unsigned long busiest_group_capacity; | 2043 | unsigned long busiest_group_capacity; |
| 2044 | unsigned long busiest_has_capacity; | ||
| 2040 | 2045 | ||
| 2041 | int group_imb; /* Is there imbalance in this sd */ | 2046 | int group_imb; /* Is there imbalance in this sd */ |
| 2042 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2047 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| @@ -2059,6 +2064,7 @@ struct sg_lb_stats { | |||
| 2059 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 2064 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 2060 | unsigned long group_capacity; | 2065 | unsigned long group_capacity; |
| 2061 | int group_imb; /* Is there an imbalance in the group ? */ | 2066 | int group_imb; /* Is there an imbalance in the group ? */ |
| 2067 | int group_has_capacity; /* Is there extra capacity in the group? */ | ||
| 2062 | }; | 2068 | }; |
| 2063 | 2069 | ||
| 2064 | /** | 2070 | /** |
| @@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu) | |||
| 2268 | struct rq *rq = cpu_rq(cpu); | 2274 | struct rq *rq = cpu_rq(cpu); |
| 2269 | u64 total, available; | 2275 | u64 total, available; |
| 2270 | 2276 | ||
| 2271 | sched_avg_update(rq); | ||
| 2272 | |||
| 2273 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 2277 | total = sched_avg_period() + (rq->clock - rq->age_stamp); |
| 2274 | available = total - rq->rt_avg; | 2278 | |
| 2279 | if (unlikely(total < rq->rt_avg)) { | ||
| 2280 | /* Ensures that power won't end up being negative */ | ||
| 2281 | available = 0; | ||
| 2282 | } else { | ||
| 2283 | available = total - rq->rt_avg; | ||
| 2284 | } | ||
| 2275 | 2285 | ||
| 2276 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | 2286 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) |
| 2277 | total = SCHED_LOAD_SCALE; | 2287 | total = SCHED_LOAD_SCALE; |
| @@ -2381,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2381 | int local_group, const struct cpumask *cpus, | 2391 | int local_group, const struct cpumask *cpus, |
| 2382 | int *balance, struct sg_lb_stats *sgs) | 2392 | int *balance, struct sg_lb_stats *sgs) |
| 2383 | { | 2393 | { |
| 2384 | unsigned long load, max_cpu_load, min_cpu_load; | 2394 | unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; |
| 2385 | int i; | 2395 | int i; |
| 2386 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2396 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
| 2387 | unsigned long avg_load_per_task = 0; | 2397 | unsigned long avg_load_per_task = 0; |
| @@ -2392,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2392 | /* Tally up the load of all CPUs in the group */ | 2402 | /* Tally up the load of all CPUs in the group */ |
| 2393 | max_cpu_load = 0; | 2403 | max_cpu_load = 0; |
| 2394 | min_cpu_load = ~0UL; | 2404 | min_cpu_load = ~0UL; |
| 2405 | max_nr_running = 0; | ||
| 2395 | 2406 | ||
| 2396 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2407 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
| 2397 | struct rq *rq = cpu_rq(i); | 2408 | struct rq *rq = cpu_rq(i); |
| @@ -2409,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2409 | load = target_load(i, load_idx); | 2420 | load = target_load(i, load_idx); |
| 2410 | } else { | 2421 | } else { |
| 2411 | load = source_load(i, load_idx); | 2422 | load = source_load(i, load_idx); |
| 2412 | if (load > max_cpu_load) | 2423 | if (load > max_cpu_load) { |
| 2413 | max_cpu_load = load; | 2424 | max_cpu_load = load; |
| 2425 | max_nr_running = rq->nr_running; | ||
| 2426 | } | ||
| 2414 | if (min_cpu_load > load) | 2427 | if (min_cpu_load > load) |
| 2415 | min_cpu_load = load; | 2428 | min_cpu_load = load; |
| 2416 | } | 2429 | } |
| @@ -2450,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2450 | if (sgs->sum_nr_running) | 2463 | if (sgs->sum_nr_running) |
| 2451 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2464 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 2452 | 2465 | ||
| 2453 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 2466 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) |
| 2454 | sgs->group_imb = 1; | 2467 | sgs->group_imb = 1; |
| 2455 | 2468 | ||
| 2456 | sgs->group_capacity = | 2469 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
| 2457 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | ||
| 2458 | if (!sgs->group_capacity) | 2470 | if (!sgs->group_capacity) |
| 2459 | sgs->group_capacity = fix_small_capacity(sd, group); | 2471 | sgs->group_capacity = fix_small_capacity(sd, group); |
| 2472 | |||
| 2473 | if (sgs->group_capacity > sgs->sum_nr_running) | ||
| 2474 | sgs->group_has_capacity = 1; | ||
| 2460 | } | 2475 | } |
| 2461 | 2476 | ||
| 2462 | /** | 2477 | /** |
| @@ -2545,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2545 | /* | 2560 | /* |
| 2546 | * In case the child domain prefers tasks go to siblings | 2561 | * In case the child domain prefers tasks go to siblings |
| 2547 | * first, lower the sg capacity to one so that we'll try | 2562 | * first, lower the sg capacity to one so that we'll try |
| 2548 | * and move all the excess tasks away. | 2563 | * and move all the excess tasks away. We lower the capacity |
| 2564 | * of a group only if the local group has the capacity to fit | ||
| 2565 | * these excess tasks, i.e. nr_running < group_capacity. The | ||
| 2566 | * extra check prevents the case where you always pull from the | ||
| 2567 | * heaviest group when it is already under-utilized (possible | ||
| 2568 | * with a large weight task outweighs the tasks on the system). | ||
| 2549 | */ | 2569 | */ |
| 2550 | if (prefer_sibling) | 2570 | if (prefer_sibling && !local_group && sds->this_has_capacity) |
| 2551 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 2571 | sgs.group_capacity = min(sgs.group_capacity, 1UL); |
| 2552 | 2572 | ||
| 2553 | if (local_group) { | 2573 | if (local_group) { |
| @@ -2555,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2555 | sds->this = sg; | 2575 | sds->this = sg; |
| 2556 | sds->this_nr_running = sgs.sum_nr_running; | 2576 | sds->this_nr_running = sgs.sum_nr_running; |
| 2557 | sds->this_load_per_task = sgs.sum_weighted_load; | 2577 | sds->this_load_per_task = sgs.sum_weighted_load; |
| 2578 | sds->this_has_capacity = sgs.group_has_capacity; | ||
| 2558 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 2579 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
| 2559 | sds->max_load = sgs.avg_load; | 2580 | sds->max_load = sgs.avg_load; |
| 2560 | sds->busiest = sg; | 2581 | sds->busiest = sg; |
| 2561 | sds->busiest_nr_running = sgs.sum_nr_running; | 2582 | sds->busiest_nr_running = sgs.sum_nr_running; |
| 2562 | sds->busiest_group_capacity = sgs.group_capacity; | 2583 | sds->busiest_group_capacity = sgs.group_capacity; |
| 2563 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2584 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
| 2585 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
| 2564 | sds->group_imb = sgs.group_imb; | 2586 | sds->group_imb = sgs.group_imb; |
| 2565 | } | 2587 | } |
| 2566 | 2588 | ||
| @@ -2757,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 2757 | return fix_small_imbalance(sds, this_cpu, imbalance); | 2779 | return fix_small_imbalance(sds, this_cpu, imbalance); |
| 2758 | 2780 | ||
| 2759 | } | 2781 | } |
| 2782 | |||
| 2760 | /******* find_busiest_group() helpers end here *********************/ | 2783 | /******* find_busiest_group() helpers end here *********************/ |
| 2761 | 2784 | ||
| 2762 | /** | 2785 | /** |
| @@ -2808,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2808 | * 4) This group is more busy than the avg busieness at this | 2831 | * 4) This group is more busy than the avg busieness at this |
| 2809 | * sched_domain. | 2832 | * sched_domain. |
| 2810 | * 5) The imbalance is within the specified limit. | 2833 | * 5) The imbalance is within the specified limit. |
| 2834 | * | ||
| 2835 | * Note: when doing newidle balance, if the local group has excess | ||
| 2836 | * capacity (i.e. nr_running < group_capacity) and the busiest group | ||
| 2837 | * does not have any capacity, we force a load balance to pull tasks | ||
| 2838 | * to the local group. In this case, we skip past checks 3, 4 and 5. | ||
| 2811 | */ | 2839 | */ |
| 2812 | if (!(*balance)) | 2840 | if (!(*balance)) |
| 2813 | goto ret; | 2841 | goto ret; |
| @@ -2819,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2819 | if (!sds.busiest || sds.busiest_nr_running == 0) | 2847 | if (!sds.busiest || sds.busiest_nr_running == 0) |
| 2820 | goto out_balanced; | 2848 | goto out_balanced; |
| 2821 | 2849 | ||
| 2850 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
| 2851 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | ||
| 2852 | !sds.busiest_has_capacity) | ||
| 2853 | goto force_balance; | ||
| 2854 | |||
| 2822 | if (sds.this_load >= sds.max_load) | 2855 | if (sds.this_load >= sds.max_load) |
| 2823 | goto out_balanced; | 2856 | goto out_balanced; |
| 2824 | 2857 | ||
| @@ -2830,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2830 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 2863 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
| 2831 | goto out_balanced; | 2864 | goto out_balanced; |
| 2832 | 2865 | ||
| 2866 | force_balance: | ||
| 2833 | /* Looks like there is an imbalance. Compute it */ | 2867 | /* Looks like there is an imbalance. Compute it */ |
| 2834 | calculate_imbalance(&sds, this_cpu, imbalance); | 2868 | calculate_imbalance(&sds, this_cpu, imbalance); |
| 2835 | return sds.busiest; | 2869 | return sds.busiest; |
| @@ -3034,7 +3068,14 @@ redo: | |||
| 3034 | 3068 | ||
| 3035 | if (!ld_moved) { | 3069 | if (!ld_moved) { |
| 3036 | schedstat_inc(sd, lb_failed[idle]); | 3070 | schedstat_inc(sd, lb_failed[idle]); |
| 3037 | sd->nr_balance_failed++; | 3071 | /* |
| 3072 | * Increment the failure counter only on periodic balance. | ||
| 3073 | * We do not want newidle balance, which can be very | ||
| 3074 | * frequent, pollute the failure counter causing | ||
| 3075 | * excessive cache_hot migrations and active balances. | ||
| 3076 | */ | ||
| 3077 | if (idle != CPU_NEWLY_IDLE) | ||
| 3078 | sd->nr_balance_failed++; | ||
| 3038 | 3079 | ||
| 3039 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), | 3080 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), |
| 3040 | this_cpu)) { | 3081 | this_cpu)) { |
| @@ -3156,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3156 | interval = msecs_to_jiffies(sd->balance_interval); | 3197 | interval = msecs_to_jiffies(sd->balance_interval); |
| 3157 | if (time_after(next_balance, sd->last_balance + interval)) | 3198 | if (time_after(next_balance, sd->last_balance + interval)) |
| 3158 | next_balance = sd->last_balance + interval; | 3199 | next_balance = sd->last_balance + interval; |
| 3159 | if (pulled_task) { | 3200 | if (pulled_task) |
| 3160 | this_rq->idle_stamp = 0; | ||
| 3161 | break; | 3201 | break; |
| 3162 | } | ||
| 3163 | } | 3202 | } |
| 3164 | 3203 | ||
| 3165 | raw_spin_lock(&this_rq->lock); | 3204 | raw_spin_lock(&this_rq->lock); |
| @@ -3633,7 +3672,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 3633 | if (time_before(now, nohz.next_balance)) | 3672 | if (time_before(now, nohz.next_balance)) |
| 3634 | return 0; | 3673 | return 0; |
| 3635 | 3674 | ||
| 3636 | if (!rq->nr_running) | 3675 | if (rq->idle_at_tick) |
| 3637 | return 0; | 3676 | return 0; |
| 3638 | 3677 | ||
| 3639 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 3678 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); |
| @@ -3754,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p) | |||
| 3754 | 3793 | ||
| 3755 | update_rq_clock(rq); | 3794 | update_rq_clock(rq); |
| 3756 | 3795 | ||
| 3757 | if (unlikely(task_cpu(p) != this_cpu)) | 3796 | if (unlikely(task_cpu(p) != this_cpu)) { |
| 3797 | rcu_read_lock(); | ||
| 3758 | __set_task_cpu(p, this_cpu); | 3798 | __set_task_cpu(p, this_cpu); |
| 3799 | rcu_read_unlock(); | ||
| 3800 | } | ||
| 3759 | 3801 | ||
| 3760 | update_curr(cfs_rq); | 3802 | update_curr(cfs_rq); |
| 3761 | 3803 | ||
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 83c66e8ad3ee..185f920ec1a2 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
| @@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1) | |||
| 61 | * release the lock. Decreases scheduling overhead. | 61 | * release the lock. Decreases scheduling overhead. |
| 62 | */ | 62 | */ |
| 63 | SCHED_FEAT(OWNER_SPIN, 1) | 63 | SCHED_FEAT(OWNER_SPIN, 1) |
| 64 | |||
| 65 | /* | ||
| 66 | * Decrement CPU power based on irq activity | ||
| 67 | */ | ||
| 68 | SCHED_FEAT(NONIRQ_POWER, 1) | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d10c80ebb67a..bea7d79f7e9c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq) | |||
| 609 | if (!task_has_rt_policy(curr)) | 609 | if (!task_has_rt_policy(curr)) |
| 610 | return; | 610 | return; |
| 611 | 611 | ||
| 612 | delta_exec = rq->clock - curr->se.exec_start; | 612 | delta_exec = rq->clock_task - curr->se.exec_start; |
| 613 | if (unlikely((s64)delta_exec < 0)) | 613 | if (unlikely((s64)delta_exec < 0)) |
| 614 | delta_exec = 0; | 614 | delta_exec = 0; |
| 615 | 615 | ||
| @@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq) | |||
| 618 | curr->se.sum_exec_runtime += delta_exec; | 618 | curr->se.sum_exec_runtime += delta_exec; |
| 619 | account_group_exec_runtime(curr, delta_exec); | 619 | account_group_exec_runtime(curr, delta_exec); |
| 620 | 620 | ||
| 621 | curr->se.exec_start = rq->clock; | 621 | curr->se.exec_start = rq->clock_task; |
| 622 | cpuacct_charge(curr, delta_exec); | 622 | cpuacct_charge(curr, delta_exec); |
| 623 | 623 | ||
| 624 | sched_rt_avg_update(rq, delta_exec); | 624 | sched_rt_avg_update(rq, delta_exec); |
| @@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | |||
| 960 | * runqueue. Otherwise simply start this RT task | 960 | * runqueue. Otherwise simply start this RT task |
| 961 | * on its current runqueue. | 961 | * on its current runqueue. |
| 962 | * | 962 | * |
| 963 | * We want to avoid overloading runqueues. Even if | 963 | * We want to avoid overloading runqueues. If the woken |
| 964 | * the RT task is of higher priority than the current RT task. | 964 | * task is a higher priority, then it will stay on this CPU |
| 965 | * RT tasks behave differently than other tasks. If | 965 | * and the lower prio task should be moved to another CPU. |
| 966 | * one gets preempted, we try to push it off to another queue. | 966 | * Even though this will probably make the lower prio task |
| 967 | * So trying to keep a preempting RT task on the same | 967 | * lose its cache, we do not want to bounce a higher task |
| 968 | * cache hot CPU will force the running RT task to | 968 | * around just because it gave up its CPU, perhaps for a |
| 969 | * a cold CPU. So we waste all the cache for the lower | 969 | * lock? |
| 970 | * RT task in hopes of saving some of a RT task | 970 | * |
| 971 | * that is just being woken and probably will have | 971 | * For equal prio tasks, we just let the scheduler sort it out. |
| 972 | * cold cache anyway. | ||
| 973 | */ | 972 | */ |
| 974 | if (unlikely(rt_task(rq->curr)) && | 973 | if (unlikely(rt_task(rq->curr)) && |
| 974 | (rq->curr->rt.nr_cpus_allowed < 2 || | ||
| 975 | rq->curr->prio < p->prio) && | ||
| 975 | (p->rt.nr_cpus_allowed > 1)) { | 976 | (p->rt.nr_cpus_allowed > 1)) { |
| 976 | int cpu = find_lowest_rq(p); | 977 | int cpu = find_lowest_rq(p); |
| 977 | 978 | ||
| @@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
| 1074 | } while (rt_rq); | 1075 | } while (rt_rq); |
| 1075 | 1076 | ||
| 1076 | p = rt_task_of(rt_se); | 1077 | p = rt_task_of(rt_se); |
| 1077 | p->se.exec_start = rq->clock; | 1078 | p->se.exec_start = rq->clock_task; |
| 1078 | 1079 | ||
| 1079 | return p; | 1080 | return p; |
| 1080 | } | 1081 | } |
| @@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | |||
| 1139 | for_each_leaf_rt_rq(rt_rq, rq) { | 1140 | for_each_leaf_rt_rq(rt_rq, rq) { |
| 1140 | array = &rt_rq->active; | 1141 | array = &rt_rq->active; |
| 1141 | idx = sched_find_first_bit(array->bitmap); | 1142 | idx = sched_find_first_bit(array->bitmap); |
| 1142 | next_idx: | 1143 | next_idx: |
| 1143 | if (idx >= MAX_RT_PRIO) | 1144 | if (idx >= MAX_RT_PRIO) |
| 1144 | continue; | 1145 | continue; |
| 1145 | if (next && next->prio < idx) | 1146 | if (next && next->prio < idx) |
| @@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq) | |||
| 1315 | if (!next_task) | 1316 | if (!next_task) |
| 1316 | return 0; | 1317 | return 0; |
| 1317 | 1318 | ||
| 1318 | retry: | 1319 | retry: |
| 1319 | if (unlikely(next_task == rq->curr)) { | 1320 | if (unlikely(next_task == rq->curr)) { |
| 1320 | WARN_ON(1); | 1321 | WARN_ON(1); |
| 1321 | return 0; | 1322 | return 0; |
| @@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1463 | * but possible) | 1464 | * but possible) |
| 1464 | */ | 1465 | */ |
| 1465 | } | 1466 | } |
| 1466 | skip: | 1467 | skip: |
| 1467 | double_unlock_balance(this_rq, src_rq); | 1468 | double_unlock_balance(this_rq, src_rq); |
| 1468 | } | 1469 | } |
| 1469 | 1470 | ||
| @@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
| 1491 | if (!task_running(rq, p) && | 1492 | if (!task_running(rq, p) && |
| 1492 | !test_tsk_need_resched(rq->curr) && | 1493 | !test_tsk_need_resched(rq->curr) && |
| 1493 | has_pushable_tasks(rq) && | 1494 | has_pushable_tasks(rq) && |
| 1494 | p->rt.nr_cpus_allowed > 1) | 1495 | p->rt.nr_cpus_allowed > 1 && |
| 1496 | rt_task(rq->curr) && | ||
| 1497 | (rq->curr->rt.nr_cpus_allowed < 2 || | ||
| 1498 | rq->curr->prio < p->prio)) | ||
| 1495 | push_rt_tasks(rq); | 1499 | push_rt_tasks(rq); |
| 1496 | } | 1500 | } |
| 1497 | 1501 | ||
| @@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq) | |||
| 1709 | { | 1713 | { |
| 1710 | struct task_struct *p = rq->curr; | 1714 | struct task_struct *p = rq->curr; |
| 1711 | 1715 | ||
| 1712 | p->se.exec_start = rq->clock; | 1716 | p->se.exec_start = rq->clock_task; |
| 1713 | 1717 | ||
| 1714 | /* The running task is never eligible for pushing */ | 1718 | /* The running task is never eligible for pushing */ |
| 1715 | dequeue_pushable_task(rq, p); | 1719 | dequeue_pushable_task(rq, p); |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c new file mode 100644 index 000000000000..45bddc0c1048 --- /dev/null +++ b/kernel/sched_stoptask.c | |||
| @@ -0,0 +1,108 @@ | |||
| 1 | /* | ||
| 2 | * stop-task scheduling class. | ||
| 3 | * | ||
| 4 | * The stop task is the highest priority task in the system, it preempts | ||
| 5 | * everything and will be preempted by nothing. | ||
| 6 | * | ||
| 7 | * See kernel/stop_machine.c | ||
| 8 | */ | ||
| 9 | |||
| 10 | #ifdef CONFIG_SMP | ||
| 11 | static int | ||
| 12 | select_task_rq_stop(struct rq *rq, struct task_struct *p, | ||
| 13 | int sd_flag, int flags) | ||
| 14 | { | ||
| 15 | return task_cpu(p); /* stop tasks as never migrate */ | ||
| 16 | } | ||
| 17 | #endif /* CONFIG_SMP */ | ||
| 18 | |||
| 19 | static void | ||
| 20 | check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) | ||
| 21 | { | ||
| 22 | resched_task(rq->curr); /* we preempt everything */ | ||
| 23 | } | ||
| 24 | |||
| 25 | static struct task_struct *pick_next_task_stop(struct rq *rq) | ||
| 26 | { | ||
| 27 | struct task_struct *stop = rq->stop; | ||
| 28 | |||
| 29 | if (stop && stop->state == TASK_RUNNING) | ||
| 30 | return stop; | ||
| 31 | |||
| 32 | return NULL; | ||
| 33 | } | ||
| 34 | |||
| 35 | static void | ||
| 36 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | ||
| 37 | { | ||
| 38 | } | ||
| 39 | |||
| 40 | static void | ||
| 41 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | ||
| 42 | { | ||
| 43 | } | ||
| 44 | |||
| 45 | static void yield_task_stop(struct rq *rq) | ||
| 46 | { | ||
| 47 | BUG(); /* the stop task should never yield, its pointless. */ | ||
| 48 | } | ||
| 49 | |||
| 50 | static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | ||
| 51 | { | ||
| 52 | } | ||
| 53 | |||
| 54 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | ||
| 55 | { | ||
| 56 | } | ||
| 57 | |||
| 58 | static void set_curr_task_stop(struct rq *rq) | ||
| 59 | { | ||
| 60 | } | ||
| 61 | |||
| 62 | static void switched_to_stop(struct rq *rq, struct task_struct *p, | ||
| 63 | int running) | ||
| 64 | { | ||
| 65 | BUG(); /* its impossible to change to this class */ | ||
| 66 | } | ||
| 67 | |||
| 68 | static void prio_changed_stop(struct rq *rq, struct task_struct *p, | ||
| 69 | int oldprio, int running) | ||
| 70 | { | ||
| 71 | BUG(); /* how!?, what priority? */ | ||
| 72 | } | ||
| 73 | |||
| 74 | static unsigned int | ||
| 75 | get_rr_interval_stop(struct rq *rq, struct task_struct *task) | ||
| 76 | { | ||
| 77 | return 0; | ||
| 78 | } | ||
| 79 | |||
| 80 | /* | ||
| 81 | * Simple, special scheduling class for the per-CPU stop tasks: | ||
| 82 | */ | ||
| 83 | static const struct sched_class stop_sched_class = { | ||
| 84 | .next = &rt_sched_class, | ||
| 85 | |||
| 86 | .enqueue_task = enqueue_task_stop, | ||
| 87 | .dequeue_task = dequeue_task_stop, | ||
| 88 | .yield_task = yield_task_stop, | ||
| 89 | |||
| 90 | .check_preempt_curr = check_preempt_curr_stop, | ||
| 91 | |||
| 92 | .pick_next_task = pick_next_task_stop, | ||
| 93 | .put_prev_task = put_prev_task_stop, | ||
| 94 | |||
| 95 | #ifdef CONFIG_SMP | ||
| 96 | .select_task_rq = select_task_rq_stop, | ||
| 97 | #endif | ||
| 98 | |||
| 99 | .set_curr_task = set_curr_task_stop, | ||
| 100 | .task_tick = task_tick_stop, | ||
| 101 | |||
| 102 | .get_rr_interval = get_rr_interval_stop, | ||
| 103 | |||
| 104 | .prio_changed = prio_changed_stop, | ||
| 105 | .switched_to = switched_to_stop, | ||
| 106 | |||
| 107 | /* no .task_new for stop tasks */ | ||
| 108 | }; | ||
diff --git a/kernel/signal.c b/kernel/signal.c index bded65187780..919562c3d6b7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -2215,6 +2215,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
| 2215 | #ifdef __ARCH_SI_TRAPNO | 2215 | #ifdef __ARCH_SI_TRAPNO |
| 2216 | err |= __put_user(from->si_trapno, &to->si_trapno); | 2216 | err |= __put_user(from->si_trapno, &to->si_trapno); |
| 2217 | #endif | 2217 | #endif |
| 2218 | #ifdef BUS_MCEERR_AO | ||
| 2219 | /* | ||
| 2220 | * Other callers might not initialize the si_lsb field, | ||
| 2221 | * so check explicitely for the right codes here. | ||
| 2222 | */ | ||
| 2223 | if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) | ||
| 2224 | err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); | ||
| 2225 | #endif | ||
| 2218 | break; | 2226 | break; |
| 2219 | case __SI_CHLD: | 2227 | case __SI_CHLD: |
| 2220 | err |= __put_user(from->si_pid, &to->si_pid); | 2228 | err |= __put_user(from->si_pid, &to->si_pid); |
diff --git a/kernel/smp.c b/kernel/smp.c index 75c970c715d3..ed6aacfcb7ef 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -365,9 +365,10 @@ call: | |||
| 365 | EXPORT_SYMBOL_GPL(smp_call_function_any); | 365 | EXPORT_SYMBOL_GPL(smp_call_function_any); |
| 366 | 366 | ||
| 367 | /** | 367 | /** |
| 368 | * __smp_call_function_single(): Run a function on another CPU | 368 | * __smp_call_function_single(): Run a function on a specific CPU |
| 369 | * @cpu: The CPU to run on. | 369 | * @cpu: The CPU to run on. |
| 370 | * @data: Pre-allocated and setup data structure | 370 | * @data: Pre-allocated and setup data structure |
| 371 | * @wait: If true, wait until function has completed on specified CPU. | ||
| 371 | * | 372 | * |
| 372 | * Like smp_call_function_single(), but allow caller to pass in a | 373 | * Like smp_call_function_single(), but allow caller to pass in a |
| 373 | * pre-allocated data structure. Useful for embedding @data inside | 374 | * pre-allocated data structure. Useful for embedding @data inside |
| @@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); | |||
| 376 | void __smp_call_function_single(int cpu, struct call_single_data *data, | 377 | void __smp_call_function_single(int cpu, struct call_single_data *data, |
| 377 | int wait) | 378 | int wait) |
| 378 | { | 379 | { |
| 379 | csd_lock(data); | 380 | unsigned int this_cpu; |
| 381 | unsigned long flags; | ||
| 380 | 382 | ||
| 383 | this_cpu = get_cpu(); | ||
| 381 | /* | 384 | /* |
| 382 | * Can deadlock when called with interrupts disabled. | 385 | * Can deadlock when called with interrupts disabled. |
| 383 | * We allow cpu's that are not yet online though, as no one else can | 386 | * We allow cpu's that are not yet online though, as no one else can |
| @@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, | |||
| 387 | WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() | 390 | WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() |
| 388 | && !oops_in_progress); | 391 | && !oops_in_progress); |
| 389 | 392 | ||
| 390 | generic_exec_single(cpu, data, wait); | 393 | if (cpu == this_cpu) { |
| 394 | local_irq_save(flags); | ||
| 395 | data->func(data->info); | ||
| 396 | local_irq_restore(flags); | ||
| 397 | } else { | ||
| 398 | csd_lock(data); | ||
| 399 | generic_exec_single(cpu, data, wait); | ||
| 400 | } | ||
| 401 | put_cpu(); | ||
| 391 | } | 402 | } |
| 392 | 403 | ||
| 393 | /** | 404 | /** |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73a..79ee8f1fc0e7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -77,11 +77,21 @@ void wakeup_softirqd(void) | |||
| 77 | } | 77 | } |
| 78 | 78 | ||
| 79 | /* | 79 | /* |
| 80 | * preempt_count and SOFTIRQ_OFFSET usage: | ||
| 81 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving | ||
| 82 | * softirq processing. | ||
| 83 | * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) | ||
| 84 | * on local_bh_disable or local_bh_enable. | ||
| 85 | * This lets us distinguish between whether we are currently processing | ||
| 86 | * softirq and whether we just have bh disabled. | ||
| 87 | */ | ||
| 88 | |||
| 89 | /* | ||
| 80 | * This one is for softirq.c-internal use, | 90 | * This one is for softirq.c-internal use, |
| 81 | * where hardirqs are disabled legitimately: | 91 | * where hardirqs are disabled legitimately: |
| 82 | */ | 92 | */ |
| 83 | #ifdef CONFIG_TRACE_IRQFLAGS | 93 | #ifdef CONFIG_TRACE_IRQFLAGS |
| 84 | static void __local_bh_disable(unsigned long ip) | 94 | static void __local_bh_disable(unsigned long ip, unsigned int cnt) |
| 85 | { | 95 | { |
| 86 | unsigned long flags; | 96 | unsigned long flags; |
| 87 | 97 | ||
| @@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip) | |||
| 95 | * We must manually increment preempt_count here and manually | 105 | * We must manually increment preempt_count here and manually |
| 96 | * call the trace_preempt_off later. | 106 | * call the trace_preempt_off later. |
| 97 | */ | 107 | */ |
| 98 | preempt_count() += SOFTIRQ_OFFSET; | 108 | preempt_count() += cnt; |
| 99 | /* | 109 | /* |
| 100 | * Were softirqs turned off above: | 110 | * Were softirqs turned off above: |
| 101 | */ | 111 | */ |
| 102 | if (softirq_count() == SOFTIRQ_OFFSET) | 112 | if (softirq_count() == cnt) |
| 103 | trace_softirqs_off(ip); | 113 | trace_softirqs_off(ip); |
| 104 | raw_local_irq_restore(flags); | 114 | raw_local_irq_restore(flags); |
| 105 | 115 | ||
| 106 | if (preempt_count() == SOFTIRQ_OFFSET) | 116 | if (preempt_count() == cnt) |
| 107 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 117 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
| 108 | } | 118 | } |
| 109 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 119 | #else /* !CONFIG_TRACE_IRQFLAGS */ |
| 110 | static inline void __local_bh_disable(unsigned long ip) | 120 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) |
| 111 | { | 121 | { |
| 112 | add_preempt_count(SOFTIRQ_OFFSET); | 122 | add_preempt_count(cnt); |
| 113 | barrier(); | 123 | barrier(); |
| 114 | } | 124 | } |
| 115 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 125 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
| 116 | 126 | ||
| 117 | void local_bh_disable(void) | 127 | void local_bh_disable(void) |
| 118 | { | 128 | { |
| 119 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | 129 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
| 130 | SOFTIRQ_DISABLE_OFFSET); | ||
| 120 | } | 131 | } |
| 121 | 132 | ||
| 122 | EXPORT_SYMBOL(local_bh_disable); | 133 | EXPORT_SYMBOL(local_bh_disable); |
| 123 | 134 | ||
| 135 | static void __local_bh_enable(unsigned int cnt) | ||
| 136 | { | ||
| 137 | WARN_ON_ONCE(in_irq()); | ||
| 138 | WARN_ON_ONCE(!irqs_disabled()); | ||
| 139 | |||
| 140 | if (softirq_count() == cnt) | ||
| 141 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
| 142 | sub_preempt_count(cnt); | ||
| 143 | } | ||
| 144 | |||
| 124 | /* | 145 | /* |
| 125 | * Special-case - softirqs can safely be enabled in | 146 | * Special-case - softirqs can safely be enabled in |
| 126 | * cond_resched_softirq(), or by __do_softirq(), | 147 | * cond_resched_softirq(), or by __do_softirq(), |
| @@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable); | |||
| 128 | */ | 149 | */ |
| 129 | void _local_bh_enable(void) | 150 | void _local_bh_enable(void) |
| 130 | { | 151 | { |
| 131 | WARN_ON_ONCE(in_irq()); | 152 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); |
| 132 | WARN_ON_ONCE(!irqs_disabled()); | ||
| 133 | |||
| 134 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
| 135 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
| 136 | sub_preempt_count(SOFTIRQ_OFFSET); | ||
| 137 | } | 153 | } |
| 138 | 154 | ||
| 139 | EXPORT_SYMBOL(_local_bh_enable); | 155 | EXPORT_SYMBOL(_local_bh_enable); |
| @@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
| 147 | /* | 163 | /* |
| 148 | * Are softirqs going to be turned on now: | 164 | * Are softirqs going to be turned on now: |
| 149 | */ | 165 | */ |
| 150 | if (softirq_count() == SOFTIRQ_OFFSET) | 166 | if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) |
| 151 | trace_softirqs_on(ip); | 167 | trace_softirqs_on(ip); |
| 152 | /* | 168 | /* |
| 153 | * Keep preemption disabled until we are done with | 169 | * Keep preemption disabled until we are done with |
| 154 | * softirq processing: | 170 | * softirq processing: |
| 155 | */ | 171 | */ |
| 156 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | 172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); |
| 157 | 173 | ||
| 158 | if (unlikely(!in_interrupt() && local_softirq_pending())) | 174 | if (unlikely(!in_interrupt() && local_softirq_pending())) |
| 159 | do_softirq(); | 175 | do_softirq(); |
| @@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void) | |||
| 198 | pending = local_softirq_pending(); | 214 | pending = local_softirq_pending(); |
| 199 | account_system_vtime(current); | 215 | account_system_vtime(current); |
| 200 | 216 | ||
| 201 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | 217 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
| 218 | SOFTIRQ_OFFSET); | ||
| 202 | lockdep_softirq_enter(); | 219 | lockdep_softirq_enter(); |
| 203 | 220 | ||
| 204 | cpu = smp_processor_id(); | 221 | cpu = smp_processor_id(); |
| @@ -245,7 +262,7 @@ restart: | |||
| 245 | lockdep_softirq_exit(); | 262 | lockdep_softirq_exit(); |
| 246 | 263 | ||
| 247 | account_system_vtime(current); | 264 | account_system_vtime(current); |
| 248 | _local_bh_enable(); | 265 | __local_bh_enable(SOFTIRQ_OFFSET); |
| 249 | } | 266 | } |
| 250 | 267 | ||
| 251 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 268 | #ifndef __ARCH_HAS_DO_SOFTIRQ |
| @@ -279,10 +296,16 @@ void irq_enter(void) | |||
| 279 | 296 | ||
| 280 | rcu_irq_enter(); | 297 | rcu_irq_enter(); |
| 281 | if (idle_cpu(cpu) && !in_interrupt()) { | 298 | if (idle_cpu(cpu) && !in_interrupt()) { |
| 282 | __irq_enter(); | 299 | /* |
| 300 | * Prevent raise_softirq from needlessly waking up ksoftirqd | ||
| 301 | * here, as softirq will be serviced on return from interrupt. | ||
| 302 | */ | ||
| 303 | local_bh_disable(); | ||
| 283 | tick_check_idle(cpu); | 304 | tick_check_idle(cpu); |
| 284 | } else | 305 | _local_bh_enable(); |
| 285 | __irq_enter(); | 306 | } |
| 307 | |||
| 308 | __irq_enter(); | ||
| 286 | } | 309 | } |
| 287 | 310 | ||
| 288 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 311 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
| @@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
| 696 | { | 719 | { |
| 697 | set_current_state(TASK_INTERRUPTIBLE); | 720 | set_current_state(TASK_INTERRUPTIBLE); |
| 698 | 721 | ||
| 722 | current->flags |= PF_KSOFTIRQD; | ||
| 699 | while (!kthread_should_stop()) { | 723 | while (!kthread_should_stop()) { |
| 700 | preempt_disable(); | 724 | preempt_disable(); |
| 701 | if (!local_softirq_pending()) { | 725 | if (!local_softirq_pending()) { |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd509..c71e07500536 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
| @@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) | |||
| 46 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | 46 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, |
| 47 | struct lock_class_key *key) | 47 | struct lock_class_key *key) |
| 48 | { | 48 | { |
| 49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 50 | /* Don't re-initialize a lock while it is held. */ | 49 | /* Don't re-initialize a lock while it is held. */ |
| 51 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | 50 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); |
| 52 | lockdep_init_map(&sp->dep_map, name, key, 0); | 51 | lockdep_init_map(&sp->dep_map, name, key, 0); |
| 53 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 54 | return init_srcu_struct_fields(sp); | 52 | return init_srcu_struct_fields(sp); |
| 55 | } | 53 | } |
| 56 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | 54 | EXPORT_SYMBOL_GPL(__init_srcu_struct); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4372ccb25127..090c28812ce1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -287,11 +287,12 @@ repeat: | |||
| 287 | goto repeat; | 287 | goto repeat; |
| 288 | } | 288 | } |
| 289 | 289 | ||
| 290 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); | ||
| 291 | |||
| 290 | /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ | 292 | /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ |
| 291 | static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | 293 | static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, |
| 292 | unsigned long action, void *hcpu) | 294 | unsigned long action, void *hcpu) |
| 293 | { | 295 | { |
| 294 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
| 295 | unsigned int cpu = (unsigned long)hcpu; | 296 | unsigned int cpu = (unsigned long)hcpu; |
| 296 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 297 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
| 297 | struct task_struct *p; | 298 | struct task_struct *p; |
| @@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
| 304 | cpu); | 305 | cpu); |
| 305 | if (IS_ERR(p)) | 306 | if (IS_ERR(p)) |
| 306 | return NOTIFY_BAD; | 307 | return NOTIFY_BAD; |
| 307 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
| 308 | get_task_struct(p); | 308 | get_task_struct(p); |
| 309 | kthread_bind(p, cpu); | ||
| 310 | sched_set_stop_task(cpu, p); | ||
| 309 | stopper->thread = p; | 311 | stopper->thread = p; |
| 310 | break; | 312 | break; |
| 311 | 313 | ||
| 312 | case CPU_ONLINE: | 314 | case CPU_ONLINE: |
| 313 | kthread_bind(stopper->thread, cpu); | ||
| 314 | /* strictly unnecessary, as first user will wake it */ | 315 | /* strictly unnecessary, as first user will wake it */ |
| 315 | wake_up_process(stopper->thread); | 316 | wake_up_process(stopper->thread); |
| 316 | /* mark enabled */ | 317 | /* mark enabled */ |
| @@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, | |||
| 325 | { | 326 | { |
| 326 | struct cpu_stop_work *work; | 327 | struct cpu_stop_work *work; |
| 327 | 328 | ||
| 329 | sched_set_stop_task(cpu, NULL); | ||
| 328 | /* kill the stopper */ | 330 | /* kill the stopper */ |
| 329 | kthread_stop(stopper->thread); | 331 | kthread_stop(stopper->thread); |
| 330 | /* drain remaining works */ | 332 | /* drain remaining works */ |
diff --git a/kernel/sys.c b/kernel/sys.c index e9ad44489828..7f5a0cd296a9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
| 931 | pgid = pid; | 931 | pgid = pid; |
| 932 | if (pgid < 0) | 932 | if (pgid < 0) |
| 933 | return -EINVAL; | 933 | return -EINVAL; |
| 934 | rcu_read_lock(); | ||
| 934 | 935 | ||
| 935 | /* From this point forward we keep holding onto the tasklist lock | 936 | /* From this point forward we keep holding onto the tasklist lock |
| 936 | * so that our parent does not change from under us. -DaveM | 937 | * so that our parent does not change from under us. -DaveM |
| @@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
| 984 | out: | 985 | out: |
| 985 | /* All paths lead to here, thus we are safe. -DaveM */ | 986 | /* All paths lead to here, thus we are safe. -DaveM */ |
| 986 | write_unlock_irq(&tasklist_lock); | 987 | write_unlock_irq(&tasklist_lock); |
| 988 | rcu_read_unlock(); | ||
| 987 | return err; | 989 | return err; |
| 988 | } | 990 | } |
| 989 | 991 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ca38e8e3e907..3a45c224770f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -1713,10 +1713,7 @@ static __init int sysctl_init(void) | |||
| 1713 | { | 1713 | { |
| 1714 | sysctl_set_parent(NULL, root_table); | 1714 | sysctl_set_parent(NULL, root_table); |
| 1715 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK | 1715 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK |
| 1716 | { | 1716 | sysctl_check_table(current->nsproxy, root_table); |
| 1717 | int err; | ||
| 1718 | err = sysctl_check_table(current->nsproxy, root_table); | ||
| 1719 | } | ||
| 1720 | #endif | 1717 | #endif |
| 1721 | return 0; | 1718 | return 0; |
| 1722 | } | 1719 | } |
| @@ -2488,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int | |||
| 2488 | kbuf[left] = 0; | 2485 | kbuf[left] = 0; |
| 2489 | } | 2486 | } |
| 2490 | 2487 | ||
| 2491 | for (; left && vleft--; i++, min++, max++, first=0) { | 2488 | for (; left && vleft--; i++, first = 0) { |
| 2492 | unsigned long val; | 2489 | unsigned long val; |
| 2493 | 2490 | ||
| 2494 | if (write) { | 2491 | if (write) { |
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 04cdcf72c827..10b90d8a03c4 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
| @@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | |||
| 143 | if (!table->maxlen) | 143 | if (!table->maxlen) |
| 144 | set_fail(&fail, table, "No maxlen"); | 144 | set_fail(&fail, table, "No maxlen"); |
| 145 | } | 145 | } |
| 146 | if ((table->proc_handler == proc_doulongvec_minmax) || | ||
| 147 | (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { | ||
| 148 | if (table->maxlen > sizeof (unsigned long)) { | ||
| 149 | if (!table->extra1) | ||
| 150 | set_fail(&fail, table, "No min"); | ||
| 151 | if (!table->extra2) | ||
| 152 | set_fail(&fail, table, "No max"); | ||
| 153 | } | ||
| 154 | } | ||
| 155 | #ifdef CONFIG_PROC_SYSCTL | 146 | #ifdef CONFIG_PROC_SYSCTL |
| 156 | if (table->procname && !table->proc_handler) | 147 | if (table->procname && !table->proc_handler) |
| 157 | set_fail(&fail, table, "No proc_handler"); | 148 | set_fail(&fail, table, "No proc_handler"); |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 4f104515a19b..f8b11a283171 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
| @@ -115,7 +115,9 @@ static int test_kprobes(void) | |||
| 115 | int ret; | 115 | int ret; |
| 116 | struct kprobe *kps[2] = {&kp, &kp2}; | 116 | struct kprobe *kps[2] = {&kp, &kp2}; |
| 117 | 117 | ||
| 118 | kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 118 | /* addr and flags should be cleard for reusing kprobe. */ |
| 119 | kp.addr = NULL; | ||
| 120 | kp.flags = 0; | ||
| 119 | ret = register_kprobes(kps, 2); | 121 | ret = register_kprobes(kps, 2); |
| 120 | if (ret < 0) { | 122 | if (ret < 0) { |
| 121 | printk(KERN_ERR "Kprobe smoke test failed: " | 123 | printk(KERN_ERR "Kprobe smoke test failed: " |
| @@ -210,7 +212,9 @@ static int test_jprobes(void) | |||
| 210 | int ret; | 212 | int ret; |
| 211 | struct jprobe *jps[2] = {&jp, &jp2}; | 213 | struct jprobe *jps[2] = {&jp, &jp2}; |
| 212 | 214 | ||
| 213 | jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 215 | /* addr and flags should be cleard for reusing kprobe. */ |
| 216 | jp.kp.addr = NULL; | ||
| 217 | jp.kp.flags = 0; | ||
| 214 | ret = register_jprobes(jps, 2); | 218 | ret = register_jprobes(jps, 2); |
| 215 | if (ret < 0) { | 219 | if (ret < 0) { |
| 216 | printk(KERN_ERR "Kprobe smoke test failed: " | 220 | printk(KERN_ERR "Kprobe smoke test failed: " |
| @@ -323,7 +327,9 @@ static int test_kretprobes(void) | |||
| 323 | int ret; | 327 | int ret; |
| 324 | struct kretprobe *rps[2] = {&rp, &rp2}; | 328 | struct kretprobe *rps[2] = {&rp, &rp2}; |
| 325 | 329 | ||
| 326 | rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ | 330 | /* addr and flags should be cleard for reusing kprobe. */ |
| 331 | rp.kp.addr = NULL; | ||
| 332 | rp.kp.flags = 0; | ||
| 327 | ret = register_kretprobes(rps, 2); | 333 | ret = register_kretprobes(rps, 2); |
| 328 | if (ret < 0) { | 334 | if (ret < 0) { |
| 329 | printk(KERN_ERR "Kprobe smoke test failed: " | 335 | printk(KERN_ERR "Kprobe smoke test failed: " |
diff --git a/kernel/timer.c b/kernel/timer.c index 97bf05baade7..68a9ae7679b7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -37,7 +37,7 @@ | |||
| 37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
| 38 | #include <linux/tick.h> | 38 | #include <linux/tick.h> |
| 39 | #include <linux/kallsyms.h> | 39 | #include <linux/kallsyms.h> |
| 40 | #include <linux/perf_event.h> | 40 | #include <linux/irq_work.h> |
| 41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
| 42 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
| 43 | 43 | ||
| @@ -1279,7 +1279,10 @@ void update_process_times(int user_tick) | |||
| 1279 | run_local_timers(); | 1279 | run_local_timers(); |
| 1280 | rcu_check_callbacks(cpu, user_tick); | 1280 | rcu_check_callbacks(cpu, user_tick); |
| 1281 | printk_tick(); | 1281 | printk_tick(); |
| 1282 | perf_event_do_pending(); | 1282 | #ifdef CONFIG_IRQ_WORK |
| 1283 | if (in_irq()) | ||
| 1284 | irq_work_run(); | ||
| 1285 | #endif | ||
| 1283 | scheduler_tick(); | 1286 | scheduler_tick(); |
| 1284 | run_posix_cpu_timers(p); | 1287 | run_posix_cpu_timers(p); |
| 1285 | } | 1288 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 538501c6ea50..e550d2eda1df 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS | |||
| 49 | help | 49 | help |
| 50 | See Documentation/trace/ftrace-design.txt | 50 | See Documentation/trace/ftrace-design.txt |
| 51 | 51 | ||
| 52 | config HAVE_C_RECORDMCOUNT | ||
| 53 | bool | ||
| 54 | help | ||
| 55 | C version of recordmcount available? | ||
| 56 | |||
| 52 | config TRACER_MAX_TRACE | 57 | config TRACER_MAX_TRACE |
| 53 | bool | 58 | bool |
| 54 | 59 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d88ce9b9fb8..ebd80d50c474 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
| 381 | { | 381 | { |
| 382 | struct ftrace_profile *rec = v; | 382 | struct ftrace_profile *rec = v; |
| 383 | char str[KSYM_SYMBOL_LEN]; | 383 | char str[KSYM_SYMBOL_LEN]; |
| 384 | int ret = 0; | ||
| 384 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 385 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 385 | static DEFINE_MUTEX(mutex); | ||
| 386 | static struct trace_seq s; | 386 | static struct trace_seq s; |
| 387 | unsigned long long avg; | 387 | unsigned long long avg; |
| 388 | unsigned long long stddev; | 388 | unsigned long long stddev; |
| 389 | #endif | 389 | #endif |
| 390 | mutex_lock(&ftrace_profile_lock); | ||
| 391 | |||
| 392 | /* we raced with function_profile_reset() */ | ||
| 393 | if (unlikely(rec->counter == 0)) { | ||
| 394 | ret = -EBUSY; | ||
| 395 | goto out; | ||
| 396 | } | ||
| 390 | 397 | ||
| 391 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); | 398 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); |
| 392 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); | 399 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); |
| @@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
| 408 | do_div(stddev, (rec->counter - 1) * 1000); | 415 | do_div(stddev, (rec->counter - 1) * 1000); |
| 409 | } | 416 | } |
| 410 | 417 | ||
| 411 | mutex_lock(&mutex); | ||
| 412 | trace_seq_init(&s); | 418 | trace_seq_init(&s); |
| 413 | trace_print_graph_duration(rec->time, &s); | 419 | trace_print_graph_duration(rec->time, &s); |
| 414 | trace_seq_puts(&s, " "); | 420 | trace_seq_puts(&s, " "); |
| @@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
| 416 | trace_seq_puts(&s, " "); | 422 | trace_seq_puts(&s, " "); |
| 417 | trace_print_graph_duration(stddev, &s); | 423 | trace_print_graph_duration(stddev, &s); |
| 418 | trace_print_seq(m, &s); | 424 | trace_print_seq(m, &s); |
| 419 | mutex_unlock(&mutex); | ||
| 420 | #endif | 425 | #endif |
| 421 | seq_putc(m, '\n'); | 426 | seq_putc(m, '\n'); |
| 427 | out: | ||
| 428 | mutex_unlock(&ftrace_profile_lock); | ||
| 422 | 429 | ||
| 423 | return 0; | 430 | return ret; |
| 424 | } | 431 | } |
| 425 | 432 | ||
| 426 | static void ftrace_profile_reset(struct ftrace_profile_stat *stat) | 433 | static void ftrace_profile_reset(struct ftrace_profile_stat *stat) |
| @@ -877,10 +884,8 @@ enum { | |||
| 877 | FTRACE_ENABLE_CALLS = (1 << 0), | 884 | FTRACE_ENABLE_CALLS = (1 << 0), |
| 878 | FTRACE_DISABLE_CALLS = (1 << 1), | 885 | FTRACE_DISABLE_CALLS = (1 << 1), |
| 879 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), | 886 | FTRACE_UPDATE_TRACE_FUNC = (1 << 2), |
| 880 | FTRACE_ENABLE_MCOUNT = (1 << 3), | 887 | FTRACE_START_FUNC_RET = (1 << 3), |
| 881 | FTRACE_DISABLE_MCOUNT = (1 << 4), | 888 | FTRACE_STOP_FUNC_RET = (1 << 4), |
| 882 | FTRACE_START_FUNC_RET = (1 << 5), | ||
| 883 | FTRACE_STOP_FUNC_RET = (1 << 6), | ||
| 884 | }; | 889 | }; |
| 885 | 890 | ||
| 886 | static int ftrace_filtered; | 891 | static int ftrace_filtered; |
| @@ -1219,8 +1224,6 @@ static void ftrace_shutdown(int command) | |||
| 1219 | 1224 | ||
| 1220 | static void ftrace_startup_sysctl(void) | 1225 | static void ftrace_startup_sysctl(void) |
| 1221 | { | 1226 | { |
| 1222 | int command = FTRACE_ENABLE_MCOUNT; | ||
| 1223 | |||
| 1224 | if (unlikely(ftrace_disabled)) | 1227 | if (unlikely(ftrace_disabled)) |
| 1225 | return; | 1228 | return; |
| 1226 | 1229 | ||
| @@ -1228,23 +1231,17 @@ static void ftrace_startup_sysctl(void) | |||
| 1228 | saved_ftrace_func = NULL; | 1231 | saved_ftrace_func = NULL; |
| 1229 | /* ftrace_start_up is true if we want ftrace running */ | 1232 | /* ftrace_start_up is true if we want ftrace running */ |
| 1230 | if (ftrace_start_up) | 1233 | if (ftrace_start_up) |
| 1231 | command |= FTRACE_ENABLE_CALLS; | 1234 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); |
| 1232 | |||
| 1233 | ftrace_run_update_code(command); | ||
| 1234 | } | 1235 | } |
| 1235 | 1236 | ||
| 1236 | static void ftrace_shutdown_sysctl(void) | 1237 | static void ftrace_shutdown_sysctl(void) |
| 1237 | { | 1238 | { |
| 1238 | int command = FTRACE_DISABLE_MCOUNT; | ||
| 1239 | |||
| 1240 | if (unlikely(ftrace_disabled)) | 1239 | if (unlikely(ftrace_disabled)) |
| 1241 | return; | 1240 | return; |
| 1242 | 1241 | ||
| 1243 | /* ftrace_start_up is true if ftrace is running */ | 1242 | /* ftrace_start_up is true if ftrace is running */ |
| 1244 | if (ftrace_start_up) | 1243 | if (ftrace_start_up) |
| 1245 | command |= FTRACE_DISABLE_CALLS; | 1244 | ftrace_run_update_code(FTRACE_DISABLE_CALLS); |
| 1246 | |||
| 1247 | ftrace_run_update_code(command); | ||
| 1248 | } | 1245 | } |
| 1249 | 1246 | ||
| 1250 | static cycle_t ftrace_update_time; | 1247 | static cycle_t ftrace_update_time; |
| @@ -1361,24 +1358,29 @@ enum { | |||
| 1361 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ | 1358 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ |
| 1362 | 1359 | ||
| 1363 | struct ftrace_iterator { | 1360 | struct ftrace_iterator { |
| 1364 | struct ftrace_page *pg; | 1361 | loff_t pos; |
| 1365 | int hidx; | 1362 | loff_t func_pos; |
| 1366 | int idx; | 1363 | struct ftrace_page *pg; |
| 1367 | unsigned flags; | 1364 | struct dyn_ftrace *func; |
| 1368 | struct trace_parser parser; | 1365 | struct ftrace_func_probe *probe; |
| 1366 | struct trace_parser parser; | ||
| 1367 | int hidx; | ||
| 1368 | int idx; | ||
| 1369 | unsigned flags; | ||
| 1369 | }; | 1370 | }; |
| 1370 | 1371 | ||
| 1371 | static void * | 1372 | static void * |
| 1372 | t_hash_next(struct seq_file *m, void *v, loff_t *pos) | 1373 | t_hash_next(struct seq_file *m, loff_t *pos) |
| 1373 | { | 1374 | { |
| 1374 | struct ftrace_iterator *iter = m->private; | 1375 | struct ftrace_iterator *iter = m->private; |
| 1375 | struct hlist_node *hnd = v; | 1376 | struct hlist_node *hnd = NULL; |
| 1376 | struct hlist_head *hhd; | 1377 | struct hlist_head *hhd; |
| 1377 | 1378 | ||
| 1378 | WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); | ||
| 1379 | |||
| 1380 | (*pos)++; | 1379 | (*pos)++; |
| 1380 | iter->pos = *pos; | ||
| 1381 | 1381 | ||
| 1382 | if (iter->probe) | ||
| 1383 | hnd = &iter->probe->node; | ||
| 1382 | retry: | 1384 | retry: |
| 1383 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) | 1385 | if (iter->hidx >= FTRACE_FUNC_HASHSIZE) |
| 1384 | return NULL; | 1386 | return NULL; |
| @@ -1401,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 1401 | } | 1403 | } |
| 1402 | } | 1404 | } |
| 1403 | 1405 | ||
| 1404 | return hnd; | 1406 | if (WARN_ON_ONCE(!hnd)) |
| 1407 | return NULL; | ||
| 1408 | |||
| 1409 | iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); | ||
| 1410 | |||
| 1411 | return iter; | ||
| 1405 | } | 1412 | } |
| 1406 | 1413 | ||
| 1407 | static void *t_hash_start(struct seq_file *m, loff_t *pos) | 1414 | static void *t_hash_start(struct seq_file *m, loff_t *pos) |
| @@ -1410,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) | |||
| 1410 | void *p = NULL; | 1417 | void *p = NULL; |
| 1411 | loff_t l; | 1418 | loff_t l; |
| 1412 | 1419 | ||
| 1413 | if (!(iter->flags & FTRACE_ITER_HASH)) | 1420 | if (iter->func_pos > *pos) |
| 1414 | *pos = 0; | 1421 | return NULL; |
| 1415 | |||
| 1416 | iter->flags |= FTRACE_ITER_HASH; | ||
| 1417 | 1422 | ||
| 1418 | iter->hidx = 0; | 1423 | iter->hidx = 0; |
| 1419 | for (l = 0; l <= *pos; ) { | 1424 | for (l = 0; l <= (*pos - iter->func_pos); ) { |
| 1420 | p = t_hash_next(m, p, &l); | 1425 | p = t_hash_next(m, &l); |
| 1421 | if (!p) | 1426 | if (!p) |
| 1422 | break; | 1427 | break; |
| 1423 | } | 1428 | } |
| 1424 | return p; | 1429 | if (!p) |
| 1430 | return NULL; | ||
| 1431 | |||
| 1432 | /* Only set this if we have an item */ | ||
| 1433 | iter->flags |= FTRACE_ITER_HASH; | ||
| 1434 | |||
| 1435 | return iter; | ||
| 1425 | } | 1436 | } |
| 1426 | 1437 | ||
| 1427 | static int t_hash_show(struct seq_file *m, void *v) | 1438 | static int |
| 1439 | t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) | ||
| 1428 | { | 1440 | { |
| 1429 | struct ftrace_func_probe *rec; | 1441 | struct ftrace_func_probe *rec; |
| 1430 | struct hlist_node *hnd = v; | ||
| 1431 | 1442 | ||
| 1432 | rec = hlist_entry(hnd, struct ftrace_func_probe, node); | 1443 | rec = iter->probe; |
| 1444 | if (WARN_ON_ONCE(!rec)) | ||
| 1445 | return -EIO; | ||
| 1433 | 1446 | ||
| 1434 | if (rec->ops->print) | 1447 | if (rec->ops->print) |
| 1435 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); | 1448 | return rec->ops->print(m, rec->ip, rec->ops, rec->data); |
| @@ -1450,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 1450 | struct dyn_ftrace *rec = NULL; | 1463 | struct dyn_ftrace *rec = NULL; |
| 1451 | 1464 | ||
| 1452 | if (iter->flags & FTRACE_ITER_HASH) | 1465 | if (iter->flags & FTRACE_ITER_HASH) |
| 1453 | return t_hash_next(m, v, pos); | 1466 | return t_hash_next(m, pos); |
| 1454 | 1467 | ||
| 1455 | (*pos)++; | 1468 | (*pos)++; |
| 1469 | iter->pos = *pos; | ||
| 1456 | 1470 | ||
| 1457 | if (iter->flags & FTRACE_ITER_PRINTALL) | 1471 | if (iter->flags & FTRACE_ITER_PRINTALL) |
| 1458 | return NULL; | 1472 | return t_hash_start(m, pos); |
| 1459 | 1473 | ||
| 1460 | retry: | 1474 | retry: |
| 1461 | if (iter->idx >= iter->pg->index) { | 1475 | if (iter->idx >= iter->pg->index) { |
| @@ -1484,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 1484 | } | 1498 | } |
| 1485 | } | 1499 | } |
| 1486 | 1500 | ||
| 1487 | return rec; | 1501 | if (!rec) |
| 1502 | return t_hash_start(m, pos); | ||
| 1503 | |||
| 1504 | iter->func_pos = *pos; | ||
| 1505 | iter->func = rec; | ||
| 1506 | |||
| 1507 | return iter; | ||
| 1508 | } | ||
| 1509 | |||
| 1510 | static void reset_iter_read(struct ftrace_iterator *iter) | ||
| 1511 | { | ||
| 1512 | iter->pos = 0; | ||
| 1513 | iter->func_pos = 0; | ||
| 1514 | iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); | ||
| 1488 | } | 1515 | } |
| 1489 | 1516 | ||
| 1490 | static void *t_start(struct seq_file *m, loff_t *pos) | 1517 | static void *t_start(struct seq_file *m, loff_t *pos) |
| @@ -1495,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 1495 | 1522 | ||
| 1496 | mutex_lock(&ftrace_lock); | 1523 | mutex_lock(&ftrace_lock); |
| 1497 | /* | 1524 | /* |
| 1525 | * If an lseek was done, then reset and start from beginning. | ||
| 1526 | */ | ||
| 1527 | if (*pos < iter->pos) | ||
| 1528 | reset_iter_read(iter); | ||
| 1529 | |||
| 1530 | /* | ||
| 1498 | * For set_ftrace_filter reading, if we have the filter | 1531 | * For set_ftrace_filter reading, if we have the filter |
| 1499 | * off, we can short cut and just print out that all | 1532 | * off, we can short cut and just print out that all |
| 1500 | * functions are enabled. | 1533 | * functions are enabled. |
| @@ -1503,12 +1536,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 1503 | if (*pos > 0) | 1536 | if (*pos > 0) |
| 1504 | return t_hash_start(m, pos); | 1537 | return t_hash_start(m, pos); |
| 1505 | iter->flags |= FTRACE_ITER_PRINTALL; | 1538 | iter->flags |= FTRACE_ITER_PRINTALL; |
| 1539 | /* reset in case of seek/pread */ | ||
| 1540 | iter->flags &= ~FTRACE_ITER_HASH; | ||
| 1506 | return iter; | 1541 | return iter; |
| 1507 | } | 1542 | } |
| 1508 | 1543 | ||
| 1509 | if (iter->flags & FTRACE_ITER_HASH) | 1544 | if (iter->flags & FTRACE_ITER_HASH) |
| 1510 | return t_hash_start(m, pos); | 1545 | return t_hash_start(m, pos); |
| 1511 | 1546 | ||
| 1547 | /* | ||
| 1548 | * Unfortunately, we need to restart at ftrace_pages_start | ||
| 1549 | * every time we let go of the ftrace_mutex. This is because | ||
| 1550 | * those pointers can change without the lock. | ||
| 1551 | */ | ||
| 1512 | iter->pg = ftrace_pages_start; | 1552 | iter->pg = ftrace_pages_start; |
| 1513 | iter->idx = 0; | 1553 | iter->idx = 0; |
| 1514 | for (l = 0; l <= *pos; ) { | 1554 | for (l = 0; l <= *pos; ) { |
| @@ -1517,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 1517 | break; | 1557 | break; |
| 1518 | } | 1558 | } |
| 1519 | 1559 | ||
| 1520 | if (!p && iter->flags & FTRACE_ITER_FILTER) | 1560 | if (!p) { |
| 1521 | return t_hash_start(m, pos); | 1561 | if (iter->flags & FTRACE_ITER_FILTER) |
| 1562 | return t_hash_start(m, pos); | ||
| 1522 | 1563 | ||
| 1523 | return p; | 1564 | return NULL; |
| 1565 | } | ||
| 1566 | |||
| 1567 | return iter; | ||
| 1524 | } | 1568 | } |
| 1525 | 1569 | ||
| 1526 | static void t_stop(struct seq_file *m, void *p) | 1570 | static void t_stop(struct seq_file *m, void *p) |
| @@ -1531,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p) | |||
| 1531 | static int t_show(struct seq_file *m, void *v) | 1575 | static int t_show(struct seq_file *m, void *v) |
| 1532 | { | 1576 | { |
| 1533 | struct ftrace_iterator *iter = m->private; | 1577 | struct ftrace_iterator *iter = m->private; |
| 1534 | struct dyn_ftrace *rec = v; | 1578 | struct dyn_ftrace *rec; |
| 1535 | 1579 | ||
| 1536 | if (iter->flags & FTRACE_ITER_HASH) | 1580 | if (iter->flags & FTRACE_ITER_HASH) |
| 1537 | return t_hash_show(m, v); | 1581 | return t_hash_show(m, iter); |
| 1538 | 1582 | ||
| 1539 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 1583 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
| 1540 | seq_printf(m, "#### all functions enabled ####\n"); | 1584 | seq_printf(m, "#### all functions enabled ####\n"); |
| 1541 | return 0; | 1585 | return 0; |
| 1542 | } | 1586 | } |
| 1543 | 1587 | ||
| 1588 | rec = iter->func; | ||
| 1589 | |||
| 1544 | if (!rec) | 1590 | if (!rec) |
| 1545 | return 0; | 1591 | return 0; |
| 1546 | 1592 | ||
| @@ -1592,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) | |||
| 1592 | 1638 | ||
| 1593 | ret = ftrace_avail_open(inode, file); | 1639 | ret = ftrace_avail_open(inode, file); |
| 1594 | if (!ret) { | 1640 | if (!ret) { |
| 1595 | m = (struct seq_file *)file->private_data; | 1641 | m = file->private_data; |
| 1596 | iter = (struct ftrace_iterator *)m->private; | 1642 | iter = m->private; |
| 1597 | iter->flags = FTRACE_ITER_FAILURES; | 1643 | iter->flags = FTRACE_ITER_FAILURES; |
| 1598 | } | 1644 | } |
| 1599 | 1645 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 19cccc3c3028..c5a632a669e1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta) | |||
| 405 | #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) | 405 | #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) |
| 406 | 406 | ||
| 407 | /* Max number of timestamps that can fit on a page */ | 407 | /* Max number of timestamps that can fit on a page */ |
| 408 | #define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) | 408 | #define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) |
| 409 | 409 | ||
| 410 | int ring_buffer_print_page_header(struct trace_seq *s) | 410 | int ring_buffer_print_page_header(struct trace_seq *s) |
| 411 | { | 411 | { |
| @@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) | |||
| 2606 | } | 2606 | } |
| 2607 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); | 2607 | EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); |
| 2608 | 2608 | ||
| 2609 | /* | ||
| 2610 | * The total entries in the ring buffer is the running counter | ||
| 2611 | * of entries entered into the ring buffer, minus the sum of | ||
| 2612 | * the entries read from the ring buffer and the number of | ||
| 2613 | * entries that were overwritten. | ||
| 2614 | */ | ||
| 2615 | static inline unsigned long | ||
| 2616 | rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | ||
| 2617 | { | ||
| 2618 | return local_read(&cpu_buffer->entries) - | ||
| 2619 | (local_read(&cpu_buffer->overrun) + cpu_buffer->read); | ||
| 2620 | } | ||
| 2621 | |||
| 2609 | /** | 2622 | /** |
| 2610 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer | 2623 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer |
| 2611 | * @buffer: The ring buffer | 2624 | * @buffer: The ring buffer |
| @@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); | |||
| 2614 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) | 2627 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) |
| 2615 | { | 2628 | { |
| 2616 | struct ring_buffer_per_cpu *cpu_buffer; | 2629 | struct ring_buffer_per_cpu *cpu_buffer; |
| 2617 | unsigned long ret; | ||
| 2618 | 2630 | ||
| 2619 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | 2631 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) |
| 2620 | return 0; | 2632 | return 0; |
| 2621 | 2633 | ||
| 2622 | cpu_buffer = buffer->buffers[cpu]; | 2634 | cpu_buffer = buffer->buffers[cpu]; |
| 2623 | ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) | ||
| 2624 | - cpu_buffer->read; | ||
| 2625 | 2635 | ||
| 2626 | return ret; | 2636 | return rb_num_of_entries(cpu_buffer); |
| 2627 | } | 2637 | } |
| 2628 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); | 2638 | EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); |
| 2629 | 2639 | ||
| @@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) | |||
| 2684 | /* if you care about this being correct, lock the buffer */ | 2694 | /* if you care about this being correct, lock the buffer */ |
| 2685 | for_each_buffer_cpu(buffer, cpu) { | 2695 | for_each_buffer_cpu(buffer, cpu) { |
| 2686 | cpu_buffer = buffer->buffers[cpu]; | 2696 | cpu_buffer = buffer->buffers[cpu]; |
| 2687 | entries += (local_read(&cpu_buffer->entries) - | 2697 | entries += rb_num_of_entries(cpu_buffer); |
| 2688 | local_read(&cpu_buffer->overrun)) - cpu_buffer->read; | ||
| 2689 | } | 2698 | } |
| 2690 | 2699 | ||
| 2691 | return entries; | 2700 | return entries; |
| @@ -2985,13 +2994,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) | |||
| 2985 | 2994 | ||
| 2986 | static void rb_advance_iter(struct ring_buffer_iter *iter) | 2995 | static void rb_advance_iter(struct ring_buffer_iter *iter) |
| 2987 | { | 2996 | { |
| 2988 | struct ring_buffer *buffer; | ||
| 2989 | struct ring_buffer_per_cpu *cpu_buffer; | 2997 | struct ring_buffer_per_cpu *cpu_buffer; |
| 2990 | struct ring_buffer_event *event; | 2998 | struct ring_buffer_event *event; |
| 2991 | unsigned length; | 2999 | unsigned length; |
| 2992 | 3000 | ||
| 2993 | cpu_buffer = iter->cpu_buffer; | 3001 | cpu_buffer = iter->cpu_buffer; |
| 2994 | buffer = cpu_buffer->buffer; | ||
| 2995 | 3002 | ||
| 2996 | /* | 3003 | /* |
| 2997 | * Check if we are at the end of the buffer. | 3004 | * Check if we are at the end of the buffer. |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..001bcd2ccf4a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) | |||
| 2196 | 2196 | ||
| 2197 | static int tracing_release(struct inode *inode, struct file *file) | 2197 | static int tracing_release(struct inode *inode, struct file *file) |
| 2198 | { | 2198 | { |
| 2199 | struct seq_file *m = (struct seq_file *)file->private_data; | 2199 | struct seq_file *m = file->private_data; |
| 2200 | struct trace_iterator *iter; | 2200 | struct trace_iterator *iter; |
| 2201 | int cpu; | 2201 | int cpu; |
| 2202 | 2202 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a5..9021f8c0c0c3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, | |||
| 343 | unsigned long ip, | 343 | unsigned long ip, |
| 344 | unsigned long parent_ip, | 344 | unsigned long parent_ip, |
| 345 | unsigned long flags, int pc); | 345 | unsigned long flags, int pc); |
| 346 | void trace_graph_function(struct trace_array *tr, | ||
| 347 | unsigned long ip, | ||
| 348 | unsigned long parent_ip, | ||
| 349 | unsigned long flags, int pc); | ||
| 346 | void trace_default_header(struct seq_file *m); | 350 | void trace_default_header(struct seq_file *m); |
| 347 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); | 351 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); |
| 348 | int trace_empty(struct trace_iterator *iter); | 352 | int trace_empty(struct trace_iterator *iter); |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 000e6e85b445..39c059ca670e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
| 10 | #include "trace.h" | 10 | #include "trace.h" |
| 11 | 11 | ||
| 12 | static char *perf_trace_buf[4]; | 12 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; |
| 13 | 13 | ||
| 14 | /* | 14 | /* |
| 15 | * Force it to be aligned to unsigned long to avoid misaligned accesses | 15 | * Force it to be aligned to unsigned long to avoid misaligned accesses |
| @@ -24,7 +24,7 @@ static int total_ref_count; | |||
| 24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, | 24 | static int perf_trace_event_init(struct ftrace_event_call *tp_event, |
| 25 | struct perf_event *p_event) | 25 | struct perf_event *p_event) |
| 26 | { | 26 | { |
| 27 | struct hlist_head *list; | 27 | struct hlist_head __percpu *list; |
| 28 | int ret = -ENOMEM; | 28 | int ret = -ENOMEM; |
| 29 | int cpu; | 29 | int cpu; |
| 30 | 30 | ||
| @@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
| 42 | tp_event->perf_events = list; | 42 | tp_event->perf_events = list; |
| 43 | 43 | ||
| 44 | if (!total_ref_count) { | 44 | if (!total_ref_count) { |
| 45 | char *buf; | 45 | char __percpu *buf; |
| 46 | int i; | 46 | int i; |
| 47 | 47 | ||
| 48 | for (i = 0; i < 4; i++) { | 48 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
| 49 | buf = (char *)alloc_percpu(perf_trace_t); | 49 | buf = (char __percpu *)alloc_percpu(perf_trace_t); |
| 50 | if (!buf) | 50 | if (!buf) |
| 51 | goto fail; | 51 | goto fail; |
| 52 | 52 | ||
| @@ -65,7 +65,7 @@ fail: | |||
| 65 | if (!total_ref_count) { | 65 | if (!total_ref_count) { |
| 66 | int i; | 66 | int i; |
| 67 | 67 | ||
| 68 | for (i = 0; i < 4; i++) { | 68 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
| 69 | free_percpu(perf_trace_buf[i]); | 69 | free_percpu(perf_trace_buf[i]); |
| 70 | perf_trace_buf[i] = NULL; | 70 | perf_trace_buf[i] = NULL; |
| 71 | } | 71 | } |
| @@ -91,6 +91,8 @@ int perf_trace_init(struct perf_event *p_event) | |||
| 91 | tp_event->class && tp_event->class->reg && | 91 | tp_event->class && tp_event->class->reg && |
| 92 | try_module_get(tp_event->mod)) { | 92 | try_module_get(tp_event->mod)) { |
| 93 | ret = perf_trace_event_init(tp_event, p_event); | 93 | ret = perf_trace_event_init(tp_event, p_event); |
| 94 | if (ret) | ||
| 95 | module_put(tp_event->mod); | ||
| 94 | break; | 96 | break; |
| 95 | } | 97 | } |
| 96 | } | 98 | } |
| @@ -99,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event) | |||
| 99 | return ret; | 101 | return ret; |
| 100 | } | 102 | } |
| 101 | 103 | ||
| 102 | int perf_trace_enable(struct perf_event *p_event) | 104 | int perf_trace_add(struct perf_event *p_event, int flags) |
| 103 | { | 105 | { |
| 104 | struct ftrace_event_call *tp_event = p_event->tp_event; | 106 | struct ftrace_event_call *tp_event = p_event->tp_event; |
| 107 | struct hlist_head __percpu *pcpu_list; | ||
| 105 | struct hlist_head *list; | 108 | struct hlist_head *list; |
| 106 | 109 | ||
| 107 | list = tp_event->perf_events; | 110 | pcpu_list = tp_event->perf_events; |
| 108 | if (WARN_ON_ONCE(!list)) | 111 | if (WARN_ON_ONCE(!pcpu_list)) |
| 109 | return -EINVAL; | 112 | return -EINVAL; |
| 110 | 113 | ||
| 111 | list = this_cpu_ptr(list); | 114 | if (!(flags & PERF_EF_START)) |
| 115 | p_event->hw.state = PERF_HES_STOPPED; | ||
| 116 | |||
| 117 | list = this_cpu_ptr(pcpu_list); | ||
| 112 | hlist_add_head_rcu(&p_event->hlist_entry, list); | 118 | hlist_add_head_rcu(&p_event->hlist_entry, list); |
| 113 | 119 | ||
| 114 | return 0; | 120 | return 0; |
| 115 | } | 121 | } |
| 116 | 122 | ||
| 117 | void perf_trace_disable(struct perf_event *p_event) | 123 | void perf_trace_del(struct perf_event *p_event, int flags) |
| 118 | { | 124 | { |
| 119 | hlist_del_rcu(&p_event->hlist_entry); | 125 | hlist_del_rcu(&p_event->hlist_entry); |
| 120 | } | 126 | } |
| @@ -140,12 +146,13 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
| 140 | tp_event->perf_events = NULL; | 146 | tp_event->perf_events = NULL; |
| 141 | 147 | ||
| 142 | if (!--total_ref_count) { | 148 | if (!--total_ref_count) { |
| 143 | for (i = 0; i < 4; i++) { | 149 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
| 144 | free_percpu(perf_trace_buf[i]); | 150 | free_percpu(perf_trace_buf[i]); |
| 145 | perf_trace_buf[i] = NULL; | 151 | perf_trace_buf[i] = NULL; |
| 146 | } | 152 | } |
| 147 | } | 153 | } |
| 148 | out: | 154 | out: |
| 155 | module_put(tp_event->mod); | ||
| 149 | mutex_unlock(&event_mutex); | 156 | mutex_unlock(&event_mutex); |
| 150 | } | 157 | } |
| 151 | 158 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..398c0e8b332c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -600,21 +600,29 @@ out: | |||
| 600 | 600 | ||
| 601 | enum { | 601 | enum { |
| 602 | FORMAT_HEADER = 1, | 602 | FORMAT_HEADER = 1, |
| 603 | FORMAT_PRINTFMT = 2, | 603 | FORMAT_FIELD_SEPERATOR = 2, |
| 604 | FORMAT_PRINTFMT = 3, | ||
| 604 | }; | 605 | }; |
| 605 | 606 | ||
| 606 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) | 607 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) |
| 607 | { | 608 | { |
| 608 | struct ftrace_event_call *call = m->private; | 609 | struct ftrace_event_call *call = m->private; |
| 609 | struct ftrace_event_field *field; | 610 | struct ftrace_event_field *field; |
| 610 | struct list_head *head; | 611 | struct list_head *common_head = &ftrace_common_fields; |
| 612 | struct list_head *head = trace_get_fields(call); | ||
| 611 | 613 | ||
| 612 | (*pos)++; | 614 | (*pos)++; |
| 613 | 615 | ||
| 614 | switch ((unsigned long)v) { | 616 | switch ((unsigned long)v) { |
| 615 | case FORMAT_HEADER: | 617 | case FORMAT_HEADER: |
| 616 | head = &ftrace_common_fields; | 618 | if (unlikely(list_empty(common_head))) |
| 619 | return NULL; | ||
| 620 | |||
| 621 | field = list_entry(common_head->prev, | ||
| 622 | struct ftrace_event_field, link); | ||
| 623 | return field; | ||
| 617 | 624 | ||
| 625 | case FORMAT_FIELD_SEPERATOR: | ||
| 618 | if (unlikely(list_empty(head))) | 626 | if (unlikely(list_empty(head))) |
| 619 | return NULL; | 627 | return NULL; |
| 620 | 628 | ||
| @@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 626 | return NULL; | 634 | return NULL; |
| 627 | } | 635 | } |
| 628 | 636 | ||
| 629 | head = trace_get_fields(call); | ||
| 630 | |||
| 631 | /* | ||
| 632 | * To separate common fields from event fields, the | ||
| 633 | * LSB is set on the first event field. Clear it in case. | ||
| 634 | */ | ||
| 635 | v = (void *)((unsigned long)v & ~1L); | ||
| 636 | |||
| 637 | field = v; | 637 | field = v; |
| 638 | /* | 638 | if (field->link.prev == common_head) |
| 639 | * If this is a common field, and at the end of the list, then | 639 | return (void *)FORMAT_FIELD_SEPERATOR; |
| 640 | * continue with main list. | 640 | else if (field->link.prev == head) |
| 641 | */ | ||
| 642 | if (field->link.prev == &ftrace_common_fields) { | ||
| 643 | if (unlikely(list_empty(head))) | ||
| 644 | return NULL; | ||
| 645 | field = list_entry(head->prev, struct ftrace_event_field, link); | ||
| 646 | /* Set the LSB to notify f_show to print an extra newline */ | ||
| 647 | field = (struct ftrace_event_field *) | ||
| 648 | ((unsigned long)field | 1); | ||
| 649 | return field; | ||
| 650 | } | ||
| 651 | |||
| 652 | /* If we are done tell f_show to print the format */ | ||
| 653 | if (field->link.prev == head) | ||
| 654 | return (void *)FORMAT_PRINTFMT; | 641 | return (void *)FORMAT_PRINTFMT; |
| 655 | 642 | ||
| 656 | field = list_entry(field->link.prev, struct ftrace_event_field, link); | 643 | field = list_entry(field->link.prev, struct ftrace_event_field, link); |
| @@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v) | |||
| 688 | seq_printf(m, "format:\n"); | 675 | seq_printf(m, "format:\n"); |
| 689 | return 0; | 676 | return 0; |
| 690 | 677 | ||
| 678 | case FORMAT_FIELD_SEPERATOR: | ||
| 679 | seq_putc(m, '\n'); | ||
| 680 | return 0; | ||
| 681 | |||
| 691 | case FORMAT_PRINTFMT: | 682 | case FORMAT_PRINTFMT: |
| 692 | seq_printf(m, "\nprint fmt: %s\n", | 683 | seq_printf(m, "\nprint fmt: %s\n", |
| 693 | call->print_fmt); | 684 | call->print_fmt); |
| 694 | return 0; | 685 | return 0; |
| 695 | } | 686 | } |
| 696 | 687 | ||
| 697 | /* | ||
| 698 | * To separate common fields from event fields, the | ||
| 699 | * LSB is set on the first event field. Clear it and | ||
| 700 | * print a newline if it is set. | ||
| 701 | */ | ||
| 702 | if ((unsigned long)v & 1) { | ||
| 703 | seq_putc(m, '\n'); | ||
| 704 | v = (void *)((unsigned long)v & ~1L); | ||
| 705 | } | ||
| 706 | |||
| 707 | field = v; | 688 | field = v; |
| 708 | 689 | ||
| 709 | /* | 690 | /* |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6f233698518e..76b05980225c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -15,15 +15,19 @@ | |||
| 15 | #include "trace.h" | 15 | #include "trace.h" |
| 16 | #include "trace_output.h" | 16 | #include "trace_output.h" |
| 17 | 17 | ||
| 18 | /* When set, irq functions will be ignored */ | ||
| 19 | static int ftrace_graph_skip_irqs; | ||
| 20 | |||
| 18 | struct fgraph_cpu_data { | 21 | struct fgraph_cpu_data { |
| 19 | pid_t last_pid; | 22 | pid_t last_pid; |
| 20 | int depth; | 23 | int depth; |
| 24 | int depth_irq; | ||
| 21 | int ignore; | 25 | int ignore; |
| 22 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; | 26 | unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; |
| 23 | }; | 27 | }; |
| 24 | 28 | ||
| 25 | struct fgraph_data { | 29 | struct fgraph_data { |
| 26 | struct fgraph_cpu_data *cpu_data; | 30 | struct fgraph_cpu_data __percpu *cpu_data; |
| 27 | 31 | ||
| 28 | /* Place to preserve last processed entry. */ | 32 | /* Place to preserve last processed entry. */ |
| 29 | struct ftrace_graph_ent_entry ent; | 33 | struct ftrace_graph_ent_entry ent; |
| @@ -41,6 +45,7 @@ struct fgraph_data { | |||
| 41 | #define TRACE_GRAPH_PRINT_PROC 0x8 | 45 | #define TRACE_GRAPH_PRINT_PROC 0x8 |
| 42 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | 46 | #define TRACE_GRAPH_PRINT_DURATION 0x10 |
| 43 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | 47 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 |
| 48 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | ||
| 44 | 49 | ||
| 45 | static struct tracer_opt trace_opts[] = { | 50 | static struct tracer_opt trace_opts[] = { |
| 46 | /* Display overruns? (for self-debug purpose) */ | 51 | /* Display overruns? (for self-debug purpose) */ |
| @@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = { | |||
| 55 | { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, | 60 | { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, |
| 56 | /* Display absolute time of an entry */ | 61 | /* Display absolute time of an entry */ |
| 57 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, | 62 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, |
| 63 | /* Display interrupts */ | ||
| 64 | { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, | ||
| 58 | { } /* Empty entry */ | 65 | { } /* Empty entry */ |
| 59 | }; | 66 | }; |
| 60 | 67 | ||
| 61 | static struct tracer_flags tracer_flags = { | 68 | static struct tracer_flags tracer_flags = { |
| 62 | /* Don't display overruns and proc by default */ | 69 | /* Don't display overruns and proc by default */ |
| 63 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | | 70 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | |
| 64 | TRACE_GRAPH_PRINT_DURATION, | 71 | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, |
| 65 | .opts = trace_opts | 72 | .opts = trace_opts |
| 66 | }; | 73 | }; |
| 67 | 74 | ||
| @@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, | |||
| 204 | return 1; | 211 | return 1; |
| 205 | } | 212 | } |
| 206 | 213 | ||
| 214 | static inline int ftrace_graph_ignore_irqs(void) | ||
| 215 | { | ||
| 216 | if (!ftrace_graph_skip_irqs) | ||
| 217 | return 0; | ||
| 218 | |||
| 219 | return in_irq(); | ||
| 220 | } | ||
| 221 | |||
| 207 | int trace_graph_entry(struct ftrace_graph_ent *trace) | 222 | int trace_graph_entry(struct ftrace_graph_ent *trace) |
| 208 | { | 223 | { |
| 209 | struct trace_array *tr = graph_array; | 224 | struct trace_array *tr = graph_array; |
| @@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) | |||
| 218 | return 0; | 233 | return 0; |
| 219 | 234 | ||
| 220 | /* trace it when it is-nested-in or is a function enabled. */ | 235 | /* trace it when it is-nested-in or is a function enabled. */ |
| 221 | if (!(trace->depth || ftrace_graph_addr(trace->func))) | 236 | if (!(trace->depth || ftrace_graph_addr(trace->func)) || |
| 237 | ftrace_graph_ignore_irqs()) | ||
| 222 | return 0; | 238 | return 0; |
| 223 | 239 | ||
| 224 | local_irq_save(flags); | 240 | local_irq_save(flags); |
| @@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) | |||
| 246 | return trace_graph_entry(trace); | 262 | return trace_graph_entry(trace); |
| 247 | } | 263 | } |
| 248 | 264 | ||
| 265 | static void | ||
| 266 | __trace_graph_function(struct trace_array *tr, | ||
| 267 | unsigned long ip, unsigned long flags, int pc) | ||
| 268 | { | ||
| 269 | u64 time = trace_clock_local(); | ||
| 270 | struct ftrace_graph_ent ent = { | ||
| 271 | .func = ip, | ||
| 272 | .depth = 0, | ||
| 273 | }; | ||
| 274 | struct ftrace_graph_ret ret = { | ||
| 275 | .func = ip, | ||
| 276 | .depth = 0, | ||
| 277 | .calltime = time, | ||
| 278 | .rettime = time, | ||
| 279 | }; | ||
| 280 | |||
| 281 | __trace_graph_entry(tr, &ent, flags, pc); | ||
| 282 | __trace_graph_return(tr, &ret, flags, pc); | ||
| 283 | } | ||
| 284 | |||
| 285 | void | ||
| 286 | trace_graph_function(struct trace_array *tr, | ||
| 287 | unsigned long ip, unsigned long parent_ip, | ||
| 288 | unsigned long flags, int pc) | ||
| 289 | { | ||
| 290 | __trace_graph_function(tr, ip, flags, pc); | ||
| 291 | } | ||
| 292 | |||
| 249 | void __trace_graph_return(struct trace_array *tr, | 293 | void __trace_graph_return(struct trace_array *tr, |
| 250 | struct ftrace_graph_ret *trace, | 294 | struct ftrace_graph_ret *trace, |
| 251 | unsigned long flags, | 295 | unsigned long flags, |
| @@ -649,8 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
| 649 | 693 | ||
| 650 | /* Print nsecs (we don't want to exceed 7 numbers) */ | 694 | /* Print nsecs (we don't want to exceed 7 numbers) */ |
| 651 | if (len < 7) { | 695 | if (len < 7) { |
| 652 | snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", | 696 | size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); |
| 653 | nsecs_rem); | 697 | |
| 698 | snprintf(nsecs_str, slen, "%03lu", nsecs_rem); | ||
| 654 | ret = trace_seq_printf(s, ".%s", nsecs_str); | 699 | ret = trace_seq_printf(s, ".%s", nsecs_str); |
| 655 | if (!ret) | 700 | if (!ret) |
| 656 | return TRACE_TYPE_PARTIAL_LINE; | 701 | return TRACE_TYPE_PARTIAL_LINE; |
| @@ -855,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
| 855 | return 0; | 900 | return 0; |
| 856 | } | 901 | } |
| 857 | 902 | ||
| 903 | /* | ||
| 904 | * Entry check for irq code | ||
| 905 | * | ||
| 906 | * returns 1 if | ||
| 907 | * - we are inside irq code | ||
| 908 | * - we just extered irq code | ||
| 909 | * | ||
| 910 | * retunns 0 if | ||
| 911 | * - funcgraph-interrupts option is set | ||
| 912 | * - we are not inside irq code | ||
| 913 | */ | ||
| 914 | static int | ||
| 915 | check_irq_entry(struct trace_iterator *iter, u32 flags, | ||
| 916 | unsigned long addr, int depth) | ||
| 917 | { | ||
| 918 | int cpu = iter->cpu; | ||
| 919 | int *depth_irq; | ||
| 920 | struct fgraph_data *data = iter->private; | ||
| 921 | |||
| 922 | /* | ||
| 923 | * If we are either displaying irqs, or we got called as | ||
| 924 | * a graph event and private data does not exist, | ||
| 925 | * then we bypass the irq check. | ||
| 926 | */ | ||
| 927 | if ((flags & TRACE_GRAPH_PRINT_IRQS) || | ||
| 928 | (!data)) | ||
| 929 | return 0; | ||
| 930 | |||
| 931 | depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
| 932 | |||
| 933 | /* | ||
| 934 | * We are inside the irq code | ||
| 935 | */ | ||
| 936 | if (*depth_irq >= 0) | ||
| 937 | return 1; | ||
| 938 | |||
| 939 | if ((addr < (unsigned long)__irqentry_text_start) || | ||
| 940 | (addr >= (unsigned long)__irqentry_text_end)) | ||
| 941 | return 0; | ||
| 942 | |||
| 943 | /* | ||
| 944 | * We are entering irq code. | ||
| 945 | */ | ||
| 946 | *depth_irq = depth; | ||
| 947 | return 1; | ||
| 948 | } | ||
| 949 | |||
| 950 | /* | ||
| 951 | * Return check for irq code | ||
| 952 | * | ||
| 953 | * returns 1 if | ||
| 954 | * - we are inside irq code | ||
| 955 | * - we just left irq code | ||
| 956 | * | ||
| 957 | * returns 0 if | ||
| 958 | * - funcgraph-interrupts option is set | ||
| 959 | * - we are not inside irq code | ||
| 960 | */ | ||
| 961 | static int | ||
| 962 | check_irq_return(struct trace_iterator *iter, u32 flags, int depth) | ||
| 963 | { | ||
| 964 | int cpu = iter->cpu; | ||
| 965 | int *depth_irq; | ||
| 966 | struct fgraph_data *data = iter->private; | ||
| 967 | |||
| 968 | /* | ||
| 969 | * If we are either displaying irqs, or we got called as | ||
| 970 | * a graph event and private data does not exist, | ||
| 971 | * then we bypass the irq check. | ||
| 972 | */ | ||
| 973 | if ((flags & TRACE_GRAPH_PRINT_IRQS) || | ||
| 974 | (!data)) | ||
| 975 | return 0; | ||
| 976 | |||
| 977 | depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
| 978 | |||
| 979 | /* | ||
| 980 | * We are not inside the irq code. | ||
| 981 | */ | ||
| 982 | if (*depth_irq == -1) | ||
| 983 | return 0; | ||
| 984 | |||
| 985 | /* | ||
| 986 | * We are inside the irq code, and this is returning entry. | ||
| 987 | * Let's not trace it and clear the entry depth, since | ||
| 988 | * we are out of irq code. | ||
| 989 | * | ||
| 990 | * This condition ensures that we 'leave the irq code' once | ||
| 991 | * we are out of the entry depth. Thus protecting us from | ||
| 992 | * the RETURN entry loss. | ||
| 993 | */ | ||
| 994 | if (*depth_irq >= depth) { | ||
| 995 | *depth_irq = -1; | ||
| 996 | return 1; | ||
| 997 | } | ||
| 998 | |||
| 999 | /* | ||
| 1000 | * We are inside the irq code, and this is not the entry. | ||
| 1001 | */ | ||
| 1002 | return 1; | ||
| 1003 | } | ||
| 1004 | |||
| 858 | static enum print_line_t | 1005 | static enum print_line_t |
| 859 | print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | 1006 | print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, |
| 860 | struct trace_iterator *iter, u32 flags) | 1007 | struct trace_iterator *iter, u32 flags) |
| @@ -865,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, | |||
| 865 | static enum print_line_t ret; | 1012 | static enum print_line_t ret; |
| 866 | int cpu = iter->cpu; | 1013 | int cpu = iter->cpu; |
| 867 | 1014 | ||
| 1015 | if (check_irq_entry(iter, flags, call->func, call->depth)) | ||
| 1016 | return TRACE_TYPE_HANDLED; | ||
| 1017 | |||
| 868 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) | 1018 | if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) |
| 869 | return TRACE_TYPE_PARTIAL_LINE; | 1019 | return TRACE_TYPE_PARTIAL_LINE; |
| 870 | 1020 | ||
| @@ -902,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 902 | int ret; | 1052 | int ret; |
| 903 | int i; | 1053 | int i; |
| 904 | 1054 | ||
| 1055 | if (check_irq_return(iter, flags, trace->depth)) | ||
| 1056 | return TRACE_TYPE_HANDLED; | ||
| 1057 | |||
| 905 | if (data) { | 1058 | if (data) { |
| 906 | struct fgraph_cpu_data *cpu_data; | 1059 | struct fgraph_cpu_data *cpu_data; |
| 907 | int cpu = iter->cpu; | 1060 | int cpu = iter->cpu; |
| @@ -1054,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1054 | 1207 | ||
| 1055 | 1208 | ||
| 1056 | enum print_line_t | 1209 | enum print_line_t |
| 1057 | print_graph_function_flags(struct trace_iterator *iter, u32 flags) | 1210 | __print_graph_function_flags(struct trace_iterator *iter, u32 flags) |
| 1058 | { | 1211 | { |
| 1059 | struct ftrace_graph_ent_entry *field; | 1212 | struct ftrace_graph_ent_entry *field; |
| 1060 | struct fgraph_data *data = iter->private; | 1213 | struct fgraph_data *data = iter->private; |
| @@ -1117,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) | |||
| 1117 | static enum print_line_t | 1270 | static enum print_line_t |
| 1118 | print_graph_function(struct trace_iterator *iter) | 1271 | print_graph_function(struct trace_iterator *iter) |
| 1119 | { | 1272 | { |
| 1120 | return print_graph_function_flags(iter, tracer_flags.val); | 1273 | return __print_graph_function_flags(iter, tracer_flags.val); |
| 1274 | } | ||
| 1275 | |||
| 1276 | enum print_line_t print_graph_function_flags(struct trace_iterator *iter, | ||
| 1277 | u32 flags) | ||
| 1278 | { | ||
| 1279 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
| 1280 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
| 1281 | else | ||
| 1282 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
| 1283 | |||
| 1284 | return __print_graph_function_flags(iter, flags); | ||
| 1121 | } | 1285 | } |
| 1122 | 1286 | ||
| 1123 | static enum print_line_t | 1287 | static enum print_line_t |
| @@ -1149,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) | |||
| 1149 | seq_printf(s, "#%.*s|||| / \n", size, spaces); | 1313 | seq_printf(s, "#%.*s|||| / \n", size, spaces); |
| 1150 | } | 1314 | } |
| 1151 | 1315 | ||
| 1152 | void print_graph_headers_flags(struct seq_file *s, u32 flags) | 1316 | static void __print_graph_headers_flags(struct seq_file *s, u32 flags) |
| 1153 | { | 1317 | { |
| 1154 | int lat = trace_flags & TRACE_ITER_LATENCY_FMT; | 1318 | int lat = trace_flags & TRACE_ITER_LATENCY_FMT; |
| 1155 | 1319 | ||
| @@ -1190,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) | |||
| 1190 | print_graph_headers_flags(s, tracer_flags.val); | 1354 | print_graph_headers_flags(s, tracer_flags.val); |
| 1191 | } | 1355 | } |
| 1192 | 1356 | ||
| 1357 | void print_graph_headers_flags(struct seq_file *s, u32 flags) | ||
| 1358 | { | ||
| 1359 | struct trace_iterator *iter = s->private; | ||
| 1360 | |||
| 1361 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | ||
| 1362 | /* print nothing if the buffers are empty */ | ||
| 1363 | if (trace_empty(iter)) | ||
| 1364 | return; | ||
| 1365 | |||
| 1366 | print_trace_header(s, iter); | ||
| 1367 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
| 1368 | } else | ||
| 1369 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
| 1370 | |||
| 1371 | __print_graph_headers_flags(s, flags); | ||
| 1372 | } | ||
| 1373 | |||
| 1193 | void graph_trace_open(struct trace_iterator *iter) | 1374 | void graph_trace_open(struct trace_iterator *iter) |
| 1194 | { | 1375 | { |
| 1195 | /* pid and depth on the last trace processed */ | 1376 | /* pid and depth on the last trace processed */ |
| @@ -1210,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter) | |||
| 1210 | pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); | 1391 | pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); |
| 1211 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); | 1392 | int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); |
| 1212 | int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); | 1393 | int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); |
| 1394 | int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); | ||
| 1395 | |||
| 1213 | *pid = -1; | 1396 | *pid = -1; |
| 1214 | *depth = 0; | 1397 | *depth = 0; |
| 1215 | *ignore = 0; | 1398 | *ignore = 0; |
| 1399 | *depth_irq = -1; | ||
| 1216 | } | 1400 | } |
| 1217 | 1401 | ||
| 1218 | iter->private = data; | 1402 | iter->private = data; |
| @@ -1235,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter) | |||
| 1235 | } | 1419 | } |
| 1236 | } | 1420 | } |
| 1237 | 1421 | ||
| 1422 | static int func_graph_set_flag(u32 old_flags, u32 bit, int set) | ||
| 1423 | { | ||
| 1424 | if (bit == TRACE_GRAPH_PRINT_IRQS) | ||
| 1425 | ftrace_graph_skip_irqs = !set; | ||
| 1426 | |||
| 1427 | return 0; | ||
| 1428 | } | ||
| 1429 | |||
| 1238 | static struct trace_event_functions graph_functions = { | 1430 | static struct trace_event_functions graph_functions = { |
| 1239 | .trace = print_graph_function_event, | 1431 | .trace = print_graph_function_event, |
| 1240 | }; | 1432 | }; |
| @@ -1261,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = { | |||
| 1261 | .print_line = print_graph_function, | 1453 | .print_line = print_graph_function, |
| 1262 | .print_header = print_graph_headers, | 1454 | .print_header = print_graph_headers, |
| 1263 | .flags = &tracer_flags, | 1455 | .flags = &tracer_flags, |
| 1456 | .set_flag = func_graph_set_flag, | ||
| 1264 | #ifdef CONFIG_FTRACE_SELFTEST | 1457 | #ifdef CONFIG_FTRACE_SELFTEST |
| 1265 | .selftest = trace_selftest_startup_function_graph, | 1458 | .selftest = trace_selftest_startup_function_graph, |
| 1266 | #endif | 1459 | #endif |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2e..5cf8c602b880 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
| @@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; | |||
| 87 | 87 | ||
| 88 | #ifdef CONFIG_FUNCTION_TRACER | 88 | #ifdef CONFIG_FUNCTION_TRACER |
| 89 | /* | 89 | /* |
| 90 | * irqsoff uses its own tracer function to keep the overhead down: | 90 | * Prologue for the preempt and irqs off function tracers. |
| 91 | * | ||
| 92 | * Returns 1 if it is OK to continue, and data->disabled is | ||
| 93 | * incremented. | ||
| 94 | * 0 if the trace is to be ignored, and data->disabled | ||
| 95 | * is kept the same. | ||
| 96 | * | ||
| 97 | * Note, this function is also used outside this ifdef but | ||
| 98 | * inside the #ifdef of the function graph tracer below. | ||
| 99 | * This is OK, since the function graph tracer is | ||
| 100 | * dependent on the function tracer. | ||
| 91 | */ | 101 | */ |
| 92 | static void | 102 | static int func_prolog_dec(struct trace_array *tr, |
| 93 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | 103 | struct trace_array_cpu **data, |
| 104 | unsigned long *flags) | ||
| 94 | { | 105 | { |
| 95 | struct trace_array *tr = irqsoff_trace; | ||
| 96 | struct trace_array_cpu *data; | ||
| 97 | unsigned long flags; | ||
| 98 | long disabled; | 106 | long disabled; |
| 99 | int cpu; | 107 | int cpu; |
| 100 | 108 | ||
| @@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
| 106 | */ | 114 | */ |
| 107 | cpu = raw_smp_processor_id(); | 115 | cpu = raw_smp_processor_id(); |
| 108 | if (likely(!per_cpu(tracing_cpu, cpu))) | 116 | if (likely(!per_cpu(tracing_cpu, cpu))) |
| 109 | return; | 117 | return 0; |
| 110 | 118 | ||
| 111 | local_save_flags(flags); | 119 | local_save_flags(*flags); |
| 112 | /* slight chance to get a false positive on tracing_cpu */ | 120 | /* slight chance to get a false positive on tracing_cpu */ |
| 113 | if (!irqs_disabled_flags(flags)) | 121 | if (!irqs_disabled_flags(*flags)) |
| 114 | return; | 122 | return 0; |
| 115 | 123 | ||
| 116 | data = tr->data[cpu]; | 124 | *data = tr->data[cpu]; |
| 117 | disabled = atomic_inc_return(&data->disabled); | 125 | disabled = atomic_inc_return(&(*data)->disabled); |
| 118 | 126 | ||
| 119 | if (likely(disabled == 1)) | 127 | if (likely(disabled == 1)) |
| 120 | trace_function(tr, ip, parent_ip, flags, preempt_count()); | 128 | return 1; |
| 129 | |||
| 130 | atomic_dec(&(*data)->disabled); | ||
| 131 | |||
| 132 | return 0; | ||
| 133 | } | ||
| 134 | |||
| 135 | /* | ||
| 136 | * irqsoff uses its own tracer function to keep the overhead down: | ||
| 137 | */ | ||
| 138 | static void | ||
| 139 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | ||
| 140 | { | ||
| 141 | struct trace_array *tr = irqsoff_trace; | ||
| 142 | struct trace_array_cpu *data; | ||
| 143 | unsigned long flags; | ||
| 144 | |||
| 145 | if (!func_prolog_dec(tr, &data, &flags)) | ||
| 146 | return; | ||
| 147 | |||
| 148 | trace_function(tr, ip, parent_ip, flags, preempt_count()); | ||
| 121 | 149 | ||
| 122 | atomic_dec(&data->disabled); | 150 | atomic_dec(&data->disabled); |
| 123 | } | 151 | } |
| @@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) | |||
| 155 | struct trace_array *tr = irqsoff_trace; | 183 | struct trace_array *tr = irqsoff_trace; |
| 156 | struct trace_array_cpu *data; | 184 | struct trace_array_cpu *data; |
| 157 | unsigned long flags; | 185 | unsigned long flags; |
| 158 | long disabled; | ||
| 159 | int ret; | 186 | int ret; |
| 160 | int cpu; | ||
| 161 | int pc; | 187 | int pc; |
| 162 | 188 | ||
| 163 | cpu = raw_smp_processor_id(); | 189 | if (!func_prolog_dec(tr, &data, &flags)) |
| 164 | if (likely(!per_cpu(tracing_cpu, cpu))) | ||
| 165 | return 0; | 190 | return 0; |
| 166 | 191 | ||
| 167 | local_save_flags(flags); | 192 | pc = preempt_count(); |
| 168 | /* slight chance to get a false positive on tracing_cpu */ | 193 | ret = __trace_graph_entry(tr, trace, flags, pc); |
| 169 | if (!irqs_disabled_flags(flags)) | ||
| 170 | return 0; | ||
| 171 | |||
| 172 | data = tr->data[cpu]; | ||
| 173 | disabled = atomic_inc_return(&data->disabled); | ||
| 174 | |||
| 175 | if (likely(disabled == 1)) { | ||
| 176 | pc = preempt_count(); | ||
| 177 | ret = __trace_graph_entry(tr, trace, flags, pc); | ||
| 178 | } else | ||
| 179 | ret = 0; | ||
| 180 | |||
| 181 | atomic_dec(&data->disabled); | 194 | atomic_dec(&data->disabled); |
| 195 | |||
| 182 | return ret; | 196 | return ret; |
| 183 | } | 197 | } |
| 184 | 198 | ||
| @@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) | |||
| 187 | struct trace_array *tr = irqsoff_trace; | 201 | struct trace_array *tr = irqsoff_trace; |
| 188 | struct trace_array_cpu *data; | 202 | struct trace_array_cpu *data; |
| 189 | unsigned long flags; | 203 | unsigned long flags; |
| 190 | long disabled; | ||
| 191 | int cpu; | ||
| 192 | int pc; | 204 | int pc; |
| 193 | 205 | ||
| 194 | cpu = raw_smp_processor_id(); | 206 | if (!func_prolog_dec(tr, &data, &flags)) |
| 195 | if (likely(!per_cpu(tracing_cpu, cpu))) | ||
| 196 | return; | 207 | return; |
| 197 | 208 | ||
| 198 | local_save_flags(flags); | 209 | pc = preempt_count(); |
| 199 | /* slight chance to get a false positive on tracing_cpu */ | 210 | __trace_graph_return(tr, trace, flags, pc); |
| 200 | if (!irqs_disabled_flags(flags)) | ||
| 201 | return; | ||
| 202 | |||
| 203 | data = tr->data[cpu]; | ||
| 204 | disabled = atomic_inc_return(&data->disabled); | ||
| 205 | |||
| 206 | if (likely(disabled == 1)) { | ||
| 207 | pc = preempt_count(); | ||
| 208 | __trace_graph_return(tr, trace, flags, pc); | ||
| 209 | } | ||
| 210 | |||
| 211 | atomic_dec(&data->disabled); | 211 | atomic_dec(&data->disabled); |
| 212 | } | 212 | } |
| 213 | 213 | ||
| @@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) | |||
| 229 | 229 | ||
| 230 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | 230 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) |
| 231 | { | 231 | { |
| 232 | u32 flags = GRAPH_TRACER_FLAGS; | ||
| 233 | |||
| 234 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
| 235 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
| 236 | else | ||
| 237 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
| 238 | |||
| 239 | /* | 232 | /* |
| 240 | * In graph mode call the graph tracer output function, | 233 | * In graph mode call the graph tracer output function, |
| 241 | * otherwise go with the TRACE_FN event handler | 234 | * otherwise go with the TRACE_FN event handler |
| 242 | */ | 235 | */ |
| 243 | if (is_graph()) | 236 | if (is_graph()) |
| 244 | return print_graph_function_flags(iter, flags); | 237 | return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); |
| 245 | 238 | ||
| 246 | return TRACE_TYPE_UNHANDLED; | 239 | return TRACE_TYPE_UNHANDLED; |
| 247 | } | 240 | } |
| 248 | 241 | ||
| 249 | static void irqsoff_print_header(struct seq_file *s) | 242 | static void irqsoff_print_header(struct seq_file *s) |
| 250 | { | 243 | { |
| 251 | if (is_graph()) { | 244 | if (is_graph()) |
| 252 | struct trace_iterator *iter = s->private; | 245 | print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); |
| 253 | u32 flags = GRAPH_TRACER_FLAGS; | 246 | else |
| 254 | |||
| 255 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | ||
| 256 | /* print nothing if the buffers are empty */ | ||
| 257 | if (trace_empty(iter)) | ||
| 258 | return; | ||
| 259 | |||
| 260 | print_trace_header(s, iter); | ||
| 261 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
| 262 | } else | ||
| 263 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
| 264 | |||
| 265 | print_graph_headers_flags(s, flags); | ||
| 266 | } else | ||
| 267 | trace_default_header(s); | 247 | trace_default_header(s); |
| 268 | } | 248 | } |
| 269 | 249 | ||
| 270 | static void | 250 | static void |
| 271 | trace_graph_function(struct trace_array *tr, | ||
| 272 | unsigned long ip, unsigned long flags, int pc) | ||
| 273 | { | ||
| 274 | u64 time = trace_clock_local(); | ||
| 275 | struct ftrace_graph_ent ent = { | ||
| 276 | .func = ip, | ||
| 277 | .depth = 0, | ||
| 278 | }; | ||
| 279 | struct ftrace_graph_ret ret = { | ||
| 280 | .func = ip, | ||
| 281 | .depth = 0, | ||
| 282 | .calltime = time, | ||
| 283 | .rettime = time, | ||
| 284 | }; | ||
| 285 | |||
| 286 | __trace_graph_entry(tr, &ent, flags, pc); | ||
| 287 | __trace_graph_return(tr, &ret, flags, pc); | ||
| 288 | } | ||
| 289 | |||
| 290 | static void | ||
| 291 | __trace_function(struct trace_array *tr, | 251 | __trace_function(struct trace_array *tr, |
| 292 | unsigned long ip, unsigned long parent_ip, | 252 | unsigned long ip, unsigned long parent_ip, |
| 293 | unsigned long flags, int pc) | 253 | unsigned long flags, int pc) |
| 294 | { | 254 | { |
| 295 | if (!is_graph()) | 255 | if (is_graph()) |
| 256 | trace_graph_function(tr, ip, parent_ip, flags, pc); | ||
| 257 | else | ||
| 296 | trace_function(tr, ip, parent_ip, flags, pc); | 258 | trace_function(tr, ip, parent_ip, flags, pc); |
| 297 | else { | ||
| 298 | trace_graph_function(tr, parent_ip, flags, pc); | ||
| 299 | trace_graph_function(tr, ip, flags, pc); | ||
| 300 | } | ||
| 301 | } | 259 | } |
| 302 | 260 | ||
| 303 | #else | 261 | #else |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8b27c9849b42..544301d29dee 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -514,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); | |||
| 514 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, | 514 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, |
| 515 | struct pt_regs *regs); | 515 | struct pt_regs *regs); |
| 516 | 516 | ||
| 517 | /* Check the name is good for event/group */ | 517 | /* Check the name is good for event/group/fields */ |
| 518 | static int check_event_name(const char *name) | 518 | static int is_good_name(const char *name) |
| 519 | { | 519 | { |
| 520 | if (!isalpha(*name) && *name != '_') | 520 | if (!isalpha(*name) && *name != '_') |
| 521 | return 0; | 521 | return 0; |
| @@ -557,7 +557,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
| 557 | else | 557 | else |
| 558 | tp->rp.kp.pre_handler = kprobe_dispatcher; | 558 | tp->rp.kp.pre_handler = kprobe_dispatcher; |
| 559 | 559 | ||
| 560 | if (!event || !check_event_name(event)) { | 560 | if (!event || !is_good_name(event)) { |
| 561 | ret = -EINVAL; | 561 | ret = -EINVAL; |
| 562 | goto error; | 562 | goto error; |
| 563 | } | 563 | } |
| @@ -567,7 +567,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
| 567 | if (!tp->call.name) | 567 | if (!tp->call.name) |
| 568 | goto error; | 568 | goto error; |
| 569 | 569 | ||
| 570 | if (!group || !check_event_name(group)) { | 570 | if (!group || !is_good_name(group)) { |
| 571 | ret = -EINVAL; | 571 | ret = -EINVAL; |
| 572 | goto error; | 572 | goto error; |
| 573 | } | 573 | } |
| @@ -883,7 +883,7 @@ static int create_trace_probe(int argc, char **argv) | |||
| 883 | int i, ret = 0; | 883 | int i, ret = 0; |
| 884 | int is_return = 0, is_delete = 0; | 884 | int is_return = 0, is_delete = 0; |
| 885 | char *symbol = NULL, *event = NULL, *group = NULL; | 885 | char *symbol = NULL, *event = NULL, *group = NULL; |
| 886 | char *arg, *tmp; | 886 | char *arg; |
| 887 | unsigned long offset = 0; | 887 | unsigned long offset = 0; |
| 888 | void *addr = NULL; | 888 | void *addr = NULL; |
| 889 | char buf[MAX_EVENT_NAME_LEN]; | 889 | char buf[MAX_EVENT_NAME_LEN]; |
| @@ -992,26 +992,36 @@ static int create_trace_probe(int argc, char **argv) | |||
| 992 | /* parse arguments */ | 992 | /* parse arguments */ |
| 993 | ret = 0; | 993 | ret = 0; |
| 994 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | 994 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { |
| 995 | /* Increment count for freeing args in error case */ | ||
| 996 | tp->nr_args++; | ||
| 997 | |||
| 995 | /* Parse argument name */ | 998 | /* Parse argument name */ |
| 996 | arg = strchr(argv[i], '='); | 999 | arg = strchr(argv[i], '='); |
| 997 | if (arg) | 1000 | if (arg) { |
| 998 | *arg++ = '\0'; | 1001 | *arg++ = '\0'; |
| 999 | else | 1002 | tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); |
| 1003 | } else { | ||
| 1000 | arg = argv[i]; | 1004 | arg = argv[i]; |
| 1005 | /* If argument name is omitted, set "argN" */ | ||
| 1006 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); | ||
| 1007 | tp->args[i].name = kstrdup(buf, GFP_KERNEL); | ||
| 1008 | } | ||
| 1001 | 1009 | ||
| 1002 | tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); | ||
| 1003 | if (!tp->args[i].name) { | 1010 | if (!tp->args[i].name) { |
| 1004 | pr_info("Failed to allocate argument%d name '%s'.\n", | 1011 | pr_info("Failed to allocate argument[%d] name.\n", i); |
| 1005 | i, argv[i]); | ||
| 1006 | ret = -ENOMEM; | 1012 | ret = -ENOMEM; |
| 1007 | goto error; | 1013 | goto error; |
| 1008 | } | 1014 | } |
| 1009 | tmp = strchr(tp->args[i].name, ':'); | 1015 | |
| 1010 | if (tmp) | 1016 | if (!is_good_name(tp->args[i].name)) { |
| 1011 | *tmp = '_'; /* convert : to _ */ | 1017 | pr_info("Invalid argument[%d] name: %s\n", |
| 1018 | i, tp->args[i].name); | ||
| 1019 | ret = -EINVAL; | ||
| 1020 | goto error; | ||
| 1021 | } | ||
| 1012 | 1022 | ||
| 1013 | if (conflict_field_name(tp->args[i].name, tp->args, i)) { | 1023 | if (conflict_field_name(tp->args[i].name, tp->args, i)) { |
| 1014 | pr_info("Argument%d name '%s' conflicts with " | 1024 | pr_info("Argument[%d] name '%s' conflicts with " |
| 1015 | "another field.\n", i, argv[i]); | 1025 | "another field.\n", i, argv[i]); |
| 1016 | ret = -EINVAL; | 1026 | ret = -EINVAL; |
| 1017 | goto error; | 1027 | goto error; |
| @@ -1020,12 +1030,9 @@ static int create_trace_probe(int argc, char **argv) | |||
| 1020 | /* Parse fetch argument */ | 1030 | /* Parse fetch argument */ |
| 1021 | ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); | 1031 | ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); |
| 1022 | if (ret) { | 1032 | if (ret) { |
| 1023 | pr_info("Parse error at argument%d. (%d)\n", i, ret); | 1033 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
| 1024 | kfree(tp->args[i].name); | ||
| 1025 | goto error; | 1034 | goto error; |
| 1026 | } | 1035 | } |
| 1027 | |||
| 1028 | tp->nr_args++; | ||
| 1029 | } | 1036 | } |
| 1030 | 1037 | ||
| 1031 | ret = register_trace_probe(tp); | 1038 | ret = register_trace_probe(tp); |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81b..7319559ed59f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -31,48 +31,98 @@ static int wakeup_rt; | |||
| 31 | static arch_spinlock_t wakeup_lock = | 31 | static arch_spinlock_t wakeup_lock = |
| 32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 32 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
| 33 | 33 | ||
| 34 | static void wakeup_reset(struct trace_array *tr); | ||
| 34 | static void __wakeup_reset(struct trace_array *tr); | 35 | static void __wakeup_reset(struct trace_array *tr); |
| 36 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace); | ||
| 37 | static void wakeup_graph_return(struct ftrace_graph_ret *trace); | ||
| 35 | 38 | ||
| 36 | static int save_lat_flag; | 39 | static int save_lat_flag; |
| 37 | 40 | ||
| 41 | #define TRACE_DISPLAY_GRAPH 1 | ||
| 42 | |||
| 43 | static struct tracer_opt trace_opts[] = { | ||
| 44 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
| 45 | /* display latency trace as call graph */ | ||
| 46 | { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, | ||
| 47 | #endif | ||
| 48 | { } /* Empty entry */ | ||
| 49 | }; | ||
| 50 | |||
| 51 | static struct tracer_flags tracer_flags = { | ||
| 52 | .val = 0, | ||
| 53 | .opts = trace_opts, | ||
| 54 | }; | ||
| 55 | |||
| 56 | #define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) | ||
| 57 | |||
| 38 | #ifdef CONFIG_FUNCTION_TRACER | 58 | #ifdef CONFIG_FUNCTION_TRACER |
| 59 | |||
| 39 | /* | 60 | /* |
| 40 | * irqsoff uses its own tracer function to keep the overhead down: | 61 | * Prologue for the wakeup function tracers. |
| 62 | * | ||
| 63 | * Returns 1 if it is OK to continue, and preemption | ||
| 64 | * is disabled and data->disabled is incremented. | ||
| 65 | * 0 if the trace is to be ignored, and preemption | ||
| 66 | * is not disabled and data->disabled is | ||
| 67 | * kept the same. | ||
| 68 | * | ||
| 69 | * Note, this function is also used outside this ifdef but | ||
| 70 | * inside the #ifdef of the function graph tracer below. | ||
| 71 | * This is OK, since the function graph tracer is | ||
| 72 | * dependent on the function tracer. | ||
| 41 | */ | 73 | */ |
| 42 | static void | 74 | static int |
| 43 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | 75 | func_prolog_preempt_disable(struct trace_array *tr, |
| 76 | struct trace_array_cpu **data, | ||
| 77 | int *pc) | ||
| 44 | { | 78 | { |
| 45 | struct trace_array *tr = wakeup_trace; | ||
| 46 | struct trace_array_cpu *data; | ||
| 47 | unsigned long flags; | ||
| 48 | long disabled; | 79 | long disabled; |
| 49 | int cpu; | 80 | int cpu; |
| 50 | int pc; | ||
| 51 | 81 | ||
| 52 | if (likely(!wakeup_task)) | 82 | if (likely(!wakeup_task)) |
| 53 | return; | 83 | return 0; |
| 54 | 84 | ||
| 55 | pc = preempt_count(); | 85 | *pc = preempt_count(); |
| 56 | preempt_disable_notrace(); | 86 | preempt_disable_notrace(); |
| 57 | 87 | ||
| 58 | cpu = raw_smp_processor_id(); | 88 | cpu = raw_smp_processor_id(); |
| 59 | if (cpu != wakeup_current_cpu) | 89 | if (cpu != wakeup_current_cpu) |
| 60 | goto out_enable; | 90 | goto out_enable; |
| 61 | 91 | ||
| 62 | data = tr->data[cpu]; | 92 | *data = tr->data[cpu]; |
| 63 | disabled = atomic_inc_return(&data->disabled); | 93 | disabled = atomic_inc_return(&(*data)->disabled); |
| 64 | if (unlikely(disabled != 1)) | 94 | if (unlikely(disabled != 1)) |
| 65 | goto out; | 95 | goto out; |
| 66 | 96 | ||
| 67 | local_irq_save(flags); | 97 | return 1; |
| 68 | 98 | ||
| 69 | trace_function(tr, ip, parent_ip, flags, pc); | 99 | out: |
| 100 | atomic_dec(&(*data)->disabled); | ||
| 101 | |||
| 102 | out_enable: | ||
| 103 | preempt_enable_notrace(); | ||
| 104 | return 0; | ||
| 105 | } | ||
| 70 | 106 | ||
| 107 | /* | ||
| 108 | * wakeup uses its own tracer function to keep the overhead down: | ||
| 109 | */ | ||
| 110 | static void | ||
| 111 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | ||
| 112 | { | ||
| 113 | struct trace_array *tr = wakeup_trace; | ||
| 114 | struct trace_array_cpu *data; | ||
| 115 | unsigned long flags; | ||
| 116 | int pc; | ||
| 117 | |||
| 118 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
| 119 | return; | ||
| 120 | |||
| 121 | local_irq_save(flags); | ||
| 122 | trace_function(tr, ip, parent_ip, flags, pc); | ||
| 71 | local_irq_restore(flags); | 123 | local_irq_restore(flags); |
| 72 | 124 | ||
| 73 | out: | ||
| 74 | atomic_dec(&data->disabled); | 125 | atomic_dec(&data->disabled); |
| 75 | out_enable: | ||
| 76 | preempt_enable_notrace(); | 126 | preempt_enable_notrace(); |
| 77 | } | 127 | } |
| 78 | 128 | ||
| @@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = | |||
| 82 | }; | 132 | }; |
| 83 | #endif /* CONFIG_FUNCTION_TRACER */ | 133 | #endif /* CONFIG_FUNCTION_TRACER */ |
| 84 | 134 | ||
| 135 | static int start_func_tracer(int graph) | ||
| 136 | { | ||
| 137 | int ret; | ||
| 138 | |||
| 139 | if (!graph) | ||
| 140 | ret = register_ftrace_function(&trace_ops); | ||
| 141 | else | ||
| 142 | ret = register_ftrace_graph(&wakeup_graph_return, | ||
| 143 | &wakeup_graph_entry); | ||
| 144 | |||
| 145 | if (!ret && tracing_is_enabled()) | ||
| 146 | tracer_enabled = 1; | ||
| 147 | else | ||
| 148 | tracer_enabled = 0; | ||
| 149 | |||
| 150 | return ret; | ||
| 151 | } | ||
| 152 | |||
| 153 | static void stop_func_tracer(int graph) | ||
| 154 | { | ||
| 155 | tracer_enabled = 0; | ||
| 156 | |||
| 157 | if (!graph) | ||
| 158 | unregister_ftrace_function(&trace_ops); | ||
| 159 | else | ||
| 160 | unregister_ftrace_graph(); | ||
| 161 | } | ||
| 162 | |||
| 163 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | ||
| 164 | static int wakeup_set_flag(u32 old_flags, u32 bit, int set) | ||
| 165 | { | ||
| 166 | |||
| 167 | if (!(bit & TRACE_DISPLAY_GRAPH)) | ||
| 168 | return -EINVAL; | ||
| 169 | |||
| 170 | if (!(is_graph() ^ set)) | ||
| 171 | return 0; | ||
| 172 | |||
| 173 | stop_func_tracer(!set); | ||
| 174 | |||
| 175 | wakeup_reset(wakeup_trace); | ||
| 176 | tracing_max_latency = 0; | ||
| 177 | |||
| 178 | return start_func_tracer(set); | ||
| 179 | } | ||
| 180 | |||
| 181 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | ||
| 182 | { | ||
| 183 | struct trace_array *tr = wakeup_trace; | ||
| 184 | struct trace_array_cpu *data; | ||
| 185 | unsigned long flags; | ||
| 186 | int pc, ret = 0; | ||
| 187 | |||
| 188 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
| 189 | return 0; | ||
| 190 | |||
| 191 | local_save_flags(flags); | ||
| 192 | ret = __trace_graph_entry(tr, trace, flags, pc); | ||
| 193 | atomic_dec(&data->disabled); | ||
| 194 | preempt_enable_notrace(); | ||
| 195 | |||
| 196 | return ret; | ||
| 197 | } | ||
| 198 | |||
| 199 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) | ||
| 200 | { | ||
| 201 | struct trace_array *tr = wakeup_trace; | ||
| 202 | struct trace_array_cpu *data; | ||
| 203 | unsigned long flags; | ||
| 204 | int pc; | ||
| 205 | |||
| 206 | if (!func_prolog_preempt_disable(tr, &data, &pc)) | ||
| 207 | return; | ||
| 208 | |||
| 209 | local_save_flags(flags); | ||
| 210 | __trace_graph_return(tr, trace, flags, pc); | ||
| 211 | atomic_dec(&data->disabled); | ||
| 212 | |||
| 213 | preempt_enable_notrace(); | ||
| 214 | return; | ||
| 215 | } | ||
| 216 | |||
| 217 | static void wakeup_trace_open(struct trace_iterator *iter) | ||
| 218 | { | ||
| 219 | if (is_graph()) | ||
| 220 | graph_trace_open(iter); | ||
| 221 | } | ||
| 222 | |||
| 223 | static void wakeup_trace_close(struct trace_iterator *iter) | ||
| 224 | { | ||
| 225 | if (iter->private) | ||
| 226 | graph_trace_close(iter); | ||
| 227 | } | ||
| 228 | |||
| 229 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) | ||
| 230 | |||
| 231 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | ||
| 232 | { | ||
| 233 | /* | ||
| 234 | * In graph mode call the graph tracer output function, | ||
| 235 | * otherwise go with the TRACE_FN event handler | ||
| 236 | */ | ||
| 237 | if (is_graph()) | ||
| 238 | return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); | ||
| 239 | |||
| 240 | return TRACE_TYPE_UNHANDLED; | ||
| 241 | } | ||
| 242 | |||
| 243 | static void wakeup_print_header(struct seq_file *s) | ||
| 244 | { | ||
| 245 | if (is_graph()) | ||
| 246 | print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); | ||
| 247 | else | ||
| 248 | trace_default_header(s); | ||
| 249 | } | ||
| 250 | |||
| 251 | static void | ||
| 252 | __trace_function(struct trace_array *tr, | ||
| 253 | unsigned long ip, unsigned long parent_ip, | ||
| 254 | unsigned long flags, int pc) | ||
| 255 | { | ||
| 256 | if (is_graph()) | ||
| 257 | trace_graph_function(tr, ip, parent_ip, flags, pc); | ||
| 258 | else | ||
| 259 | trace_function(tr, ip, parent_ip, flags, pc); | ||
| 260 | } | ||
| 261 | #else | ||
| 262 | #define __trace_function trace_function | ||
| 263 | |||
| 264 | static int wakeup_set_flag(u32 old_flags, u32 bit, int set) | ||
| 265 | { | ||
| 266 | return -EINVAL; | ||
| 267 | } | ||
| 268 | |||
| 269 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | ||
| 270 | { | ||
| 271 | return -1; | ||
| 272 | } | ||
| 273 | |||
| 274 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | ||
| 275 | { | ||
| 276 | return TRACE_TYPE_UNHANDLED; | ||
| 277 | } | ||
| 278 | |||
| 279 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } | ||
| 280 | static void wakeup_print_header(struct seq_file *s) { } | ||
| 281 | static void wakeup_trace_open(struct trace_iterator *iter) { } | ||
| 282 | static void wakeup_trace_close(struct trace_iterator *iter) { } | ||
| 283 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | ||
| 284 | |||
| 85 | /* | 285 | /* |
| 86 | * Should this new latency be reported/recorded? | 286 | * Should this new latency be reported/recorded? |
| 87 | */ | 287 | */ |
| @@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore, | |||
| 152 | /* The task we are waiting for is waking up */ | 352 | /* The task we are waiting for is waking up */ |
| 153 | data = wakeup_trace->data[wakeup_cpu]; | 353 | data = wakeup_trace->data[wakeup_cpu]; |
| 154 | 354 | ||
| 155 | trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); | 355 | __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); |
| 156 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); | 356 | tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); |
| 157 | 357 | ||
| 158 | T0 = data->preempt_timestamp; | 358 | T0 = data->preempt_timestamp; |
| @@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) | |||
| 252 | * is not called by an assembly function (where as schedule is) | 452 | * is not called by an assembly function (where as schedule is) |
| 253 | * it should be safe to use it here. | 453 | * it should be safe to use it here. |
| 254 | */ | 454 | */ |
| 255 | trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); | 455 | __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); |
| 256 | 456 | ||
| 257 | out_locked: | 457 | out_locked: |
| 258 | arch_spin_unlock(&wakeup_lock); | 458 | arch_spin_unlock(&wakeup_lock); |
| @@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr) | |||
| 303 | */ | 503 | */ |
| 304 | smp_wmb(); | 504 | smp_wmb(); |
| 305 | 505 | ||
| 306 | register_ftrace_function(&trace_ops); | 506 | if (start_func_tracer(is_graph())) |
| 307 | 507 | printk(KERN_ERR "failed to start wakeup tracer\n"); | |
| 308 | if (tracing_is_enabled()) | ||
| 309 | tracer_enabled = 1; | ||
| 310 | else | ||
| 311 | tracer_enabled = 0; | ||
| 312 | 508 | ||
| 313 | return; | 509 | return; |
| 314 | fail_deprobe_wake_new: | 510 | fail_deprobe_wake_new: |
| @@ -320,7 +516,7 @@ fail_deprobe: | |||
| 320 | static void stop_wakeup_tracer(struct trace_array *tr) | 516 | static void stop_wakeup_tracer(struct trace_array *tr) |
| 321 | { | 517 | { |
| 322 | tracer_enabled = 0; | 518 | tracer_enabled = 0; |
| 323 | unregister_ftrace_function(&trace_ops); | 519 | stop_func_tracer(is_graph()); |
| 324 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); | 520 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); |
| 325 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); | 521 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); |
| 326 | unregister_trace_sched_wakeup(probe_wakeup, NULL); | 522 | unregister_trace_sched_wakeup(probe_wakeup, NULL); |
| @@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly = | |||
| 379 | .start = wakeup_tracer_start, | 575 | .start = wakeup_tracer_start, |
| 380 | .stop = wakeup_tracer_stop, | 576 | .stop = wakeup_tracer_stop, |
| 381 | .print_max = 1, | 577 | .print_max = 1, |
| 578 | .print_header = wakeup_print_header, | ||
| 579 | .print_line = wakeup_print_line, | ||
| 580 | .flags = &tracer_flags, | ||
| 581 | .set_flag = wakeup_set_flag, | ||
| 382 | #ifdef CONFIG_FTRACE_SELFTEST | 582 | #ifdef CONFIG_FTRACE_SELFTEST |
| 383 | .selftest = trace_selftest_startup_wakeup, | 583 | .selftest = trace_selftest_startup_wakeup, |
| 384 | #endif | 584 | #endif |
| 585 | .open = wakeup_trace_open, | ||
| 586 | .close = wakeup_trace_close, | ||
| 385 | .use_max_tr = 1, | 587 | .use_max_tr = 1, |
| 386 | }; | 588 | }; |
| 387 | 589 | ||
| @@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
| 394 | .stop = wakeup_tracer_stop, | 596 | .stop = wakeup_tracer_stop, |
| 395 | .wait_pipe = poll_wait_pipe, | 597 | .wait_pipe = poll_wait_pipe, |
| 396 | .print_max = 1, | 598 | .print_max = 1, |
| 599 | .print_header = wakeup_print_header, | ||
| 600 | .print_line = wakeup_print_line, | ||
| 601 | .flags = &tracer_flags, | ||
| 602 | .set_flag = wakeup_set_flag, | ||
| 397 | #ifdef CONFIG_FTRACE_SELFTEST | 603 | #ifdef CONFIG_FTRACE_SELFTEST |
| 398 | .selftest = trace_selftest_startup_wakeup, | 604 | .selftest = trace_selftest_startup_wakeup, |
| 399 | #endif | 605 | #endif |
| 606 | .open = wakeup_trace_open, | ||
| 607 | .close = wakeup_trace_close, | ||
| 400 | .use_max_tr = 1, | 608 | .use_max_tr = 1, |
| 401 | }; | 609 | }; |
| 402 | 610 | ||
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc3793baf6..209b379a4721 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c | |||
| @@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) | |||
| 263 | { | 263 | { |
| 264 | int ret, cpu; | 264 | int ret, cpu; |
| 265 | 265 | ||
| 266 | for_each_possible_cpu(cpu) { | ||
| 267 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
| 268 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
| 269 | } | ||
| 270 | |||
| 266 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); | 271 | ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); |
| 267 | if (ret) | 272 | if (ret) |
| 268 | goto out; | 273 | goto out; |
| @@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) | |||
| 279 | if (ret) | 284 | if (ret) |
| 280 | goto no_creation; | 285 | goto no_creation; |
| 281 | 286 | ||
| 282 | for_each_possible_cpu(cpu) { | ||
| 283 | spin_lock_init(&workqueue_cpu_stat(cpu)->lock); | ||
| 284 | INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); | ||
| 285 | } | ||
| 286 | |||
| 287 | return 0; | 287 | return 0; |
| 288 | 288 | ||
| 289 | no_creation: | 289 | no_creation: |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c77f3eceea25..e95ee7f31d43 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
| 26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
| 27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
| 28 | #include <linux/jump_label.h> | ||
| 28 | 29 | ||
| 29 | extern struct tracepoint __start___tracepoints[]; | 30 | extern struct tracepoint __start___tracepoints[]; |
| 30 | extern struct tracepoint __stop___tracepoints[]; | 31 | extern struct tracepoint __stop___tracepoints[]; |
| @@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
| 263 | * is used. | 264 | * is used. |
| 264 | */ | 265 | */ |
| 265 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); | 266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); |
| 266 | elem->state = active; | 267 | if (!elem->state && active) { |
| 268 | jump_label_enable(&elem->state); | ||
| 269 | elem->state = active; | ||
| 270 | } else if (elem->state && !active) { | ||
| 271 | jump_label_disable(&elem->state); | ||
| 272 | elem->state = active; | ||
| 273 | } | ||
| 267 | } | 274 | } |
| 268 | 275 | ||
| 269 | /* | 276 | /* |
| @@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
| 277 | if (elem->unregfunc && elem->state) | 284 | if (elem->unregfunc && elem->state) |
| 278 | elem->unregfunc(); | 285 | elem->unregfunc(); |
| 279 | 286 | ||
| 280 | elem->state = 0; | 287 | if (elem->state) { |
| 288 | jump_label_disable(&elem->state); | ||
| 289 | elem->state = 0; | ||
| 290 | } | ||
| 281 | rcu_assign_pointer(elem->funcs, NULL); | 291 | rcu_assign_pointer(elem->funcs, NULL); |
| 282 | } | 292 | } |
| 283 | 293 | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0d53c8e853b1..bafba687a6d8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | |||
| 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
| 44 | #endif | 44 | #endif |
| 45 | 45 | ||
| 46 | static int __read_mostly did_panic; | ||
| 47 | static int __initdata no_watchdog; | 46 | static int __initdata no_watchdog; |
| 48 | 47 | ||
| 49 | 48 | ||
| @@ -122,7 +121,7 @@ static void __touch_watchdog(void) | |||
| 122 | 121 | ||
| 123 | void touch_softlockup_watchdog(void) | 122 | void touch_softlockup_watchdog(void) |
| 124 | { | 123 | { |
| 125 | __get_cpu_var(watchdog_touch_ts) = 0; | 124 | __raw_get_cpu_var(watchdog_touch_ts) = 0; |
| 126 | } | 125 | } |
| 127 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 126 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
| 128 | 127 | ||
| @@ -142,7 +141,14 @@ void touch_all_softlockup_watchdogs(void) | |||
| 142 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 141 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 143 | void touch_nmi_watchdog(void) | 142 | void touch_nmi_watchdog(void) |
| 144 | { | 143 | { |
| 145 | __get_cpu_var(watchdog_nmi_touch) = true; | 144 | if (watchdog_enabled) { |
| 145 | unsigned cpu; | ||
| 146 | |||
| 147 | for_each_present_cpu(cpu) { | ||
| 148 | if (per_cpu(watchdog_nmi_touch, cpu) != true) | ||
| 149 | per_cpu(watchdog_nmi_touch, cpu) = true; | ||
| 150 | } | ||
| 151 | } | ||
| 146 | touch_softlockup_watchdog(); | 152 | touch_softlockup_watchdog(); |
| 147 | } | 153 | } |
| 148 | EXPORT_SYMBOL(touch_nmi_watchdog); | 154 | EXPORT_SYMBOL(touch_nmi_watchdog); |
| @@ -180,18 +186,6 @@ static int is_softlockup(unsigned long touch_ts) | |||
| 180 | return 0; | 186 | return 0; |
| 181 | } | 187 | } |
| 182 | 188 | ||
| 183 | static int | ||
| 184 | watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
| 185 | { | ||
| 186 | did_panic = 1; | ||
| 187 | |||
| 188 | return NOTIFY_DONE; | ||
| 189 | } | ||
| 190 | |||
| 191 | static struct notifier_block panic_block = { | ||
| 192 | .notifier_call = watchdog_panic, | ||
| 193 | }; | ||
| 194 | |||
| 195 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 189 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 196 | static struct perf_event_attr wd_hw_attr = { | 190 | static struct perf_event_attr wd_hw_attr = { |
| 197 | .type = PERF_TYPE_HARDWARE, | 191 | .type = PERF_TYPE_HARDWARE, |
| @@ -202,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = { | |||
| 202 | }; | 196 | }; |
| 203 | 197 | ||
| 204 | /* Callback function for perf event subsystem */ | 198 | /* Callback function for perf event subsystem */ |
| 205 | void watchdog_overflow_callback(struct perf_event *event, int nmi, | 199 | static void watchdog_overflow_callback(struct perf_event *event, int nmi, |
| 206 | struct perf_sample_data *data, | 200 | struct perf_sample_data *data, |
| 207 | struct pt_regs *regs) | 201 | struct pt_regs *regs) |
| 208 | { | 202 | { |
| @@ -364,14 +358,14 @@ static int watchdog_nmi_enable(int cpu) | |||
| 364 | /* Try to register using hardware perf events */ | 358 | /* Try to register using hardware perf events */ |
| 365 | wd_attr = &wd_hw_attr; | 359 | wd_attr = &wd_hw_attr; |
| 366 | wd_attr->sample_period = hw_nmi_get_sample_period(); | 360 | wd_attr->sample_period = hw_nmi_get_sample_period(); |
| 367 | event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); | 361 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); |
| 368 | if (!IS_ERR(event)) { | 362 | if (!IS_ERR(event)) { |
| 369 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | 363 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); |
| 370 | goto out_save; | 364 | goto out_save; |
| 371 | } | 365 | } |
| 372 | 366 | ||
| 373 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); | 367 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); |
| 374 | return -1; | 368 | return PTR_ERR(event); |
| 375 | 369 | ||
| 376 | /* success path */ | 370 | /* success path */ |
| 377 | out_save: | 371 | out_save: |
| @@ -415,17 +409,19 @@ static int watchdog_prepare_cpu(int cpu) | |||
| 415 | static int watchdog_enable(int cpu) | 409 | static int watchdog_enable(int cpu) |
| 416 | { | 410 | { |
| 417 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | 411 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); |
| 412 | int err; | ||
| 418 | 413 | ||
| 419 | /* enable the perf event */ | 414 | /* enable the perf event */ |
| 420 | if (watchdog_nmi_enable(cpu) != 0) | 415 | err = watchdog_nmi_enable(cpu); |
| 421 | return -1; | 416 | if (err) |
| 417 | return err; | ||
| 422 | 418 | ||
| 423 | /* create the watchdog thread */ | 419 | /* create the watchdog thread */ |
| 424 | if (!p) { | 420 | if (!p) { |
| 425 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | 421 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); |
| 426 | if (IS_ERR(p)) { | 422 | if (IS_ERR(p)) { |
| 427 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | 423 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); |
| 428 | return -1; | 424 | return PTR_ERR(p); |
| 429 | } | 425 | } |
| 430 | kthread_bind(p, cpu); | 426 | kthread_bind(p, cpu); |
| 431 | per_cpu(watchdog_touch_ts, cpu) = 0; | 427 | per_cpu(watchdog_touch_ts, cpu) = 0; |
| @@ -433,6 +429,9 @@ static int watchdog_enable(int cpu) | |||
| 433 | wake_up_process(p); | 429 | wake_up_process(p); |
| 434 | } | 430 | } |
| 435 | 431 | ||
| 432 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | ||
| 433 | watchdog_enabled = 1; | ||
| 434 | |||
| 436 | return 0; | 435 | return 0; |
| 437 | } | 436 | } |
| 438 | 437 | ||
| @@ -455,9 +454,6 @@ static void watchdog_disable(int cpu) | |||
| 455 | per_cpu(softlockup_watchdog, cpu) = NULL; | 454 | per_cpu(softlockup_watchdog, cpu) = NULL; |
| 456 | kthread_stop(p); | 455 | kthread_stop(p); |
| 457 | } | 456 | } |
| 458 | |||
| 459 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | ||
| 460 | watchdog_enabled = 1; | ||
| 461 | } | 457 | } |
| 462 | 458 | ||
| 463 | static void watchdog_enable_all_cpus(void) | 459 | static void watchdog_enable_all_cpus(void) |
| @@ -477,6 +473,9 @@ static void watchdog_disable_all_cpus(void) | |||
| 477 | { | 473 | { |
| 478 | int cpu; | 474 | int cpu; |
| 479 | 475 | ||
| 476 | if (no_watchdog) | ||
| 477 | return; | ||
| 478 | |||
| 480 | for_each_online_cpu(cpu) | 479 | for_each_online_cpu(cpu) |
| 481 | watchdog_disable(cpu); | 480 | watchdog_disable(cpu); |
| 482 | 481 | ||
| @@ -519,17 +518,16 @@ static int __cpuinit | |||
| 519 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 518 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
| 520 | { | 519 | { |
| 521 | int hotcpu = (unsigned long)hcpu; | 520 | int hotcpu = (unsigned long)hcpu; |
| 521 | int err = 0; | ||
| 522 | 522 | ||
| 523 | switch (action) { | 523 | switch (action) { |
| 524 | case CPU_UP_PREPARE: | 524 | case CPU_UP_PREPARE: |
| 525 | case CPU_UP_PREPARE_FROZEN: | 525 | case CPU_UP_PREPARE_FROZEN: |
| 526 | if (watchdog_prepare_cpu(hotcpu)) | 526 | err = watchdog_prepare_cpu(hotcpu); |
| 527 | return NOTIFY_BAD; | ||
| 528 | break; | 527 | break; |
| 529 | case CPU_ONLINE: | 528 | case CPU_ONLINE: |
| 530 | case CPU_ONLINE_FROZEN: | 529 | case CPU_ONLINE_FROZEN: |
| 531 | if (watchdog_enable(hotcpu)) | 530 | err = watchdog_enable(hotcpu); |
| 532 | return NOTIFY_BAD; | ||
| 533 | break; | 531 | break; |
| 534 | #ifdef CONFIG_HOTPLUG_CPU | 532 | #ifdef CONFIG_HOTPLUG_CPU |
| 535 | case CPU_UP_CANCELED: | 533 | case CPU_UP_CANCELED: |
| @@ -542,7 +540,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 542 | break; | 540 | break; |
| 543 | #endif /* CONFIG_HOTPLUG_CPU */ | 541 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 544 | } | 542 | } |
| 545 | return NOTIFY_OK; | 543 | return notifier_from_errno(err); |
| 546 | } | 544 | } |
| 547 | 545 | ||
| 548 | static struct notifier_block __cpuinitdata cpu_nfb = { | 546 | static struct notifier_block __cpuinitdata cpu_nfb = { |
| @@ -558,13 +556,11 @@ static int __init spawn_watchdog_task(void) | |||
| 558 | return 0; | 556 | return 0; |
| 559 | 557 | ||
| 560 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 558 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
| 561 | WARN_ON(err == NOTIFY_BAD); | 559 | WARN_ON(notifier_to_errno(err)); |
| 562 | 560 | ||
| 563 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 561 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
| 564 | register_cpu_notifier(&cpu_nfb); | 562 | register_cpu_notifier(&cpu_nfb); |
| 565 | 563 | ||
| 566 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | ||
| 567 | |||
| 568 | return 0; | 564 | return 0; |
| 569 | } | 565 | } |
| 570 | early_initcall(spawn_watchdog_task); | 566 | early_initcall(spawn_watchdog_task); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 727f24e563ae..f77afd939229 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -1,19 +1,26 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * linux/kernel/workqueue.c | 2 | * kernel/workqueue.c - generic async execution with shared worker pool |
| 3 | * | 3 | * |
| 4 | * Generic mechanism for defining kernel helper threads for running | 4 | * Copyright (C) 2002 Ingo Molnar |
| 5 | * arbitrary tasks in process context. | ||
| 6 | * | 5 | * |
| 7 | * Started by Ingo Molnar, Copyright (C) 2002 | 6 | * Derived from the taskqueue/keventd code by: |
| 7 | * David Woodhouse <dwmw2@infradead.org> | ||
| 8 | * Andrew Morton | ||
| 9 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | ||
| 10 | * Theodore Ts'o <tytso@mit.edu> | ||
| 8 | * | 11 | * |
| 9 | * Derived from the taskqueue/keventd code by: | 12 | * Made to use alloc_percpu by Christoph Lameter. |
| 10 | * | 13 | * |
| 11 | * David Woodhouse <dwmw2@infradead.org> | 14 | * Copyright (C) 2010 SUSE Linux Products GmbH |
| 12 | * Andrew Morton | 15 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
| 13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | ||
| 14 | * Theodore Ts'o <tytso@mit.edu> | ||
| 15 | * | 16 | * |
| 16 | * Made to use alloc_percpu by Christoph Lameter. | 17 | * This is the generic async execution mechanism. Work items as are |
| 18 | * executed in process context. The worker pool is shared and | ||
| 19 | * automatically managed. There is one worker pool for each CPU and | ||
| 20 | * one extra for works which are better served by workers which are | ||
| 21 | * not bound to any specific CPU. | ||
| 22 | * | ||
| 23 | * Please read Documentation/workqueue.txt for details. | ||
| 17 | */ | 24 | */ |
| 18 | 25 | ||
| 19 | #include <linux/module.h> | 26 | #include <linux/module.h> |
