aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c17
-rw-r--r--kernel/cgroup.c62
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/lockdep.c84
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/perf_event.c389
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c11
-rw-r--r--kernel/rcupdate.c30
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c2
-rw-r--r--kernel/rcutree.c131
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h69
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/stop_machine.c5
-rw-r--r--kernel/trace/trace_event_perf.c190
-rw-r--r--kernel/trace/trace_kprobe.c17
-rw-r--r--kernel/trace/trace_syscalls.c17
25 files changed, 762 insertions, 489 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 24f8c81fc48d..e4c0e1fee9b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -353,17 +353,18 @@ restart:
353 353
354void acct_exit_ns(struct pid_namespace *ns) 354void acct_exit_ns(struct pid_namespace *ns)
355{ 355{
356 struct bsd_acct_struct *acct; 356 struct bsd_acct_struct *acct = ns->bacct;
357 357
358 spin_lock(&acct_lock); 358 if (acct == NULL)
359 acct = ns->bacct; 359 return;
360 if (acct != NULL) {
361 if (acct->file != NULL)
362 acct_file_reopen(acct, NULL, NULL);
363 360
364 kfree(acct); 361 del_timer_sync(&acct->timer);
365 } 362 spin_lock(&acct_lock);
363 if (acct->file != NULL)
364 acct_file_reopen(acct, NULL, NULL);
366 spin_unlock(&acct_lock); 365 spin_unlock(&acct_lock);
366
367 kfree(acct);
367} 368}
368 369
369/* 370/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4a07d057a265..e9ec642932ee 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1646,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1647{ 1647{
1648 char *start; 1648 char *start;
1649 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1650 1652
1651 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1652 /* 1654 /*
@@ -1662,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1662 *--start = '\0'; 1664 *--start = '\0';
1663 for (;;) { 1665 for (;;) {
1664 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1665 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1666 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1667 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1668 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1669 if (!cgrp) 1672 if (!cgrp)
1670 break; 1673 break;
1671 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1672 if (!cgrp->parent) 1678 if (!cgrp->parent)
1673 continue; 1679 continue;
1674 if (--start < buf) 1680 if (--start < buf)
@@ -4429,7 +4435,15 @@ __setup("cgroup_disable=", cgroup_disable);
4429 */ 4435 */
4430unsigned short css_id(struct cgroup_subsys_state *css) 4436unsigned short css_id(struct cgroup_subsys_state *css)
4431{ 4437{
4432 struct css_id *cssid = rcu_dereference(css->id); 4438 struct css_id *cssid;
4439
4440 /*
4441 * This css_id() can return correct value when somone has refcnt
4442 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4443 * it's unchanged until freed.
4444 */
4445 cssid = rcu_dereference_check(css->id,
4446 rcu_read_lock_held() || atomic_read(&css->refcnt));
4433 4447
4434 if (cssid) 4448 if (cssid)
4435 return cssid->id; 4449 return cssid->id;
@@ -4439,7 +4453,10 @@ EXPORT_SYMBOL_GPL(css_id);
4439 4453
4440unsigned short css_depth(struct cgroup_subsys_state *css) 4454unsigned short css_depth(struct cgroup_subsys_state *css)
4441{ 4455{
4442 struct css_id *cssid = rcu_dereference(css->id); 4456 struct css_id *cssid;
4457
4458 cssid = rcu_dereference_check(css->id,
4459 rcu_read_lock_held() || atomic_read(&css->refcnt));
4443 4460
4444 if (cssid) 4461 if (cssid)
4445 return cssid->depth; 4462 return cssid->depth;
@@ -4447,15 +4464,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4447} 4464}
4448EXPORT_SYMBOL_GPL(css_depth); 4465EXPORT_SYMBOL_GPL(css_depth);
4449 4466
4467/**
4468 * css_is_ancestor - test "root" css is an ancestor of "child"
4469 * @child: the css to be tested.
4470 * @root: the css supporsed to be an ancestor of the child.
4471 *
4472 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4473 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4474 * But, considering usual usage, the csses should be valid objects after test.
4475 * Assuming that the caller will do some action to the child if this returns
4476 * returns true, the caller must take "child";s reference count.
4477 * If "child" is valid object and this returns true, "root" is valid, too.
4478 */
4479
4450bool css_is_ancestor(struct cgroup_subsys_state *child, 4480bool css_is_ancestor(struct cgroup_subsys_state *child,
4451 const struct cgroup_subsys_state *root) 4481 const struct cgroup_subsys_state *root)
4452{ 4482{
4453 struct css_id *child_id = rcu_dereference(child->id); 4483 struct css_id *child_id;
4454 struct css_id *root_id = rcu_dereference(root->id); 4484 struct css_id *root_id;
4485 bool ret = true;
4455 4486
4456 if (!child_id || !root_id || (child_id->depth < root_id->depth)) 4487 rcu_read_lock();
4457 return false; 4488 child_id = rcu_dereference(child->id);
4458 return child_id->stack[root_id->depth] == root_id->id; 4489 root_id = rcu_dereference(root->id);
4490 if (!child_id
4491 || !root_id
4492 || (child_id->depth < root_id->depth)
4493 || (child_id->stack[root_id->depth] != root_id->id))
4494 ret = false;
4495 rcu_read_unlock();
4496 return ret;
4459} 4497}
4460 4498
4461static void __free_css_id_cb(struct rcu_head *head) 4499static void __free_css_id_cb(struct rcu_head *head)
@@ -4555,13 +4593,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4555{ 4593{
4556 int subsys_id, i, depth = 0; 4594 int subsys_id, i, depth = 0;
4557 struct cgroup_subsys_state *parent_css, *child_css; 4595 struct cgroup_subsys_state *parent_css, *child_css;
4558 struct css_id *child_id, *parent_id = NULL; 4596 struct css_id *child_id, *parent_id;
4559 4597
4560 subsys_id = ss->subsys_id; 4598 subsys_id = ss->subsys_id;
4561 parent_css = parent->subsys[subsys_id]; 4599 parent_css = parent->subsys[subsys_id];
4562 child_css = child->subsys[subsys_id]; 4600 child_css = child->subsys[subsys_id];
4563 depth = css_depth(parent_css) + 1;
4564 parent_id = parent_css->id; 4601 parent_id = parent_css->id;
4602 depth = parent_id->depth;
4565 4603
4566 child_id = get_new_cssid(ss, depth); 4604 child_id = get_new_cssid(ss, depth);
4567 if (IS_ERR(child_id)) 4605 if (IS_ERR(child_id))
diff --git a/kernel/fork.c b/kernel/fork.c
index 5d3592deaf71..4d57d9e3a6e9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1111,7 +1111,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1111 p->memcg_batch.do_batch = 0; 1111 p->memcg_batch.do_batch = 0;
1112 p->memcg_batch.memcg = NULL; 1112 p->memcg_batch.memcg = NULL;
1113#endif 1113#endif
1114 p->stack_start = stack_start;
1115 1114
1116 /* Perform scheduler related setup. Assign this task to a CPU. */ 1115 /* Perform scheduler related setup. Assign this task to a CPU. */
1117 sched_fork(p, clone_flags); 1116 sched_fork(p, clone_flags);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87ebe8adc474..474a84715eac 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1134,11 +1134,9 @@ int crash_shrink_memory(unsigned long new_size)
1134 1134
1135 free_reserved_phys_range(end, crashk_res.end); 1135 free_reserved_phys_range(end, crashk_res.end);
1136 1136
1137 if (start == end) { 1137 if (start == end)
1138 crashk_res.end = end;
1139 release_resource(&crashk_res); 1138 release_resource(&crashk_res);
1140 } else 1139 crashk_res.end = end - 1;
1141 crashk_res.end = end - 1;
1142 1140
1143unlock: 1141unlock:
1144 mutex_unlock(&kexec_mutex); 1142 mutex_unlock(&kexec_mutex);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e9c759f06c1d..ec21304856d1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
431/* 431/*
432 * Various lockdep statistics: 432 * Various lockdep statistics:
433 */ 433 */
434atomic_t chain_lookup_hits; 434DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
435atomic_t chain_lookup_misses;
436atomic_t hardirqs_on_events;
437atomic_t hardirqs_off_events;
438atomic_t redundant_hardirqs_on;
439atomic_t redundant_hardirqs_off;
440atomic_t softirqs_on_events;
441atomic_t softirqs_off_events;
442atomic_t redundant_softirqs_on;
443atomic_t redundant_softirqs_off;
444atomic_t nr_unused_locks;
445atomic_t nr_cyclic_checks;
446atomic_t nr_find_usage_forwards_checks;
447atomic_t nr_find_usage_backwards_checks;
448#endif 435#endif
449 436
450/* 437/*
@@ -748,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
748 return NULL; 735 return NULL;
749 } 736 }
750 class = lock_classes + nr_lock_classes++; 737 class = lock_classes + nr_lock_classes++;
751 debug_atomic_inc(&nr_unused_locks); 738 debug_atomic_inc(nr_unused_locks);
752 class->key = key; 739 class->key = key;
753 class->name = lock->name; 740 class->name = lock->name;
754 class->subclass = subclass; 741 class->subclass = subclass;
@@ -818,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
818 * Add a new dependency to the head of the list: 805 * Add a new dependency to the head of the list:
819 */ 806 */
820static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 807static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
821 struct list_head *head, unsigned long ip, int distance) 808 struct list_head *head, unsigned long ip,
809 int distance, struct stack_trace *trace)
822{ 810{
823 struct lock_list *entry; 811 struct lock_list *entry;
824 /* 812 /*
@@ -829,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
829 if (!entry) 817 if (!entry)
830 return 0; 818 return 0;
831 819
832 if (!save_trace(&entry->trace))
833 return 0;
834
835 entry->class = this; 820 entry->class = this;
836 entry->distance = distance; 821 entry->distance = distance;
822 entry->trace = *trace;
837 /* 823 /*
838 * Since we never remove from the dependency list, the list can 824 * Since we never remove from the dependency list, the list can
839 * be walked lockless by other CPUs, it's only allocation 825 * be walked lockless by other CPUs, it's only allocation
@@ -1205,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1205{ 1191{
1206 int result; 1192 int result;
1207 1193
1208 debug_atomic_inc(&nr_cyclic_checks); 1194 debug_atomic_inc(nr_cyclic_checks);
1209 1195
1210 result = __bfs_forwards(root, target, class_equal, target_entry); 1196 result = __bfs_forwards(root, target, class_equal, target_entry);
1211 1197
@@ -1242,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1242{ 1228{
1243 int result; 1229 int result;
1244 1230
1245 debug_atomic_inc(&nr_find_usage_forwards_checks); 1231 debug_atomic_inc(nr_find_usage_forwards_checks);
1246 1232
1247 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1233 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1248 1234
@@ -1265,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1265{ 1251{
1266 int result; 1252 int result;
1267 1253
1268 debug_atomic_inc(&nr_find_usage_backwards_checks); 1254 debug_atomic_inc(nr_find_usage_backwards_checks);
1269 1255
1270 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1256 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1271 1257
@@ -1635,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1635 */ 1621 */
1636static int 1622static int
1637check_prev_add(struct task_struct *curr, struct held_lock *prev, 1623check_prev_add(struct task_struct *curr, struct held_lock *prev,
1638 struct held_lock *next, int distance) 1624 struct held_lock *next, int distance, int trylock_loop)
1639{ 1625{
1640 struct lock_list *entry; 1626 struct lock_list *entry;
1641 int ret; 1627 int ret;
1642 struct lock_list this; 1628 struct lock_list this;
1643 struct lock_list *uninitialized_var(target_entry); 1629 struct lock_list *uninitialized_var(target_entry);
1630 /*
1631 * Static variable, serialized by the graph_lock().
1632 *
1633 * We use this static variable to save the stack trace in case
1634 * we call into this function multiple times due to encountering
1635 * trylocks in the held lock stack.
1636 */
1637 static struct stack_trace trace;
1644 1638
1645 /* 1639 /*
1646 * Prove that the new <prev> -> <next> dependency would not 1640 * Prove that the new <prev> -> <next> dependency would not
@@ -1688,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1688 } 1682 }
1689 } 1683 }
1690 1684
1685 if (!trylock_loop && !save_trace(&trace))
1686 return 0;
1687
1691 /* 1688 /*
1692 * Ok, all validations passed, add the new lock 1689 * Ok, all validations passed, add the new lock
1693 * to the previous lock's dependency list: 1690 * to the previous lock's dependency list:
1694 */ 1691 */
1695 ret = add_lock_to_list(hlock_class(prev), hlock_class(next), 1692 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1696 &hlock_class(prev)->locks_after, 1693 &hlock_class(prev)->locks_after,
1697 next->acquire_ip, distance); 1694 next->acquire_ip, distance, &trace);
1698 1695
1699 if (!ret) 1696 if (!ret)
1700 return 0; 1697 return 0;
1701 1698
1702 ret = add_lock_to_list(hlock_class(next), hlock_class(prev), 1699 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1703 &hlock_class(next)->locks_before, 1700 &hlock_class(next)->locks_before,
1704 next->acquire_ip, distance); 1701 next->acquire_ip, distance, &trace);
1705 if (!ret) 1702 if (!ret)
1706 return 0; 1703 return 0;
1707 1704
@@ -1731,6 +1728,7 @@ static int
1731check_prevs_add(struct task_struct *curr, struct held_lock *next) 1728check_prevs_add(struct task_struct *curr, struct held_lock *next)
1732{ 1729{
1733 int depth = curr->lockdep_depth; 1730 int depth = curr->lockdep_depth;
1731 int trylock_loop = 0;
1734 struct held_lock *hlock; 1732 struct held_lock *hlock;
1735 1733
1736 /* 1734 /*
@@ -1756,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1756 * added: 1754 * added:
1757 */ 1755 */
1758 if (hlock->read != 2) { 1756 if (hlock->read != 2) {
1759 if (!check_prev_add(curr, hlock, next, distance)) 1757 if (!check_prev_add(curr, hlock, next,
1758 distance, trylock_loop))
1760 return 0; 1759 return 0;
1761 /* 1760 /*
1762 * Stop after the first non-trylock entry, 1761 * Stop after the first non-trylock entry,
@@ -1779,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1779 if (curr->held_locks[depth].irq_context != 1778 if (curr->held_locks[depth].irq_context !=
1780 curr->held_locks[depth-1].irq_context) 1779 curr->held_locks[depth-1].irq_context)
1781 break; 1780 break;
1781 trylock_loop = 1;
1782 } 1782 }
1783 return 1; 1783 return 1;
1784out_bug: 1784out_bug:
@@ -1825,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1825 list_for_each_entry(chain, hash_head, entry) { 1825 list_for_each_entry(chain, hash_head, entry) {
1826 if (chain->chain_key == chain_key) { 1826 if (chain->chain_key == chain_key) {
1827cache_hit: 1827cache_hit:
1828 debug_atomic_inc(&chain_lookup_hits); 1828 debug_atomic_inc(chain_lookup_hits);
1829 if (very_verbose(class)) 1829 if (very_verbose(class))
1830 printk("\nhash chain already cached, key: " 1830 printk("\nhash chain already cached, key: "
1831 "%016Lx tail class: [%p] %s\n", 1831 "%016Lx tail class: [%p] %s\n",
@@ -1890,7 +1890,7 @@ cache_hit:
1890 chain_hlocks[chain->base + j] = class - lock_classes; 1890 chain_hlocks[chain->base + j] = class - lock_classes;
1891 } 1891 }
1892 list_add_tail_rcu(&chain->entry, hash_head); 1892 list_add_tail_rcu(&chain->entry, hash_head);
1893 debug_atomic_inc(&chain_lookup_misses); 1893 debug_atomic_inc(chain_lookup_misses);
1894 inc_chains(); 1894 inc_chains();
1895 1895
1896 return 1; 1896 return 1;
@@ -2311,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2311 return; 2311 return;
2312 2312
2313 if (unlikely(curr->hardirqs_enabled)) { 2313 if (unlikely(curr->hardirqs_enabled)) {
2314 debug_atomic_inc(&redundant_hardirqs_on); 2314 /*
2315 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit
2317 * in a stat is not a big deal.
2318 */
2319 __debug_atomic_inc(redundant_hardirqs_on);
2315 return; 2320 return;
2316 } 2321 }
2317 /* we'll do an OFF -> ON transition: */ 2322 /* we'll do an OFF -> ON transition: */
@@ -2338,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2338 2343
2339 curr->hardirq_enable_ip = ip; 2344 curr->hardirq_enable_ip = ip;
2340 curr->hardirq_enable_event = ++curr->irq_events; 2345 curr->hardirq_enable_event = ++curr->irq_events;
2341 debug_atomic_inc(&hardirqs_on_events); 2346 debug_atomic_inc(hardirqs_on_events);
2342} 2347}
2343EXPORT_SYMBOL(trace_hardirqs_on_caller); 2348EXPORT_SYMBOL(trace_hardirqs_on_caller);
2344 2349
@@ -2370,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
2370 curr->hardirqs_enabled = 0; 2375 curr->hardirqs_enabled = 0;
2371 curr->hardirq_disable_ip = ip; 2376 curr->hardirq_disable_ip = ip;
2372 curr->hardirq_disable_event = ++curr->irq_events; 2377 curr->hardirq_disable_event = ++curr->irq_events;
2373 debug_atomic_inc(&hardirqs_off_events); 2378 debug_atomic_inc(hardirqs_off_events);
2374 } else 2379 } else
2375 debug_atomic_inc(&redundant_hardirqs_off); 2380 debug_atomic_inc(redundant_hardirqs_off);
2376} 2381}
2377EXPORT_SYMBOL(trace_hardirqs_off_caller); 2382EXPORT_SYMBOL(trace_hardirqs_off_caller);
2378 2383
@@ -2396,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
2396 return; 2401 return;
2397 2402
2398 if (curr->softirqs_enabled) { 2403 if (curr->softirqs_enabled) {
2399 debug_atomic_inc(&redundant_softirqs_on); 2404 debug_atomic_inc(redundant_softirqs_on);
2400 return; 2405 return;
2401 } 2406 }
2402 2407
@@ -2406,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
2406 curr->softirqs_enabled = 1; 2411 curr->softirqs_enabled = 1;
2407 curr->softirq_enable_ip = ip; 2412 curr->softirq_enable_ip = ip;
2408 curr->softirq_enable_event = ++curr->irq_events; 2413 curr->softirq_enable_event = ++curr->irq_events;
2409 debug_atomic_inc(&softirqs_on_events); 2414 debug_atomic_inc(softirqs_on_events);
2410 /* 2415 /*
2411 * We are going to turn softirqs on, so set the 2416 * We are going to turn softirqs on, so set the
2412 * usage bit for all held locks, if hardirqs are 2417 * usage bit for all held locks, if hardirqs are
@@ -2436,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
2436 curr->softirqs_enabled = 0; 2441 curr->softirqs_enabled = 0;
2437 curr->softirq_disable_ip = ip; 2442 curr->softirq_disable_ip = ip;
2438 curr->softirq_disable_event = ++curr->irq_events; 2443 curr->softirq_disable_event = ++curr->irq_events;
2439 debug_atomic_inc(&softirqs_off_events); 2444 debug_atomic_inc(softirqs_off_events);
2440 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2445 DEBUG_LOCKS_WARN_ON(!softirq_count());
2441 } else 2446 } else
2442 debug_atomic_inc(&redundant_softirqs_off); 2447 debug_atomic_inc(redundant_softirqs_off);
2443} 2448}
2444 2449
2445static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) 2450static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2644,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2644 return 0; 2649 return 0;
2645 break; 2650 break;
2646 case LOCK_USED: 2651 case LOCK_USED:
2647 debug_atomic_dec(&nr_unused_locks); 2652 debug_atomic_dec(nr_unused_locks);
2648 break; 2653 break;
2649 default: 2654 default:
2650 if (!debug_locks_off_graph_unlock()) 2655 if (!debug_locks_off_graph_unlock())
@@ -2750,7 +2755,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2750 if (!class) 2755 if (!class)
2751 return 0; 2756 return 0;
2752 } 2757 }
2753 debug_atomic_inc((atomic_t *)&class->ops); 2758 atomic_inc((atomic_t *)&class->ops);
2754 if (very_verbose(class)) { 2759 if (very_verbose(class)) {
2755 printk("\nacquire class [%p] %s", class->key, class->name); 2760 printk("\nacquire class [%p] %s", class->key, class->name);
2756 if (class->name_version > 1) 2761 if (class->name_version > 1)
@@ -3801,8 +3806,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
3801{ 3806{
3802 struct task_struct *curr = current; 3807 struct task_struct *curr = current;
3803 3808
3809#ifndef CONFIG_PROVE_RCU_REPEATEDLY
3804 if (!debug_locks_off()) 3810 if (!debug_locks_off())
3805 return; 3811 return;
3812#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
3813 /* Note: the following can be executed concurrently, so be careful. */
3806 printk("\n===================================================\n"); 3814 printk("\n===================================================\n");
3807 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 3815 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3808 printk( "---------------------------------------------------\n"); 3816 printk( "---------------------------------------------------\n");
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
110#endif 110#endif
111 111
112#ifdef CONFIG_DEBUG_LOCKDEP 112#ifdef CONFIG_DEBUG_LOCKDEP
113
114#include <asm/local.h>
113/* 115/*
114 * Various lockdep statistics: 116 * Various lockdep statistics.
117 * We want them per cpu as they are often accessed in fast path
118 * and we want to avoid too much cache bouncing.
115 */ 119 */
116extern atomic_t chain_lookup_hits; 120struct lockdep_stats {
117extern atomic_t chain_lookup_misses; 121 int chain_lookup_hits;
118extern atomic_t hardirqs_on_events; 122 int chain_lookup_misses;
119extern atomic_t hardirqs_off_events; 123 int hardirqs_on_events;
120extern atomic_t redundant_hardirqs_on; 124 int hardirqs_off_events;
121extern atomic_t redundant_hardirqs_off; 125 int redundant_hardirqs_on;
122extern atomic_t softirqs_on_events; 126 int redundant_hardirqs_off;
123extern atomic_t softirqs_off_events; 127 int softirqs_on_events;
124extern atomic_t redundant_softirqs_on; 128 int softirqs_off_events;
125extern atomic_t redundant_softirqs_off; 129 int redundant_softirqs_on;
126extern atomic_t nr_unused_locks; 130 int redundant_softirqs_off;
127extern atomic_t nr_cyclic_checks; 131 int nr_unused_locks;
128extern atomic_t nr_cyclic_check_recursions; 132 int nr_cyclic_checks;
129extern atomic_t nr_find_usage_forwards_checks; 133 int nr_cyclic_check_recursions;
130extern atomic_t nr_find_usage_forwards_recursions; 134 int nr_find_usage_forwards_checks;
131extern atomic_t nr_find_usage_backwards_checks; 135 int nr_find_usage_forwards_recursions;
132extern atomic_t nr_find_usage_backwards_recursions; 136 int nr_find_usage_backwards_checks;
133# define debug_atomic_inc(ptr) atomic_inc(ptr) 137 int nr_find_usage_backwards_recursions;
134# define debug_atomic_dec(ptr) atomic_dec(ptr) 138};
135# define debug_atomic_read(ptr) atomic_read(ptr) 139
140DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
141
142#define __debug_atomic_inc(ptr) \
143 this_cpu_inc(lockdep_stats.ptr);
144
145#define debug_atomic_inc(ptr) { \
146 WARN_ON_ONCE(!irqs_disabled()); \
147 __this_cpu_inc(lockdep_stats.ptr); \
148}
149
150#define debug_atomic_dec(ptr) { \
151 WARN_ON_ONCE(!irqs_disabled()); \
152 __this_cpu_dec(lockdep_stats.ptr); \
153}
154
155#define debug_atomic_read(ptr) ({ \
156 struct lockdep_stats *__cpu_lockdep_stats; \
157 unsigned long long __total = 0; \
158 int __cpu; \
159 for_each_possible_cpu(__cpu) { \
160 __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
161 __total += __cpu_lockdep_stats->ptr; \
162 } \
163 __total; \
164})
136#else 165#else
166# define __debug_atomic_inc(ptr) do { } while (0)
137# define debug_atomic_inc(ptr) do { } while (0) 167# define debug_atomic_inc(ptr) do { } while (0)
138# define debug_atomic_dec(ptr) do { } while (0) 168# define debug_atomic_dec(ptr) do { } while (0)
139# define debug_atomic_read(ptr) 0 169# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
184static void lockdep_stats_debug_show(struct seq_file *m) 184static void lockdep_stats_debug_show(struct seq_file *m)
185{ 185{
186#ifdef CONFIG_DEBUG_LOCKDEP 186#ifdef CONFIG_DEBUG_LOCKDEP
187 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), 187 unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
188 hi2 = debug_atomic_read(&hardirqs_off_events), 188 hi2 = debug_atomic_read(hardirqs_off_events),
189 hr1 = debug_atomic_read(&redundant_hardirqs_on), 189 hr1 = debug_atomic_read(redundant_hardirqs_on),
190 hr2 = debug_atomic_read(&redundant_hardirqs_off), 190 hr2 = debug_atomic_read(redundant_hardirqs_off),
191 si1 = debug_atomic_read(&softirqs_on_events), 191 si1 = debug_atomic_read(softirqs_on_events),
192 si2 = debug_atomic_read(&softirqs_off_events), 192 si2 = debug_atomic_read(softirqs_off_events),
193 sr1 = debug_atomic_read(&redundant_softirqs_on), 193 sr1 = debug_atomic_read(redundant_softirqs_on),
194 sr2 = debug_atomic_read(&redundant_softirqs_off); 194 sr2 = debug_atomic_read(redundant_softirqs_off);
195 195
196 seq_printf(m, " chain lookup misses: %11u\n", 196 seq_printf(m, " chain lookup misses: %11llu\n",
197 debug_atomic_read(&chain_lookup_misses)); 197 debug_atomic_read(chain_lookup_misses));
198 seq_printf(m, " chain lookup hits: %11u\n", 198 seq_printf(m, " chain lookup hits: %11llu\n",
199 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(chain_lookup_hits));
200 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11llu\n",
201 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(nr_cyclic_checks));
202 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11llu\n",
203 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(nr_find_usage_forwards_checks));
204 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11llu\n",
205 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(nr_find_usage_backwards_checks));
206 206
207 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11llu\n", hi1);
208 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11llu\n", hi2);
209 seq_printf(m, " redundant hardirq ons: %11u\n", hr1); 209 seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
210 seq_printf(m, " redundant hardirq offs: %11u\n", hr2); 210 seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
211 seq_printf(m, " softirq on events: %11u\n", si1); 211 seq_printf(m, " softirq on events: %11llu\n", si1);
212 seq_printf(m, " softirq off events: %11u\n", si2); 212 seq_printf(m, " softirq off events: %11llu\n", si2);
213 seq_printf(m, " redundant softirq ons: %11u\n", sr1); 213 seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
214 seq_printf(m, " redundant softirq offs: %11u\n", sr2); 214 seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
215#endif 215#endif
216} 216}
217 217
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
263#endif 263#endif
264 } 264 }
265#ifdef CONFIG_DEBUG_LOCKDEP 265#ifdef CONFIG_DEBUG_LOCKDEP
266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
267#endif 267#endif
268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
269 nr_lock_classes, MAX_LOCKDEP_KEYS); 269 nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a4fa381db3c2..e099650cd249 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2297,11 +2297,6 @@ unlock:
2297 rcu_read_unlock(); 2297 rcu_read_unlock();
2298} 2298}
2299 2299
2300static unsigned long perf_data_size(struct perf_mmap_data *data)
2301{
2302 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2303}
2304
2305#ifndef CONFIG_PERF_USE_VMALLOC 2300#ifndef CONFIG_PERF_USE_VMALLOC
2306 2301
2307/* 2302/*
@@ -2320,6 +2315,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2320 return virt_to_page(data->data_pages[pgoff - 1]); 2315 return virt_to_page(data->data_pages[pgoff - 1]);
2321} 2316}
2322 2317
2318static void *perf_mmap_alloc_page(int cpu)
2319{
2320 struct page *page;
2321 int node;
2322
2323 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2324 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2325 if (!page)
2326 return NULL;
2327
2328 return page_address(page);
2329}
2330
2323static struct perf_mmap_data * 2331static struct perf_mmap_data *
2324perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2332perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2325{ 2333{
@@ -2336,17 +2344,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2336 if (!data) 2344 if (!data)
2337 goto fail; 2345 goto fail;
2338 2346
2339 data->user_page = (void *)get_zeroed_page(GFP_KERNEL); 2347 data->user_page = perf_mmap_alloc_page(event->cpu);
2340 if (!data->user_page) 2348 if (!data->user_page)
2341 goto fail_user_page; 2349 goto fail_user_page;
2342 2350
2343 for (i = 0; i < nr_pages; i++) { 2351 for (i = 0; i < nr_pages; i++) {
2344 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); 2352 data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
2345 if (!data->data_pages[i]) 2353 if (!data->data_pages[i])
2346 goto fail_data_pages; 2354 goto fail_data_pages;
2347 } 2355 }
2348 2356
2349 data->data_order = 0;
2350 data->nr_pages = nr_pages; 2357 data->nr_pages = nr_pages;
2351 2358
2352 return data; 2359 return data;
@@ -2382,6 +2389,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2382 kfree(data); 2389 kfree(data);
2383} 2390}
2384 2391
2392static inline int page_order(struct perf_mmap_data *data)
2393{
2394 return 0;
2395}
2396
2385#else 2397#else
2386 2398
2387/* 2399/*
@@ -2390,10 +2402,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2390 * Required for architectures that have d-cache aliasing issues. 2402 * Required for architectures that have d-cache aliasing issues.
2391 */ 2403 */
2392 2404
2405static inline int page_order(struct perf_mmap_data *data)
2406{
2407 return data->page_order;
2408}
2409
2393static struct page * 2410static struct page *
2394perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2411perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2395{ 2412{
2396 if (pgoff > (1UL << data->data_order)) 2413 if (pgoff > (1UL << page_order(data)))
2397 return NULL; 2414 return NULL;
2398 2415
2399 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2416 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2413,7 +2430,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2413 int i, nr; 2430 int i, nr;
2414 2431
2415 data = container_of(work, struct perf_mmap_data, work); 2432 data = container_of(work, struct perf_mmap_data, work);
2416 nr = 1 << data->data_order; 2433 nr = 1 << page_order(data);
2417 2434
2418 base = data->user_page; 2435 base = data->user_page;
2419 for (i = 0; i < nr + 1; i++) 2436 for (i = 0; i < nr + 1; i++)
@@ -2452,7 +2469,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2452 2469
2453 data->user_page = all_buf; 2470 data->user_page = all_buf;
2454 data->data_pages[0] = all_buf + PAGE_SIZE; 2471 data->data_pages[0] = all_buf + PAGE_SIZE;
2455 data->data_order = ilog2(nr_pages); 2472 data->page_order = ilog2(nr_pages);
2456 data->nr_pages = 1; 2473 data->nr_pages = 1;
2457 2474
2458 return data; 2475 return data;
@@ -2466,6 +2483,11 @@ fail:
2466 2483
2467#endif 2484#endif
2468 2485
2486static unsigned long perf_data_size(struct perf_mmap_data *data)
2487{
2488 return data->nr_pages << (PAGE_SHIFT + page_order(data));
2489}
2490
2469static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2491static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2470{ 2492{
2471 struct perf_event *event = vma->vm_file->private_data; 2493 struct perf_event *event = vma->vm_file->private_data;
@@ -2506,8 +2528,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2506{ 2528{
2507 long max_size = perf_data_size(data); 2529 long max_size = perf_data_size(data);
2508 2530
2509 atomic_set(&data->lock, -1);
2510
2511 if (event->attr.watermark) { 2531 if (event->attr.watermark) {
2512 data->watermark = min_t(long, max_size, 2532 data->watermark = min_t(long, max_size,
2513 event->attr.wakeup_watermark); 2533 event->attr.wakeup_watermark);
@@ -2580,6 +2600,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2580 long user_extra, extra; 2600 long user_extra, extra;
2581 int ret = 0; 2601 int ret = 0;
2582 2602
2603 /*
2604 * Don't allow mmap() of inherited per-task counters. This would
2605 * create a performance issue due to all children writing to the
2606 * same buffer.
2607 */
2608 if (event->cpu == -1 && event->attr.inherit)
2609 return -EINVAL;
2610
2583 if (!(vma->vm_flags & VM_SHARED)) 2611 if (!(vma->vm_flags & VM_SHARED))
2584 return -EINVAL; 2612 return -EINVAL;
2585 2613
@@ -2885,120 +2913,80 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2885} 2913}
2886 2914
2887/* 2915/*
2888 * Curious locking construct.
2889 *
2890 * We need to ensure a later event_id doesn't publish a head when a former 2916 * We need to ensure a later event_id doesn't publish a head when a former
2891 * event_id isn't done writing. However since we need to deal with NMIs we 2917 * event isn't done writing. However since we need to deal with NMIs we
2892 * cannot fully serialize things. 2918 * cannot fully serialize things.
2893 * 2919 *
2894 * What we do is serialize between CPUs so we only have to deal with NMI
2895 * nesting on a single CPU.
2896 *
2897 * We only publish the head (and generate a wakeup) when the outer-most 2920 * We only publish the head (and generate a wakeup) when the outer-most
2898 * event_id completes. 2921 * event completes.
2899 */ 2922 */
2900static void perf_output_lock(struct perf_output_handle *handle) 2923static void perf_output_get_handle(struct perf_output_handle *handle)
2901{ 2924{
2902 struct perf_mmap_data *data = handle->data; 2925 struct perf_mmap_data *data = handle->data;
2903 int cur, cpu = get_cpu();
2904
2905 handle->locked = 0;
2906
2907 for (;;) {
2908 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2909 if (cur == -1) {
2910 handle->locked = 1;
2911 break;
2912 }
2913 if (cur == cpu)
2914 break;
2915 2926
2916 cpu_relax(); 2927 preempt_disable();
2917 } 2928 local_inc(&data->nest);
2929 handle->wakeup = local_read(&data->wakeup);
2918} 2930}
2919 2931
2920static void perf_output_unlock(struct perf_output_handle *handle) 2932static void perf_output_put_handle(struct perf_output_handle *handle)
2921{ 2933{
2922 struct perf_mmap_data *data = handle->data; 2934 struct perf_mmap_data *data = handle->data;
2923 unsigned long head; 2935 unsigned long head;
2924 int cpu;
2925
2926 data->done_head = data->head;
2927
2928 if (!handle->locked)
2929 goto out;
2930 2936
2931again: 2937again:
2932 /* 2938 head = local_read(&data->head);
2933 * The xchg implies a full barrier that ensures all writes are done
2934 * before we publish the new head, matched by a rmb() in userspace when
2935 * reading this position.
2936 */
2937 while ((head = atomic_long_xchg(&data->done_head, 0)))
2938 data->user_page->data_head = head;
2939 2939
2940 /* 2940 /*
2941 * NMI can happen here, which means we can miss a done_head update. 2941 * IRQ/NMI can happen here, which means we can miss a head update.
2942 */ 2942 */
2943 2943
2944 cpu = atomic_xchg(&data->lock, -1); 2944 if (!local_dec_and_test(&data->nest))
2945 WARN_ON_ONCE(cpu != smp_processor_id()); 2945 goto out;
2946 2946
2947 /* 2947 /*
2948 * Therefore we have to validate we did not indeed do so. 2948 * Publish the known good head. Rely on the full barrier implied
2949 * by atomic_dec_and_test() order the data->head read and this
2950 * write.
2949 */ 2951 */
2950 if (unlikely(atomic_long_read(&data->done_head))) { 2952 data->user_page->data_head = head;
2951 /*
2952 * Since we had it locked, we can lock it again.
2953 */
2954 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2955 cpu_relax();
2956 2953
2954 /*
2955 * Now check if we missed an update, rely on the (compiler)
2956 * barrier in atomic_dec_and_test() to re-read data->head.
2957 */
2958 if (unlikely(head != local_read(&data->head))) {
2959 local_inc(&data->nest);
2957 goto again; 2960 goto again;
2958 } 2961 }
2959 2962
2960 if (atomic_xchg(&data->wakeup, 0)) 2963 if (handle->wakeup != local_read(&data->wakeup))
2961 perf_output_wakeup(handle); 2964 perf_output_wakeup(handle);
2962out: 2965
2963 put_cpu(); 2966 out:
2967 preempt_enable();
2964} 2968}
2965 2969
2966void perf_output_copy(struct perf_output_handle *handle, 2970__always_inline void perf_output_copy(struct perf_output_handle *handle,
2967 const void *buf, unsigned int len) 2971 const void *buf, unsigned int len)
2968{ 2972{
2969 unsigned int pages_mask;
2970 unsigned long offset;
2971 unsigned int size;
2972 void **pages;
2973
2974 offset = handle->offset;
2975 pages_mask = handle->data->nr_pages - 1;
2976 pages = handle->data->data_pages;
2977
2978 do { 2973 do {
2979 unsigned long page_offset; 2974 unsigned long size = min_t(unsigned long, handle->size, len);
2980 unsigned long page_size;
2981 int nr;
2982 2975
2983 nr = (offset >> PAGE_SHIFT) & pages_mask; 2976 memcpy(handle->addr, buf, size);
2984 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2985 page_offset = offset & (page_size - 1);
2986 size = min_t(unsigned int, page_size - page_offset, len);
2987 2977
2988 memcpy(pages[nr] + page_offset, buf, size); 2978 len -= size;
2979 handle->addr += size;
2980 handle->size -= size;
2981 if (!handle->size) {
2982 struct perf_mmap_data *data = handle->data;
2989 2983
2990 len -= size; 2984 handle->page++;
2991 buf += size; 2985 handle->page &= data->nr_pages - 1;
2992 offset += size; 2986 handle->addr = data->data_pages[handle->page];
2987 handle->size = PAGE_SIZE << page_order(data);
2988 }
2993 } while (len); 2989 } while (len);
2994
2995 handle->offset = offset;
2996
2997 /*
2998 * Check we didn't copy past our reservation window, taking the
2999 * possible unsigned int wrap into account.
3000 */
3001 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
3002} 2990}
3003 2991
3004int perf_output_begin(struct perf_output_handle *handle, 2992int perf_output_begin(struct perf_output_handle *handle,
@@ -3036,13 +3024,13 @@ int perf_output_begin(struct perf_output_handle *handle,
3036 handle->sample = sample; 3024 handle->sample = sample;
3037 3025
3038 if (!data->nr_pages) 3026 if (!data->nr_pages)
3039 goto fail; 3027 goto out;
3040 3028
3041 have_lost = atomic_read(&data->lost); 3029 have_lost = local_read(&data->lost);
3042 if (have_lost) 3030 if (have_lost)
3043 size += sizeof(lost_event); 3031 size += sizeof(lost_event);
3044 3032
3045 perf_output_lock(handle); 3033 perf_output_get_handle(handle);
3046 3034
3047 do { 3035 do {
3048 /* 3036 /*
@@ -3052,24 +3040,28 @@ int perf_output_begin(struct perf_output_handle *handle,
3052 */ 3040 */
3053 tail = ACCESS_ONCE(data->user_page->data_tail); 3041 tail = ACCESS_ONCE(data->user_page->data_tail);
3054 smp_rmb(); 3042 smp_rmb();
3055 offset = head = atomic_long_read(&data->head); 3043 offset = head = local_read(&data->head);
3056 head += size; 3044 head += size;
3057 if (unlikely(!perf_output_space(data, tail, offset, head))) 3045 if (unlikely(!perf_output_space(data, tail, offset, head)))
3058 goto fail; 3046 goto fail;
3059 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 3047 } while (local_cmpxchg(&data->head, offset, head) != offset);
3060 3048
3061 handle->offset = offset; 3049 if (head - local_read(&data->wakeup) > data->watermark)
3062 handle->head = head; 3050 local_add(data->watermark, &data->wakeup);
3063 3051
3064 if (head - tail > data->watermark) 3052 handle->page = offset >> (PAGE_SHIFT + page_order(data));
3065 atomic_set(&data->wakeup, 1); 3053 handle->page &= data->nr_pages - 1;
3054 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
3055 handle->addr = data->data_pages[handle->page];
3056 handle->addr += handle->size;
3057 handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
3066 3058
3067 if (have_lost) { 3059 if (have_lost) {
3068 lost_event.header.type = PERF_RECORD_LOST; 3060 lost_event.header.type = PERF_RECORD_LOST;
3069 lost_event.header.misc = 0; 3061 lost_event.header.misc = 0;
3070 lost_event.header.size = sizeof(lost_event); 3062 lost_event.header.size = sizeof(lost_event);
3071 lost_event.id = event->id; 3063 lost_event.id = event->id;
3072 lost_event.lost = atomic_xchg(&data->lost, 0); 3064 lost_event.lost = local_xchg(&data->lost, 0);
3073 3065
3074 perf_output_put(handle, lost_event); 3066 perf_output_put(handle, lost_event);
3075 } 3067 }
@@ -3077,8 +3069,8 @@ int perf_output_begin(struct perf_output_handle *handle,
3077 return 0; 3069 return 0;
3078 3070
3079fail: 3071fail:
3080 atomic_inc(&data->lost); 3072 local_inc(&data->lost);
3081 perf_output_unlock(handle); 3073 perf_output_put_handle(handle);
3082out: 3074out:
3083 rcu_read_unlock(); 3075 rcu_read_unlock();
3084 3076
@@ -3093,14 +3085,14 @@ void perf_output_end(struct perf_output_handle *handle)
3093 int wakeup_events = event->attr.wakeup_events; 3085 int wakeup_events = event->attr.wakeup_events;
3094 3086
3095 if (handle->sample && wakeup_events) { 3087 if (handle->sample && wakeup_events) {
3096 int events = atomic_inc_return(&data->events); 3088 int events = local_inc_return(&data->events);
3097 if (events >= wakeup_events) { 3089 if (events >= wakeup_events) {
3098 atomic_sub(wakeup_events, &data->events); 3090 local_sub(wakeup_events, &data->events);
3099 atomic_set(&data->wakeup, 1); 3091 local_inc(&data->wakeup);
3100 } 3092 }
3101 } 3093 }
3102 3094
3103 perf_output_unlock(handle); 3095 perf_output_put_handle(handle);
3104 rcu_read_unlock(); 3096 rcu_read_unlock();
3105} 3097}
3106 3098
@@ -3436,22 +3428,13 @@ static void perf_event_task_output(struct perf_event *event,
3436{ 3428{
3437 struct perf_output_handle handle; 3429 struct perf_output_handle handle;
3438 struct task_struct *task = task_event->task; 3430 struct task_struct *task = task_event->task;
3439 unsigned long flags;
3440 int size, ret; 3431 int size, ret;
3441 3432
3442 /*
3443 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3444 * in perf_output_lock() from interrupt context, it's game over.
3445 */
3446 local_irq_save(flags);
3447
3448 size = task_event->event_id.header.size; 3433 size = task_event->event_id.header.size;
3449 ret = perf_output_begin(&handle, event, size, 0, 0); 3434 ret = perf_output_begin(&handle, event, size, 0, 0);
3450 3435
3451 if (ret) { 3436 if (ret)
3452 local_irq_restore(flags);
3453 return; 3437 return;
3454 }
3455 3438
3456 task_event->event_id.pid = perf_event_pid(event, task); 3439 task_event->event_id.pid = perf_event_pid(event, task);
3457 task_event->event_id.ppid = perf_event_pid(event, current); 3440 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3462,7 +3445,6 @@ static void perf_event_task_output(struct perf_event *event,
3462 perf_output_put(&handle, task_event->event_id); 3445 perf_output_put(&handle, task_event->event_id);
3463 3446
3464 perf_output_end(&handle); 3447 perf_output_end(&handle);
3465 local_irq_restore(flags);
3466} 3448}
3467 3449
3468static int perf_event_task_match(struct perf_event *event) 3450static int perf_event_task_match(struct perf_event *event)
@@ -4020,9 +4002,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
4020 perf_swevent_overflow(event, 0, nmi, data, regs); 4002 perf_swevent_overflow(event, 0, nmi, data, regs);
4021} 4003}
4022 4004
4023static int perf_tp_event_match(struct perf_event *event,
4024 struct perf_sample_data *data);
4025
4026static int perf_exclude_event(struct perf_event *event, 4005static int perf_exclude_event(struct perf_event *event,
4027 struct pt_regs *regs) 4006 struct pt_regs *regs)
4028{ 4007{
@@ -4052,10 +4031,6 @@ static int perf_swevent_match(struct perf_event *event,
4052 if (perf_exclude_event(event, regs)) 4031 if (perf_exclude_event(event, regs))
4053 return 0; 4032 return 0;
4054 4033
4055 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4056 !perf_tp_event_match(event, data))
4057 return 0;
4058
4059 return 1; 4034 return 1;
4060} 4035}
4061 4036
@@ -4066,19 +4041,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id)
4066 return hash_64(val, SWEVENT_HLIST_BITS); 4041 return hash_64(val, SWEVENT_HLIST_BITS);
4067} 4042}
4068 4043
4069static struct hlist_head * 4044static inline struct hlist_head *
4070find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) 4045__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4071{ 4046{
4072 u64 hash; 4047 u64 hash = swevent_hash(type, event_id);
4073 struct swevent_hlist *hlist; 4048
4049 return &hlist->heads[hash];
4050}
4074 4051
4075 hash = swevent_hash(type, event_id); 4052/* For the read side: events when they trigger */
4053static inline struct hlist_head *
4054find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4055{
4056 struct swevent_hlist *hlist;
4076 4057
4077 hlist = rcu_dereference(ctx->swevent_hlist); 4058 hlist = rcu_dereference(ctx->swevent_hlist);
4078 if (!hlist) 4059 if (!hlist)
4079 return NULL; 4060 return NULL;
4080 4061
4081 return &hlist->heads[hash]; 4062 return __find_swevent_head(hlist, type, event_id);
4063}
4064
4065/* For the event head insertion and removal in the hlist */
4066static inline struct hlist_head *
4067find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4068{
4069 struct swevent_hlist *hlist;
4070 u32 event_id = event->attr.config;
4071 u64 type = event->attr.type;
4072
4073 /*
4074 * Event scheduling is always serialized against hlist allocation
4075 * and release. Which makes the protected version suitable here.
4076 * The context lock guarantees that.
4077 */
4078 hlist = rcu_dereference_protected(ctx->swevent_hlist,
4079 lockdep_is_held(&event->ctx->lock));
4080 if (!hlist)
4081 return NULL;
4082
4083 return __find_swevent_head(hlist, type, event_id);
4082} 4084}
4083 4085
4084static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4086static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
@@ -4095,7 +4097,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4095 4097
4096 rcu_read_lock(); 4098 rcu_read_lock();
4097 4099
4098 head = find_swevent_head(cpuctx, type, event_id); 4100 head = find_swevent_head_rcu(cpuctx, type, event_id);
4099 4101
4100 if (!head) 4102 if (!head)
4101 goto end; 4103 goto end;
@@ -4110,7 +4112,7 @@ end:
4110 4112
4111int perf_swevent_get_recursion_context(void) 4113int perf_swevent_get_recursion_context(void)
4112{ 4114{
4113 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4115 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4114 int rctx; 4116 int rctx;
4115 4117
4116 if (in_nmi()) 4118 if (in_nmi())
@@ -4122,10 +4124,8 @@ int perf_swevent_get_recursion_context(void)
4122 else 4124 else
4123 rctx = 0; 4125 rctx = 0;
4124 4126
4125 if (cpuctx->recursion[rctx]) { 4127 if (cpuctx->recursion[rctx])
4126 put_cpu_var(perf_cpu_context);
4127 return -1; 4128 return -1;
4128 }
4129 4129
4130 cpuctx->recursion[rctx]++; 4130 cpuctx->recursion[rctx]++;
4131 barrier(); 4131 barrier();
@@ -4139,7 +4139,6 @@ void perf_swevent_put_recursion_context(int rctx)
4139 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4139 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4140 barrier(); 4140 barrier();
4141 cpuctx->recursion[rctx]--; 4141 cpuctx->recursion[rctx]--;
4142 put_cpu_var(perf_cpu_context);
4143} 4142}
4144EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4143EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4145 4144
@@ -4150,6 +4149,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4150 struct perf_sample_data data; 4149 struct perf_sample_data data;
4151 int rctx; 4150 int rctx;
4152 4151
4152 preempt_disable_notrace();
4153 rctx = perf_swevent_get_recursion_context(); 4153 rctx = perf_swevent_get_recursion_context();
4154 if (rctx < 0) 4154 if (rctx < 0)
4155 return; 4155 return;
@@ -4159,6 +4159,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4159 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4159 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4160 4160
4161 perf_swevent_put_recursion_context(rctx); 4161 perf_swevent_put_recursion_context(rctx);
4162 preempt_enable_notrace();
4162} 4163}
4163 4164
4164static void perf_swevent_read(struct perf_event *event) 4165static void perf_swevent_read(struct perf_event *event)
@@ -4178,7 +4179,7 @@ static int perf_swevent_enable(struct perf_event *event)
4178 perf_swevent_set_period(event); 4179 perf_swevent_set_period(event);
4179 } 4180 }
4180 4181
4181 head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); 4182 head = find_swevent_head(cpuctx, event);
4182 if (WARN_ON_ONCE(!head)) 4183 if (WARN_ON_ONCE(!head))
4183 return -EINVAL; 4184 return -EINVAL;
4184 4185
@@ -4366,6 +4367,14 @@ static const struct pmu perf_ops_task_clock = {
4366 .read = task_clock_perf_event_read, 4367 .read = task_clock_perf_event_read,
4367}; 4368};
4368 4369
4370/* Deref the hlist from the update side */
4371static inline struct swevent_hlist *
4372swevent_hlist_deref(struct perf_cpu_context *cpuctx)
4373{
4374 return rcu_dereference_protected(cpuctx->swevent_hlist,
4375 lockdep_is_held(&cpuctx->hlist_mutex));
4376}
4377
4369static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) 4378static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4370{ 4379{
4371 struct swevent_hlist *hlist; 4380 struct swevent_hlist *hlist;
@@ -4376,12 +4385,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4376 4385
4377static void swevent_hlist_release(struct perf_cpu_context *cpuctx) 4386static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4378{ 4387{
4379 struct swevent_hlist *hlist; 4388 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
4380 4389
4381 if (!cpuctx->swevent_hlist) 4390 if (!hlist)
4382 return; 4391 return;
4383 4392
4384 hlist = cpuctx->swevent_hlist;
4385 rcu_assign_pointer(cpuctx->swevent_hlist, NULL); 4393 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4386 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); 4394 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4387} 4395}
@@ -4418,7 +4426,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4418 4426
4419 mutex_lock(&cpuctx->hlist_mutex); 4427 mutex_lock(&cpuctx->hlist_mutex);
4420 4428
4421 if (!cpuctx->swevent_hlist && cpu_online(cpu)) { 4429 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
4422 struct swevent_hlist *hlist; 4430 struct swevent_hlist *hlist;
4423 4431
4424 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); 4432 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -4467,10 +4475,46 @@ static int swevent_hlist_get(struct perf_event *event)
4467 4475
4468#ifdef CONFIG_EVENT_TRACING 4476#ifdef CONFIG_EVENT_TRACING
4469 4477
4470void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4478static const struct pmu perf_ops_tracepoint = {
4471 int entry_size, struct pt_regs *regs) 4479 .enable = perf_trace_enable,
4480 .disable = perf_trace_disable,
4481 .read = perf_swevent_read,
4482 .unthrottle = perf_swevent_unthrottle,
4483};
4484
4485static int perf_tp_filter_match(struct perf_event *event,
4486 struct perf_sample_data *data)
4487{
4488 void *record = data->raw->data;
4489
4490 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4491 return 1;
4492 return 0;
4493}
4494
4495static int perf_tp_event_match(struct perf_event *event,
4496 struct perf_sample_data *data,
4497 struct pt_regs *regs)
4498{
4499 /*
4500 * All tracepoints are from kernel-space.
4501 */
4502 if (event->attr.exclude_kernel)
4503 return 0;
4504
4505 if (!perf_tp_filter_match(event, data))
4506 return 0;
4507
4508 return 1;
4509}
4510
4511void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4512 struct pt_regs *regs, struct hlist_head *head)
4472{ 4513{
4473 struct perf_sample_data data; 4514 struct perf_sample_data data;
4515 struct perf_event *event;
4516 struct hlist_node *node;
4517
4474 struct perf_raw_record raw = { 4518 struct perf_raw_record raw = {
4475 .size = entry_size, 4519 .size = entry_size,
4476 .data = record, 4520 .data = record,
@@ -4479,26 +4523,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4479 perf_sample_data_init(&data, addr); 4523 perf_sample_data_init(&data, addr);
4480 data.raw = &raw; 4524 data.raw = &raw;
4481 4525
4482 /* Trace events already protected against recursion */ 4526 rcu_read_lock();
4483 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4527 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4484 &data, regs); 4528 if (perf_tp_event_match(event, &data, regs))
4529 perf_swevent_add(event, count, 1, &data, regs);
4530 }
4531 rcu_read_unlock();
4485} 4532}
4486EXPORT_SYMBOL_GPL(perf_tp_event); 4533EXPORT_SYMBOL_GPL(perf_tp_event);
4487 4534
4488static int perf_tp_event_match(struct perf_event *event,
4489 struct perf_sample_data *data)
4490{
4491 void *record = data->raw->data;
4492
4493 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4494 return 1;
4495 return 0;
4496}
4497
4498static void tp_perf_event_destroy(struct perf_event *event) 4535static void tp_perf_event_destroy(struct perf_event *event)
4499{ 4536{
4500 perf_trace_disable(event->attr.config); 4537 perf_trace_destroy(event);
4501 swevent_hlist_put(event);
4502} 4538}
4503 4539
4504static const struct pmu *tp_perf_event_init(struct perf_event *event) 4540static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4514,17 +4550,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4514 !capable(CAP_SYS_ADMIN)) 4550 !capable(CAP_SYS_ADMIN))
4515 return ERR_PTR(-EPERM); 4551 return ERR_PTR(-EPERM);
4516 4552
4517 if (perf_trace_enable(event->attr.config)) 4553 err = perf_trace_init(event);
4554 if (err)
4518 return NULL; 4555 return NULL;
4519 4556
4520 event->destroy = tp_perf_event_destroy; 4557 event->destroy = tp_perf_event_destroy;
4521 err = swevent_hlist_get(event);
4522 if (err) {
4523 perf_trace_disable(event->attr.config);
4524 return ERR_PTR(err);
4525 }
4526 4558
4527 return &perf_ops_generic; 4559 return &perf_ops_tracepoint;
4528} 4560}
4529 4561
4530static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4562static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4552,12 +4584,6 @@ static void perf_event_free_filter(struct perf_event *event)
4552 4584
4553#else 4585#else
4554 4586
4555static int perf_tp_event_match(struct perf_event *event,
4556 struct perf_sample_data *data)
4557{
4558 return 1;
4559}
4560
4561static const struct pmu *tp_perf_event_init(struct perf_event *event) 4587static const struct pmu *tp_perf_event_init(struct perf_event *event)
4562{ 4588{
4563 return NULL; 4589 return NULL;
@@ -4894,6 +4920,13 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
4894 int fput_needed = 0; 4920 int fput_needed = 0;
4895 int ret = -EINVAL; 4921 int ret = -EINVAL;
4896 4922
4923 /*
4924 * Don't allow output of inherited per-task events. This would
4925 * create performance issues due to cross cpu access.
4926 */
4927 if (event->cpu == -1 && event->attr.inherit)
4928 return -EINVAL;
4929
4897 if (!output_fd) 4930 if (!output_fd)
4898 goto set; 4931 goto set;
4899 4932
@@ -4914,6 +4947,18 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
4914 if (event->data) 4947 if (event->data)
4915 goto out; 4948 goto out;
4916 4949
4950 /*
4951 * Don't allow cross-cpu buffers
4952 */
4953 if (output_event->cpu != event->cpu)
4954 goto out;
4955
4956 /*
4957 * If its not a per-cpu buffer, it must be the same task.
4958 */
4959 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
4960 goto out;
4961
4917 atomic_long_inc(&output_file->f_count); 4962 atomic_long_inc(&output_file->f_count);
4918 4963
4919set: 4964set:
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..dfadc5b729f1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vmalloc(buffer_bytes);
130 if (prof_buffer) 130 if (prof_buffer) {
131 memset(prof_buffer, 0, buffer_bytes);
131 return 0; 132 return 0;
133 }
132 134
133 free_cpumask_var(prof_cpu_mask); 135 free_cpumask_var(prof_cpu_mask);
134 return -ENOMEM; 136 return -ENOMEM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9fb51237b18c..6af9cdd558b7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/ptrace.h> 17#include <linux/ptrace.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/signal.h> 19#include <linux/signal.h>
@@ -665,10 +664,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
665 struct task_struct *child; 664 struct task_struct *child;
666 long ret; 665 long ret;
667 666
668 /*
669 * This lock_kernel fixes a subtle race with suid exec
670 */
671 lock_kernel();
672 if (request == PTRACE_TRACEME) { 667 if (request == PTRACE_TRACEME) {
673 ret = ptrace_traceme(); 668 ret = ptrace_traceme();
674 if (!ret) 669 if (!ret)
@@ -702,7 +697,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
702 out_put_task_struct: 697 out_put_task_struct:
703 put_task_struct(child); 698 put_task_struct(child);
704 out: 699 out:
705 unlock_kernel();
706 return ret; 700 return ret;
707} 701}
708 702
@@ -812,10 +806,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
812 struct task_struct *child; 806 struct task_struct *child;
813 long ret; 807 long ret;
814 808
815 /*
816 * This lock_kernel fixes a subtle race with suid exec
817 */
818 lock_kernel();
819 if (request == PTRACE_TRACEME) { 809 if (request == PTRACE_TRACEME) {
820 ret = ptrace_traceme(); 810 ret = ptrace_traceme();
821 goto out; 811 goto out;
@@ -845,7 +835,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
845 out_put_task_struct: 835 out_put_task_struct:
846 put_task_struct(child); 836 put_task_struct(child);
847 out: 837 out:
848 unlock_kernel();
849 return ret; 838 return ret;
850} 839}
851#endif /* CONFIG_COMPAT */ 840#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 03a7ea1579f6..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48#include <linux/hardirq.h> 47#include <linux/hardirq.h>
49 48
50#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -64,9 +63,6 @@ struct lockdep_map rcu_sched_lock_map =
64EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
65#endif 64#endif
66 65
67int rcu_scheduler_active __read_mostly;
68EXPORT_SYMBOL_GPL(rcu_scheduler_active);
69
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 66#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 67
72int debug_lockdep_rcu_enabled(void) 68int debug_lockdep_rcu_enabled(void)
@@ -97,21 +93,6 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
97#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 93#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
98 94
99/* 95/*
100 * This function is invoked towards the end of the scheduler's initialization
101 * process. Before this is called, the idle task might contain
102 * RCU read-side critical sections (during which time, this idle
103 * task is booting the system). After this function is called, the
104 * idle tasks are prohibited from containing RCU read-side critical
105 * sections.
106 */
107void rcu_scheduler_starting(void)
108{
109 WARN_ON(num_online_cpus() != 1);
110 WARN_ON(nr_context_switches() > 0);
111 rcu_scheduler_active = 1;
112}
113
114/*
115 * Awaken the corresponding synchronize_rcu() instance now that a 96 * Awaken the corresponding synchronize_rcu() instance now that a
116 * grace period has elapsed. 97 * grace period has elapsed.
117 */ 98 */
@@ -122,3 +103,14 @@ void wakeme_after_rcu(struct rcu_head *head)
122 rcu = container_of(head, struct rcu_synchronize, head); 103 rcu = container_of(head, struct rcu_synchronize, head);
123 complete(&rcu->completion); 104 complete(&rcu->completion);
124} 105}
106
107#ifdef CONFIG_PROVE_RCU
108/*
109 * wrapper function to avoid #include problems.
110 */
111int rcu_my_thread_group_empty(void)
112{
113 return thread_group_empty(current);
114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
44}; 44};
45 45
46/* Definition for rcupdate control block. */ 46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = { 47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist, 48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist, 49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50}; 50};
51 51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = { 52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
54 .curtail = &rcu_bh_ctrlblk.rcucblist, 54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55}; 55};
56 56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61
57#ifdef CONFIG_NO_HZ 62#ifdef CONFIG_NO_HZ
58 63
59static long rcu_dynticks_nesting = 1; 64static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
108 */ 113 */
109void rcu_sched_qs(int cpu) 114void rcu_sched_qs(int cpu)
110{ 115{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) 116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ); 118 raise_softirq(RCU_SOFTIRQ);
113} 119}
114 120
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
173 */ 179 */
174static void rcu_process_callbacks(struct softirq_action *unused) 180static void rcu_process_callbacks(struct softirq_action *unused)
175{ 181{
176 __rcu_process_callbacks(&rcu_ctrlblk); 182 __rcu_process_callbacks(&rcu_sched_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk); 183 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178} 184}
179 185
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
187 * 193 *
188 * Cool, huh? (Due to Josh Triplett.) 194 * Cool, huh? (Due to Josh Triplett.)
189 * 195 *
190 * But we want to make this a static inline later. 196 * But we want to make this a static inline later. The cond_resched()
197 * currently makes this problematic.
191 */ 198 */
192void synchronize_sched(void) 199void synchronize_sched(void)
193{ 200{
@@ -195,12 +202,6 @@ void synchronize_sched(void)
195} 202}
196EXPORT_SYMBOL_GPL(synchronize_sched); 203EXPORT_SYMBOL_GPL(synchronize_sched);
197 204
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/* 205/*
205 * Helper function for call_rcu() and call_rcu_bh(). 206 * Helper function for call_rcu() and call_rcu_bh().
206 */ 207 */
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
226 */ 227 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 228void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{ 229{
229 __call_rcu(head, func, &rcu_ctrlblk); 230 __call_rcu(head, func, &rcu_sched_ctrlblk);
230} 231}
231EXPORT_SYMBOL_GPL(call_rcu); 232EXPORT_SYMBOL_GPL(call_rcu);
232 233
@@ -244,11 +245,13 @@ void rcu_barrier(void)
244{ 245{
245 struct rcu_synchronize rcu; 246 struct rcu_synchronize rcu;
246 247
248 init_rcu_head_on_stack(&rcu.head);
247 init_completion(&rcu.completion); 249 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */ 250 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu); 251 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */ 252 /* Wait for it. */
251 wait_for_completion(&rcu.completion); 253 wait_for_completion(&rcu.completion);
254 destroy_rcu_head_on_stack(&rcu.head);
252} 255}
253EXPORT_SYMBOL_GPL(rcu_barrier); 256EXPORT_SYMBOL_GPL(rcu_barrier);
254 257
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
256{ 259{
257 struct rcu_synchronize rcu; 260 struct rcu_synchronize rcu;
258 261
262 init_rcu_head_on_stack(&rcu.head);
259 init_completion(&rcu.completion); 263 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */ 264 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu); 265 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */ 266 /* Wait for it. */
263 wait_for_completion(&rcu.completion); 267 wait_for_completion(&rcu.completion);
268 destroy_rcu_head_on_stack(&rcu.head);
264} 269}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh); 270EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266 271
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
268{ 273{
269 struct rcu_synchronize rcu; 274 struct rcu_synchronize rcu;
270 275
276 init_rcu_head_on_stack(&rcu.head);
271 init_completion(&rcu.completion); 277 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */ 278 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu); 279 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */ 280 /* Wait for it. */
275 wait_for_completion(&rcu.completion); 281 wait_for_completion(&rcu.completion);
282 destroy_rcu_head_on_stack(&rcu.head);
276} 283}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched); 284EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278 285
@@ -280,3 +287,5 @@ void __init rcu_init(void)
280{ 287{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 288 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282} 289}
290
291#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright IBM Corporation, 2009
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#ifdef CONFIG_DEBUG_LOCK_ALLOC
26
27#include <linux/kernel_stat.h>
28
29/*
30 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously.
32 */
33void rcu_scheduler_starting(void)
34{
35 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1;
37}
38
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2b676f3a0f26..6535ac8bc6a5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
464{ 464{
465 struct rcu_bh_torture_synchronize rcu; 465 struct rcu_bh_torture_synchronize rcu;
466 466
467 init_rcu_head_on_stack(&rcu.head);
467 init_completion(&rcu.completion); 468 init_completion(&rcu.completion);
468 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 469 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
469 wait_for_completion(&rcu.completion); 470 wait_for_completion(&rcu.completion);
471 destroy_rcu_head_on_stack(&rcu.head);
470} 472}
471 473
472static struct rcu_torture_ops rcu_bh_ops = { 474static struct rcu_torture_ops rcu_bh_ops = {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75f..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
@@ -53,8 +54,8 @@
53 54
54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 56
56#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(structname) { \
57 .level = { &name.node[0] }, \ 58 .level = { &structname.node[0] }, \
58 .levelcnt = { \ 59 .levelcnt = { \
59 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
60 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
@@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
65 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
66 .gpnum = -300, \ 67 .gpnum = -300, \
67 .completed = -300, \ 68 .completed = -300, \
68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
69 .orphan_cbs_list = NULL, \ 70 .orphan_cbs_list = NULL, \
70 .orphan_cbs_tail = &name.orphan_cbs_list, \ 71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
71 .orphan_qlen = 0, \ 72 .orphan_qlen = 0, \
72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
73 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
74 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
76 .name = #structname, \
75} 77}
76 78
77struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
82 84
85int rcu_scheduler_active __read_mostly;
86EXPORT_SYMBOL_GPL(rcu_scheduler_active);
87
83/* 88/*
84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
85 * permit this function to be invoked without holding the root rcu_node 90 * permit this function to be invoked without holding the root rcu_node
@@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
97 */ 102 */
98void rcu_sched_qs(int cpu) 103void rcu_sched_qs(int cpu)
99{ 104{
100 struct rcu_data *rdp; 105 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
101 106
102 rdp = &per_cpu(rcu_sched_data, cpu);
103 rdp->passed_quiesc_completed = rdp->gpnum - 1; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
104 barrier(); 108 barrier();
105 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
106 rcu_preempt_note_context_switch(cpu);
107} 110}
108 111
109void rcu_bh_qs(int cpu) 112void rcu_bh_qs(int cpu)
110{ 113{
111 struct rcu_data *rdp; 114 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
112 115
113 rdp = &per_cpu(rcu_bh_data, cpu);
114 rdp->passed_quiesc_completed = rdp->gpnum - 1; 116 rdp->passed_quiesc_completed = rdp->gpnum - 1;
115 barrier(); 117 barrier();
116 rdp->passed_quiesc = 1; 118 rdp->passed_quiesc = 1;
117} 119}
118 120
121/*
122 * Note a context switch. This is a quiescent state for RCU-sched,
123 * and requires special handling for preemptible RCU.
124 */
125void rcu_note_context_switch(int cpu)
126{
127 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu);
129}
130
119#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ
120DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
121 .dynticks_nesting = 1, 133 .dynticks_nesting = 1,
@@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
438 450
439#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
440 452
453int rcu_cpu_stall_panicking __read_mostly;
454
441static void record_gp_stall_check_time(struct rcu_state *rsp) 455static void record_gp_stall_check_time(struct rcu_state *rsp)
442{ 456{
443 rsp->gp_start = jiffies; 457 rsp->gp_start = jiffies;
@@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
470 484
471 /* OK, time to rat on our buddy... */ 485 /* OK, time to rat on our buddy... */
472 486
473 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name);
474 rcu_for_each_leaf_node(rsp, rnp) { 489 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags); 490 raw_spin_lock_irqsave(&rnp->lock, flags);
476 rcu_print_task_stall(rnp); 491 rcu_print_task_stall(rnp);
@@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
481 if (rnp->qsmask & (1UL << cpu)) 496 if (rnp->qsmask & (1UL << cpu))
482 printk(" %d", rnp->grplo + cpu); 497 printk(" %d", rnp->grplo + cpu);
483 } 498 }
484 printk(" (detected by %d, t=%ld jiffies)\n", 499 printk("} (detected by %d, t=%ld jiffies)\n",
485 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 500 smp_processor_id(), (long)(jiffies - rsp->gp_start));
486 trigger_all_cpu_backtrace(); 501 trigger_all_cpu_backtrace();
487 502
@@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
497 unsigned long flags; 512 unsigned long flags;
498 struct rcu_node *rnp = rcu_get_root(rsp); 513 struct rcu_node *rnp = rcu_get_root(rsp);
499 514
500 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
501 smp_processor_id(), jiffies - rsp->gp_start); 516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
502 trigger_all_cpu_backtrace(); 517 trigger_all_cpu_backtrace();
503 518
504 raw_spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
515 long delta; 530 long delta;
516 struct rcu_node *rnp; 531 struct rcu_node *rnp;
517 532
533 if (rcu_cpu_stall_panicking)
534 return;
518 delta = jiffies - rsp->jiffies_stall; 535 delta = jiffies - rsp->jiffies_stall;
519 rnp = rdp->mynode; 536 rnp = rdp->mynode;
520 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
529 } 546 }
530} 547}
531 548
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{
551 rcu_cpu_stall_panicking = 1;
552 return NOTIFY_DONE;
553}
554
555static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic,
557};
558
559static void __init check_cpu_stall_init(void)
560{
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562}
563
532#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
533 565
534static void record_gp_stall_check_time(struct rcu_state *rsp) 566static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 571{
540} 572}
541 573
574static void __init check_cpu_stall_init(void)
575{
576}
577
542#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
543 579
544/* 580/*
@@ -1125,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1125 */ 1161 */
1126void rcu_check_callbacks(int cpu, int user) 1162void rcu_check_callbacks(int cpu, int user)
1127{ 1163{
1128 if (!rcu_pending(cpu))
1129 return; /* if nothing for RCU to do. */
1130 if (user || 1164 if (user ||
1131 (idle_cpu(cpu) && rcu_scheduler_active && 1165 (idle_cpu(cpu) && rcu_scheduler_active &&
1132 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1166 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1158,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
1158 rcu_bh_qs(cpu); 1192 rcu_bh_qs(cpu);
1159 } 1193 }
1160 rcu_preempt_check_callbacks(cpu); 1194 rcu_preempt_check_callbacks(cpu);
1161 raise_softirq(RCU_SOFTIRQ); 1195 if (rcu_pending(cpu))
1196 raise_softirq(RCU_SOFTIRQ);
1162} 1197}
1163 1198
1164#ifdef CONFIG_SMP 1199#ifdef CONFIG_SMP
@@ -1236,11 +1271,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1236 break; /* grace period idle or initializing, ignore. */ 1271 break; /* grace period idle or initializing, ignore. */
1237 1272
1238 case RCU_SAVE_DYNTICK: 1273 case RCU_SAVE_DYNTICK:
1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1274 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1242 break; /* So gcc recognizes the dead code. */ 1275 break; /* So gcc recognizes the dead code. */
1243 1276
1277 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1278
1244 /* Record dyntick-idle state. */ 1279 /* Record dyntick-idle state. */
1245 force_qs_rnp(rsp, dyntick_save_progress_counter); 1280 force_qs_rnp(rsp, dyntick_save_progress_counter);
1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1281 raw_spin_lock(&rnp->lock); /* irqs already disabled */
@@ -1449,11 +1484,13 @@ void synchronize_sched(void)
1449 if (rcu_blocking_is_gp()) 1484 if (rcu_blocking_is_gp())
1450 return; 1485 return;
1451 1486
1487 init_rcu_head_on_stack(&rcu.head);
1452 init_completion(&rcu.completion); 1488 init_completion(&rcu.completion);
1453 /* Will wake me after RCU finished. */ 1489 /* Will wake me after RCU finished. */
1454 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1490 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1455 /* Wait for it. */ 1491 /* Wait for it. */
1456 wait_for_completion(&rcu.completion); 1492 wait_for_completion(&rcu.completion);
1493 destroy_rcu_head_on_stack(&rcu.head);
1457} 1494}
1458EXPORT_SYMBOL_GPL(synchronize_sched); 1495EXPORT_SYMBOL_GPL(synchronize_sched);
1459 1496
@@ -1473,11 +1510,13 @@ void synchronize_rcu_bh(void)
1473 if (rcu_blocking_is_gp()) 1510 if (rcu_blocking_is_gp())
1474 return; 1511 return;
1475 1512
1513 init_rcu_head_on_stack(&rcu.head);
1476 init_completion(&rcu.completion); 1514 init_completion(&rcu.completion);
1477 /* Will wake me after RCU finished. */ 1515 /* Will wake me after RCU finished. */
1478 call_rcu_bh(&rcu.head, wakeme_after_rcu); 1516 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1479 /* Wait for it. */ 1517 /* Wait for it. */
1480 wait_for_completion(&rcu.completion); 1518 wait_for_completion(&rcu.completion);
1519 destroy_rcu_head_on_stack(&rcu.head);
1481} 1520}
1482EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1521EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1483 1522
@@ -1498,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1498 check_cpu_stall(rsp, rdp); 1537 check_cpu_stall(rsp, rdp);
1499 1538
1500 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1539 /* Is the RCU core waiting for a quiescent state from this CPU? */
1501 if (rdp->qs_pending) { 1540 if (rdp->qs_pending && !rdp->passed_quiesc) {
1541
1542 /*
1543 * If force_quiescent_state() coming soon and this CPU
1544 * needs a quiescent state, and this is either RCU-sched
1545 * or RCU-bh, force a local reschedule.
1546 */
1502 rdp->n_rp_qs_pending++; 1547 rdp->n_rp_qs_pending++;
1548 if (!rdp->preemptable &&
1549 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1550 jiffies))
1551 set_need_resched();
1552 } else if (rdp->qs_pending && rdp->passed_quiesc) {
1553 rdp->n_rp_report_qs++;
1503 return 1; 1554 return 1;
1504 } 1555 }
1505 1556
@@ -1767,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1767} 1818}
1768 1819
1769/* 1820/*
1821 * This function is invoked towards the end of the scheduler's initialization
1822 * process. Before this is called, the idle task might contain
1823 * RCU read-side critical sections (during which time, this idle
1824 * task is booting the system). After this function is called, the
1825 * idle tasks are prohibited from containing RCU read-side critical
1826 * sections. This function also enables RCU lockdep checking.
1827 */
1828void rcu_scheduler_starting(void)
1829{
1830 WARN_ON(num_online_cpus() != 1);
1831 WARN_ON(nr_context_switches() > 0);
1832 rcu_scheduler_active = 1;
1833}
1834
1835/*
1770 * Compute the per-level fanout, either using the exact fanout specified 1836 * Compute the per-level fanout, either using the exact fanout specified
1771 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 1837 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1772 */ 1838 */
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]); 1915 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1850 } 1916 }
1851 } 1917 }
1918
1919 rnp = rsp->level[NUM_RCU_LVLS - 1];
1920 for_each_possible_cpu(i) {
1921 while (i > rnp->grphi)
1922 rnp++;
1923 rsp->rda[i]->mynode = rnp;
1924 rcu_boot_init_percpu_data(i, rsp);
1925 }
1852} 1926}
1853 1927
1854/* 1928/*
@@ -1859,19 +1933,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1933#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1860do { \ 1934do { \
1861 int i; \ 1935 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \ 1936 \
1865 rcu_init_one(rsp); \
1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1867 j = 0; \
1868 for_each_possible_cpu(i) { \ 1937 for_each_possible_cpu(i) { \
1869 if (i > rnp[j].grphi) \
1870 j++; \
1871 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1872 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1938 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1873 rcu_boot_init_percpu_data(i, rsp); \
1874 } \ 1939 } \
1940 rcu_init_one(rsp); \
1875} while (0) 1941} while (0)
1876 1942
1877void __init rcu_init(void) 1943void __init rcu_init(void)
@@ -1879,12 +1945,6 @@ void __init rcu_init(void)
1879 int cpu; 1945 int cpu;
1880 1946
1881 rcu_bootup_announce(); 1947 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1948 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1949 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1890 __rcu_init_preempt(); 1950 __rcu_init_preempt();
@@ -1898,6 +1958,7 @@ void __init rcu_init(void)
1898 cpu_notifier(rcu_cpu_notify, 0); 1958 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(cpu) 1959 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1960 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1961 check_cpu_stall_init();
1901} 1962}
1902 1963
1903#include "rcutree_plugin.h" 1964#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08e..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 unsigned long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 unsigned long n_rp_report_qs;
226 unsigned long n_rp_cb_ready; 227 unsigned long n_rp_cb_ready;
227 unsigned long n_rp_cpu_needs_gp; 228 unsigned long n_rp_cpu_needs_gp;
228 unsigned long n_rp_gp_completed; 229 unsigned long n_rp_gp_completed;
@@ -326,6 +327,7 @@ struct rcu_state {
326 unsigned long jiffies_stall; /* Time at which to check */ 327 unsigned long jiffies_stall; /* Time at which to check */
327 /* for CPU stalls. */ 328 /* for CPU stalls. */
328#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
330 char *name; /* Name of structure. */
329}; 331};
330 332
331/* Return values for rcu_preempt_offline_tasks(). */ 333/* Return values for rcu_preempt_offline_tasks(). */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda8943..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28 28
29/*
30 * Check the RCU kernel configuration parameters and print informative
31 * messages about anything out of the ordinary. If you like #ifdef, you
32 * will love this function.
33 */
34static void __init rcu_bootup_announce_oddness(void)
35{
36#ifdef CONFIG_RCU_TRACE
37 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
38#endif
39#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
40 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
41 CONFIG_RCU_FANOUT);
42#endif
43#ifdef CONFIG_RCU_FANOUT_EXACT
44 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
45#endif
46#ifdef CONFIG_RCU_FAST_NO_HZ
47 printk(KERN_INFO
48 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
49#endif
50#ifdef CONFIG_PROVE_RCU
51 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
52#endif
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif
63#if NUM_RCU_LVL_4 != 0
64 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
65#endif
66}
67
29#ifdef CONFIG_TREE_PREEMPT_RCU 68#ifdef CONFIG_TREE_PREEMPT_RCU
30 69
31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
38 */ 77 */
39static void __init rcu_bootup_announce(void) 78static void __init rcu_bootup_announce(void)
40{ 79{
41 printk(KERN_INFO 80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
42 "Experimental preemptable hierarchical RCU implementation.\n"); 81 rcu_bootup_announce_oddness();
43} 82}
44 83
45/* 84/*
@@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
75 * that this just means that the task currently running on the CPU is 114 * that this just means that the task currently running on the CPU is
76 * not in a quiescent state. There might be any number of tasks blocked 115 * not in a quiescent state. There might be any number of tasks blocked
77 * while in an RCU read-side critical section. 116 * while in an RCU read-side critical section.
117 *
118 * Unlike the other rcu_*_qs() functions, callers to this function
119 * must disable irqs in order to protect the assignment to
120 * ->rcu_read_unlock_special.
78 */ 121 */
79static void rcu_preempt_qs(int cpu) 122static void rcu_preempt_qs(int cpu)
80{ 123{
81 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 124 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
125
82 rdp->passed_quiesc_completed = rdp->gpnum - 1; 126 rdp->passed_quiesc_completed = rdp->gpnum - 1;
83 barrier(); 127 barrier();
84 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
129 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
85} 130}
86 131
87/* 132/*
@@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
144 * grace period, then the fact that the task has been enqueued 189 * grace period, then the fact that the task has been enqueued
145 * means that we continue to block the current grace period. 190 * means that we continue to block the current grace period.
146 */ 191 */
147 rcu_preempt_qs(cpu);
148 local_irq_save(flags); 192 local_irq_save(flags);
149 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 193 rcu_preempt_qs(cpu);
150 local_irq_restore(flags); 194 local_irq_restore(flags);
151} 195}
152 196
@@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
236 */ 280 */
237 special = t->rcu_read_unlock_special; 281 special = t->rcu_read_unlock_special;
238 if (special & RCU_READ_UNLOCK_NEED_QS) { 282 if (special & RCU_READ_UNLOCK_NEED_QS) {
239 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
240 rcu_preempt_qs(smp_processor_id()); 283 rcu_preempt_qs(smp_processor_id());
241 } 284 }
242 285
@@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
473 struct task_struct *t = current; 516 struct task_struct *t = current;
474 517
475 if (t->rcu_read_lock_nesting == 0) { 518 if (t->rcu_read_lock_nesting == 0) {
476 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
477 rcu_preempt_qs(cpu); 519 rcu_preempt_qs(cpu);
478 return; 520 return;
479 } 521 }
@@ -515,11 +557,13 @@ void synchronize_rcu(void)
515 if (!rcu_scheduler_active) 557 if (!rcu_scheduler_active)
516 return; 558 return;
517 559
560 init_rcu_head_on_stack(&rcu.head);
518 init_completion(&rcu.completion); 561 init_completion(&rcu.completion);
519 /* Will wake me after RCU finished. */ 562 /* Will wake me after RCU finished. */
520 call_rcu(&rcu.head, wakeme_after_rcu); 563 call_rcu(&rcu.head, wakeme_after_rcu);
521 /* Wait for it. */ 564 /* Wait for it. */
522 wait_for_completion(&rcu.completion); 565 wait_for_completion(&rcu.completion);
566 destroy_rcu_head_on_stack(&rcu.head);
523} 567}
524EXPORT_SYMBOL_GPL(synchronize_rcu); 568EXPORT_SYMBOL_GPL(synchronize_rcu);
525 569
@@ -754,6 +798,7 @@ void exit_rcu(void)
754static void __init rcu_bootup_announce(void) 798static void __init rcu_bootup_announce(void)
755{ 799{
756 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 800 printk(KERN_INFO "Hierarchical RCU implementation.\n");
801 rcu_bootup_announce_oddness();
757} 802}
758 803
759/* 804/*
@@ -1008,6 +1053,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1008int rcu_needs_cpu(int cpu) 1053int rcu_needs_cpu(int cpu)
1009{ 1054{
1010 int c = 0; 1055 int c = 0;
1056 int snap;
1057 int snap_nmi;
1011 int thatcpu; 1058 int thatcpu;
1012 1059
1013 /* Check for being in the holdoff period. */ 1060 /* Check for being in the holdoff period. */
@@ -1015,12 +1062,18 @@ int rcu_needs_cpu(int cpu)
1015 return rcu_needs_cpu_quick_check(cpu); 1062 return rcu_needs_cpu_quick_check(cpu);
1016 1063
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1064 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1065 for_each_online_cpu(thatcpu) {
1019 if (thatcpu != cpu) { 1066 if (thatcpu == cpu)
1067 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
1070 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0; 1072 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu); 1074 return rcu_needs_cpu_quick_check(cpu);
1023 } 1075 }
1076 }
1024 1077
1025 /* Check and update the rcu_dyntick_drain sequencing. */ 1078 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 1079 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d27..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
242{ 242{
243 seq_printf(m, "%3d%cnp=%ld " 243 seq_printf(m, "%3d%cnp=%ld "
244 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", 244 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
245 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
245 rdp->cpu, 246 rdp->cpu,
246 cpu_is_offline(rdp->cpu) ? '!' : ' ', 247 cpu_is_offline(rdp->cpu) ? '!' : ' ',
247 rdp->n_rcu_pending, 248 rdp->n_rcu_pending,
248 rdp->n_rp_qs_pending, 249 rdp->n_rp_qs_pending,
250 rdp->n_rp_report_qs,
249 rdp->n_rp_cb_ready, 251 rdp->n_rp_cb_ready,
250 rdp->n_rp_cpu_needs_gp, 252 rdp->n_rp_cpu_needs_gp,
251 rdp->n_rp_gp_completed, 253 rdp->n_rp_gp_completed,
diff --git a/kernel/sched.c b/kernel/sched.c
index 78554dd0d1a4..1d93cd0ae4d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3608,7 +3608,7 @@ need_resched:
3608 preempt_disable(); 3608 preempt_disable();
3609 cpu = smp_processor_id(); 3609 cpu = smp_processor_id();
3610 rq = cpu_rq(cpu); 3610 rq = cpu_rq(cpu);
3611 rcu_sched_qs(cpu); 3611 rcu_note_context_switch(cpu);
3612 prev = rq->curr; 3612 prev = rq->curr;
3613 switch_count = &prev->nivcsw; 3613 switch_count = &prev->nivcsw;
3614 3614
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9cf1baf6616a..87a330a7185f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
114 { 114 {
115 char path[64]; 115 char path[64];
116 116
117 rcu_read_lock();
117 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); 118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
118 SEQ_printf(m, " %s", path); 120 SEQ_printf(m, " %s", path);
119 } 121 }
120#endif 122#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef0274..0db913a5c60f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
716 preempt_enable_no_resched(); 716 preempt_enable_no_resched();
717 cond_resched(); 717 cond_resched();
718 preempt_disable(); 718 preempt_disable();
719 rcu_sched_qs((long)__bind_cpu); 719 rcu_note_context_switch((long)__bind_cpu);
720 } 720 }
721 preempt_enable(); 721 preempt_enable();
722 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ef51d1fcf5e6..b4e7431e7c78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -294,7 +294,6 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu; 295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct cpu_stop_work *work;
298 struct task_struct *p; 297 struct task_struct *p;
299 298
300 switch (action & ~CPU_TASKS_FROZEN) { 299 switch (action & ~CPU_TASKS_FROZEN) {
@@ -323,6 +322,9 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
323#ifdef CONFIG_HOTPLUG_CPU 322#ifdef CONFIG_HOTPLUG_CPU
324 case CPU_UP_CANCELED: 323 case CPU_UP_CANCELED:
325 case CPU_DEAD: 324 case CPU_DEAD:
325 {
326 struct cpu_stop_work *work;
327
326 /* kill the stopper */ 328 /* kill the stopper */
327 kthread_stop(stopper->thread); 329 kthread_stop(stopper->thread);
328 /* drain remaining works */ 330 /* drain remaining works */
@@ -335,6 +337,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
335 put_task_struct(stopper->thread); 337 put_task_struct(stopper->thread);
336 stopper->thread = NULL; 338 stopper->thread = NULL;
337 break; 339 break;
340 }
338#endif 341#endif
339 } 342 }
340 343
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0a47e8d6b491..26b8607a0abc 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,13 +9,9 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); 12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
16 13
17static char *perf_trace_buf; 14static char *perf_trace_buf[4];
18static char *perf_trace_buf_nmi;
19 15
20/* 16/*
21 * Force it to be aligned to unsigned long to avoid misaligned accesses 17 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -27,63 +23,82 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
27/* Count the events in use (per event id, not per instance) */ 23/* Count the events in use (per event id, not per instance) */
28static int total_ref_count; 24static int total_ref_count;
29 25
30static int perf_trace_event_enable(struct ftrace_event_call *event) 26static int perf_trace_event_init(struct ftrace_event_call *tp_event,
27 struct perf_event *p_event)
31{ 28{
32 char *buf; 29 struct hlist_head *list;
33 int ret = -ENOMEM; 30 int ret = -ENOMEM;
31 int cpu;
34 32
35 if (event->perf_refcount++ > 0) 33 p_event->tp_event = tp_event;
34 if (tp_event->perf_refcount++ > 0)
36 return 0; 35 return 0;
37 36
38 if (!total_ref_count) { 37 list = alloc_percpu(struct hlist_head);
39 buf = (char *)alloc_percpu(perf_trace_t); 38 if (!list)
40 if (!buf) 39 goto fail;
41 goto fail_buf; 40
41 for_each_possible_cpu(cpu)
42 INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
42 43
43 rcu_assign_pointer(perf_trace_buf, buf); 44 tp_event->perf_events = list;
45
46 if (!total_ref_count) {
47 char *buf;
48 int i;
44 49
45 buf = (char *)alloc_percpu(perf_trace_t); 50 for (i = 0; i < 4; i++) {
46 if (!buf) 51 buf = (char *)alloc_percpu(perf_trace_t);
47 goto fail_buf_nmi; 52 if (!buf)
53 goto fail;
48 54
49 rcu_assign_pointer(perf_trace_buf_nmi, buf); 55 perf_trace_buf[i] = buf;
56 }
50 } 57 }
51 58
52 if (event->class->reg) 59 if (tp_event->class->reg)
53 ret = event->class->reg(event, TRACE_REG_PERF_REGISTER); 60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
54 else 61 else
55 ret = tracepoint_probe_register(event->name, 62 ret = tracepoint_probe_register(tp_event->name,
56 event->class->perf_probe, 63 tp_event->class->perf_probe,
57 event); 64 tp_event);
58 if (!ret) { 65
59 total_ref_count++; 66 if (ret)
60 return 0; 67 goto fail;
61 }
62 68
63fail_buf_nmi: 69 total_ref_count++;
70 return 0;
71
72fail:
64 if (!total_ref_count) { 73 if (!total_ref_count) {
65 free_percpu(perf_trace_buf_nmi); 74 int i;
66 free_percpu(perf_trace_buf); 75
67 perf_trace_buf_nmi = NULL; 76 for (i = 0; i < 4; i++) {
68 perf_trace_buf = NULL; 77 free_percpu(perf_trace_buf[i]);
78 perf_trace_buf[i] = NULL;
79 }
80 }
81
82 if (!--tp_event->perf_refcount) {
83 free_percpu(tp_event->perf_events);
84 tp_event->perf_events = NULL;
69 } 85 }
70fail_buf:
71 event->perf_refcount--;
72 86
73 return ret; 87 return ret;
74} 88}
75 89
76int perf_trace_enable(int event_id) 90int perf_trace_init(struct perf_event *p_event)
77{ 91{
78 struct ftrace_event_call *event; 92 struct ftrace_event_call *tp_event;
93 int event_id = p_event->attr.config;
79 int ret = -EINVAL; 94 int ret = -EINVAL;
80 95
81 mutex_lock(&event_mutex); 96 mutex_lock(&event_mutex);
82 list_for_each_entry(event, &ftrace_events, list) { 97 list_for_each_entry(tp_event, &ftrace_events, list) {
83 if (event->event.type == event_id && 98 if (tp_event->event.type == event_id &&
84 event->class && event->class->perf_probe && 99 tp_event->class && tp_event->class->perf_probe &&
85 try_module_get(event->mod)) { 100 try_module_get(tp_event->mod)) {
86 ret = perf_trace_event_enable(event); 101 ret = perf_trace_event_init(tp_event, p_event);
87 break; 102 break;
88 } 103 }
89 } 104 }
@@ -92,93 +107,76 @@ int perf_trace_enable(int event_id)
92 return ret; 107 return ret;
93} 108}
94 109
95static void perf_trace_event_disable(struct ftrace_event_call *event) 110int perf_trace_enable(struct perf_event *p_event)
96{ 111{
97 char *buf, *nmi_buf; 112 struct ftrace_event_call *tp_event = p_event->tp_event;
98 113 struct hlist_head *list;
99 if (--event->perf_refcount > 0)
100 return;
101
102 if (event->class->reg)
103 event->class->reg(event, TRACE_REG_PERF_UNREGISTER);
104 else
105 tracepoint_probe_unregister(event->name, event->class->perf_probe, event);
106 114
107 if (!--total_ref_count) { 115 list = tp_event->perf_events;
108 buf = perf_trace_buf; 116 if (WARN_ON_ONCE(!list))
109 rcu_assign_pointer(perf_trace_buf, NULL); 117 return -EINVAL;
110 118
111 nmi_buf = perf_trace_buf_nmi; 119 list = per_cpu_ptr(list, smp_processor_id());
112 rcu_assign_pointer(perf_trace_buf_nmi, NULL); 120 hlist_add_head_rcu(&p_event->hlist_entry, list);
113 121
114 /* 122 return 0;
115 * Ensure every events in profiling have finished before 123}
116 * releasing the buffers
117 */
118 synchronize_sched();
119 124
120 free_percpu(buf); 125void perf_trace_disable(struct perf_event *p_event)
121 free_percpu(nmi_buf); 126{
122 } 127 hlist_del_rcu(&p_event->hlist_entry);
123} 128}
124 129
125void perf_trace_disable(int event_id) 130void perf_trace_destroy(struct perf_event *p_event)
126{ 131{
127 struct ftrace_event_call *event; 132 struct ftrace_event_call *tp_event = p_event->tp_event;
133 int i;
128 134
129 mutex_lock(&event_mutex); 135 if (--tp_event->perf_refcount > 0)
130 list_for_each_entry(event, &ftrace_events, list) { 136 return;
131 if (event->event.type == event_id) { 137
132 perf_trace_event_disable(event); 138 if (tp_event->class->reg)
133 module_put(event->mod); 139 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
134 break; 140 else
141 tracepoint_probe_unregister(tp_event->name,
142 tp_event->class->perf_probe,
143 tp_event);
144
145 free_percpu(tp_event->perf_events);
146 tp_event->perf_events = NULL;
147
148 if (!--total_ref_count) {
149 for (i = 0; i < 4; i++) {
150 free_percpu(perf_trace_buf[i]);
151 perf_trace_buf[i] = NULL;
135 } 152 }
136 } 153 }
137 mutex_unlock(&event_mutex);
138} 154}
139 155
140__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 156__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
141 int *rctxp, unsigned long *irq_flags) 157 struct pt_regs *regs, int *rctxp)
142{ 158{
143 struct trace_entry *entry; 159 struct trace_entry *entry;
144 char *trace_buf, *raw_data; 160 char *raw_data;
145 int pc, cpu; 161 int pc;
146 162
147 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 163 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
148 164
149 pc = preempt_count(); 165 pc = preempt_count();
150 166
151 /* Protect the per cpu buffer, begin the rcu read side */
152 local_irq_save(*irq_flags);
153
154 *rctxp = perf_swevent_get_recursion_context(); 167 *rctxp = perf_swevent_get_recursion_context();
155 if (*rctxp < 0) 168 if (*rctxp < 0)
156 goto err_recursion; 169 return NULL;
157
158 cpu = smp_processor_id();
159
160 if (in_nmi())
161 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
162 else
163 trace_buf = rcu_dereference_sched(perf_trace_buf);
164
165 if (!trace_buf)
166 goto err;
167 170
168 raw_data = per_cpu_ptr(trace_buf, cpu); 171 raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id());
169 172
170 /* zero the dead bytes from align to not leak stack to user */ 173 /* zero the dead bytes from align to not leak stack to user */
171 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); 174 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
172 175
173 entry = (struct trace_entry *)raw_data; 176 entry = (struct trace_entry *)raw_data;
174 tracing_generic_entry_update(entry, *irq_flags, pc); 177 tracing_generic_entry_update(entry, regs->flags, pc);
175 entry->type = type; 178 entry->type = type;
176 179
177 return raw_data; 180 return raw_data;
178err:
179 perf_swevent_put_recursion_context(*rctxp);
180err_recursion:
181 local_irq_restore(*irq_flags);
182 return NULL;
183} 181}
184EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 182EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9a082bba9537..faf7cefd15da 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1338,9 +1338,9 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1338 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1338 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1339 struct ftrace_event_call *call = &tp->call; 1339 struct ftrace_event_call *call = &tp->call;
1340 struct kprobe_trace_entry_head *entry; 1340 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head;
1341 u8 *data; 1342 u8 *data;
1342 int size, __size, i; 1343 int size, __size, i;
1343 unsigned long irq_flags;
1344 int rctx; 1344 int rctx;
1345 1345
1346 __size = sizeof(*entry) + tp->size; 1346 __size = sizeof(*entry) + tp->size;
@@ -1350,8 +1350,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1350 "profile buffer not large enough")) 1350 "profile buffer not large enough"))
1351 return; 1351 return;
1352 1352
1353 entry = perf_trace_buf_prepare(size, call->event.type, 1353 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1354 &rctx, &irq_flags);
1355 if (!entry) 1354 if (!entry)
1356 return; 1355 return;
1357 1356
@@ -1360,7 +1359,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1360 for (i = 0; i < tp->nr_args; i++) 1359 for (i = 0; i < tp->nr_args; i++)
1361 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); 1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1362 1361
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); 1362 head = per_cpu_ptr(call->perf_events, smp_processor_id());
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
1364} 1364}
1365 1365
1366/* Kretprobe profile handler */ 1366/* Kretprobe profile handler */
@@ -1370,9 +1370,9 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1370 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1370 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1371 struct ftrace_event_call *call = &tp->call; 1371 struct ftrace_event_call *call = &tp->call;
1372 struct kretprobe_trace_entry_head *entry; 1372 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head;
1373 u8 *data; 1374 u8 *data;
1374 int size, __size, i; 1375 int size, __size, i;
1375 unsigned long irq_flags;
1376 int rctx; 1376 int rctx;
1377 1377
1378 __size = sizeof(*entry) + tp->size; 1378 __size = sizeof(*entry) + tp->size;
@@ -1382,8 +1382,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1382 "profile buffer not large enough")) 1382 "profile buffer not large enough"))
1383 return; 1383 return;
1384 1384
1385 entry = perf_trace_buf_prepare(size, call->event.type, 1385 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1386 &rctx, &irq_flags);
1387 if (!entry) 1386 if (!entry)
1388 return; 1387 return;
1389 1388
@@ -1393,8 +1392,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1393 for (i = 0; i < tp->nr_args; i++) 1392 for (i = 0; i < tp->nr_args; i++)
1394 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); 1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1395 1394
1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, 1395 head = per_cpu_ptr(call->perf_events, smp_processor_id());
1397 irq_flags, regs); 1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1398} 1397}
1399 1398
1400static int probe_perf_enable(struct ftrace_event_call *call) 1399static int probe_perf_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9d358301ae3e..d2c859cec9ea 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -488,7 +488,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
488{ 488{
489 struct syscall_metadata *sys_data; 489 struct syscall_metadata *sys_data;
490 struct syscall_trace_enter *rec; 490 struct syscall_trace_enter *rec;
491 unsigned long flags; 491 struct hlist_head *head;
492 int syscall_nr; 492 int syscall_nr;
493 int rctx; 493 int rctx;
494 int size; 494 int size;
@@ -511,15 +511,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
511 return; 511 return;
512 512
513 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 513 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
514 sys_data->enter_event->event.type, 514 sys_data->enter_event->event.type, regs, &rctx);
515 &rctx, &flags);
516 if (!rec) 515 if (!rec)
517 return; 516 return;
518 517
519 rec->nr = syscall_nr; 518 rec->nr = syscall_nr;
520 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 519 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
521 (unsigned long *)&rec->args); 520 (unsigned long *)&rec->args);
522 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 521
522 head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id());
523 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
523} 524}
524 525
525int perf_sysenter_enable(struct ftrace_event_call *call) 526int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -561,7 +562,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
561{ 562{
562 struct syscall_metadata *sys_data; 563 struct syscall_metadata *sys_data;
563 struct syscall_trace_exit *rec; 564 struct syscall_trace_exit *rec;
564 unsigned long flags; 565 struct hlist_head *head;
565 int syscall_nr; 566 int syscall_nr;
566 int rctx; 567 int rctx;
567 int size; 568 int size;
@@ -587,15 +588,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
587 return; 588 return;
588 589
589 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 590 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
590 sys_data->exit_event->event.type, 591 sys_data->exit_event->event.type, regs, &rctx);
591 &rctx, &flags);
592 if (!rec) 592 if (!rec)
593 return; 593 return;
594 594
595 rec->nr = syscall_nr; 595 rec->nr = syscall_nr;
596 rec->ret = syscall_get_return_value(current, regs); 596 rec->ret = syscall_get_return_value(current, regs);
597 597
598 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 598 head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id());
599 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
599} 600}
600 601
601int perf_sysexit_enable(struct ftrace_event_call *call) 602int perf_sysexit_enable(struct ftrace_event_call *call)