aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/acct.c27
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit_tree.c101
-rw-r--r--kernel/audit_watch.c1
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/capability.c5
-rw-r--r--kernel/cgroup.c756
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/compat.c1
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/cpuset.c173
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c14
-rw-r--r--kernel/early_res.c584
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fork.c73
-rw-r--r--kernel/hw_breakpoint.c207
-rw-r--r--kernel/irq/chip.c89
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/manage.c32
-rw-r--r--kernel/irq/numa_migrate.c5
-rw-r--r--kernel/irq/proc.c1
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kgdb.c205
-rw-r--r--kernel/kprobes.c770
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c1
-rw-r--r--kernel/lockdep.c118
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/module.c181
-rw-r--r--kernel/nsproxy.c14
-rw-r--r--kernel/padata.c9
-rw-r--r--kernel/panic.c46
-rw-r--r--kernel/params.c12
-rw-r--r--kernel/perf_event.c529
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/posix-cpu-timers.c46
-rw-r--r--kernel/power/hibernate.c10
-rw-r--r--kernel/power/hibernate_nvs.c1
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c1
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/printk.c55
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c50
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c12
-rw-r--r--kernel/rcutree.c131
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h77
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/res_counter.c1
-rw-r--r--kernel/resource.c53
-rw-r--r--kernel/sched.c818
-rw-r--r--kernel/sched_cpupri.c5
-rw-r--r--kernel/sched_debug.c114
-rw-r--r--kernel/sched_fair.c352
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c27
-rw-r--r--kernel/signal.c45
-rw-r--r--kernel/slow-work.c2
-rw-r--r--kernel/slow-work.h8
-rw-r--r--kernel/smp.c1
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/srcu.c1
-rw-r--r--kernel/stop_machine.c537
-rw-r--r--kernel/sys.c71
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c51
-rw-r--r--kernel/sysctl_binary.c8
-rw-r--r--kernel/taskstats.c7
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/tick-oneshot.c52
-rw-r--r--kernel/time/tick-sched.c84
-rw-r--r--kernel/time/timecompare.c1
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/blktrace.c1
-rw-r--r--kernel/trace/ftrace.c64
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c219
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/trace/trace.c184
-rw-r--r--kernel/trace/trace.h52
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_event_perf.c (renamed from kernel/trace/trace_event_profile.c)63
-rw-r--r--kernel/trace/trace_events.c3
-rw-r--r--kernel/trace/trace_events_filter.c3
-rw-r--r--kernel/trace/trace_functions_graph.c201
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_irqsoff.c271
-rw-r--r--kernel/trace/trace_kprobe.c564
-rw-r--r--kernel/trace/trace_ksym.c27
-rw-r--r--kernel/trace/trace_mmiotrace.c1
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_sched_switch.c5
-rw-r--r--kernel/trace/trace_sched_wakeup.c5
-rw-r--r--kernel/trace/trace_selftest.c65
-rw-r--r--kernel/trace/trace_stat.c1
-rw-r--r--kernel/trace/trace_syscalls.c73
-rw-r--r--kernel/trace/trace_workqueue.c1
-rw-r--r--kernel/tsacct.c1
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/workqueue.c2
127 files changed, 6206 insertions, 3269 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aebdeb2aa34..149e18ef1ab1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -67,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
67obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
68obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
90obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 93obj-$(CONFIG_LATENCYTOP) += latencytop.o
94obj-$(CONFIG_BINFMT_ELF) += elfcore.o
95obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..e4c0e1fee9b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -353,17 +353,18 @@ restart:
353 353
354void acct_exit_ns(struct pid_namespace *ns) 354void acct_exit_ns(struct pid_namespace *ns)
355{ 355{
356 struct bsd_acct_struct *acct; 356 struct bsd_acct_struct *acct = ns->bacct;
357 357
358 spin_lock(&acct_lock); 358 if (acct == NULL)
359 acct = ns->bacct; 359 return;
360 if (acct != NULL) {
361 if (acct->file != NULL)
362 acct_file_reopen(acct, NULL, NULL);
363 360
364 kfree(acct); 361 del_timer_sync(&acct->timer);
365 } 362 spin_lock(&acct_lock);
363 if (acct->file != NULL)
364 acct_file_reopen(acct, NULL, NULL);
366 spin_unlock(&acct_lock); 365 spin_unlock(&acct_lock);
366
367 kfree(acct);
367} 368}
368 369
369/* 370/*
@@ -588,16 +589,6 @@ out:
588} 589}
589 590
590/** 591/**
591 * acct_init_pacct - initialize a new pacct_struct
592 * @pacct: per-process accounting info struct to initialize
593 */
594void acct_init_pacct(struct pacct_struct *pacct)
595{
596 memset(pacct, 0, sizeof(struct pacct_struct));
597 pacct->ac_utime = pacct->ac_stime = cputime_zero;
598}
599
600/**
601 * acct_collect - collect accounting information into pacct_struct 592 * acct_collect - collect accounting information into pacct_struct
602 * @exitcode: task exit code 593 * @exitcode: task exit code
603 * @group_dead: not 0, if this thread is the last one in the process. 594 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
56#include <linux/init.h> 56#include <linux/init.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/slab.h>
59#include <asm/atomic.h> 60#include <asm/atomic.h>
60 61
61static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..c71bd26631a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/slab.h>
49#include <linux/err.h> 50#include <linux/err.h>
50#include <linux/kthread.h> 51#include <linux/kthread.h>
51 52
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 skb_get(skb); 399 skb_get(skb);
399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 400 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
400 if (err < 0) { 401 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 402 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 403 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 404 audit_log_lost("auditd dissapeared\n");
404 audit_pid = 0; 405 audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
6#include <linux/slab.h>
6 7
7struct audit_tree; 8struct audit_tree;
8struct audit_chunk; 9struct audit_chunk;
@@ -548,6 +549,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
548 return 0; 549 return 0;
549} 550}
550 551
552static int compare_root(struct vfsmount *mnt, void *arg)
553{
554 return mnt->mnt_root->d_inode == arg;
555}
556
551void audit_trim_trees(void) 557void audit_trim_trees(void)
552{ 558{
553 struct list_head cursor; 559 struct list_head cursor;
@@ -559,7 +565,6 @@ void audit_trim_trees(void)
559 struct path path; 565 struct path path;
560 struct vfsmount *root_mnt; 566 struct vfsmount *root_mnt;
561 struct node *node; 567 struct node *node;
562 struct list_head list;
563 int err; 568 int err;
564 569
565 tree = container_of(cursor.next, struct audit_tree, list); 570 tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +582,16 @@ void audit_trim_trees(void)
577 if (!root_mnt) 582 if (!root_mnt)
578 goto skip_it; 583 goto skip_it;
579 584
580 list_add_tail(&list, &root_mnt->mnt_list);
581 spin_lock(&hash_lock); 585 spin_lock(&hash_lock);
582 list_for_each_entry(node, &tree->chunks, list) { 586 list_for_each_entry(node, &tree->chunks, list) {
583 struct audit_chunk *chunk = find_chunk(node); 587 struct inode *inode = find_chunk(node)->watch.inode;
584 struct inode *inode = chunk->watch.inode;
585 struct vfsmount *mnt;
586 node->index |= 1U<<31; 588 node->index |= 1U<<31;
587 list_for_each_entry(mnt, &list, mnt_list) { 589 if (iterate_mounts(compare_root, inode, root_mnt))
588 if (mnt->mnt_root->d_inode == inode) { 590 node->index &= ~(1U<<31);
589 node->index &= ~(1U<<31);
590 break;
591 }
592 }
593 } 591 }
594 spin_unlock(&hash_lock); 592 spin_unlock(&hash_lock);
595 trim_marked(tree); 593 trim_marked(tree);
596 put_tree(tree); 594 put_tree(tree);
597 list_del_init(&list);
598 drop_collected_mounts(root_mnt); 595 drop_collected_mounts(root_mnt);
599skip_it: 596skip_it:
600 mutex_lock(&audit_filter_mutex); 597 mutex_lock(&audit_filter_mutex);
@@ -603,22 +600,6 @@ skip_it:
603 mutex_unlock(&audit_filter_mutex); 600 mutex_unlock(&audit_filter_mutex);
604} 601}
605 602
606static int is_under(struct vfsmount *mnt, struct dentry *dentry,
607 struct path *path)
608{
609 if (mnt != path->mnt) {
610 for (;;) {
611 if (mnt->mnt_parent == mnt)
612 return 0;
613 if (mnt->mnt_parent == path->mnt)
614 break;
615 mnt = mnt->mnt_parent;
616 }
617 dentry = mnt->mnt_mountpoint;
618 }
619 return is_subdir(dentry, path->dentry);
620}
621
622int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 603int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
623{ 604{
624 605
@@ -638,13 +619,17 @@ void audit_put_tree(struct audit_tree *tree)
638 put_tree(tree); 619 put_tree(tree);
639} 620}
640 621
622static int tag_mount(struct vfsmount *mnt, void *arg)
623{
624 return tag_chunk(mnt->mnt_root->d_inode, arg);
625}
626
641/* called with audit_filter_mutex */ 627/* called with audit_filter_mutex */
642int audit_add_tree_rule(struct audit_krule *rule) 628int audit_add_tree_rule(struct audit_krule *rule)
643{ 629{
644 struct audit_tree *seed = rule->tree, *tree; 630 struct audit_tree *seed = rule->tree, *tree;
645 struct path path; 631 struct path path;
646 struct vfsmount *mnt, *p; 632 struct vfsmount *mnt;
647 struct list_head list;
648 int err; 633 int err;
649 634
650 list_for_each_entry(tree, &tree_list, list) { 635 list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +655,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
670 err = -ENOMEM; 655 err = -ENOMEM;
671 goto Err; 656 goto Err;
672 } 657 }
673 list_add_tail(&list, &mnt->mnt_list);
674 658
675 get_tree(tree); 659 get_tree(tree);
676 list_for_each_entry(p, &list, mnt_list) { 660 err = iterate_mounts(tag_mount, tree, mnt);
677 err = tag_chunk(p->mnt_root->d_inode, tree);
678 if (err)
679 break;
680 }
681
682 list_del(&list);
683 drop_collected_mounts(mnt); 661 drop_collected_mounts(mnt);
684 662
685 if (!err) { 663 if (!err) {
@@ -714,31 +692,23 @@ int audit_tag_tree(char *old, char *new)
714{ 692{
715 struct list_head cursor, barrier; 693 struct list_head cursor, barrier;
716 int failed = 0; 694 int failed = 0;
717 struct path path; 695 struct path path1, path2;
718 struct vfsmount *tagged; 696 struct vfsmount *tagged;
719 struct list_head list;
720 struct vfsmount *mnt;
721 struct dentry *dentry;
722 int err; 697 int err;
723 698
724 err = kern_path(new, 0, &path); 699 err = kern_path(new, 0, &path2);
725 if (err) 700 if (err)
726 return err; 701 return err;
727 tagged = collect_mounts(&path); 702 tagged = collect_mounts(&path2);
728 path_put(&path); 703 path_put(&path2);
729 if (!tagged) 704 if (!tagged)
730 return -ENOMEM; 705 return -ENOMEM;
731 706
732 err = kern_path(old, 0, &path); 707 err = kern_path(old, 0, &path1);
733 if (err) { 708 if (err) {
734 drop_collected_mounts(tagged); 709 drop_collected_mounts(tagged);
735 return err; 710 return err;
736 } 711 }
737 mnt = mntget(path.mnt);
738 dentry = dget(path.dentry);
739 path_put(&path);
740
741 list_add_tail(&list, &tagged->mnt_list);
742 712
743 mutex_lock(&audit_filter_mutex); 713 mutex_lock(&audit_filter_mutex);
744 list_add(&barrier, &tree_list); 714 list_add(&barrier, &tree_list);
@@ -746,7 +716,7 @@ int audit_tag_tree(char *old, char *new)
746 716
747 while (cursor.next != &tree_list) { 717 while (cursor.next != &tree_list) {
748 struct audit_tree *tree; 718 struct audit_tree *tree;
749 struct vfsmount *p; 719 int good_one = 0;
750 720
751 tree = container_of(cursor.next, struct audit_tree, list); 721 tree = container_of(cursor.next, struct audit_tree, list);
752 get_tree(tree); 722 get_tree(tree);
@@ -754,30 +724,19 @@ int audit_tag_tree(char *old, char *new)
754 list_add(&cursor, &tree->list); 724 list_add(&cursor, &tree->list);
755 mutex_unlock(&audit_filter_mutex); 725 mutex_unlock(&audit_filter_mutex);
756 726
757 err = kern_path(tree->pathname, 0, &path); 727 err = kern_path(tree->pathname, 0, &path2);
758 if (err) { 728 if (!err) {
759 put_tree(tree); 729 good_one = path_is_under(&path1, &path2);
760 mutex_lock(&audit_filter_mutex); 730 path_put(&path2);
761 continue;
762 } 731 }
763 732
764 spin_lock(&vfsmount_lock); 733 if (!good_one) {
765 if (!is_under(mnt, dentry, &path)) {
766 spin_unlock(&vfsmount_lock);
767 path_put(&path);
768 put_tree(tree); 734 put_tree(tree);
769 mutex_lock(&audit_filter_mutex); 735 mutex_lock(&audit_filter_mutex);
770 continue; 736 continue;
771 } 737 }
772 spin_unlock(&vfsmount_lock);
773 path_put(&path);
774
775 list_for_each_entry(p, &list, mnt_list) {
776 failed = tag_chunk(p->mnt_root->d_inode, tree);
777 if (failed)
778 break;
779 }
780 738
739 failed = iterate_mounts(tag_mount, tree, tagged);
781 if (failed) { 740 if (failed) {
782 put_tree(tree); 741 put_tree(tree);
783 mutex_lock(&audit_filter_mutex); 742 mutex_lock(&audit_filter_mutex);
@@ -818,10 +777,8 @@ int audit_tag_tree(char *old, char *new)
818 } 777 }
819 list_del(&barrier); 778 list_del(&barrier);
820 list_del(&cursor); 779 list_del(&cursor);
821 list_del(&list);
822 mutex_unlock(&audit_filter_mutex); 780 mutex_unlock(&audit_filter_mutex);
823 dput(dentry); 781 path_put(&path1);
824 mntput(mnt);
825 drop_collected_mounts(tagged); 782 drop_collected_mounts(tagged);
826 return failed; 783 return failed;
827} 784}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/inotify.h> 31#include <linux/inotify.h>
31#include <linux/security.h> 32#include <linux/security.h>
32#include "audit.h" 33#include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/security.h> 31#include <linux/security.h>
31#include "audit.h" 32#include "audit.h"
32 33
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
51#include <linux/module.h> 51#include <linux/module.h>
52#include <linux/slab.h>
52#include <linux/mount.h> 53#include <linux/mount.h>
53#include <linux/socket.h> 54#include <linux/socket.h>
54#include <linux/mqueue.h> 55#include <linux/mqueue.h>
@@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
1893{ 1894{
1894 if (context->name_count >= AUDIT_NAMES) { 1895 if (context->name_count >= AUDIT_NAMES) {
1895 if (inode) 1896 if (inode)
1896 printk(KERN_DEBUG "name_count maxed, losing inode data: " 1897 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1897 "dev=%02x:%02x, inode=%lu\n", 1898 "dev=%02x:%02x, inode=%lu\n",
1898 MAJOR(inode->i_sb->s_dev), 1899 MAJOR(inode->i_sb->s_dev),
1899 MINOR(inode->i_sb->s_dev), 1900 MINOR(inode->i_sb->s_dev),
@@ -1988,7 +1989,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1988 1989
1989/** 1990/**
1990 * audit_inode_child - collect inode info for created/removed objects 1991 * audit_inode_child - collect inode info for created/removed objects
1991 * @dname: inode's dentry name
1992 * @dentry: dentry being audited 1992 * @dentry: dentry being audited
1993 * @parent: inode of dentry parent 1993 * @parent: inode of dentry parent
1994 * 1994 *
@@ -2000,13 +2000,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2000 * must be hooked prior, in order to capture the target inode during 2000 * must be hooked prior, in order to capture the target inode during
2001 * unsuccessful attempts. 2001 * unsuccessful attempts.
2002 */ 2002 */
2003void __audit_inode_child(const char *dname, const struct dentry *dentry, 2003void __audit_inode_child(const struct dentry *dentry,
2004 const struct inode *parent) 2004 const struct inode *parent)
2005{ 2005{
2006 int idx; 2006 int idx;
2007 struct audit_context *context = current->audit_context; 2007 struct audit_context *context = current->audit_context;
2008 const char *found_parent = NULL, *found_child = NULL; 2008 const char *found_parent = NULL, *found_child = NULL;
2009 const struct inode *inode = dentry->d_inode; 2009 const struct inode *inode = dentry->d_inode;
2010 const char *dname = dentry->d_name.name;
2010 int dirlen = 0; 2011 int dirlen = 0;
2011 2012
2012 if (!context->in_syscall) 2013 if (!context->in_syscall)
@@ -2014,9 +2015,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2014 2015
2015 if (inode) 2016 if (inode)
2016 handle_one(inode); 2017 handle_one(inode);
2017 /* determine matching parent */
2018 if (!dname)
2019 goto add_names;
2020 2018
2021 /* parent is more likely, look for it first */ 2019 /* parent is more likely, look for it first */
2022 for (idx = 0; idx < context->name_count; idx++) { 2020 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "cred-internals.h"
19 18
20/* 19/*
21 * Leveraged for setting/resetting capabilities 20 * Leveraged for setting/resetting capabilities
@@ -135,7 +134,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
135 if (pid && (pid != task_pid_vnr(current))) { 134 if (pid && (pid != task_pid_vnr(current))) {
136 struct task_struct *target; 135 struct task_struct *target;
137 136
138 read_lock(&tasklist_lock); 137 rcu_read_lock();
139 138
140 target = find_task_by_vpid(pid); 139 target = find_task_by_vpid(pid);
141 if (!target) 140 if (!target)
@@ -143,7 +142,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
143 else 142 else
144 ret = security_capget(target, pEp, pIp, pPp); 143 ret = security_capget(target, pEp, pIp, pPp);
145 144
146 read_unlock(&tasklist_lock); 145 rcu_read_unlock();
147 } else 146 } else
148 ret = security_capget(current, pEp, pIp, pPp); 147 ret = security_capget(current, pEp, pIp, pPp);
149 148
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4fd90e129772..e9ec642932ee 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -23,7 +27,6 @@
23 */ 27 */
24 28
25#include <linux/cgroup.h> 29#include <linux/cgroup.h>
26#include <linux/module.h>
27#include <linux/ctype.h> 30#include <linux/ctype.h>
28#include <linux/errno.h> 31#include <linux/errno.h>
29#include <linux/fs.h> 32#include <linux/fs.h>
@@ -44,6 +47,7 @@
44#include <linux/string.h> 47#include <linux/string.h>
45#include <linux/sort.h> 48#include <linux/sort.h>
46#include <linux/kmod.h> 49#include <linux/kmod.h>
50#include <linux/module.h>
47#include <linux/delayacct.h> 51#include <linux/delayacct.h>
48#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
49#include <linux/hash.h> 53#include <linux/hash.h>
@@ -52,15 +56,21 @@
52#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
53#include <linux/idr.h> 57#include <linux/idr.h>
54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
55 61
56#include <asm/atomic.h> 62#include <asm/atomic.h>
57 63
58static DEFINE_MUTEX(cgroup_mutex); 64static DEFINE_MUTEX(cgroup_mutex);
59 65
60/* Generate an array of cgroup subsystem pointers */ 66/*
67 * Generate an array of cgroup subsystem pointers. At boot time, this is
68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69 * registered after that. The mutable section of this array is protected by
70 * cgroup_mutex.
71 */
61#define SUBSYS(_x) &_x ## _subsys, 72#define SUBSYS(_x) &_x ## _subsys,
62 73static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
63static struct cgroup_subsys *subsys[] = {
64#include <linux/cgroup_subsys.h> 74#include <linux/cgroup_subsys.h>
65}; 75};
66 76
@@ -147,6 +157,35 @@ struct css_id {
147 unsigned short stack[0]; /* Array of Length (depth+1) */ 157 unsigned short stack[0]; /* Array of Length (depth+1) */
148}; 158};
149 159
160/*
161 * cgroup_event represents events which userspace want to recieve.
162 */
163struct cgroup_event {
164 /*
165 * Cgroup which the event belongs to.
166 */
167 struct cgroup *cgrp;
168 /*
169 * Control file which the event associated.
170 */
171 struct cftype *cft;
172 /*
173 * eventfd to signal userspace about the event.
174 */
175 struct eventfd_ctx *eventfd;
176 /*
177 * Each of these stored in a list by the cgroup.
178 */
179 struct list_head list;
180 /*
181 * All fields below needed to unregister event when
182 * userspace closes eventfd.
183 */
184 poll_table pt;
185 wait_queue_head_t *wqh;
186 wait_queue_t wait;
187 struct work_struct remove;
188};
150 189
151/* The list of hierarchy roots */ 190/* The list of hierarchy roots */
152 191
@@ -250,7 +289,8 @@ struct cg_cgroup_link {
250static struct css_set init_css_set; 289static struct css_set init_css_set;
251static struct cg_cgroup_link init_css_set_link; 290static struct cg_cgroup_link init_css_set_link;
252 291
253static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 292static int cgroup_init_idr(struct cgroup_subsys *ss,
293 struct cgroup_subsys_state *css);
254 294
255/* css_set_lock protects the list of css_set objects, and the 295/* css_set_lock protects the list of css_set objects, and the
256 * chain of tasks off each css_set. Nests outside task->alloc_lock 296 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -448,8 +488,11 @@ static struct css_set *find_existing_css_set(
448 struct hlist_node *node; 488 struct hlist_node *node;
449 struct css_set *cg; 489 struct css_set *cg;
450 490
451 /* Built the set of subsystem state objects that we want to 491 /*
452 * see in the new css_set */ 492 * Build the set of subsystem state objects that we want to see in the
493 * new css_set. while subsystems can change globally, the entries here
494 * won't change, so no need for locking.
495 */
453 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 496 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
454 if (root->subsys_bits & (1UL << i)) { 497 if (root->subsys_bits & (1UL << i)) {
455 /* Subsystem is in this hierarchy. So we want 498 /* Subsystem is in this hierarchy. So we want
@@ -696,6 +739,7 @@ void cgroup_lock(void)
696{ 739{
697 mutex_lock(&cgroup_mutex); 740 mutex_lock(&cgroup_mutex);
698} 741}
742EXPORT_SYMBOL_GPL(cgroup_lock);
699 743
700/** 744/**
701 * cgroup_unlock - release lock on cgroup changes 745 * cgroup_unlock - release lock on cgroup changes
@@ -706,6 +750,7 @@ void cgroup_unlock(void)
706{ 750{
707 mutex_unlock(&cgroup_mutex); 751 mutex_unlock(&cgroup_mutex);
708} 752}
753EXPORT_SYMBOL_GPL(cgroup_unlock);
709 754
710/* 755/*
711 * A couple of forward declarations required, due to cyclic reference loop: 756 * A couple of forward declarations required, due to cyclic reference loop:
@@ -757,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
757 if (ret) 802 if (ret)
758 break; 803 break;
759 } 804 }
805
760 return ret; 806 return ret;
761} 807}
762 808
@@ -884,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
884 css_put(css); 930 css_put(css);
885} 931}
886 932
887 933/*
934 * Call with cgroup_mutex held. Drops reference counts on modules, including
935 * any duplicate ones that parse_cgroupfs_options took. If this function
936 * returns an error, no reference counts are touched.
937 */
888static int rebind_subsystems(struct cgroupfs_root *root, 938static int rebind_subsystems(struct cgroupfs_root *root,
889 unsigned long final_bits) 939 unsigned long final_bits)
890{ 940{
@@ -892,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
892 struct cgroup *cgrp = &root->top_cgroup; 942 struct cgroup *cgrp = &root->top_cgroup;
893 int i; 943 int i;
894 944
945 BUG_ON(!mutex_is_locked(&cgroup_mutex));
946
895 removed_bits = root->actual_subsys_bits & ~final_bits; 947 removed_bits = root->actual_subsys_bits & ~final_bits;
896 added_bits = final_bits & ~root->actual_subsys_bits; 948 added_bits = final_bits & ~root->actual_subsys_bits;
897 /* Check that any added subsystems are currently free */ 949 /* Check that any added subsystems are currently free */
@@ -900,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
900 struct cgroup_subsys *ss = subsys[i]; 952 struct cgroup_subsys *ss = subsys[i];
901 if (!(bit & added_bits)) 953 if (!(bit & added_bits))
902 continue; 954 continue;
955 /*
956 * Nobody should tell us to do a subsys that doesn't exist:
957 * parse_cgroupfs_options should catch that case and refcounts
958 * ensure that subsystems won't disappear once selected.
959 */
960 BUG_ON(ss == NULL);
903 if (ss->root != &rootnode) { 961 if (ss->root != &rootnode) {
904 /* Subsystem isn't free */ 962 /* Subsystem isn't free */
905 return -EBUSY; 963 return -EBUSY;
@@ -919,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
919 unsigned long bit = 1UL << i; 977 unsigned long bit = 1UL << i;
920 if (bit & added_bits) { 978 if (bit & added_bits) {
921 /* We're binding this subsystem to this hierarchy */ 979 /* We're binding this subsystem to this hierarchy */
980 BUG_ON(ss == NULL);
922 BUG_ON(cgrp->subsys[i]); 981 BUG_ON(cgrp->subsys[i]);
923 BUG_ON(!dummytop->subsys[i]); 982 BUG_ON(!dummytop->subsys[i]);
924 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 983 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -930,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
930 if (ss->bind) 989 if (ss->bind)
931 ss->bind(ss, cgrp); 990 ss->bind(ss, cgrp);
932 mutex_unlock(&ss->hierarchy_mutex); 991 mutex_unlock(&ss->hierarchy_mutex);
992 /* refcount was already taken, and we're keeping it */
933 } else if (bit & removed_bits) { 993 } else if (bit & removed_bits) {
934 /* We're removing this subsystem */ 994 /* We're removing this subsystem */
995 BUG_ON(ss == NULL);
935 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 996 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
936 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 997 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
937 mutex_lock(&ss->hierarchy_mutex); 998 mutex_lock(&ss->hierarchy_mutex);
@@ -942,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
942 subsys[i]->root = &rootnode; 1003 subsys[i]->root = &rootnode;
943 list_move(&ss->sibling, &rootnode.subsys_list); 1004 list_move(&ss->sibling, &rootnode.subsys_list);
944 mutex_unlock(&ss->hierarchy_mutex); 1005 mutex_unlock(&ss->hierarchy_mutex);
1006 /* subsystem is now free - drop reference on module */
1007 module_put(ss->module);
945 } else if (bit & final_bits) { 1008 } else if (bit & final_bits) {
946 /* Subsystem state should already exist */ 1009 /* Subsystem state should already exist */
1010 BUG_ON(ss == NULL);
947 BUG_ON(!cgrp->subsys[i]); 1011 BUG_ON(!cgrp->subsys[i]);
1012 /*
1013 * a refcount was taken, but we already had one, so
1014 * drop the extra reference.
1015 */
1016 module_put(ss->module);
1017#ifdef CONFIG_MODULE_UNLOAD
1018 BUG_ON(ss->module && !module_refcount(ss->module));
1019#endif
948 } else { 1020 } else {
949 /* Subsystem state shouldn't exist */ 1021 /* Subsystem state shouldn't exist */
950 BUG_ON(cgrp->subsys[i]); 1022 BUG_ON(cgrp->subsys[i]);
@@ -986,13 +1058,20 @@ struct cgroup_sb_opts {
986 1058
987}; 1059};
988 1060
989/* Convert a hierarchy specifier into a bitmask of subsystems and 1061/*
990 * flags. */ 1062 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
991static int parse_cgroupfs_options(char *data, 1063 * with cgroup_mutex held to protect the subsys[] array. This function takes
992 struct cgroup_sb_opts *opts) 1064 * refcounts on subsystems to be used, unless it returns error, in which case
1065 * no refcounts are taken.
1066 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
993{ 1068{
994 char *token, *o = data ?: "all"; 1069 char *token, *o = data ?: "all";
995 unsigned long mask = (unsigned long)-1; 1070 unsigned long mask = (unsigned long)-1;
1071 int i;
1072 bool module_pin_failed = false;
1073
1074 BUG_ON(!mutex_is_locked(&cgroup_mutex));
996 1075
997#ifdef CONFIG_CPUSETS 1076#ifdef CONFIG_CPUSETS
998 mask = ~(1UL << cpuset_subsys_id); 1077 mask = ~(1UL << cpuset_subsys_id);
@@ -1005,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
1005 return -EINVAL; 1084 return -EINVAL;
1006 if (!strcmp(token, "all")) { 1085 if (!strcmp(token, "all")) {
1007 /* Add all non-disabled subsystems */ 1086 /* Add all non-disabled subsystems */
1008 int i;
1009 opts->subsys_bits = 0; 1087 opts->subsys_bits = 0;
1010 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1011 struct cgroup_subsys *ss = subsys[i]; 1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
1012 if (!ss->disabled) 1092 if (!ss->disabled)
1013 opts->subsys_bits |= 1ul << i; 1093 opts->subsys_bits |= 1ul << i;
1014 } 1094 }
@@ -1026,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
1026 if (!opts->release_agent) 1106 if (!opts->release_agent)
1027 return -ENOMEM; 1107 return -ENOMEM;
1028 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
1029 int i;
1030 const char *name = token + 5; 1109 const char *name = token + 5;
1031 /* Can't specify an empty name */ 1110 /* Can't specify an empty name */
1032 if (!strlen(name)) 1111 if (!strlen(name))
@@ -1050,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
1050 return -ENOMEM; 1129 return -ENOMEM;
1051 } else { 1130 } else {
1052 struct cgroup_subsys *ss; 1131 struct cgroup_subsys *ss;
1053 int i;
1054 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1055 ss = subsys[i]; 1133 ss = subsys[i];
1134 if (ss == NULL)
1135 continue;
1056 if (!strcmp(token, ss->name)) { 1136 if (!strcmp(token, ss->name)) {
1057 if (!ss->disabled) 1137 if (!ss->disabled)
1058 set_bit(i, &opts->subsys_bits); 1138 set_bit(i, &opts->subsys_bits);
@@ -1087,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
1087 if (!opts->subsys_bits && !opts->name) 1167 if (!opts->subsys_bits && !opts->name)
1088 return -EINVAL; 1168 return -EINVAL;
1089 1169
1170 /*
1171 * Grab references on all the modules we'll need, so the subsystems
1172 * don't dance around before rebind_subsystems attaches them. This may
1173 * take duplicate reference counts on a subsystem that's already used,
1174 * but rebind_subsystems handles this case.
1175 */
1176 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1177 unsigned long bit = 1UL << i;
1178
1179 if (!(bit & opts->subsys_bits))
1180 continue;
1181 if (!try_module_get(subsys[i]->module)) {
1182 module_pin_failed = true;
1183 break;
1184 }
1185 }
1186 if (module_pin_failed) {
1187 /*
1188 * oops, one of the modules was going away. this means that we
1189 * raced with a module_delete call, and to the user this is
1190 * essentially a "subsystem doesn't exist" case.
1191 */
1192 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1193 /* drop refcounts only on the ones we took */
1194 unsigned long bit = 1UL << i;
1195
1196 if (!(bit & opts->subsys_bits))
1197 continue;
1198 module_put(subsys[i]->module);
1199 }
1200 return -ENOENT;
1201 }
1202
1090 return 0; 1203 return 0;
1091} 1204}
1092 1205
1206static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1207{
1208 int i;
1209 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1210 unsigned long bit = 1UL << i;
1211
1212 if (!(bit & subsys_bits))
1213 continue;
1214 module_put(subsys[i]->module);
1215 }
1216}
1217
1093static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1218static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1094{ 1219{
1095 int ret = 0; 1220 int ret = 0;
@@ -1106,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1106 if (ret) 1231 if (ret)
1107 goto out_unlock; 1232 goto out_unlock;
1108 1233
1109 /* Don't allow flags to change at remount */ 1234 /* Don't allow flags or name to change at remount */
1110 if (opts.flags != root->flags) { 1235 if (opts.flags != root->flags ||
1111 ret = -EINVAL; 1236 (opts.name && strcmp(opts.name, root->name))) {
1112 goto out_unlock;
1113 }
1114
1115 /* Don't allow name to change at remount */
1116 if (opts.name && strcmp(opts.name, root->name)) {
1117 ret = -EINVAL; 1237 ret = -EINVAL;
1238 drop_parsed_module_refcounts(opts.subsys_bits);
1118 goto out_unlock; 1239 goto out_unlock;
1119 } 1240 }
1120 1241
1121 ret = rebind_subsystems(root, opts.subsys_bits); 1242 ret = rebind_subsystems(root, opts.subsys_bits);
1122 if (ret) 1243 if (ret) {
1244 drop_parsed_module_refcounts(opts.subsys_bits);
1123 goto out_unlock; 1245 goto out_unlock;
1246 }
1124 1247
1125 /* (re)populate subsystem files */ 1248 /* (re)populate subsystem files */
1126 cgroup_populate_dir(cgrp); 1249 cgroup_populate_dir(cgrp);
@@ -1151,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1151 INIT_LIST_HEAD(&cgrp->release_list); 1274 INIT_LIST_HEAD(&cgrp->release_list);
1152 INIT_LIST_HEAD(&cgrp->pidlists); 1275 INIT_LIST_HEAD(&cgrp->pidlists);
1153 mutex_init(&cgrp->pidlist_mutex); 1276 mutex_init(&cgrp->pidlist_mutex);
1277 INIT_LIST_HEAD(&cgrp->event_list);
1278 spin_lock_init(&cgrp->event_list_lock);
1154} 1279}
1155 1280
1156static void init_cgroup_root(struct cgroupfs_root *root) 1281static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1306,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1306 struct cgroupfs_root *new_root; 1431 struct cgroupfs_root *new_root;
1307 1432
1308 /* First find the desired set of subsystems */ 1433 /* First find the desired set of subsystems */
1434 mutex_lock(&cgroup_mutex);
1309 ret = parse_cgroupfs_options(data, &opts); 1435 ret = parse_cgroupfs_options(data, &opts);
1436 mutex_unlock(&cgroup_mutex);
1310 if (ret) 1437 if (ret)
1311 goto out_err; 1438 goto out_err;
1312 1439
@@ -1317,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1317 new_root = cgroup_root_from_opts(&opts); 1444 new_root = cgroup_root_from_opts(&opts);
1318 if (IS_ERR(new_root)) { 1445 if (IS_ERR(new_root)) {
1319 ret = PTR_ERR(new_root); 1446 ret = PTR_ERR(new_root);
1320 goto out_err; 1447 goto drop_modules;
1321 } 1448 }
1322 opts.new_root = new_root; 1449 opts.new_root = new_root;
1323 1450
@@ -1326,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1326 if (IS_ERR(sb)) { 1453 if (IS_ERR(sb)) {
1327 ret = PTR_ERR(sb); 1454 ret = PTR_ERR(sb);
1328 cgroup_drop_root(opts.new_root); 1455 cgroup_drop_root(opts.new_root);
1329 goto out_err; 1456 goto drop_modules;
1330 } 1457 }
1331 1458
1332 root = sb->s_fs_info; 1459 root = sb->s_fs_info;
@@ -1382,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1382 free_cg_links(&tmp_cg_links); 1509 free_cg_links(&tmp_cg_links);
1383 goto drop_new_super; 1510 goto drop_new_super;
1384 } 1511 }
1512 /*
1513 * There must be no failure case after here, since rebinding
1514 * takes care of subsystems' refcounts, which are explicitly
1515 * dropped in the failure exit path.
1516 */
1385 1517
1386 /* EBUSY should be the only error here */ 1518 /* EBUSY should be the only error here */
1387 BUG_ON(ret); 1519 BUG_ON(ret);
@@ -1420,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1420 * any) is not needed 1552 * any) is not needed
1421 */ 1553 */
1422 cgroup_drop_root(opts.new_root); 1554 cgroup_drop_root(opts.new_root);
1555 /* no subsys rebinding, so refcounts don't change */
1556 drop_parsed_module_refcounts(opts.subsys_bits);
1423 } 1557 }
1424 1558
1425 simple_set_mnt(mnt, sb); 1559 simple_set_mnt(mnt, sb);
@@ -1429,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1429 1563
1430 drop_new_super: 1564 drop_new_super:
1431 deactivate_locked_super(sb); 1565 deactivate_locked_super(sb);
1566 drop_modules:
1567 drop_parsed_module_refcounts(opts.subsys_bits);
1432 out_err: 1568 out_err:
1433 kfree(opts.release_agent); 1569 kfree(opts.release_agent);
1434 kfree(opts.name); 1570 kfree(opts.name);
@@ -1510,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1510int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1511{ 1647{
1512 char *start; 1648 char *start;
1513 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1514 1652
1515 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1516 /* 1654 /*
@@ -1526,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1526 *--start = '\0'; 1664 *--start = '\0';
1527 for (;;) { 1665 for (;;) {
1528 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1529 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1530 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1531 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1532 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1533 if (!cgrp) 1672 if (!cgrp)
1534 break; 1673 break;
1535 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1536 if (!cgrp->parent) 1678 if (!cgrp->parent)
1537 continue; 1679 continue;
1538 if (--start < buf) 1680 if (--start < buf)
@@ -1542,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1542 memmove(buf, start, buf + buflen - start); 1684 memmove(buf, start, buf + buflen - start);
1543 return 0; 1685 return 0;
1544} 1686}
1687EXPORT_SYMBOL_GPL(cgroup_path);
1545 1688
1546/** 1689/**
1547 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1690 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1554,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1554int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1697int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1555{ 1698{
1556 int retval = 0; 1699 int retval = 0;
1557 struct cgroup_subsys *ss; 1700 struct cgroup_subsys *ss, *failed_ss = NULL;
1558 struct cgroup *oldcgrp; 1701 struct cgroup *oldcgrp;
1559 struct css_set *cg; 1702 struct css_set *cg;
1560 struct css_set *newcg; 1703 struct css_set *newcg;
@@ -1568,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 for_each_subsys(root, ss) { 1711 for_each_subsys(root, ss) {
1569 if (ss->can_attach) { 1712 if (ss->can_attach) {
1570 retval = ss->can_attach(ss, cgrp, tsk, false); 1713 retval = ss->can_attach(ss, cgrp, tsk, false);
1571 if (retval) 1714 if (retval) {
1572 return retval; 1715 /*
1716 * Remember on which subsystem the can_attach()
1717 * failed, so that we only call cancel_attach()
1718 * against the subsystems whose can_attach()
1719 * succeeded. (See below)
1720 */
1721 failed_ss = ss;
1722 goto out;
1723 }
1573 } 1724 }
1574 } 1725 }
1575 1726
@@ -1583,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1583 */ 1734 */
1584 newcg = find_css_set(cg, cgrp); 1735 newcg = find_css_set(cg, cgrp);
1585 put_css_set(cg); 1736 put_css_set(cg);
1586 if (!newcg) 1737 if (!newcg) {
1587 return -ENOMEM; 1738 retval = -ENOMEM;
1739 goto out;
1740 }
1588 1741
1589 task_lock(tsk); 1742 task_lock(tsk);
1590 if (tsk->flags & PF_EXITING) { 1743 if (tsk->flags & PF_EXITING) {
1591 task_unlock(tsk); 1744 task_unlock(tsk);
1592 put_css_set(newcg); 1745 put_css_set(newcg);
1593 return -ESRCH; 1746 retval = -ESRCH;
1747 goto out;
1594 } 1748 }
1595 rcu_assign_pointer(tsk->cgroups, newcg); 1749 rcu_assign_pointer(tsk->cgroups, newcg);
1596 task_unlock(tsk); 1750 task_unlock(tsk);
@@ -1616,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1616 * is no longer empty. 1770 * is no longer empty.
1617 */ 1771 */
1618 cgroup_wakeup_rmdir_waiter(cgrp); 1772 cgroup_wakeup_rmdir_waiter(cgrp);
1619 return 0; 1773out:
1774 if (retval) {
1775 for_each_subsys(root, ss) {
1776 if (ss == failed_ss)
1777 /*
1778 * This subsystem was the one that failed the
1779 * can_attach() check earlier, so we don't need
1780 * to call cancel_attach() against it or any
1781 * remaining subsystems.
1782 */
1783 break;
1784 if (ss->cancel_attach)
1785 ss->cancel_attach(ss, cgrp, tsk, false);
1786 }
1787 }
1788 return retval;
1620} 1789}
1621 1790
1622/* 1791/*
@@ -1682,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1682 } 1851 }
1683 return true; 1852 return true;
1684} 1853}
1854EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1685 1855
1686static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1856static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1687 const char *buffer) 1857 const char *buffer)
@@ -1950,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1950 .rename = cgroup_rename, 2120 .rename = cgroup_rename,
1951}; 2121};
1952 2122
2123/*
2124 * Check if a file is a control file
2125 */
2126static inline struct cftype *__file_cft(struct file *file)
2127{
2128 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2129 return ERR_PTR(-EINVAL);
2130 return __d_cft(file->f_dentry);
2131}
2132
1953static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2133static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1954 struct super_block *sb) 2134 struct super_block *sb)
1955{ 2135{
@@ -2069,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2069 error = PTR_ERR(dentry); 2249 error = PTR_ERR(dentry);
2070 return error; 2250 return error;
2071} 2251}
2252EXPORT_SYMBOL_GPL(cgroup_add_file);
2072 2253
2073int cgroup_add_files(struct cgroup *cgrp, 2254int cgroup_add_files(struct cgroup *cgrp,
2074 struct cgroup_subsys *subsys, 2255 struct cgroup_subsys *subsys,
@@ -2083,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2083 } 2264 }
2084 return 0; 2265 return 0;
2085} 2266}
2267EXPORT_SYMBOL_GPL(cgroup_add_files);
2086 2268
2087/** 2269/**
2088 * cgroup_task_count - count the number of tasks in a cgroup. 2270 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2468,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2468{ 2650{
2469 struct cgroup_pidlist *l; 2651 struct cgroup_pidlist *l;
2470 /* don't need task_nsproxy() if we're looking at ourself */ 2652 /* don't need task_nsproxy() if we're looking at ourself */
2471 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2653 struct pid_namespace *ns = current->nsproxy->pid_ns;
2654
2472 /* 2655 /*
2473 * We can't drop the pidlist_mutex before taking the l->mutex in case 2656 * We can't drop the pidlist_mutex before taking the l->mutex in case
2474 * the last ref-holder is trying to remove l from the list at the same 2657 * the last ref-holder is trying to remove l from the list at the same
@@ -2478,8 +2661,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2478 mutex_lock(&cgrp->pidlist_mutex); 2661 mutex_lock(&cgrp->pidlist_mutex);
2479 list_for_each_entry(l, &cgrp->pidlists, links) { 2662 list_for_each_entry(l, &cgrp->pidlists, links) {
2480 if (l->key.type == type && l->key.ns == ns) { 2663 if (l->key.type == type && l->key.ns == ns) {
2481 /* found a matching list - drop the extra refcount */
2482 put_pid_ns(ns);
2483 /* make sure l doesn't vanish out from under us */ 2664 /* make sure l doesn't vanish out from under us */
2484 down_write(&l->mutex); 2665 down_write(&l->mutex);
2485 mutex_unlock(&cgrp->pidlist_mutex); 2666 mutex_unlock(&cgrp->pidlist_mutex);
@@ -2490,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2490 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2671 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2491 if (!l) { 2672 if (!l) {
2492 mutex_unlock(&cgrp->pidlist_mutex); 2673 mutex_unlock(&cgrp->pidlist_mutex);
2493 put_pid_ns(ns);
2494 return l; 2674 return l;
2495 } 2675 }
2496 init_rwsem(&l->mutex); 2676 init_rwsem(&l->mutex);
2497 down_write(&l->mutex); 2677 down_write(&l->mutex);
2498 l->key.type = type; 2678 l->key.type = type;
2499 l->key.ns = ns; 2679 l->key.ns = get_pid_ns(ns);
2500 l->use_count = 0; /* don't increment here */ 2680 l->use_count = 0; /* don't increment here */
2501 l->list = NULL; 2681 l->list = NULL;
2502 l->owner = cgrp; 2682 l->owner = cgrp;
@@ -2804,6 +2984,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2804} 2984}
2805 2985
2806/* 2986/*
2987 * Unregister event and free resources.
2988 *
2989 * Gets called from workqueue.
2990 */
2991static void cgroup_event_remove(struct work_struct *work)
2992{
2993 struct cgroup_event *event = container_of(work, struct cgroup_event,
2994 remove);
2995 struct cgroup *cgrp = event->cgrp;
2996
2997 /* TODO: check return code */
2998 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2999
3000 eventfd_ctx_put(event->eventfd);
3001 kfree(event);
3002 dput(cgrp->dentry);
3003}
3004
3005/*
3006 * Gets called on POLLHUP on eventfd when user closes it.
3007 *
3008 * Called with wqh->lock held and interrupts disabled.
3009 */
3010static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3011 int sync, void *key)
3012{
3013 struct cgroup_event *event = container_of(wait,
3014 struct cgroup_event, wait);
3015 struct cgroup *cgrp = event->cgrp;
3016 unsigned long flags = (unsigned long)key;
3017
3018 if (flags & POLLHUP) {
3019 __remove_wait_queue(event->wqh, &event->wait);
3020 spin_lock(&cgrp->event_list_lock);
3021 list_del(&event->list);
3022 spin_unlock(&cgrp->event_list_lock);
3023 /*
3024 * We are in atomic context, but cgroup_event_remove() may
3025 * sleep, so we have to call it in workqueue.
3026 */
3027 schedule_work(&event->remove);
3028 }
3029
3030 return 0;
3031}
3032
3033static void cgroup_event_ptable_queue_proc(struct file *file,
3034 wait_queue_head_t *wqh, poll_table *pt)
3035{
3036 struct cgroup_event *event = container_of(pt,
3037 struct cgroup_event, pt);
3038
3039 event->wqh = wqh;
3040 add_wait_queue(wqh, &event->wait);
3041}
3042
3043/*
3044 * Parse input and register new cgroup event handler.
3045 *
3046 * Input must be in format '<event_fd> <control_fd> <args>'.
3047 * Interpretation of args is defined by control file implementation.
3048 */
3049static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3050 const char *buffer)
3051{
3052 struct cgroup_event *event = NULL;
3053 unsigned int efd, cfd;
3054 struct file *efile = NULL;
3055 struct file *cfile = NULL;
3056 char *endp;
3057 int ret;
3058
3059 efd = simple_strtoul(buffer, &endp, 10);
3060 if (*endp != ' ')
3061 return -EINVAL;
3062 buffer = endp + 1;
3063
3064 cfd = simple_strtoul(buffer, &endp, 10);
3065 if ((*endp != ' ') && (*endp != '\0'))
3066 return -EINVAL;
3067 buffer = endp + 1;
3068
3069 event = kzalloc(sizeof(*event), GFP_KERNEL);
3070 if (!event)
3071 return -ENOMEM;
3072 event->cgrp = cgrp;
3073 INIT_LIST_HEAD(&event->list);
3074 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3075 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3076 INIT_WORK(&event->remove, cgroup_event_remove);
3077
3078 efile = eventfd_fget(efd);
3079 if (IS_ERR(efile)) {
3080 ret = PTR_ERR(efile);
3081 goto fail;
3082 }
3083
3084 event->eventfd = eventfd_ctx_fileget(efile);
3085 if (IS_ERR(event->eventfd)) {
3086 ret = PTR_ERR(event->eventfd);
3087 goto fail;
3088 }
3089
3090 cfile = fget(cfd);
3091 if (!cfile) {
3092 ret = -EBADF;
3093 goto fail;
3094 }
3095
3096 /* the process need read permission on control file */
3097 ret = file_permission(cfile, MAY_READ);
3098 if (ret < 0)
3099 goto fail;
3100
3101 event->cft = __file_cft(cfile);
3102 if (IS_ERR(event->cft)) {
3103 ret = PTR_ERR(event->cft);
3104 goto fail;
3105 }
3106
3107 if (!event->cft->register_event || !event->cft->unregister_event) {
3108 ret = -EINVAL;
3109 goto fail;
3110 }
3111
3112 ret = event->cft->register_event(cgrp, event->cft,
3113 event->eventfd, buffer);
3114 if (ret)
3115 goto fail;
3116
3117 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3118 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3119 ret = 0;
3120 goto fail;
3121 }
3122
3123 /*
3124 * Events should be removed after rmdir of cgroup directory, but before
3125 * destroying subsystem state objects. Let's take reference to cgroup
3126 * directory dentry to do that.
3127 */
3128 dget(cgrp->dentry);
3129
3130 spin_lock(&cgrp->event_list_lock);
3131 list_add(&event->list, &cgrp->event_list);
3132 spin_unlock(&cgrp->event_list_lock);
3133
3134 fput(cfile);
3135 fput(efile);
3136
3137 return 0;
3138
3139fail:
3140 if (cfile)
3141 fput(cfile);
3142
3143 if (event && event->eventfd && !IS_ERR(event->eventfd))
3144 eventfd_ctx_put(event->eventfd);
3145
3146 if (!IS_ERR_OR_NULL(efile))
3147 fput(efile);
3148
3149 kfree(event);
3150
3151 return ret;
3152}
3153
3154/*
2807 * for the common functions, 'private' gives the type of file 3155 * for the common functions, 'private' gives the type of file
2808 */ 3156 */
2809/* for hysterical raisins, we can't put this on the older files */ 3157/* for hysterical raisins, we can't put this on the older files */
@@ -2828,6 +3176,11 @@ static struct cftype files[] = {
2828 .read_u64 = cgroup_read_notify_on_release, 3176 .read_u64 = cgroup_read_notify_on_release,
2829 .write_u64 = cgroup_write_notify_on_release, 3177 .write_u64 = cgroup_write_notify_on_release,
2830 }, 3178 },
3179 {
3180 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3181 .write_string = cgroup_write_event_control,
3182 .mode = S_IWUGO,
3183 },
2831}; 3184};
2832 3185
2833static struct cftype cft_release_agent = { 3186static struct cftype cft_release_agent = {
@@ -2892,8 +3245,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2892 /* We need to take each hierarchy_mutex in a consistent order */ 3245 /* We need to take each hierarchy_mutex in a consistent order */
2893 int i; 3246 int i;
2894 3247
3248 /*
3249 * No worry about a race with rebind_subsystems that might mess up the
3250 * locking order, since both parties are under cgroup_mutex.
3251 */
2895 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3252 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2896 struct cgroup_subsys *ss = subsys[i]; 3253 struct cgroup_subsys *ss = subsys[i];
3254 if (ss == NULL)
3255 continue;
2897 if (ss->root == root) 3256 if (ss->root == root)
2898 mutex_lock(&ss->hierarchy_mutex); 3257 mutex_lock(&ss->hierarchy_mutex);
2899 } 3258 }
@@ -2905,6 +3264,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2905 3264
2906 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3265 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2907 struct cgroup_subsys *ss = subsys[i]; 3266 struct cgroup_subsys *ss = subsys[i];
3267 if (ss == NULL)
3268 continue;
2908 if (ss->root == root) 3269 if (ss->root == root)
2909 mutex_unlock(&ss->hierarchy_mutex); 3270 mutex_unlock(&ss->hierarchy_mutex);
2910 } 3271 }
@@ -3028,11 +3389,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3028 * synchronization other than RCU, and the subsystem linked 3389 * synchronization other than RCU, and the subsystem linked
3029 * list isn't RCU-safe */ 3390 * list isn't RCU-safe */
3030 int i; 3391 int i;
3392 /*
3393 * We won't need to lock the subsys array, because the subsystems
3394 * we're concerned about aren't going anywhere since our cgroup root
3395 * has a reference on them.
3396 */
3031 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3397 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3032 struct cgroup_subsys *ss = subsys[i]; 3398 struct cgroup_subsys *ss = subsys[i];
3033 struct cgroup_subsys_state *css; 3399 struct cgroup_subsys_state *css;
3034 /* Skip subsystems not in this hierarchy */ 3400 /* Skip subsystems not present or not in this hierarchy */
3035 if (ss->root != cgrp->root) 3401 if (ss == NULL || ss->root != cgrp->root)
3036 continue; 3402 continue;
3037 css = cgrp->subsys[ss->subsys_id]; 3403 css = cgrp->subsys[ss->subsys_id];
3038 /* When called from check_for_release() it's possible 3404 /* When called from check_for_release() it's possible
@@ -3106,6 +3472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3106 struct dentry *d; 3472 struct dentry *d;
3107 struct cgroup *parent; 3473 struct cgroup *parent;
3108 DEFINE_WAIT(wait); 3474 DEFINE_WAIT(wait);
3475 struct cgroup_event *event, *tmp;
3109 int ret; 3476 int ret;
3110 3477
3111 /* the vfs holds both inode->i_mutex already */ 3478 /* the vfs holds both inode->i_mutex already */
@@ -3189,6 +3556,20 @@ again:
3189 set_bit(CGRP_RELEASABLE, &parent->flags); 3556 set_bit(CGRP_RELEASABLE, &parent->flags);
3190 check_for_release(parent); 3557 check_for_release(parent);
3191 3558
3559 /*
3560 * Unregister events and notify userspace.
3561 * Notify userspace about cgroup removing only after rmdir of cgroup
3562 * directory to avoid race between userspace and kernelspace
3563 */
3564 spin_lock(&cgrp->event_list_lock);
3565 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3566 list_del(&event->list);
3567 remove_wait_queue(event->wqh, &event->wait);
3568 eventfd_signal(event->eventfd, 1);
3569 schedule_work(&event->remove);
3570 }
3571 spin_unlock(&cgrp->event_list_lock);
3572
3192 mutex_unlock(&cgroup_mutex); 3573 mutex_unlock(&cgroup_mutex);
3193 return 0; 3574 return 0;
3194} 3575}
@@ -3223,7 +3604,196 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3223 mutex_init(&ss->hierarchy_mutex); 3604 mutex_init(&ss->hierarchy_mutex);
3224 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3605 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3225 ss->active = 1; 3606 ss->active = 1;
3607
3608 /* this function shouldn't be used with modular subsystems, since they
3609 * need to register a subsys_id, among other things */
3610 BUG_ON(ss->module);
3611}
3612
3613/**
3614 * cgroup_load_subsys: load and register a modular subsystem at runtime
3615 * @ss: the subsystem to load
3616 *
3617 * This function should be called in a modular subsystem's initcall. If the
3618 * subsytem is built as a module, it will be assigned a new subsys_id and set
3619 * up for use. If the subsystem is built-in anyway, work is delegated to the
3620 * simpler cgroup_init_subsys.
3621 */
3622int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3623{
3624 int i;
3625 struct cgroup_subsys_state *css;
3626
3627 /* check name and function validity */
3628 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3629 ss->create == NULL || ss->destroy == NULL)
3630 return -EINVAL;
3631
3632 /*
3633 * we don't support callbacks in modular subsystems. this check is
3634 * before the ss->module check for consistency; a subsystem that could
3635 * be a module should still have no callbacks even if the user isn't
3636 * compiling it as one.
3637 */
3638 if (ss->fork || ss->exit)
3639 return -EINVAL;
3640
3641 /*
3642 * an optionally modular subsystem is built-in: we want to do nothing,
3643 * since cgroup_init_subsys will have already taken care of it.
3644 */
3645 if (ss->module == NULL) {
3646 /* a few sanity checks */
3647 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3648 BUG_ON(subsys[ss->subsys_id] != ss);
3649 return 0;
3650 }
3651
3652 /*
3653 * need to register a subsys id before anything else - for example,
3654 * init_cgroup_css needs it.
3655 */
3656 mutex_lock(&cgroup_mutex);
3657 /* find the first empty slot in the array */
3658 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3659 if (subsys[i] == NULL)
3660 break;
3661 }
3662 if (i == CGROUP_SUBSYS_COUNT) {
3663 /* maximum number of subsystems already registered! */
3664 mutex_unlock(&cgroup_mutex);
3665 return -EBUSY;
3666 }
3667 /* assign ourselves the subsys_id */
3668 ss->subsys_id = i;
3669 subsys[i] = ss;
3670
3671 /*
3672 * no ss->create seems to need anything important in the ss struct, so
3673 * this can happen first (i.e. before the rootnode attachment).
3674 */
3675 css = ss->create(ss, dummytop);
3676 if (IS_ERR(css)) {
3677 /* failure case - need to deassign the subsys[] slot. */
3678 subsys[i] = NULL;
3679 mutex_unlock(&cgroup_mutex);
3680 return PTR_ERR(css);
3681 }
3682
3683 list_add(&ss->sibling, &rootnode.subsys_list);
3684 ss->root = &rootnode;
3685
3686 /* our new subsystem will be attached to the dummy hierarchy. */
3687 init_cgroup_css(css, ss, dummytop);
3688 /* init_idr must be after init_cgroup_css because it sets css->id. */
3689 if (ss->use_id) {
3690 int ret = cgroup_init_idr(ss, css);
3691 if (ret) {
3692 dummytop->subsys[ss->subsys_id] = NULL;
3693 ss->destroy(ss, dummytop);
3694 subsys[i] = NULL;
3695 mutex_unlock(&cgroup_mutex);
3696 return ret;
3697 }
3698 }
3699
3700 /*
3701 * Now we need to entangle the css into the existing css_sets. unlike
3702 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3703 * will need a new pointer to it; done by iterating the css_set_table.
3704 * furthermore, modifying the existing css_sets will corrupt the hash
3705 * table state, so each changed css_set will need its hash recomputed.
3706 * this is all done under the css_set_lock.
3707 */
3708 write_lock(&css_set_lock);
3709 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3710 struct css_set *cg;
3711 struct hlist_node *node, *tmp;
3712 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3713
3714 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3715 /* skip entries that we already rehashed */
3716 if (cg->subsys[ss->subsys_id])
3717 continue;
3718 /* remove existing entry */
3719 hlist_del(&cg->hlist);
3720 /* set new value */
3721 cg->subsys[ss->subsys_id] = css;
3722 /* recompute hash and restore entry */
3723 new_bucket = css_set_hash(cg->subsys);
3724 hlist_add_head(&cg->hlist, new_bucket);
3725 }
3726 }
3727 write_unlock(&css_set_lock);
3728
3729 mutex_init(&ss->hierarchy_mutex);
3730 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3731 ss->active = 1;
3732
3733 /* success! */
3734 mutex_unlock(&cgroup_mutex);
3735 return 0;
3226} 3736}
3737EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3738
3739/**
3740 * cgroup_unload_subsys: unload a modular subsystem
3741 * @ss: the subsystem to unload
3742 *
3743 * This function should be called in a modular subsystem's exitcall. When this
3744 * function is invoked, the refcount on the subsystem's module will be 0, so
3745 * the subsystem will not be attached to any hierarchy.
3746 */
3747void cgroup_unload_subsys(struct cgroup_subsys *ss)
3748{
3749 struct cg_cgroup_link *link;
3750 struct hlist_head *hhead;
3751
3752 BUG_ON(ss->module == NULL);
3753
3754 /*
3755 * we shouldn't be called if the subsystem is in use, and the use of
3756 * try_module_get in parse_cgroupfs_options should ensure that it
3757 * doesn't start being used while we're killing it off.
3758 */
3759 BUG_ON(ss->root != &rootnode);
3760
3761 mutex_lock(&cgroup_mutex);
3762 /* deassign the subsys_id */
3763 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3764 subsys[ss->subsys_id] = NULL;
3765
3766 /* remove subsystem from rootnode's list of subsystems */
3767 list_del(&ss->sibling);
3768
3769 /*
3770 * disentangle the css from all css_sets attached to the dummytop. as
3771 * in loading, we need to pay our respects to the hashtable gods.
3772 */
3773 write_lock(&css_set_lock);
3774 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3775 struct css_set *cg = link->cg;
3776
3777 hlist_del(&cg->hlist);
3778 BUG_ON(!cg->subsys[ss->subsys_id]);
3779 cg->subsys[ss->subsys_id] = NULL;
3780 hhead = css_set_hash(cg->subsys);
3781 hlist_add_head(&cg->hlist, hhead);
3782 }
3783 write_unlock(&css_set_lock);
3784
3785 /*
3786 * remove subsystem's css from the dummytop and free it - need to free
3787 * before marking as null because ss->destroy needs the cgrp->subsys
3788 * pointer to find their state. note that this also takes care of
3789 * freeing the css_id.
3790 */
3791 ss->destroy(ss, dummytop);
3792 dummytop->subsys[ss->subsys_id] = NULL;
3793
3794 mutex_unlock(&cgroup_mutex);
3795}
3796EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3227 3797
3228/** 3798/**
3229 * cgroup_init_early - cgroup initialization at system boot 3799 * cgroup_init_early - cgroup initialization at system boot
@@ -3253,7 +3823,8 @@ int __init cgroup_init_early(void)
3253 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3823 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3254 INIT_HLIST_HEAD(&css_set_table[i]); 3824 INIT_HLIST_HEAD(&css_set_table[i]);
3255 3825
3256 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3826 /* at bootup time, we don't worry about modular subsystems */
3827 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3257 struct cgroup_subsys *ss = subsys[i]; 3828 struct cgroup_subsys *ss = subsys[i];
3258 3829
3259 BUG_ON(!ss->name); 3830 BUG_ON(!ss->name);
@@ -3288,12 +3859,13 @@ int __init cgroup_init(void)
3288 if (err) 3859 if (err)
3289 return err; 3860 return err;
3290 3861
3291 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3862 /* at bootup time, we don't worry about modular subsystems */
3863 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3292 struct cgroup_subsys *ss = subsys[i]; 3864 struct cgroup_subsys *ss = subsys[i];
3293 if (!ss->early_init) 3865 if (!ss->early_init)
3294 cgroup_init_subsys(ss); 3866 cgroup_init_subsys(ss);
3295 if (ss->use_id) 3867 if (ss->use_id)
3296 cgroup_subsys_init_idr(ss); 3868 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3297 } 3869 }
3298 3870
3299 /* Add init_css_set to the hash table */ 3871 /* Add init_css_set to the hash table */
@@ -3397,9 +3969,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3397 int i; 3969 int i;
3398 3970
3399 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3971 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3972 /*
3973 * ideally we don't want subsystems moving around while we do this.
3974 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3975 * subsys/hierarchy state.
3976 */
3400 mutex_lock(&cgroup_mutex); 3977 mutex_lock(&cgroup_mutex);
3401 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3978 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3402 struct cgroup_subsys *ss = subsys[i]; 3979 struct cgroup_subsys *ss = subsys[i];
3980 if (ss == NULL)
3981 continue;
3403 seq_printf(m, "%s\t%d\t%d\t%d\n", 3982 seq_printf(m, "%s\t%d\t%d\t%d\n",
3404 ss->name, ss->root->hierarchy_id, 3983 ss->name, ss->root->hierarchy_id,
3405 ss->root->number_of_cgroups, !ss->disabled); 3984 ss->root->number_of_cgroups, !ss->disabled);
@@ -3457,7 +4036,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3457{ 4036{
3458 if (need_forkexit_callback) { 4037 if (need_forkexit_callback) {
3459 int i; 4038 int i;
3460 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4039 /*
4040 * forkexit callbacks are only supported for builtin
4041 * subsystems, and the builtin section of the subsys array is
4042 * immutable, so we don't need to lock the subsys array here.
4043 */
4044 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3461 struct cgroup_subsys *ss = subsys[i]; 4045 struct cgroup_subsys *ss = subsys[i];
3462 if (ss->fork) 4046 if (ss->fork)
3463 ss->fork(ss, child); 4047 ss->fork(ss, child);
@@ -3526,7 +4110,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3526 struct css_set *cg; 4110 struct css_set *cg;
3527 4111
3528 if (run_callbacks && need_forkexit_callback) { 4112 if (run_callbacks && need_forkexit_callback) {
3529 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4113 /*
4114 * modular subsystems can't use callbacks, so no need to lock
4115 * the subsys array
4116 */
4117 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3530 struct cgroup_subsys *ss = subsys[i]; 4118 struct cgroup_subsys *ss = subsys[i];
3531 if (ss->exit) 4119 if (ss->exit)
3532 ss->exit(ss, tsk); 4120 ss->exit(ss, tsk);
@@ -3720,12 +4308,13 @@ static void check_for_release(struct cgroup *cgrp)
3720 } 4308 }
3721} 4309}
3722 4310
3723void __css_put(struct cgroup_subsys_state *css) 4311/* Caller must verify that the css is not for root cgroup */
4312void __css_put(struct cgroup_subsys_state *css, int count)
3724{ 4313{
3725 struct cgroup *cgrp = css->cgroup; 4314 struct cgroup *cgrp = css->cgroup;
3726 int val; 4315 int val;
3727 rcu_read_lock(); 4316 rcu_read_lock();
3728 val = atomic_dec_return(&css->refcnt); 4317 val = atomic_sub_return(count, &css->refcnt);
3729 if (val == 1) { 4318 if (val == 1) {
3730 if (notify_on_release(cgrp)) { 4319 if (notify_on_release(cgrp)) {
3731 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4320 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3736,6 +4325,7 @@ void __css_put(struct cgroup_subsys_state *css)
3736 rcu_read_unlock(); 4325 rcu_read_unlock();
3737 WARN_ON_ONCE(val < 1); 4326 WARN_ON_ONCE(val < 1);
3738} 4327}
4328EXPORT_SYMBOL_GPL(__css_put);
3739 4329
3740/* 4330/*
3741 * Notify userspace when a cgroup is released, by running the 4331 * Notify userspace when a cgroup is released, by running the
@@ -3817,8 +4407,11 @@ static int __init cgroup_disable(char *str)
3817 while ((token = strsep(&str, ",")) != NULL) { 4407 while ((token = strsep(&str, ",")) != NULL) {
3818 if (!*token) 4408 if (!*token)
3819 continue; 4409 continue;
3820 4410 /*
3821 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4411 * cgroup_disable, being at boot time, can't know about module
4412 * subsystems, so we don't worry about them.
4413 */
4414 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3822 struct cgroup_subsys *ss = subsys[i]; 4415 struct cgroup_subsys *ss = subsys[i];
3823 4416
3824 if (!strcmp(token, ss->name)) { 4417 if (!strcmp(token, ss->name)) {
@@ -3842,31 +4435,65 @@ __setup("cgroup_disable=", cgroup_disable);
3842 */ 4435 */
3843unsigned short css_id(struct cgroup_subsys_state *css) 4436unsigned short css_id(struct cgroup_subsys_state *css)
3844{ 4437{
3845 struct css_id *cssid = rcu_dereference(css->id); 4438 struct css_id *cssid;
4439
4440 /*
4441 * This css_id() can return correct value when somone has refcnt
4442 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4443 * it's unchanged until freed.
4444 */
4445 cssid = rcu_dereference_check(css->id,
4446 rcu_read_lock_held() || atomic_read(&css->refcnt));
3846 4447
3847 if (cssid) 4448 if (cssid)
3848 return cssid->id; 4449 return cssid->id;
3849 return 0; 4450 return 0;
3850} 4451}
4452EXPORT_SYMBOL_GPL(css_id);
3851 4453
3852unsigned short css_depth(struct cgroup_subsys_state *css) 4454unsigned short css_depth(struct cgroup_subsys_state *css)
3853{ 4455{
3854 struct css_id *cssid = rcu_dereference(css->id); 4456 struct css_id *cssid;
4457
4458 cssid = rcu_dereference_check(css->id,
4459 rcu_read_lock_held() || atomic_read(&css->refcnt));
3855 4460
3856 if (cssid) 4461 if (cssid)
3857 return cssid->depth; 4462 return cssid->depth;
3858 return 0; 4463 return 0;
3859} 4464}
4465EXPORT_SYMBOL_GPL(css_depth);
4466
4467/**
4468 * css_is_ancestor - test "root" css is an ancestor of "child"
4469 * @child: the css to be tested.
4470 * @root: the css supporsed to be an ancestor of the child.
4471 *
4472 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4473 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4474 * But, considering usual usage, the csses should be valid objects after test.
4475 * Assuming that the caller will do some action to the child if this returns
4476 * returns true, the caller must take "child";s reference count.
4477 * If "child" is valid object and this returns true, "root" is valid, too.
4478 */
3860 4479
3861bool css_is_ancestor(struct cgroup_subsys_state *child, 4480bool css_is_ancestor(struct cgroup_subsys_state *child,
3862 const struct cgroup_subsys_state *root) 4481 const struct cgroup_subsys_state *root)
3863{ 4482{
3864 struct css_id *child_id = rcu_dereference(child->id); 4483 struct css_id *child_id;
3865 struct css_id *root_id = rcu_dereference(root->id); 4484 struct css_id *root_id;
4485 bool ret = true;
3866 4486
3867 if (!child_id || !root_id || (child_id->depth < root_id->depth)) 4487 rcu_read_lock();
3868 return false; 4488 child_id = rcu_dereference(child->id);
3869 return child_id->stack[root_id->depth] == root_id->id; 4489 root_id = rcu_dereference(root->id);
4490 if (!child_id
4491 || !root_id
4492 || (child_id->depth < root_id->depth)
4493 || (child_id->stack[root_id->depth] != root_id->id))
4494 ret = false;
4495 rcu_read_unlock();
4496 return ret;
3870} 4497}
3871 4498
3872static void __free_css_id_cb(struct rcu_head *head) 4499static void __free_css_id_cb(struct rcu_head *head)
@@ -3893,6 +4520,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3893 spin_unlock(&ss->id_lock); 4520 spin_unlock(&ss->id_lock);
3894 call_rcu(&id->rcu_head, __free_css_id_cb); 4521 call_rcu(&id->rcu_head, __free_css_id_cb);
3895} 4522}
4523EXPORT_SYMBOL_GPL(free_css_id);
3896 4524
3897/* 4525/*
3898 * This is called by init or create(). Then, calls to this function are 4526 * This is called by init or create(). Then, calls to this function are
@@ -3942,15 +4570,14 @@ err_out:
3942 4570
3943} 4571}
3944 4572
3945static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4573static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4574 struct cgroup_subsys_state *rootcss)
3946{ 4575{
3947 struct css_id *newid; 4576 struct css_id *newid;
3948 struct cgroup_subsys_state *rootcss;
3949 4577
3950 spin_lock_init(&ss->id_lock); 4578 spin_lock_init(&ss->id_lock);
3951 idr_init(&ss->idr); 4579 idr_init(&ss->idr);
3952 4580
3953 rootcss = init_css_set.subsys[ss->subsys_id];
3954 newid = get_new_cssid(ss, 0); 4581 newid = get_new_cssid(ss, 0);
3955 if (IS_ERR(newid)) 4582 if (IS_ERR(newid))
3956 return PTR_ERR(newid); 4583 return PTR_ERR(newid);
@@ -3966,13 +4593,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3966{ 4593{
3967 int subsys_id, i, depth = 0; 4594 int subsys_id, i, depth = 0;
3968 struct cgroup_subsys_state *parent_css, *child_css; 4595 struct cgroup_subsys_state *parent_css, *child_css;
3969 struct css_id *child_id, *parent_id = NULL; 4596 struct css_id *child_id, *parent_id;
3970 4597
3971 subsys_id = ss->subsys_id; 4598 subsys_id = ss->subsys_id;
3972 parent_css = parent->subsys[subsys_id]; 4599 parent_css = parent->subsys[subsys_id];
3973 child_css = child->subsys[subsys_id]; 4600 child_css = child->subsys[subsys_id];
3974 depth = css_depth(parent_css) + 1;
3975 parent_id = parent_css->id; 4601 parent_id = parent_css->id;
4602 depth = parent_id->depth;
3976 4603
3977 child_id = get_new_cssid(ss, depth); 4604 child_id = get_new_cssid(ss, depth);
3978 if (IS_ERR(child_id)) 4605 if (IS_ERR(child_id))
@@ -4010,6 +4637,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
4010 4637
4011 return rcu_dereference(cssid->css); 4638 return rcu_dereference(cssid->css);
4012} 4639}
4640EXPORT_SYMBOL_GPL(css_lookup);
4013 4641
4014/** 4642/**
4015 * css_get_next - lookup next cgroup under specified hierarchy. 4643 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..e5c0244962b0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/slab.h>
18#include <linux/cgroup.h> 19#include <linux/cgroup.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
47 struct freezer, css); 48 struct freezer, css);
48} 49}
49 50
50int cgroup_frozen(struct task_struct *task) 51int cgroup_freezing_or_frozen(struct task_struct *task)
51{ 52{
52 struct freezer *freezer; 53 struct freezer *freezer;
53 enum freezer_state state; 54 enum freezer_state state;
54 55
55 task_lock(task); 56 task_lock(task);
56 freezer = task_freezer(task); 57 freezer = task_freezer(task);
57 state = freezer->state; 58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
58 task_unlock(task); 62 task_unlock(task);
59 63
60 return state == CGROUP_FROZEN; 64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
61} 65}
62 66
63/* 67/*
@@ -201,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
201 * No lock is needed, since the task isn't on tasklist yet, 205 * No lock is needed, since the task isn't on tasklist yet,
202 * so it can't be moved to another cgroup, which means the 206 * so it can't be moved to another cgroup, which means the
203 * freezer won't be removed and will be valid during this 207 * freezer won't be removed and will be valid during this
204 * function call. 208 * function call. Nevertheless, apply RCU read-side critical
209 * section to suppress RCU lockdep false positives.
205 */ 210 */
211 rcu_read_lock();
206 freezer = task_freezer(task); 212 freezer = task_freezer(task);
213 rcu_read_unlock();
207 214
208 /* 215 /*
209 * The root cgroup is non-freezable, so we can skip the 216 * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..7f40e9275fd9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h> 27#include <linux/ptrace.h>
28#include <linux/gfp.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30 31
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 677f25376a38..545777574779 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/gfp.h>
17 18
18#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
19/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 20/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -163,6 +164,7 @@ static inline void check_for_tasks(int cpu)
163} 164}
164 165
165struct take_cpu_down_param { 166struct take_cpu_down_param {
167 struct task_struct *caller;
166 unsigned long mod; 168 unsigned long mod;
167 void *hcpu; 169 void *hcpu;
168}; 170};
@@ -171,6 +173,7 @@ struct take_cpu_down_param {
171static int __ref take_cpu_down(void *_param) 173static int __ref take_cpu_down(void *_param)
172{ 174{
173 struct take_cpu_down_param *param = _param; 175 struct take_cpu_down_param *param = _param;
176 unsigned int cpu = (unsigned long)param->hcpu;
174 int err; 177 int err;
175 178
176 /* Ensure this CPU doesn't handle any more interrupts. */ 179 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -181,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
181 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 184 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
182 param->hcpu); 185 param->hcpu);
183 186
187 if (task_cpu(param->caller) == cpu)
188 move_task_off_dead_cpu(cpu, param->caller);
184 /* Force idle task to run as soon as we yield: it should 189 /* Force idle task to run as soon as we yield: it should
185 immediately notice cpu is offline and die quickly. */ 190 immediately notice cpu is offline and die quickly. */
186 sched_idle_next(); 191 sched_idle_next();
@@ -191,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
191static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 196static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
192{ 197{
193 int err, nr_calls = 0; 198 int err, nr_calls = 0;
194 cpumask_var_t old_allowed;
195 void *hcpu = (void *)(long)cpu; 199 void *hcpu = (void *)(long)cpu;
196 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 200 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
197 struct take_cpu_down_param tcd_param = { 201 struct take_cpu_down_param tcd_param = {
202 .caller = current,
198 .mod = mod, 203 .mod = mod,
199 .hcpu = hcpu, 204 .hcpu = hcpu,
200 }; 205 };
@@ -205,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
205 if (!cpu_online(cpu)) 210 if (!cpu_online(cpu))
206 return -EINVAL; 211 return -EINVAL;
207 212
208 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
209 return -ENOMEM;
210
211 cpu_hotplug_begin(); 213 cpu_hotplug_begin();
212 set_cpu_active(cpu, false); 214 set_cpu_active(cpu, false);
213 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -224,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
224 goto out_release; 226 goto out_release;
225 } 227 }
226 228
227 /* Ensure that we are not runnable on dying cpu */
228 cpumask_copy(old_allowed, &current->cpus_allowed);
229 set_cpus_allowed_ptr(current, cpu_active_mask);
230
231 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
232 if (err) { 230 if (err) {
233 set_cpu_active(cpu, true); 231 set_cpu_active(cpu, true);
@@ -236,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
236 hcpu) == NOTIFY_BAD) 234 hcpu) == NOTIFY_BAD)
237 BUG(); 235 BUG();
238 236
239 goto out_allowed; 237 goto out_release;
240 } 238 }
241 BUG_ON(cpu_online(cpu)); 239 BUG_ON(cpu_online(cpu));
242 240
@@ -254,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
254 252
255 check_for_tasks(cpu); 253 check_for_tasks(cpu);
256 254
257out_allowed:
258 set_cpus_allowed_ptr(current, old_allowed);
259out_release: 255out_release:
260 cpu_hotplug_done(); 256 cpu_hotplug_done();
261 if (!err) { 257 if (!err) {
@@ -263,7 +259,6 @@ out_release:
263 hcpu) == NOTIFY_BAD) 259 hcpu) == NOTIFY_BAD)
264 BUG(); 260 BUG();
265 } 261 }
266 free_cpumask_var(old_allowed);
267 return err; 262 return err;
268} 263}
269 264
@@ -271,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
271{ 266{
272 int err; 267 int err;
273 268
274 err = stop_machine_create();
275 if (err)
276 return err;
277 cpu_maps_update_begin(); 269 cpu_maps_update_begin();
278 270
279 if (cpu_hotplug_disabled) { 271 if (cpu_hotplug_disabled) {
@@ -285,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
285 277
286out: 278out:
287 cpu_maps_update_done(); 279 cpu_maps_update_done();
288 stop_machine_destroy();
289 return err; 280 return err;
290} 281}
291EXPORT_SYMBOL(cpu_down); 282EXPORT_SYMBOL(cpu_down);
@@ -338,7 +329,7 @@ int __cpuinit cpu_up(unsigned int cpu)
338 if (!cpu_possible(cpu)) { 329 if (!cpu_possible(cpu)) {
339 printk(KERN_ERR "can't online cpu %d because it is not " 330 printk(KERN_ERR "can't online cpu %d because it is not "
340 "configured as may-hotadd at boot time\n", cpu); 331 "configured as may-hotadd at boot time\n", cpu);
341#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 332#if defined(CONFIG_IA64)
342 printk(KERN_ERR "please check additional_cpus= boot " 333 printk(KERN_ERR "please check additional_cpus= boot "
343 "parameter\n"); 334 "parameter\n");
344#endif 335#endif
@@ -366,9 +357,6 @@ int disable_nonboot_cpus(void)
366{ 357{
367 int cpu, first_cpu, error; 358 int cpu, first_cpu, error;
368 359
369 error = stop_machine_create();
370 if (error)
371 return error;
372 cpu_maps_update_begin(); 360 cpu_maps_update_begin();
373 first_cpu = cpumask_first(cpu_online_mask); 361 first_cpu = cpumask_first(cpu_online_mask);
374 /* 362 /*
@@ -399,7 +387,6 @@ int disable_nonboot_cpus(void)
399 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 387 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
400 } 388 }
401 cpu_maps_update_done(); 389 cpu_maps_update_done();
402 stop_machine_destroy();
403 return error; 390 return error;
404} 391}
405 392
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..9a50c5f6e727 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 * call to guarantee_online_mems(), as we know no one is changing 920 * call to guarantee_online_mems(), as we know no one is changing
921 * our task's cpuset. 921 * our task's cpuset.
922 * 922 *
923 * Hold callback_mutex around the two modifications of our tasks
924 * mems_allowed to synchronize with cpuset_mems_allowed().
925 *
926 * While the mm_struct we are migrating is typically from some 923 * While the mm_struct we are migrating is typically from some
927 * other task, the task_struct mems_allowed that we are hacking 924 * other task, the task_struct mems_allowed that we are hacking
928 * is for our current task, which must allocate new pages for that 925 * is for our current task, which must allocate new pages for that
@@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
973 struct cpuset *cs; 970 struct cpuset *cs;
974 int migrate; 971 int migrate;
975 const nodemask_t *oldmem = scan->data; 972 const nodemask_t *oldmem = scan->data;
976 nodemask_t newmems; 973 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
974
975 if (!newmems)
976 return;
977 977
978 cs = cgroup_cs(scan->cg); 978 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, &newmems); 979 guarantee_online_mems(cs, newmems);
980 980
981 task_lock(p); 981 task_lock(p);
982 cpuset_change_task_nodemask(p, &newmems); 982 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p); 983 task_unlock(p);
984 984
985 NODEMASK_FREE(newmems);
986
985 mm = get_task_mm(p); 987 mm = get_task_mm(p);
986 if (!mm) 988 if (!mm)
987 return; 989 return;
@@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1051static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1053static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1052 const char *buf) 1054 const char *buf)
1053{ 1055{
1054 nodemask_t oldmem; 1056 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1055 int retval; 1057 int retval;
1056 struct ptr_heap heap; 1058 struct ptr_heap heap;
1057 1059
1060 if (!oldmem)
1061 return -ENOMEM;
1062
1058 /* 1063 /*
1059 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1064 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1060 * it's read-only 1065 * it's read-only
1061 */ 1066 */
1062 if (cs == &top_cpuset) 1067 if (cs == &top_cpuset) {
1063 return -EACCES; 1068 retval = -EACCES;
1069 goto done;
1070 }
1064 1071
1065 /* 1072 /*
1066 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1073 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1076 goto done; 1083 goto done;
1077 1084
1078 if (!nodes_subset(trialcs->mems_allowed, 1085 if (!nodes_subset(trialcs->mems_allowed,
1079 node_states[N_HIGH_MEMORY])) 1086 node_states[N_HIGH_MEMORY])) {
1080 return -EINVAL; 1087 retval = -EINVAL;
1088 goto done;
1089 }
1081 } 1090 }
1082 oldmem = cs->mems_allowed; 1091 *oldmem = cs->mems_allowed;
1083 if (nodes_equal(oldmem, trialcs->mems_allowed)) { 1092 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1084 retval = 0; /* Too easy - nothing to do */ 1093 retval = 0; /* Too easy - nothing to do */
1085 goto done; 1094 goto done;
1086 } 1095 }
@@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1096 cs->mems_allowed = trialcs->mems_allowed; 1105 cs->mems_allowed = trialcs->mems_allowed;
1097 mutex_unlock(&callback_mutex); 1106 mutex_unlock(&callback_mutex);
1098 1107
1099 update_tasks_nodemask(cs, &oldmem, &heap); 1108 update_tasks_nodemask(cs, oldmem, &heap);
1100 1109
1101 heap_free(&heap); 1110 heap_free(&heap);
1102done: 1111done:
1112 NODEMASK_FREE(oldmem);
1103 return retval; 1113 return retval;
1104} 1114}
1105 1115
@@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1384 struct cgroup *oldcont, struct task_struct *tsk, 1394 struct cgroup *oldcont, struct task_struct *tsk,
1385 bool threadgroup) 1395 bool threadgroup)
1386{ 1396{
1387 nodemask_t from, to;
1388 struct mm_struct *mm; 1397 struct mm_struct *mm;
1389 struct cpuset *cs = cgroup_cs(cont); 1398 struct cpuset *cs = cgroup_cs(cont);
1390 struct cpuset *oldcs = cgroup_cs(oldcont); 1399 struct cpuset *oldcs = cgroup_cs(oldcont);
1400 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1401 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1402
1403 if (from == NULL || to == NULL)
1404 goto alloc_fail;
1391 1405
1392 if (cs == &top_cpuset) { 1406 if (cs == &top_cpuset) {
1393 cpumask_copy(cpus_attach, cpu_possible_mask); 1407 cpumask_copy(cpus_attach, cpu_possible_mask);
1394 to = node_possible_map;
1395 } else { 1408 } else {
1396 guarantee_online_cpus(cs, cpus_attach); 1409 guarantee_online_cpus(cs, cpus_attach);
1397 guarantee_online_mems(cs, &to);
1398 } 1410 }
1411 guarantee_online_mems(cs, to);
1399 1412
1400 /* do per-task migration stuff possibly for each in the threadgroup */ 1413 /* do per-task migration stuff possibly for each in the threadgroup */
1401 cpuset_attach_task(tsk, &to, cs); 1414 cpuset_attach_task(tsk, to, cs);
1402 if (threadgroup) { 1415 if (threadgroup) {
1403 struct task_struct *c; 1416 struct task_struct *c;
1404 rcu_read_lock(); 1417 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1418 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs); 1419 cpuset_attach_task(c, to, cs);
1407 } 1420 }
1408 rcu_read_unlock(); 1421 rcu_read_unlock();
1409 } 1422 }
1410 1423
1411 /* change mm; only needs to be done once even if threadgroup */ 1424 /* change mm; only needs to be done once even if threadgroup */
1412 from = oldcs->mems_allowed; 1425 *from = oldcs->mems_allowed;
1413 to = cs->mems_allowed; 1426 *to = cs->mems_allowed;
1414 mm = get_task_mm(tsk); 1427 mm = get_task_mm(tsk);
1415 if (mm) { 1428 if (mm) {
1416 mpol_rebind_mm(mm, &to); 1429 mpol_rebind_mm(mm, to);
1417 if (is_memory_migrate(cs)) 1430 if (is_memory_migrate(cs))
1418 cpuset_migrate_mm(mm, &from, &to); 1431 cpuset_migrate_mm(mm, from, to);
1419 mmput(mm); 1432 mmput(mm);
1420 } 1433 }
1434
1435alloc_fail:
1436 NODEMASK_FREE(from);
1437 NODEMASK_FREE(to);
1421} 1438}
1422 1439
1423/* The various types of files and directories in a cpuset file system */ 1440/* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1562 1579
1563static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1580static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1564{ 1581{
1565 nodemask_t mask; 1582 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
1583 int retval;
1584
1585 if (mask == NULL)
1586 return -ENOMEM;
1566 1587
1567 mutex_lock(&callback_mutex); 1588 mutex_lock(&callback_mutex);
1568 mask = cs->mems_allowed; 1589 *mask = cs->mems_allowed;
1569 mutex_unlock(&callback_mutex); 1590 mutex_unlock(&callback_mutex);
1570 1591
1571 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1592 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
1593
1594 NODEMASK_FREE(mask);
1595
1596 return retval;
1572} 1597}
1573 1598
1574static ssize_t cpuset_common_file_read(struct cgroup *cont, 1599static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1997 struct cpuset *cp; /* scans cpusets being updated */ 2022 struct cpuset *cp; /* scans cpusets being updated */
1998 struct cpuset *child; /* scans child cpusets of cp */ 2023 struct cpuset *child; /* scans child cpusets of cp */
1999 struct cgroup *cont; 2024 struct cgroup *cont;
2000 nodemask_t oldmems; 2025 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2026
2027 if (oldmems == NULL)
2028 return;
2001 2029
2002 list_add_tail((struct list_head *)&root->stack_list, &queue); 2030 list_add_tail((struct list_head *)&root->stack_list, &queue);
2003 2031
@@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2042 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2043 continue;
2016 2044
2017 oldmems = cp->mems_allowed; 2045 *oldmems = cp->mems_allowed;
2018 2046
2019 /* Remove offline cpus and mems from this cpuset. */ 2047 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2048 mutex_lock(&callback_mutex);
@@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2030 remove_tasks_in_empty_cpuset(cp); 2058 remove_tasks_in_empty_cpuset(cp);
2031 else { 2059 else {
2032 update_tasks_cpumask(cp, NULL); 2060 update_tasks_cpumask(cp, NULL);
2033 update_tasks_nodemask(cp, &oldmems, NULL); 2061 update_tasks_nodemask(cp, oldmems, NULL);
2034 } 2062 }
2035 } 2063 }
2064 NODEMASK_FREE(oldmems);
2036} 2065}
2037 2066
2038/* 2067/*
@@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2090static int cpuset_track_online_nodes(struct notifier_block *self, 2119static int cpuset_track_online_nodes(struct notifier_block *self,
2091 unsigned long action, void *arg) 2120 unsigned long action, void *arg)
2092{ 2121{
2122 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2123
2124 if (oldmems == NULL)
2125 return NOTIFY_DONE;
2126
2093 cgroup_lock(); 2127 cgroup_lock();
2094 switch (action) { 2128 switch (action) {
2095 case MEM_ONLINE: 2129 case MEM_ONLINE:
2096 case MEM_OFFLINE: 2130 *oldmems = top_cpuset.mems_allowed;
2097 mutex_lock(&callback_mutex); 2131 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2132 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 mutex_unlock(&callback_mutex); 2133 mutex_unlock(&callback_mutex);
2100 if (action == MEM_OFFLINE) 2134 update_tasks_nodemask(&top_cpuset, oldmems, NULL);
2101 scan_for_empty_cpusets(&top_cpuset); 2135 break;
2136 case MEM_OFFLINE:
2137 /*
2138 * needn't update top_cpuset.mems_allowed explicitly because
2139 * scan_for_empty_cpusets() will update it.
2140 */
2141 scan_for_empty_cpusets(&top_cpuset);
2102 break; 2142 break;
2103 default: 2143 default:
2104 break; 2144 break;
2105 } 2145 }
2106 cgroup_unlock(); 2146 cgroup_unlock();
2147
2148 NODEMASK_FREE(oldmems);
2107 return NOTIFY_OK; 2149 return NOTIFY_OK;
2108} 2150}
2109#endif 2151#endif
@@ -2140,19 +2182,52 @@ void __init cpuset_init_smp(void)
2140void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2141{ 2183{
2142 mutex_lock(&callback_mutex); 2184 mutex_lock(&callback_mutex);
2143 cpuset_cpus_allowed_locked(tsk, pmask); 2185 task_lock(tsk);
2186 guarantee_online_cpus(task_cs(tsk), pmask);
2187 task_unlock(tsk);
2144 mutex_unlock(&callback_mutex); 2188 mutex_unlock(&callback_mutex);
2145} 2189}
2146 2190
2147/** 2191int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2148 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2149 * Must be called with callback_mutex held.
2150 **/
2151void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2152{ 2192{
2153 task_lock(tsk); 2193 const struct cpuset *cs;
2154 guarantee_online_cpus(task_cs(tsk), pmask); 2194 int cpu;
2155 task_unlock(tsk); 2195
2196 rcu_read_lock();
2197 cs = task_cs(tsk);
2198 if (cs)
2199 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2200 rcu_read_unlock();
2201
2202 /*
2203 * We own tsk->cpus_allowed, nobody can change it under us.
2204 *
2205 * But we used cs && cs->cpus_allowed lockless and thus can
2206 * race with cgroup_attach_task() or update_cpumask() and get
2207 * the wrong tsk->cpus_allowed. However, both cases imply the
2208 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2209 * which takes task_rq_lock().
2210 *
2211 * If we are called after it dropped the lock we must see all
2212 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2213 * set any mask even if it is not right from task_cs() pov,
2214 * the pending set_cpus_allowed_ptr() will fix things.
2215 */
2216
2217 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2218 if (cpu >= nr_cpu_ids) {
2219 /*
2220 * Either tsk->cpus_allowed is wrong (see above) or it
2221 * is actually empty. The latter case is only possible
2222 * if we are racing with remove_tasks_in_empty_cpuset().
2223 * Like above we can temporary set any mask and rely on
2224 * set_cpus_allowed_ptr() as synchronization point.
2225 */
2226 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2227 cpu = cpumask_any(cpu_active_mask);
2228 }
2229
2230 return cpu;
2156} 2231}
2157 2232
2158void cpuset_init_current_mems_allowed(void) 2233void cpuset_init_current_mems_allowed(void)
@@ -2341,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2341} 2416}
2342 2417
2343/** 2418/**
2344 * cpuset_lock - lock out any changes to cpuset structures
2345 *
2346 * The out of memory (oom) code needs to mutex_lock cpusets
2347 * from being changed while it scans the tasklist looking for a
2348 * task in an overlapping cpuset. Expose callback_mutex via this
2349 * cpuset_lock() routine, so the oom code can lock it, before
2350 * locking the task list. The tasklist_lock is a spinlock, so
2351 * must be taken inside callback_mutex.
2352 */
2353
2354void cpuset_lock(void)
2355{
2356 mutex_lock(&callback_mutex);
2357}
2358
2359/**
2360 * cpuset_unlock - release lock on cpuset changes 2419 * cpuset_unlock - release lock on cpuset changes
2361 * 2420 *
2362 * Undo the lock taken in a previous cpuset_lock() call. 2421 * Undo the lock taken in a previous cpuset_lock() call.
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
index 1ed8ca18790c..8f3672a58a1e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,13 +10,13 @@
10 */ 10 */
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/cred.h> 12#include <linux/cred.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/key.h> 15#include <linux/key.h>
15#include <linux/keyctl.h> 16#include <linux/keyctl.h>
16#include <linux/init_task.h> 17#include <linux/init_task.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
19#include "cred-internals.h"
20 20
21#if 0 21#if 0
22#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
@@ -364,7 +364,7 @@ struct cred *prepare_usermodehelper_creds(void)
364 364
365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); 365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
366 if (!new) 366 if (!new)
367 return NULL; 367 goto free_tgcred;
368 368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new); 369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370 370
@@ -398,6 +398,12 @@ struct cred *prepare_usermodehelper_creds(void)
398error: 398error:
399 put_cred(new); 399 put_cred(new);
400 return NULL; 400 return NULL;
401
402free_tgcred:
403#ifdef CONFIG_KEYS
404 kfree(tgcred);
405#endif
406 return NULL;
401} 407}
402 408
403/* 409/*
@@ -553,8 +559,6 @@ int commit_creds(struct cred *new)
553 atomic_dec(&old->user->processes); 559 atomic_dec(&old->user->processes);
554 alter_cred_subscribers(old, -2); 560 alter_cred_subscribers(old, -2);
555 561
556 sched_switch_user(task);
557
558 /* send notifications */ 562 /* send notifications */
559 if (new->uid != old->uid || 563 if (new->uid != old->uid ||
560 new->euid != old->euid || 564 new->euid != old->euid ||
@@ -786,8 +790,6 @@ bool creds_are_invalid(const struct cred *cred)
786{ 790{
787 if (cred->magic != CRED_MAGIC) 791 if (cred->magic != CRED_MAGIC)
788 return true; 792 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX 793#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) { 794 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE) 795 if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..31aa9332ef3f
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,584 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10
11/*
12 * Early reserved memory areas.
13 */
14/*
15 * need to make sure this one is bigger enough before
16 * find_fw_memmap_area could be used
17 */
18#define MAX_EARLY_RES_X 32
19
20struct early_res {
21 u64 start, end;
22 char name[15];
23 char overlap_ok;
24};
25static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
26
27static int max_early_res __initdata = MAX_EARLY_RES_X;
28static struct early_res *early_res __initdata = &early_res_x[0];
29static int early_res_count __initdata;
30
31static int __init find_overlapped_early(u64 start, u64 end)
32{
33 int i;
34 struct early_res *r;
35
36 for (i = 0; i < max_early_res && early_res[i].end; i++) {
37 r = &early_res[i];
38 if (end > r->start && start < r->end)
39 break;
40 }
41
42 return i;
43}
44
45/*
46 * Drop the i-th range from the early reservation map,
47 * by copying any higher ranges down one over it, and
48 * clearing what had been the last slot.
49 */
50static void __init drop_range(int i)
51{
52 int j;
53
54 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
55 ;
56
57 memmove(&early_res[i], &early_res[i + 1],
58 (j - 1 - i) * sizeof(struct early_res));
59
60 early_res[j - 1].end = 0;
61 early_res_count--;
62}
63
64static void __init drop_range_partial(int i, u64 start, u64 end)
65{
66 u64 common_start, common_end;
67 u64 old_start, old_end;
68
69 old_start = early_res[i].start;
70 old_end = early_res[i].end;
71 common_start = max(old_start, start);
72 common_end = min(old_end, end);
73
74 /* no overlap ? */
75 if (common_start >= common_end)
76 return;
77
78 if (old_start < common_start) {
79 /* make head segment */
80 early_res[i].end = common_start;
81 if (old_end > common_end) {
82 char name[15];
83
84 /*
85 * Save a local copy of the name, since the
86 * early_res array could get resized inside
87 * reserve_early_without_check() ->
88 * __check_and_double_early_res(), which would
89 * make the current name pointer invalid.
90 */
91 strncpy(name, early_res[i].name,
92 sizeof(early_res[i].name) - 1);
93 /* add another for left over on tail */
94 reserve_early_without_check(common_end, old_end, name);
95 }
96 return;
97 } else {
98 if (old_end > common_end) {
99 /* reuse the entry for tail left */
100 early_res[i].start = common_end;
101 return;
102 }
103 /* all covered */
104 drop_range(i);
105 }
106}
107
108/*
109 * Split any existing ranges that:
110 * 1) are marked 'overlap_ok', and
111 * 2) overlap with the stated range [start, end)
112 * into whatever portion (if any) of the existing range is entirely
113 * below or entirely above the stated range. Drop the portion
114 * of the existing range that overlaps with the stated range,
115 * which will allow the caller of this routine to then add that
116 * stated range without conflicting with any existing range.
117 */
118static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
119{
120 int i;
121 struct early_res *r;
122 u64 lower_start, lower_end;
123 u64 upper_start, upper_end;
124 char name[15];
125
126 for (i = 0; i < max_early_res && early_res[i].end; i++) {
127 r = &early_res[i];
128
129 /* Continue past non-overlapping ranges */
130 if (end <= r->start || start >= r->end)
131 continue;
132
133 /*
134 * Leave non-ok overlaps as is; let caller
135 * panic "Overlapping early reservations"
136 * when it hits this overlap.
137 */
138 if (!r->overlap_ok)
139 return;
140
141 /*
142 * We have an ok overlap. We will drop it from the early
143 * reservation map, and add back in any non-overlapping
144 * portions (lower or upper) as separate, overlap_ok,
145 * non-overlapping ranges.
146 */
147
148 /* 1. Note any non-overlapping (lower or upper) ranges. */
149 strncpy(name, r->name, sizeof(name) - 1);
150
151 lower_start = lower_end = 0;
152 upper_start = upper_end = 0;
153 if (r->start < start) {
154 lower_start = r->start;
155 lower_end = start;
156 }
157 if (r->end > end) {
158 upper_start = end;
159 upper_end = r->end;
160 }
161
162 /* 2. Drop the original ok overlapping range */
163 drop_range(i);
164
165 i--; /* resume for-loop on copied down entry */
166
167 /* 3. Add back in any non-overlapping ranges. */
168 if (lower_end)
169 reserve_early_overlap_ok(lower_start, lower_end, name);
170 if (upper_end)
171 reserve_early_overlap_ok(upper_start, upper_end, name);
172 }
173}
174
175static void __init __reserve_early(u64 start, u64 end, char *name,
176 int overlap_ok)
177{
178 int i;
179 struct early_res *r;
180
181 i = find_overlapped_early(start, end);
182 if (i >= max_early_res)
183 panic("Too many early reservations");
184 r = &early_res[i];
185 if (r->end)
186 panic("Overlapping early reservations "
187 "%llx-%llx %s to %llx-%llx %s\n",
188 start, end - 1, name ? name : "", r->start,
189 r->end - 1, r->name);
190 r->start = start;
191 r->end = end;
192 r->overlap_ok = overlap_ok;
193 if (name)
194 strncpy(r->name, name, sizeof(r->name) - 1);
195 early_res_count++;
196}
197
198/*
199 * A few early reservtations come here.
200 *
201 * The 'overlap_ok' in the name of this routine does -not- mean it
202 * is ok for these reservations to overlap an earlier reservation.
203 * Rather it means that it is ok for subsequent reservations to
204 * overlap this one.
205 *
206 * Use this entry point to reserve early ranges when you are doing
207 * so out of "Paranoia", reserving perhaps more memory than you need,
208 * just in case, and don't mind a subsequent overlapping reservation
209 * that is known to be needed.
210 *
211 * The drop_overlaps_that_are_ok() call here isn't really needed.
212 * It would be needed if we had two colliding 'overlap_ok'
213 * reservations, so that the second such would not panic on the
214 * overlap with the first. We don't have any such as of this
215 * writing, but might as well tolerate such if it happens in
216 * the future.
217 */
218void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
219{
220 drop_overlaps_that_are_ok(start, end);
221 __reserve_early(start, end, name, 1);
222}
223
224static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
225{
226 u64 start, end, size, mem;
227 struct early_res *new;
228
229 /* do we have enough slots left ? */
230 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
231 return;
232
233 /* double it */
234 mem = -1ULL;
235 size = sizeof(struct early_res) * max_early_res * 2;
236 if (early_res == early_res_x)
237 start = 0;
238 else
239 start = early_res[0].end;
240 end = ex_start;
241 if (start + size < end)
242 mem = find_fw_memmap_area(start, end, size,
243 sizeof(struct early_res));
244 if (mem == -1ULL) {
245 start = ex_end;
246 end = get_max_mapped();
247 if (start + size < end)
248 mem = find_fw_memmap_area(start, end, size,
249 sizeof(struct early_res));
250 }
251 if (mem == -1ULL)
252 panic("can not find more space for early_res array");
253
254 new = __va(mem);
255 /* save the first one for own */
256 new[0].start = mem;
257 new[0].end = mem + size;
258 new[0].overlap_ok = 0;
259 /* copy old to new */
260 if (early_res == early_res_x) {
261 memcpy(&new[1], &early_res[0],
262 sizeof(struct early_res) * max_early_res);
263 memset(&new[max_early_res+1], 0,
264 sizeof(struct early_res) * (max_early_res - 1));
265 early_res_count++;
266 } else {
267 memcpy(&new[1], &early_res[1],
268 sizeof(struct early_res) * (max_early_res - 1));
269 memset(&new[max_early_res], 0,
270 sizeof(struct early_res) * max_early_res);
271 }
272 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
273 early_res = new;
274 max_early_res *= 2;
275 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
276 max_early_res, mem, mem + size - 1);
277}
278
279/*
280 * Most early reservations come here.
281 *
282 * We first have drop_overlaps_that_are_ok() drop any pre-existing
283 * 'overlap_ok' ranges, so that we can then reserve this memory
284 * range without risk of panic'ing on an overlapping overlap_ok
285 * early reservation.
286 */
287void __init reserve_early(u64 start, u64 end, char *name)
288{
289 if (start >= end)
290 return;
291
292 __check_and_double_early_res(start, end);
293
294 drop_overlaps_that_are_ok(start, end);
295 __reserve_early(start, end, name, 0);
296}
297
298void __init reserve_early_without_check(u64 start, u64 end, char *name)
299{
300 struct early_res *r;
301
302 if (start >= end)
303 return;
304
305 __check_and_double_early_res(start, end);
306
307 r = &early_res[early_res_count];
308
309 r->start = start;
310 r->end = end;
311 r->overlap_ok = 0;
312 if (name)
313 strncpy(r->name, name, sizeof(r->name) - 1);
314 early_res_count++;
315}
316
317void __init free_early(u64 start, u64 end)
318{
319 struct early_res *r;
320 int i;
321
322 i = find_overlapped_early(start, end);
323 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start)
325 panic("free_early on not reserved area: %llx-%llx!",
326 start, end - 1);
327
328 drop_range(i);
329}
330
331void __init free_early_partial(u64 start, u64 end)
332{
333 struct early_res *r;
334 int i;
335
336 if (start == end)
337 return;
338
339 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
340 return;
341
342try_next:
343 i = find_overlapped_early(start, end);
344 if (i >= max_early_res)
345 return;
346
347 r = &early_res[i];
348 /* hole ? */
349 if (r->end >= end && r->start <= start) {
350 drop_range_partial(i, start, end);
351 return;
352 }
353
354 drop_range_partial(i, start, end);
355 goto try_next;
356}
357
358#ifdef CONFIG_NO_BOOTMEM
359static void __init subtract_early_res(struct range *range, int az)
360{
361 int i, count;
362 u64 final_start, final_end;
363 int idx = 0;
364
365 count = 0;
366 for (i = 0; i < max_early_res && early_res[i].end; i++)
367 count++;
368
369 /* need to skip first one ?*/
370 if (early_res != early_res_x)
371 idx = 1;
372
373#define DEBUG_PRINT_EARLY_RES 1
374
375#if DEBUG_PRINT_EARLY_RES
376 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
377#endif
378 for (i = idx; i < count; i++) {
379 struct early_res *r = &early_res[i];
380#if DEBUG_PRINT_EARLY_RES
381 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
382 r->start, r->end, r->name);
383#endif
384 final_start = PFN_DOWN(r->start);
385 final_end = PFN_UP(r->end);
386 if (final_start >= final_end)
387 continue;
388 subtract_range(range, az, final_start, final_end);
389 }
390
391}
392
393int __init get_free_all_memory_range(struct range **rangep, int nodeid)
394{
395 int i, count;
396 u64 start = 0, end;
397 u64 size;
398 u64 mem;
399 struct range *range;
400 int nr_range;
401
402 count = 0;
403 for (i = 0; i < max_early_res && early_res[i].end; i++)
404 count++;
405
406 count *= 2;
407
408 size = sizeof(struct range) * count;
409 end = get_max_mapped();
410#ifdef MAX_DMA32_PFN
411 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
412 start = MAX_DMA32_PFN << PAGE_SHIFT;
413#endif
414 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
415 if (mem == -1ULL)
416 panic("can not find more space for range free");
417
418 range = __va(mem);
419 /* use early_node_map[] and early_res to get range array at first */
420 memset(range, 0, size);
421 nr_range = 0;
422
423 /* need to go over early_node_map to find out good range for node */
424 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
425#ifdef CONFIG_X86_32
426 subtract_range(range, count, max_low_pfn, -1ULL);
427#endif
428 subtract_early_res(range, count);
429 nr_range = clean_sort_range(range, count);
430
431 /* need to clear it ? */
432 if (nodeid == MAX_NUMNODES) {
433 memset(&early_res[0], 0,
434 sizeof(struct early_res) * max_early_res);
435 early_res = NULL;
436 max_early_res = 0;
437 }
438
439 *rangep = range;
440 return nr_range;
441}
442#else
443void __init early_res_to_bootmem(u64 start, u64 end)
444{
445 int i, count;
446 u64 final_start, final_end;
447 int idx = 0;
448
449 count = 0;
450 for (i = 0; i < max_early_res && early_res[i].end; i++)
451 count++;
452
453 /* need to skip first one ?*/
454 if (early_res != early_res_x)
455 idx = 1;
456
457 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
458 count - idx, max_early_res, start, end);
459 for (i = idx; i < count; i++) {
460 struct early_res *r = &early_res[i];
461 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
462 r->start, r->end, r->name);
463 final_start = max(start, r->start);
464 final_end = min(end, r->end);
465 if (final_start >= final_end) {
466 printk(KERN_CONT "\n");
467 continue;
468 }
469 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
470 final_start, final_end);
471 reserve_bootmem_generic(final_start, final_end - final_start,
472 BOOTMEM_DEFAULT);
473 }
474 /* clear them */
475 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
476 early_res = NULL;
477 max_early_res = 0;
478 early_res_count = 0;
479}
480#endif
481
482/* Check for already reserved areas */
483static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
484{
485 int i;
486 u64 addr = *addrp;
487 int changed = 0;
488 struct early_res *r;
489again:
490 i = find_overlapped_early(addr, addr + size);
491 r = &early_res[i];
492 if (i < max_early_res && r->end) {
493 *addrp = addr = round_up(r->end, align);
494 changed = 1;
495 goto again;
496 }
497 return changed;
498}
499
500/* Check for already reserved areas */
501static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
502{
503 int i;
504 u64 addr = *addrp, last;
505 u64 size = *sizep;
506 int changed = 0;
507again:
508 last = addr + size;
509 for (i = 0; i < max_early_res && early_res[i].end; i++) {
510 struct early_res *r = &early_res[i];
511 if (last > r->start && addr < r->start) {
512 size = r->start - addr;
513 changed = 1;
514 goto again;
515 }
516 if (last > r->end && addr < r->end) {
517 addr = round_up(r->end, align);
518 size = last - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last <= r->end && addr >= r->start) {
523 (*sizep)++;
524 return 0;
525 }
526 }
527 if (changed) {
528 *addrp = addr;
529 *sizep = size;
530 }
531 return changed;
532}
533
534/*
535 * Find a free area with specified alignment in a specific range.
536 * only with the area.between start to end is active range from early_node_map
537 * so they are good as RAM
538 */
539u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
540 u64 size, u64 align)
541{
542 u64 addr, last;
543
544 addr = round_up(ei_start, align);
545 if (addr < start)
546 addr = round_up(start, align);
547 if (addr >= ei_last)
548 goto out;
549 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
550 ;
551 last = addr + size;
552 if (last > ei_last)
553 goto out;
554 if (last > end)
555 goto out;
556
557 return addr;
558
559out:
560 return -1ULL;
561}
562
563u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
564 u64 *sizep, u64 align)
565{
566 u64 addr, last;
567
568 addr = round_up(ei_start, align);
569 if (addr < start)
570 addr = round_up(start, align);
571 if (addr >= ei_last)
572 goto out;
573 *sizep = ei_last - addr;
574 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
575 ;
576 last = addr + *sizep;
577 if (last > ei_last)
578 goto out;
579
580 return addr;
581
582out:
583 return -1ULL;
584}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exit.c b/kernel/exit.c
index 45ed043b8bf5..eabca5a73a85 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
55#include <asm/unistd.h> 55#include <asm/unistd.h>
56#include <asm/pgtable.h> 56#include <asm/pgtable.h>
57#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59 58
60static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
61 60
@@ -87,7 +86,7 @@ static void __exit_signal(struct task_struct *tsk)
87 86
88 sighand = rcu_dereference_check(tsk->sighand, 87 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() || 88 rcu_read_lock_held() ||
90 lockdep_is_held(&tasklist_lock)); 89 lockdep_tasklist_lock_is_held());
91 spin_lock(&sighand->siglock); 90 spin_lock(&sighand->siglock);
92 91
93 posix_cpu_timers_exit(tsk); 92 posix_cpu_timers_exit(tsk);
@@ -952,7 +951,9 @@ NORET_TYPE void do_exit(long code)
952 preempt_count()); 951 preempt_count());
953 952
954 acct_update_integrals(tsk); 953 acct_update_integrals(tsk);
955 954 /* sync mm's RSS info before statistics gathering */
955 if (tsk->mm)
956 sync_mm_rss(tsk, tsk->mm);
956 group_dead = atomic_dec_and_test(&tsk->signal->live); 957 group_dead = atomic_dec_and_test(&tsk->signal->live);
957 if (group_dead) { 958 if (group_dead) {
958 hrtimer_cancel(&tsk->signal->real_timer); 959 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1188,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1188 1189
1189 if (unlikely(wo->wo_flags & WNOWAIT)) { 1190 if (unlikely(wo->wo_flags & WNOWAIT)) {
1190 int exit_code = p->exit_code; 1191 int exit_code = p->exit_code;
1191 int why, status; 1192 int why;
1192 1193
1193 get_task_struct(p); 1194 get_task_struct(p);
1194 read_unlock(&tasklist_lock); 1195 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 17bbf093356d..4d57d9e3a6e9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */
86DEFINE_PER_CPU(unsigned long, process_counts) = 0; 86DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89EXPORT_SYMBOL_GPL(tasklist_lock); 89
90#ifdef CONFIG_PROVE_RCU
91int lockdep_tasklist_lock_is_held(void)
92{
93 return lockdep_is_held(&tasklist_lock);
94}
95EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
96#endif /* #ifdef CONFIG_PROVE_RCU */
90 97
91int nr_processes(void) 98int nr_processes(void)
92{ 99{
@@ -329,15 +336,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
329 if (!tmp) 336 if (!tmp)
330 goto fail_nomem; 337 goto fail_nomem;
331 *tmp = *mpnt; 338 *tmp = *mpnt;
339 INIT_LIST_HEAD(&tmp->anon_vma_chain);
332 pol = mpol_dup(vma_policy(mpnt)); 340 pol = mpol_dup(vma_policy(mpnt));
333 retval = PTR_ERR(pol); 341 retval = PTR_ERR(pol);
334 if (IS_ERR(pol)) 342 if (IS_ERR(pol))
335 goto fail_nomem_policy; 343 goto fail_nomem_policy;
336 vma_set_policy(tmp, pol); 344 vma_set_policy(tmp, pol);
345 if (anon_vma_fork(tmp, mpnt))
346 goto fail_nomem_anon_vma_fork;
337 tmp->vm_flags &= ~VM_LOCKED; 347 tmp->vm_flags &= ~VM_LOCKED;
338 tmp->vm_mm = mm; 348 tmp->vm_mm = mm;
339 tmp->vm_next = NULL; 349 tmp->vm_next = NULL;
340 anon_vma_link(tmp);
341 file = tmp->vm_file; 350 file = tmp->vm_file;
342 if (file) { 351 if (file) {
343 struct inode *inode = file->f_path.dentry->d_inode; 352 struct inode *inode = file->f_path.dentry->d_inode;
@@ -392,6 +401,8 @@ out:
392 flush_tlb_mm(oldmm); 401 flush_tlb_mm(oldmm);
393 up_write(&oldmm->mmap_sem); 402 up_write(&oldmm->mmap_sem);
394 return retval; 403 return retval;
404fail_nomem_anon_vma_fork:
405 mpol_put(pol);
395fail_nomem_policy: 406fail_nomem_policy:
396 kmem_cache_free(vm_area_cachep, tmp); 407 kmem_cache_free(vm_area_cachep, tmp);
397fail_nomem: 408fail_nomem:
@@ -455,8 +466,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
455 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 466 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
456 mm->core_state = NULL; 467 mm->core_state = NULL;
457 mm->nr_ptes = 0; 468 mm->nr_ptes = 0;
458 set_mm_counter(mm, file_rss, 0); 469 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
459 set_mm_counter(mm, anon_rss, 0);
460 spin_lock_init(&mm->page_table_lock); 470 spin_lock_init(&mm->page_table_lock);
461 mm->free_area_cache = TASK_UNMAPPED_BASE; 471 mm->free_area_cache = TASK_UNMAPPED_BASE;
462 mm->cached_hole_size = ~0UL; 472 mm->cached_hole_size = ~0UL;
@@ -825,23 +835,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
825 */ 835 */
826static void posix_cpu_timers_init_group(struct signal_struct *sig) 836static void posix_cpu_timers_init_group(struct signal_struct *sig)
827{ 837{
838 unsigned long cpu_limit;
839
828 /* Thread group counters. */ 840 /* Thread group counters. */
829 thread_group_cputime_init(sig); 841 thread_group_cputime_init(sig);
830 842
831 /* Expiration times and increments. */ 843 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
832 sig->it[CPUCLOCK_PROF].expires = cputime_zero; 844 if (cpu_limit != RLIM_INFINITY) {
833 sig->it[CPUCLOCK_PROF].incr = cputime_zero; 845 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
834 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
835 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
836
837 /* Cached expiration times. */
838 sig->cputime_expires.prof_exp = cputime_zero;
839 sig->cputime_expires.virt_exp = cputime_zero;
840 sig->cputime_expires.sched_exp = 0;
841
842 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
843 sig->cputime_expires.prof_exp =
844 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
845 sig->cputimer.running = 1; 846 sig->cputimer.running = 1;
846 } 847 }
847 848
@@ -858,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
858 if (clone_flags & CLONE_THREAD) 859 if (clone_flags & CLONE_THREAD)
859 return 0; 860 return 0;
860 861
861 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 862 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
862 tsk->signal = sig; 863 tsk->signal = sig;
863 if (!sig) 864 if (!sig)
864 return -ENOMEM; 865 return -ENOMEM;
@@ -866,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
866 atomic_set(&sig->count, 1); 867 atomic_set(&sig->count, 1);
867 atomic_set(&sig->live, 1); 868 atomic_set(&sig->live, 1);
868 init_waitqueue_head(&sig->wait_chldexit); 869 init_waitqueue_head(&sig->wait_chldexit);
869 sig->flags = 0;
870 if (clone_flags & CLONE_NEWPID) 870 if (clone_flags & CLONE_NEWPID)
871 sig->flags |= SIGNAL_UNKILLABLE; 871 sig->flags |= SIGNAL_UNKILLABLE;
872 sig->group_exit_code = 0;
873 sig->group_exit_task = NULL;
874 sig->group_stop_count = 0;
875 sig->curr_target = tsk; 872 sig->curr_target = tsk;
876 init_sigpending(&sig->shared_pending); 873 init_sigpending(&sig->shared_pending);
877 INIT_LIST_HEAD(&sig->posix_timers); 874 INIT_LIST_HEAD(&sig->posix_timers);
878 875
879 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 876 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
880 sig->it_real_incr.tv64 = 0;
881 sig->real_timer.function = it_real_fn; 877 sig->real_timer.function = it_real_fn;
882 878
883 sig->leader = 0; /* session leadership doesn't inherit */
884 sig->tty_old_pgrp = NULL;
885 sig->tty = NULL;
886
887 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
888 sig->gtime = cputime_zero;
889 sig->cgtime = cputime_zero;
890#ifndef CONFIG_VIRT_CPU_ACCOUNTING
891 sig->prev_utime = sig->prev_stime = cputime_zero;
892#endif
893 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
894 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
895 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
896 sig->maxrss = sig->cmaxrss = 0;
897 task_io_accounting_init(&sig->ioac);
898 sig->sum_sched_runtime = 0;
899 taskstats_tgid_init(sig);
900
901 task_lock(current->group_leader); 879 task_lock(current->group_leader);
902 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 880 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
903 task_unlock(current->group_leader); 881 task_unlock(current->group_leader);
904 882
905 posix_cpu_timers_init_group(sig); 883 posix_cpu_timers_init_group(sig);
906 884
907 acct_init_pacct(&sig->pacct);
908
909 tty_audit_fork(sig); 885 tty_audit_fork(sig);
910 886
911 sig->oom_adj = current->signal->oom_adj; 887 sig->oom_adj = current->signal->oom_adj;
@@ -1034,7 +1010,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1034#endif 1010#endif
1035 retval = -EAGAIN; 1011 retval = -EAGAIN;
1036 if (atomic_read(&p->real_cred->user->processes) >= 1012 if (atomic_read(&p->real_cred->user->processes) >=
1037 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1013 task_rlimit(p, RLIMIT_NPROC)) {
1038 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1014 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1039 p->real_cred->user != INIT_USER) 1015 p->real_cred->user != INIT_USER)
1040 goto bad_fork_free; 1016 goto bad_fork_free;
@@ -1076,6 +1052,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1076 p->prev_utime = cputime_zero; 1052 p->prev_utime = cputime_zero;
1077 p->prev_stime = cputime_zero; 1053 p->prev_stime = cputime_zero;
1078#endif 1054#endif
1055#if defined(SPLIT_RSS_COUNTING)
1056 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1057#endif
1079 1058
1080 p->default_timer_slack_ns = current->timer_slack_ns; 1059 p->default_timer_slack_ns = current->timer_slack_ns;
1081 1060
@@ -1133,10 +1112,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1133 p->memcg_batch.memcg = NULL; 1112 p->memcg_batch.memcg = NULL;
1134#endif 1113#endif
1135 1114
1136 p->bts = NULL;
1137
1138 p->stack_start = stack_start;
1139
1140 /* Perform scheduler related setup. Assign this task to a CPU. */ 1115 /* Perform scheduler related setup. Assign this task to a CPU. */
1141 sched_fork(p, clone_flags); 1116 sched_fork(p, clone_flags);
1142 1117
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 967e66143e11..7a56b22e0602 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,29 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/smp.h> 45#include <linux/smp.h>
45 46
46#include <linux/hw_breakpoint.h> 47#include <linux/hw_breakpoint.h>
47 48
49
48/* 50/*
49 * Constraints data 51 * Constraints data
50 */ 52 */
51 53
52/* Number of pinned cpu breakpoints in a cpu */ 54/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 55static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
54 56
55/* Number of pinned task breakpoints in a cpu */ 57/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); 58static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
57 59
58/* Number of non-pinned cpu/task breakpoints in a cpu */ 60/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 61static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62
63static int nr_slots[TYPE_MAX];
64
65static int constraints_initialized;
60 66
61/* Gather the number of total pinned and un-pinned bp in a cpuset */ 67/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots { 68struct bp_busy_slots {
@@ -67,16 +73,29 @@ struct bp_busy_slots {
67/* Serialize accesses to the above constraints */ 73/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex); 74static DEFINE_MUTEX(nr_bp_mutex);
69 75
76__weak int hw_breakpoint_weight(struct perf_event *bp)
77{
78 return 1;
79}
80
81static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
82{
83 if (bp->attr.bp_type & HW_BREAKPOINT_RW)
84 return TYPE_DATA;
85
86 return TYPE_INST;
87}
88
70/* 89/*
71 * Report the maximum number of pinned breakpoints a task 90 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu 91 * have in this cpu
73 */ 92 */
74static unsigned int max_task_bp_pinned(int cpu) 93static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
75{ 94{
76 int i; 95 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 96 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
78 97
79 for (i = HBP_NUM -1; i >= 0; i--) { 98 for (i = nr_slots[type] - 1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0) 99 if (tsk_pinned[i] > 0)
81 return i + 1; 100 return i + 1;
82 } 101 }
@@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu)
84 return 0; 103 return 0;
85} 104}
86 105
87static int task_bp_pinned(struct task_struct *tsk) 106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
88{ 107{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp; 108 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list; 109 struct list_head *list;
@@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk)
105 */ 124 */
106 list_for_each_entry(bp, list, event_entry) { 125 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT) 126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++; 127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
109 } 129 }
110 130
111 raw_spin_unlock_irqrestore(&ctx->lock, flags); 131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk)
118 * a given cpu (cpu > -1) or in all of them (cpu = -1). 138 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */ 139 */
120static void 140static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) 141fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
142 enum bp_type_idx type)
122{ 143{
123 int cpu = bp->cpu; 144 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task; 145 struct task_struct *tsk = bp->ctx->task;
125 146
126 if (cpu >= 0) { 147 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 148 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
128 if (!tsk) 149 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu); 150 slots->pinned += max_task_bp_pinned(cpu, type);
130 else 151 else
131 slots->pinned += task_bp_pinned(tsk); 152 slots->pinned += task_bp_pinned(tsk, type);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu); 153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
133 154
134 return; 155 return;
135 } 156 }
@@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
137 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
138 unsigned int nr; 159 unsigned int nr;
139 160
140 nr = per_cpu(nr_cpu_bp_pinned, cpu); 161 nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
141 if (!tsk) 162 if (!tsk)
142 nr += max_task_bp_pinned(cpu); 163 nr += max_task_bp_pinned(cpu, type);
143 else 164 else
144 nr += task_bp_pinned(tsk); 165 nr += task_bp_pinned(tsk, type);
145 166
146 if (nr > slots->pinned) 167 if (nr > slots->pinned)
147 slots->pinned = nr; 168 slots->pinned = nr;
148 169
149 nr = per_cpu(nr_bp_flexible, cpu); 170 nr = per_cpu(nr_bp_flexible[type], cpu);
150 171
151 if (nr > slots->flexible) 172 if (nr > slots->flexible)
152 slots->flexible = nr; 173 slots->flexible = nr;
@@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
154} 175}
155 176
156/* 177/*
178 * For now, continue to consider flexible as pinned, until we can
179 * ensure no flexible event can ever be scheduled before a pinned event
180 * in a same cpu.
181 */
182static void
183fetch_this_slot(struct bp_busy_slots *slots, int weight)
184{
185 slots->pinned += weight;
186}
187
188/*
157 * Add a pinned breakpoint for the given task in our constraint table 189 * Add a pinned breakpoint for the given task in our constraint table
158 */ 190 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
192 enum bp_type_idx type, int weight)
160{ 193{
161 unsigned int *tsk_pinned; 194 unsigned int *tsk_pinned;
162 int count = 0; 195 int old_count = 0;
196 int old_idx = 0;
197 int idx = 0;
163 198
164 count = task_bp_pinned(tsk); 199 old_count = task_bp_pinned(tsk, type);
200 old_idx = old_count - 1;
201 idx = old_idx + weight;
165 202
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
167 if (enable) { 204 if (enable) {
168 tsk_pinned[count]++; 205 tsk_pinned[idx]++;
169 if (count > 0) 206 if (old_count > 0)
170 tsk_pinned[count-1]--; 207 tsk_pinned[old_idx]--;
171 } else { 208 } else {
172 tsk_pinned[count]--; 209 tsk_pinned[idx]--;
173 if (count > 0) 210 if (old_count > 0)
174 tsk_pinned[count-1]++; 211 tsk_pinned[old_idx]++;
175 } 212 }
176} 213}
177 214
178/* 215/*
179 * Add/remove the given breakpoint in our constraint table 216 * Add/remove the given breakpoint in our constraint table
180 */ 217 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable) 218static void
219toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
220 int weight)
182{ 221{
183 int cpu = bp->cpu; 222 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task; 223 struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
186 /* Pinned counter task profiling */ 225 /* Pinned counter task profiling */
187 if (tsk) { 226 if (tsk) {
188 if (cpu >= 0) { 227 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable); 228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
190 return; 229 return;
191 } 230 }
192 231
193 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable); 233 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
195 return; 234 return;
196 } 235 }
197 236
198 /* Pinned counter cpu profiling */ 237 /* Pinned counter cpu profiling */
199 if (enable) 238 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++; 239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
201 else 240 else
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--; 241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
203} 242}
204 243
205/* 244/*
@@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
246static int __reserve_bp_slot(struct perf_event *bp) 285static int __reserve_bp_slot(struct perf_event *bp)
247{ 286{
248 struct bp_busy_slots slots = {0}; 287 struct bp_busy_slots slots = {0};
288 enum bp_type_idx type;
289 int weight;
249 290
250 fetch_bp_busy_slots(&slots, bp); 291 /* We couldn't initialize breakpoint constraints on boot */
292 if (!constraints_initialized)
293 return -ENOMEM;
294
295 /* Basic checks */
296 if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
297 bp->attr.bp_type == HW_BREAKPOINT_INVALID)
298 return -EINVAL;
299
300 type = find_slot_idx(bp);
301 weight = hw_breakpoint_weight(bp);
302
303 fetch_bp_busy_slots(&slots, bp, type);
304 fetch_this_slot(&slots, weight);
251 305
252 /* Flexible counters need to keep at least one slot */ 306 /* Flexible counters need to keep at least one slot */
253 if (slots.pinned + (!!slots.flexible) == HBP_NUM) 307 if (slots.pinned + (!!slots.flexible) > nr_slots[type])
254 return -ENOSPC; 308 return -ENOSPC;
255 309
256 toggle_bp_slot(bp, true); 310 toggle_bp_slot(bp, true, type, weight);
257 311
258 return 0; 312 return 0;
259} 313}
@@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp)
273 327
274static void __release_bp_slot(struct perf_event *bp) 328static void __release_bp_slot(struct perf_event *bp)
275{ 329{
276 toggle_bp_slot(bp, false); 330 enum bp_type_idx type;
331 int weight;
332
333 type = find_slot_idx(bp);
334 weight = hw_breakpoint_weight(bp);
335 toggle_bp_slot(bp, false, type, weight);
277} 336}
278 337
279void release_bp_slot(struct perf_event *bp) 338void release_bp_slot(struct perf_event *bp)
@@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
308 return 0; 367 return 0;
309} 368}
310 369
370static int validate_hw_breakpoint(struct perf_event *bp)
371{
372 int ret;
373
374 ret = arch_validate_hwbkpt_settings(bp);
375 if (ret)
376 return ret;
377
378 if (arch_check_bp_in_kernelspace(bp)) {
379 if (bp->attr.exclude_kernel)
380 return -EINVAL;
381 /*
382 * Don't let unprivileged users set a breakpoint in the trap
383 * path to avoid trap recursion attacks.
384 */
385 if (!capable(CAP_SYS_ADMIN))
386 return -EPERM;
387 }
388
389 return 0;
390}
391
311int register_perf_hw_breakpoint(struct perf_event *bp) 392int register_perf_hw_breakpoint(struct perf_event *bp)
312{ 393{
313 int ret; 394 int ret;
@@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
316 if (ret) 397 if (ret)
317 return ret; 398 return ret;
318 399
319 /* 400 ret = validate_hw_breakpoint(bp);
320 * Ptrace breakpoints can be temporary perf events only
321 * meant to reserve a slot. In this case, it is created disabled and
322 * we don't want to check the params right now (as we put a null addr)
323 * But perf tools create events as disabled and we want to check
324 * the params for them.
325 * This is a quick hack that will be removed soon, once we remove
326 * the tmp breakpoints from ptrace
327 */
328 if (!bp->attr.disabled || !bp->overflow_handler)
329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
330 401
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */ 402 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret) 403 if (ret)
@@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
373 if (attr->disabled) 444 if (attr->disabled)
374 goto end; 445 goto end;
375 446
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 447 err = validate_hw_breakpoint(bp);
377 if (!err) 448 if (!err)
378 perf_event_enable(bp); 449 perf_event_enable(bp);
379 450
@@ -413,17 +484,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
413 * 484 *
414 * @return a set of per_cpu pointers to perf events 485 * @return a set of per_cpu pointers to perf events
415 */ 486 */
416struct perf_event ** 487struct perf_event * __percpu *
417register_wide_hw_breakpoint(struct perf_event_attr *attr, 488register_wide_hw_breakpoint(struct perf_event_attr *attr,
418 perf_overflow_handler_t triggered) 489 perf_overflow_handler_t triggered)
419{ 490{
420 struct perf_event **cpu_events, **pevent, *bp; 491 struct perf_event * __percpu *cpu_events, **pevent, *bp;
421 long err; 492 long err;
422 int cpu; 493 int cpu;
423 494
424 cpu_events = alloc_percpu(typeof(*cpu_events)); 495 cpu_events = alloc_percpu(typeof(*cpu_events));
425 if (!cpu_events) 496 if (!cpu_events)
426 return ERR_PTR(-ENOMEM); 497 return (void __percpu __force *)ERR_PTR(-ENOMEM);
427 498
428 get_online_cpus(); 499 get_online_cpus();
429 for_each_online_cpu(cpu) { 500 for_each_online_cpu(cpu) {
@@ -451,7 +522,7 @@ fail:
451 put_online_cpus(); 522 put_online_cpus();
452 523
453 free_percpu(cpu_events); 524 free_percpu(cpu_events);
454 return ERR_PTR(err); 525 return (void __percpu __force *)ERR_PTR(err);
455} 526}
456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 527EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
457 528
@@ -459,7 +530,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel 530 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
460 * @cpu_events: the per cpu set of events to unregister 531 * @cpu_events: the per cpu set of events to unregister
461 */ 532 */
462void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) 533void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
463{ 534{
464 int cpu; 535 int cpu;
465 struct perf_event **pevent; 536 struct perf_event **pevent;
@@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
480 551
481static int __init init_hw_breakpoint(void) 552static int __init init_hw_breakpoint(void)
482{ 553{
554 unsigned int **task_bp_pinned;
555 int cpu, err_cpu;
556 int i;
557
558 for (i = 0; i < TYPE_MAX; i++)
559 nr_slots[i] = hw_breakpoint_slots(i);
560
561 for_each_possible_cpu(cpu) {
562 for (i = 0; i < TYPE_MAX; i++) {
563 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
564 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
565 GFP_KERNEL);
566 if (!*task_bp_pinned)
567 goto err_alloc;
568 }
569 }
570
571 constraints_initialized = 1;
572
483 return register_die_notifier(&hw_breakpoint_exceptions_nb); 573 return register_die_notifier(&hw_breakpoint_exceptions_nb);
574
575 err_alloc:
576 for_each_possible_cpu(err_cpu) {
577 if (err_cpu == cpu)
578 break;
579 for (i = 0; i < TYPE_MAX; i++)
580 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
581 }
582
583 return -ENOMEM;
484} 584}
485core_initcall(init_hw_breakpoint); 585core_initcall(init_hw_breakpoint);
486 586
@@ -489,5 +589,4 @@ struct pmu perf_ops_bp = {
489 .enable = arch_install_hw_breakpoint, 589 .enable = arch_install_hw_breakpoint,
490 .disable = arch_uninstall_hw_breakpoint, 590 .disable = arch_uninstall_hw_breakpoint,
491 .read = hw_breakpoint_pmu_read, 591 .read = hw_breakpoint_pmu_read,
492 .unthrottle = hw_breakpoint_pmu_unthrottle
493}; 592};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
85 raw_spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 100}
87 101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
88 122
89/** 123/**
90 * set_irq_chip - set the irq chip for an irq 124 * set_irq_chip - set the irq chip for an irq
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
325 if (desc->chip->ack) 359 if (desc->chip->ack)
326 desc->chip->ack(irq); 360 desc->chip->ack(irq);
327 } 361 }
362 desc->status |= IRQ_MASKED;
363}
364
365static inline void mask_irq(struct irq_desc *desc, int irq)
366{
367 if (desc->chip->mask) {
368 desc->chip->mask(irq);
369 desc->status |= IRQ_MASKED;
370 }
371}
372
373static inline void unmask_irq(struct irq_desc *desc, int irq)
374{
375 if (desc->chip->unmask) {
376 desc->chip->unmask(irq);
377 desc->status &= ~IRQ_MASKED;
378 }
328} 379}
329 380
330/* 381/*
@@ -450,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
450 raw_spin_lock(&desc->lock); 501 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 502 desc->status &= ~IRQ_INPROGRESS;
452 503
453 if (unlikely(desc->status & IRQ_ONESHOT)) 504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
454 desc->status |= IRQ_MASKED; 505 unmask_irq(desc, irq);
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq);
457out_unlock: 506out_unlock:
458 raw_spin_unlock(&desc->lock); 507 raw_spin_unlock(&desc->lock);
459} 508}
@@ -490,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
490 action = desc->action; 539 action = desc->action;
491 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
492 desc->status |= IRQ_PENDING; 541 desc->status |= IRQ_PENDING;
493 if (desc->chip->mask) 542 mask_irq(desc, irq);
494 desc->chip->mask(irq);
495 goto out; 543 goto out;
496 } 544 }
497 545
@@ -520,7 +568,7 @@ out:
520 * signal. The occurence is latched into the irq controller hardware 568 * signal. The occurence is latched into the irq controller hardware
521 * and must be acked in order to be reenabled. After the ack another 569 * and must be acked in order to be reenabled. After the ack another
522 * interrupt can happen on the same source even before the first one 570 * interrupt can happen on the same source even before the first one
523 * is handled by the assosiacted event handler. If this happens it 571 * is handled by the associated event handler. If this happens it
524 * might be necessary to disable (mask) the interrupt depending on the 572 * might be necessary to disable (mask) the interrupt depending on the
525 * controller hardware. This requires to reenable the interrupt inside 573 * controller hardware. This requires to reenable the interrupt inside
526 * of the loop which handles the interrupts which have arrived while 574 * of the loop which handles the interrupts which have arrived while
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
559 irqreturn_t action_ret; 607 irqreturn_t action_ret;
560 608
561 if (unlikely(!action)) { 609 if (unlikely(!action)) {
562 desc->chip->mask(irq); 610 mask_irq(desc, irq);
563 goto out_unlock; 611 goto out_unlock;
564 } 612 }
565 613
@@ -571,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
571 if (unlikely((desc->status & 619 if (unlikely((desc->status &
572 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
573 (IRQ_PENDING | IRQ_MASKED))) { 621 (IRQ_PENDING | IRQ_MASKED))) {
574 desc->chip->unmask(irq); 622 unmask_irq(desc, irq);
575 desc->status &= ~IRQ_MASKED;
576 } 623 }
577 624
578 desc->status &= ~IRQ_PENDING; 625 desc->status &= ~IRQ_PENDING;
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
682 __set_irq_handler(irq, handle, 0, name); 729 __set_irq_handler(irq, handle, 0, name);
683} 730}
684 731
685void __init set_irq_noprobe(unsigned int irq) 732void set_irq_noprobe(unsigned int irq)
686{ 733{
687 struct irq_desc *desc = irq_to_desc(irq); 734 struct irq_desc *desc = irq_to_desc(irq);
688 unsigned long flags; 735 unsigned long flags;
@@ -697,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
697 raw_spin_unlock_irqrestore(&desc->lock, flags); 744 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 745}
699 746
700void __init set_irq_probe(unsigned int irq) 747void set_irq_probe(unsigned int irq)
701{ 748{
702 struct irq_desc *desc = irq_to_desc(irq); 749 struct irq_desc *desc = irq_to_desc(irq);
703 unsigned long flags; 750 unsigned long flags;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * automatically freed on driver detach. 42 * automatically freed on driver detach.
43 * 43 *
44 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
45 * separately, dev_free_irq() must be used. 45 * separately, devm_free_irq() must be used.
46 */ 46 */
47int devm_request_threaded_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
48 irq_handler_t handler, irq_handler_t thread_fn, 48 irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
81 * Except for the extra @dev argument, this function takes the 81 * Except for the extra @dev argument, this function takes the
82 * same arguments and performs the same function as free_irq(). 82 * same arguments and performs the same function as free_irq().
83 * This function instead of free_irq() should be used to manually 83 * This function instead of free_irq() should be used to manually
84 * free IRQs allocated with dev_request_irq(). 84 * free IRQs allocated with devm_request_irq().
85 */ 85 */
86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) 86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
132 */ 128 */
133DEFINE_RAW_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..704e488730a5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
382{ 382{
383 struct irq_desc *desc = irq_to_desc(irq); 383 struct irq_desc *desc = irq_to_desc(irq);
384 struct irqaction *action; 384 struct irqaction *action;
385 unsigned long flags;
385 386
386 if (!desc) 387 if (!desc)
387 return 0; 388 return 0;
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
389 if (desc->status & IRQ_NOREQUEST) 390 if (desc->status & IRQ_NOREQUEST)
390 return 0; 391 return 0;
391 392
393 raw_spin_lock_irqsave(&desc->lock, flags);
392 action = desc->action; 394 action = desc->action;
393 if (action) 395 if (action)
394 if (irqflags & action->flags & IRQF_SHARED) 396 if (irqflags & action->flags & IRQF_SHARED)
395 action = NULL; 397 action = NULL;
396 398
399 raw_spin_unlock_irqrestore(&desc->lock, flags);
400
397 return !action; 401 return !action;
398} 402}
399 403
@@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
483 */ 487 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 488static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 489{
490again:
486 chip_bus_lock(irq, desc); 491 chip_bus_lock(irq, desc);
487 raw_spin_lock_irq(&desc->lock); 492 raw_spin_lock_irq(&desc->lock);
493
494 /*
495 * Implausible though it may be we need to protect us against
496 * the following scenario:
497 *
498 * The thread is faster done than the hard interrupt handler
499 * on the other CPU. If we unmask the irq line then the
500 * interrupt can come in again and masks the line, leaves due
501 * to IRQ_INPROGRESS and the irq line is masked forever.
502 */
503 if (unlikely(desc->status & IRQ_INPROGRESS)) {
504 raw_spin_unlock_irq(&desc->lock);
505 chip_bus_sync_unlock(irq, desc);
506 cpu_relax();
507 goto again;
508 }
509
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 510 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 511 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 512 desc->chip->unmask(irq);
@@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 if (new->flags & IRQF_ONESHOT) 757 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT; 758 desc->status |= IRQ_ONESHOT;
737 759
760 /*
761 * Force MSI interrupts to run with interrupts
762 * disabled. The multi vector cards can cause stack
763 * overflows due to nested interrupts when enough of
764 * them are directed to a core and fire at the same
765 * time.
766 */
767 if (desc->msi_desc)
768 new->flags |= IRQF_DISABLED;
769
738 if (!(desc->status & IRQ_NOAUTOEN)) { 770 if (!(desc->status & IRQ_NOAUTOEN)) {
739 desc->depth = 0; 771 desc->depth = 0;
740 desc->status &= ~IRQ_DISABLED; 772 desc->status &= ~IRQ_DISABLED;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
6 */ 6 */
7 7
8#include <linux/irq.h> 8#include <linux/irq.h>
9#include <linux/slab.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/random.h> 11#include <linux/random.h>
11#include <linux/interrupt.h> 12#include <linux/interrupt.h>
@@ -70,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 72
72 /* We have to check it to avoid races with another CPU */ 73 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 74 desc = irq_to_desc(irq);
74 75
75 if (desc && old_desc != desc) 76 if (desc && old_desc != desc)
76 goto out_unlock; 77 goto out_unlock;
@@ -90,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 91 goto out_unlock;
91 } 92 }
92 93
93 irq_desc_ptrs[irq] = desc; 94 replace_irq_desc(irq, desc);
94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 96
96 /* free the old one */ 97 /* free the old one */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c0..7a6eb04ef6b5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/gfp.h>
10#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
11#include <linux/seq_file.h> 12#include <linux/seq_file.h>
12#include <linux/interrupt.h> 13#include <linux/interrupt.h>
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8e5288a8a355..13aff293f4de 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -21,6 +21,7 @@
21#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/ctype.h> 23#include <linux/ctype.h>
24#include <linux/slab.h>
24 25
25#include <asm/sections.h> 26#include <asm/sections.h>
26 27
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..474a84715eac 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
41#include <asm/sections.h> 41#include <asm/sections.h>
42 42
43/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
44note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
45 45
46/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
@@ -1134,11 +1134,9 @@ int crash_shrink_memory(unsigned long new_size)
1134 1134
1135 free_reserved_phys_range(end, crashk_res.end); 1135 free_reserved_phys_range(end, crashk_res.end);
1136 1136
1137 if (start == end) { 1137 if (start == end)
1138 crashk_res.end = end;
1139 release_resource(&crashk_res); 1138 release_resource(&crashk_res);
1140 } else 1139 crashk_res.end = end - 1;
1141 crashk_res.end = end - 1;
1142 1140
1143unlock: 1141unlock:
1144 mutex_unlock(&kexec_mutex); 1142 mutex_unlock(&kexec_mutex);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 761fdd2b3034..11f3515ca83f 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -69,9 +69,16 @@ struct kgdb_state {
69 struct pt_regs *linux_regs; 69 struct pt_regs *linux_regs;
70}; 70};
71 71
72/* Exception state values */
73#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
74#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
75#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
76#define DCPU_SSTEP 0x8 /* CPU is single stepping */
77
72static struct debuggerinfo_struct { 78static struct debuggerinfo_struct {
73 void *debuggerinfo; 79 void *debuggerinfo;
74 struct task_struct *task; 80 struct task_struct *task;
81 int exception_state;
75} kgdb_info[NR_CPUS]; 82} kgdb_info[NR_CPUS];
76 83
77/** 84/**
@@ -391,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
391 398
392/* 399/*
393 * Copy the binary array pointed to by buf into mem. Fix $, #, and 400 * Copy the binary array pointed to by buf into mem. Fix $, #, and
394 * 0x7d escaped with 0x7d. Return a pointer to the character after 401 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
395 * the last byte written. 402 * The input buf is overwitten with the result to write to mem.
396 */ 403 */
397static int kgdb_ebin2mem(char *buf, char *mem, int count) 404static int kgdb_ebin2mem(char *buf, char *mem, int count)
398{ 405{
399 int err = 0; 406 int size = 0;
400 char c; 407 char *c = buf;
401 408
402 while (count-- > 0) { 409 while (count-- > 0) {
403 c = *buf++; 410 c[size] = *buf++;
404 if (c == 0x7d) 411 if (c[size] == 0x7d)
405 c = *buf++ ^ 0x20; 412 c[size] = *buf++ ^ 0x20;
406 413 size++;
407 err = probe_kernel_write(mem, &c, 1);
408 if (err)
409 break;
410
411 mem++;
412 } 414 }
413 415
414 return err; 416 return probe_kernel_write(mem, c, size);
415} 417}
416 418
417/* 419/*
@@ -563,49 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
563} 565}
564 566
565/* 567/*
566 * CPU debug state control:
567 */
568
569#ifdef CONFIG_SMP
570static void kgdb_wait(struct pt_regs *regs)
571{
572 unsigned long flags;
573 int cpu;
574
575 local_irq_save(flags);
576 cpu = raw_smp_processor_id();
577 kgdb_info[cpu].debuggerinfo = regs;
578 kgdb_info[cpu].task = current;
579 /*
580 * Make sure the above info reaches the primary CPU before
581 * our cpu_in_kgdb[] flag setting does:
582 */
583 smp_wmb();
584 atomic_set(&cpu_in_kgdb[cpu], 1);
585
586 /* Disable any cpu specific hw breakpoints */
587 kgdb_disable_hw_debug(regs);
588
589 /* Wait till primary CPU is done with debugging */
590 while (atomic_read(&passive_cpu_wait[cpu]))
591 cpu_relax();
592
593 kgdb_info[cpu].debuggerinfo = NULL;
594 kgdb_info[cpu].task = NULL;
595
596 /* fix up hardware debug registers on local cpu */
597 if (arch_kgdb_ops.correct_hw_break)
598 arch_kgdb_ops.correct_hw_break();
599
600 /* Signal the primary CPU that we are done: */
601 atomic_set(&cpu_in_kgdb[cpu], 0);
602 touch_softlockup_watchdog_sync();
603 clocksource_touch_watchdog();
604 local_irq_restore(flags);
605}
606#endif
607
608/*
609 * Some architectures need cache flushes when we set/clear a 568 * Some architectures need cache flushes when we set/clear a
610 * breakpoint: 569 * breakpoint:
611 */ 570 */
@@ -1400,34 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
1400 return 1; 1359 return 1;
1401} 1360}
1402 1361
1403/* 1362static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
1404 * kgdb_handle_exception() - main entry point from a kernel exception
1405 *
1406 * Locking hierarchy:
1407 * interface locks, if any (begin_session)
1408 * kgdb lock (kgdb_active)
1409 */
1410int
1411kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1412{ 1363{
1413 struct kgdb_state kgdb_var;
1414 struct kgdb_state *ks = &kgdb_var;
1415 unsigned long flags; 1364 unsigned long flags;
1416 int sstep_tries = 100; 1365 int sstep_tries = 100;
1417 int error = 0; 1366 int error = 0;
1418 int i, cpu; 1367 int i, cpu;
1419 1368 int trace_on = 0;
1420 ks->cpu = raw_smp_processor_id();
1421 ks->ex_vector = evector;
1422 ks->signo = signo;
1423 ks->ex_vector = evector;
1424 ks->err_code = ecode;
1425 ks->kgdb_usethreadid = 0;
1426 ks->linux_regs = regs;
1427
1428 if (kgdb_reenter_check(ks))
1429 return 0; /* Ouch, double exception ! */
1430
1431acquirelock: 1369acquirelock:
1432 /* 1370 /*
1433 * Interrupts will be restored by the 'trap return' code, except when 1371 * Interrupts will be restored by the 'trap return' code, except when
@@ -1435,13 +1373,43 @@ acquirelock:
1435 */ 1373 */
1436 local_irq_save(flags); 1374 local_irq_save(flags);
1437 1375
1438 cpu = raw_smp_processor_id(); 1376 cpu = ks->cpu;
1377 kgdb_info[cpu].debuggerinfo = regs;
1378 kgdb_info[cpu].task = current;
1379 /*
1380 * Make sure the above info reaches the primary CPU before
1381 * our cpu_in_kgdb[] flag setting does:
1382 */
1383 atomic_inc(&cpu_in_kgdb[cpu]);
1439 1384
1440 /* 1385 /*
1441 * Acquire the kgdb_active lock: 1386 * CPU will loop if it is a slave or request to become a kgdb
1387 * master cpu and acquire the kgdb_active lock:
1442 */ 1388 */
1443 while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) 1389 while (1) {
1390 if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
1391 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
1392 break;
1393 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
1394 if (!atomic_read(&passive_cpu_wait[cpu]))
1395 goto return_normal;
1396 } else {
1397return_normal:
1398 /* Return to normal operation by executing any
1399 * hw breakpoint fixup.
1400 */
1401 if (arch_kgdb_ops.correct_hw_break)
1402 arch_kgdb_ops.correct_hw_break();
1403 if (trace_on)
1404 tracing_on();
1405 atomic_dec(&cpu_in_kgdb[cpu]);
1406 touch_softlockup_watchdog_sync();
1407 clocksource_touch_watchdog();
1408 local_irq_restore(flags);
1409 return 0;
1410 }
1444 cpu_relax(); 1411 cpu_relax();
1412 }
1445 1413
1446 /* 1414 /*
1447 * For single stepping, try to only enter on the processor 1415 * For single stepping, try to only enter on the processor
@@ -1475,9 +1443,6 @@ acquirelock:
1475 if (kgdb_io_ops->pre_exception) 1443 if (kgdb_io_ops->pre_exception)
1476 kgdb_io_ops->pre_exception(); 1444 kgdb_io_ops->pre_exception();
1477 1445
1478 kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
1479 kgdb_info[ks->cpu].task = current;
1480
1481 kgdb_disable_hw_debug(ks->linux_regs); 1446 kgdb_disable_hw_debug(ks->linux_regs);
1482 1447
1483 /* 1448 /*
@@ -1486,15 +1451,9 @@ acquirelock:
1486 */ 1451 */
1487 if (!kgdb_single_step) { 1452 if (!kgdb_single_step) {
1488 for (i = 0; i < NR_CPUS; i++) 1453 for (i = 0; i < NR_CPUS; i++)
1489 atomic_set(&passive_cpu_wait[i], 1); 1454 atomic_inc(&passive_cpu_wait[i]);
1490 } 1455 }
1491 1456
1492 /*
1493 * spin_lock code is good enough as a barrier so we don't
1494 * need one here:
1495 */
1496 atomic_set(&cpu_in_kgdb[ks->cpu], 1);
1497
1498#ifdef CONFIG_SMP 1457#ifdef CONFIG_SMP
1499 /* Signal the other CPUs to enter kgdb_wait() */ 1458 /* Signal the other CPUs to enter kgdb_wait() */
1500 if ((!kgdb_single_step) && kgdb_do_roundup) 1459 if ((!kgdb_single_step) && kgdb_do_roundup)
@@ -1518,6 +1477,9 @@ acquirelock:
1518 kgdb_single_step = 0; 1477 kgdb_single_step = 0;
1519 kgdb_contthread = current; 1478 kgdb_contthread = current;
1520 exception_level = 0; 1479 exception_level = 0;
1480 trace_on = tracing_is_on();
1481 if (trace_on)
1482 tracing_off();
1521 1483
1522 /* Talk to debugger with gdbserial protocol */ 1484 /* Talk to debugger with gdbserial protocol */
1523 error = gdb_serial_stub(ks); 1485 error = gdb_serial_stub(ks);
@@ -1526,13 +1488,11 @@ acquirelock:
1526 if (kgdb_io_ops->post_exception) 1488 if (kgdb_io_ops->post_exception)
1527 kgdb_io_ops->post_exception(); 1489 kgdb_io_ops->post_exception();
1528 1490
1529 kgdb_info[ks->cpu].debuggerinfo = NULL; 1491 atomic_dec(&cpu_in_kgdb[ks->cpu]);
1530 kgdb_info[ks->cpu].task = NULL;
1531 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1532 1492
1533 if (!kgdb_single_step) { 1493 if (!kgdb_single_step) {
1534 for (i = NR_CPUS-1; i >= 0; i--) 1494 for (i = NR_CPUS-1; i >= 0; i--)
1535 atomic_set(&passive_cpu_wait[i], 0); 1495 atomic_dec(&passive_cpu_wait[i]);
1536 /* 1496 /*
1537 * Wait till all the CPUs have quit 1497 * Wait till all the CPUs have quit
1538 * from the debugger. 1498 * from the debugger.
@@ -1551,6 +1511,8 @@ kgdb_restore:
1551 else 1511 else
1552 kgdb_sstep_pid = 0; 1512 kgdb_sstep_pid = 0;
1553 } 1513 }
1514 if (trace_on)
1515 tracing_on();
1554 /* Free kgdb_active */ 1516 /* Free kgdb_active */
1555 atomic_set(&kgdb_active, -1); 1517 atomic_set(&kgdb_active, -1);
1556 touch_softlockup_watchdog_sync(); 1518 touch_softlockup_watchdog_sync();
@@ -1560,13 +1522,52 @@ kgdb_restore:
1560 return error; 1522 return error;
1561} 1523}
1562 1524
1525/*
1526 * kgdb_handle_exception() - main entry point from a kernel exception
1527 *
1528 * Locking hierarchy:
1529 * interface locks, if any (begin_session)
1530 * kgdb lock (kgdb_active)
1531 */
1532int
1533kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1534{
1535 struct kgdb_state kgdb_var;
1536 struct kgdb_state *ks = &kgdb_var;
1537 int ret;
1538
1539 ks->cpu = raw_smp_processor_id();
1540 ks->ex_vector = evector;
1541 ks->signo = signo;
1542 ks->ex_vector = evector;
1543 ks->err_code = ecode;
1544 ks->kgdb_usethreadid = 0;
1545 ks->linux_regs = regs;
1546
1547 if (kgdb_reenter_check(ks))
1548 return 0; /* Ouch, double exception ! */
1549 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
1550 ret = kgdb_cpu_enter(ks, regs);
1551 kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
1552 return ret;
1553}
1554
1563int kgdb_nmicallback(int cpu, void *regs) 1555int kgdb_nmicallback(int cpu, void *regs)
1564{ 1556{
1565#ifdef CONFIG_SMP 1557#ifdef CONFIG_SMP
1558 struct kgdb_state kgdb_var;
1559 struct kgdb_state *ks = &kgdb_var;
1560
1561 memset(ks, 0, sizeof(struct kgdb_state));
1562 ks->cpu = cpu;
1563 ks->linux_regs = regs;
1564
1566 if (!atomic_read(&cpu_in_kgdb[cpu]) && 1565 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1567 atomic_read(&kgdb_active) != cpu && 1566 atomic_read(&kgdb_active) != -1 &&
1568 atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { 1567 atomic_read(&kgdb_active) != cpu) {
1569 kgdb_wait((struct pt_regs *)regs); 1568 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
1569 kgdb_cpu_enter(ks, regs);
1570 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
1570 return 0; 1571 return 0;
1571 } 1572 }
1572#endif 1573#endif
@@ -1742,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1742 */ 1743 */
1743void kgdb_breakpoint(void) 1744void kgdb_breakpoint(void)
1744{ 1745{
1745 atomic_set(&kgdb_setting_breakpoint, 1); 1746 atomic_inc(&kgdb_setting_breakpoint);
1746 wmb(); /* Sync point before breakpoint */ 1747 wmb(); /* Sync point before breakpoint */
1747 arch_kgdb_breakpoint(); 1748 arch_kgdb_breakpoint();
1748 wmb(); /* Sync point after breakpoint */ 1749 wmb(); /* Sync point after breakpoint */
1749 atomic_set(&kgdb_setting_breakpoint, 0); 1750 atomic_dec(&kgdb_setting_breakpoint);
1750} 1751}
1751EXPORT_SYMBOL_GPL(kgdb_breakpoint); 1752EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1752 1753
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ccec774c716d..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,9 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
47#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h>
48 50
49#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
50#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
105 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
106 * is a recipe for disaster 108 * is a recipe for disaster
107 */ 109 */
108#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
109
110struct kprobe_insn_page { 110struct kprobe_insn_page {
111 struct list_head list; 111 struct list_head list;
112 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
113 char slot_used[INSNS_PER_PAGE];
114 int nused; 113 int nused;
115 int ngarbage; 114 int ngarbage;
115 char slot_used[];
116};
117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
116}; 126};
117 127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
118enum kprobe_slot_state { 133enum kprobe_slot_state {
119 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
120 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
121 SLOT_USED = 2, 136 SLOT_USED = 2,
122}; 137};
123 138
124static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
125static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
126static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
127static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
143 .nr_garbage = 0,
144};
145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
128 146
129/** 147/**
130 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
131 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
132 */ 150 */
133static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
134{ 152{
135 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
136 154
137 retry: 155 retry:
138 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
139 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
140 int i; 158 int i;
141 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
142 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
143 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
144 kip->nused++; 162 kip->nused++;
145 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
146 } 164 }
147 } 165 }
148 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
149 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
150 } 169 }
151 } 170 }
152 171
153 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
154 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
155 goto retry; 174 goto retry;
156 } 175
157 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
158 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
159 if (!kip) 178 if (!kip)
160 return NULL; 179 return NULL;
161 180
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
170 return NULL; 189 return NULL;
171 } 190 }
172 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
173 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
174 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
175 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
176 kip->nused = 1; 194 kip->nused = 1;
177 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
178 return kip->insns; 197 return kip->insns;
179} 198}
180 199
200
181kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
182{ 202{
183 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
184 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
185 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
186 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
187 return ret; 209 return ret;
188} 210}
189 211
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
199 * so as not to have to set it up again the 221 * so as not to have to set it up again the
200 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
201 */ 223 */
202 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
203 list_del(&kip->list); 225 list_del(&kip->list);
204 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
205 kfree(kip); 227 kfree(kip);
@@ -209,51 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
209 return 0; 231 return 0;
210} 232}
211 233
212static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
213{ 235{
214 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
215 237
216 /* Ensure no-one is interrupted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
217 synchronize_sched(); 239 synchronize_sched();
218 240
219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
220 int i; 242 int i;
221 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
222 continue; 244 continue;
223 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
224 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
225 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
226 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
227 break; 249 break;
228 } 250 }
229 } 251 }
230 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
231 return 0; 253 return 0;
232} 254}
233 255
234void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
235{ 258{
236 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
237 260
238 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
239 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) /
240 if (kip->insns <= slot && 263 (c->insn_size * sizeof(kprobe_opcode_t));
241 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 if (idx >= 0 && idx < slots_per_page(c)) {
242 int i = (slot - kip->insns) / MAX_INSN_SIZE; 265 WARN_ON(kip->slot_used[idx] != SLOT_USED);
243 if (dirty) { 266 if (dirty) {
244 kip->slot_used[i] = SLOT_DIRTY; 267 kip->slot_used[idx] = SLOT_DIRTY;
245 kip->ngarbage++; 268 kip->ngarbage++;
269 if (++c->nr_garbage > slots_per_page(c))
270 collect_garbage_slots(c);
246 } else 271 } else
247 collect_one_slot(kip, i); 272 collect_one_slot(kip, idx);
248 break; 273 return;
249 } 274 }
250 } 275 }
276 /* Could not free this slot. */
277 WARN_ON(1);
278}
251 279
252 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 280void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
253 collect_garbage_slots(); 281{
254 282 mutex_lock(&kprobe_insn_mutex);
283 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
255 mutex_unlock(&kprobe_insn_mutex); 284 mutex_unlock(&kprobe_insn_mutex);
256} 285}
286#ifdef CONFIG_OPTPROBES
287/* For optimized_kprobe buffer */
288static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
289static struct kprobe_insn_cache kprobe_optinsn_slots = {
290 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
291 /* .insn_size is initialized later */
292 .nr_garbage = 0,
293};
294/* Get a slot for optimized_kprobe buffer */
295kprobe_opcode_t __kprobes *get_optinsn_slot(void)
296{
297 kprobe_opcode_t *ret = NULL;
298
299 mutex_lock(&kprobe_optinsn_mutex);
300 ret = __get_insn_slot(&kprobe_optinsn_slots);
301 mutex_unlock(&kprobe_optinsn_mutex);
302
303 return ret;
304}
305
306void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
307{
308 mutex_lock(&kprobe_optinsn_mutex);
309 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
310 mutex_unlock(&kprobe_optinsn_mutex);
311}
312#endif
257#endif 313#endif
258 314
259/* We have preemption disabled.. so it is safe to use __ versions */ 315/* We have preemption disabled.. so it is safe to use __ versions */
@@ -284,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
284 if (p->addr == addr) 340 if (p->addr == addr)
285 return p; 341 return p;
286 } 342 }
343
287 return NULL; 344 return NULL;
288} 345}
289 346
347static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
348
349/* Return true if the kprobe is an aggregator */
350static inline int kprobe_aggrprobe(struct kprobe *p)
351{
352 return p->pre_handler == aggr_pre_handler;
353}
354
355/*
356 * Keep all fields in the kprobe consistent
357 */
358static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
359{
360 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
361 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
362}
363
364#ifdef CONFIG_OPTPROBES
365/* NOTE: change this value only with kprobe_mutex held */
366static bool kprobes_allow_optimization;
367
368/*
369 * Call all pre_handler on the list, but ignores its return value.
370 * This must be called from arch-dep optimized caller.
371 */
372void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
373{
374 struct kprobe *kp;
375
376 list_for_each_entry_rcu(kp, &p->list, list) {
377 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
378 set_kprobe_instance(kp);
379 kp->pre_handler(kp, regs);
380 }
381 reset_kprobe_instance();
382 }
383}
384
385/* Return true(!0) if the kprobe is ready for optimization. */
386static inline int kprobe_optready(struct kprobe *p)
387{
388 struct optimized_kprobe *op;
389
390 if (kprobe_aggrprobe(p)) {
391 op = container_of(p, struct optimized_kprobe, kp);
392 return arch_prepared_optinsn(&op->optinsn);
393 }
394
395 return 0;
396}
397
398/*
399 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint).
401 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{
404 int i;
405 struct kprobe *p = NULL;
406 struct optimized_kprobe *op;
407
408 /* Don't check i == 0, since that is a breakpoint case. */
409 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
410 p = get_kprobe((void *)(addr - i));
411
412 if (p && kprobe_optready(p)) {
413 op = container_of(p, struct optimized_kprobe, kp);
414 if (arch_within_optimized_kprobe(op, addr))
415 return p;
416 }
417
418 return NULL;
419}
420
421/* Optimization staging list, protected by kprobe_mutex */
422static LIST_HEAD(optimizing_list);
423
424static void kprobe_optimizer(struct work_struct *work);
425static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
426#define OPTIMIZE_DELAY 5
427
428/* Kprobe jump optimizer */
429static __kprobes void kprobe_optimizer(struct work_struct *work)
430{
431 struct optimized_kprobe *op, *tmp;
432
433 /* Lock modules while optimizing kprobes */
434 mutex_lock(&module_mutex);
435 mutex_lock(&kprobe_mutex);
436 if (kprobes_all_disarmed || !kprobes_allow_optimization)
437 goto end;
438
439 /*
440 * Wait for quiesence period to ensure all running interrupts
441 * are done. Because optprobe may modify multiple instructions
442 * there is a chance that Nth instruction is interrupted. In that
443 * case, running interrupt can return to 2nd-Nth byte of jump
444 * instruction. This wait is for avoiding it.
445 */
446 synchronize_sched();
447
448 /*
449 * The optimization/unoptimization refers online_cpus via
450 * stop_machine() and cpu-hotplug modifies online_cpus.
451 * And same time, text_mutex will be held in cpu-hotplug and here.
452 * This combination can cause a deadlock (cpu-hotplug try to lock
453 * text_mutex but stop_machine can not be done because online_cpus
454 * has been changed)
455 * To avoid this deadlock, we need to call get_online_cpus()
456 * for preventing cpu-hotplug outside of text_mutex locking.
457 */
458 get_online_cpus();
459 mutex_lock(&text_mutex);
460 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
461 WARN_ON(kprobe_disabled(&op->kp));
462 if (arch_optimize_kprobe(op) < 0)
463 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
464 list_del_init(&op->list);
465 }
466 mutex_unlock(&text_mutex);
467 put_online_cpus();
468end:
469 mutex_unlock(&kprobe_mutex);
470 mutex_unlock(&module_mutex);
471}
472
473/* Optimize kprobe if p is ready to be optimized */
474static __kprobes void optimize_kprobe(struct kprobe *p)
475{
476 struct optimized_kprobe *op;
477
478 /* Check if the kprobe is disabled or not ready for optimization. */
479 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
480 (kprobe_disabled(p) || kprobes_all_disarmed))
481 return;
482
483 /* Both of break_handler and post_handler are not supported. */
484 if (p->break_handler || p->post_handler)
485 return;
486
487 op = container_of(p, struct optimized_kprobe, kp);
488
489 /* Check there is no other kprobes at the optimized instructions */
490 if (arch_check_optimized_kprobe(op) < 0)
491 return;
492
493 /* Check if it is already optimized. */
494 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
495 return;
496
497 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
498 list_add(&op->list, &optimizing_list);
499 if (!delayed_work_pending(&optimizing_work))
500 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
501}
502
503/* Unoptimize a kprobe if p is optimized */
504static __kprobes void unoptimize_kprobe(struct kprobe *p)
505{
506 struct optimized_kprobe *op;
507
508 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
509 op = container_of(p, struct optimized_kprobe, kp);
510 if (!list_empty(&op->list))
511 /* Dequeue from the optimization queue */
512 list_del_init(&op->list);
513 else
514 /* Replace jump with break */
515 arch_unoptimize_kprobe(op);
516 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
517 }
518}
519
520/* Remove optimized instructions */
521static void __kprobes kill_optimized_kprobe(struct kprobe *p)
522{
523 struct optimized_kprobe *op;
524
525 op = container_of(p, struct optimized_kprobe, kp);
526 if (!list_empty(&op->list)) {
527 /* Dequeue from the optimization queue */
528 list_del_init(&op->list);
529 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
530 }
531 /* Don't unoptimize, because the target code will be freed. */
532 arch_remove_optimized_kprobe(op);
533}
534
535/* Try to prepare optimized instructions */
536static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
537{
538 struct optimized_kprobe *op;
539
540 op = container_of(p, struct optimized_kprobe, kp);
541 arch_prepare_optimized_kprobe(op);
542}
543
544/* Free optimized instructions and optimized_kprobe */
545static __kprobes void free_aggr_kprobe(struct kprobe *p)
546{
547 struct optimized_kprobe *op;
548
549 op = container_of(p, struct optimized_kprobe, kp);
550 arch_remove_optimized_kprobe(op);
551 kfree(op);
552}
553
554/* Allocate new optimized_kprobe and try to prepare optimized instructions */
555static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
556{
557 struct optimized_kprobe *op;
558
559 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
560 if (!op)
561 return NULL;
562
563 INIT_LIST_HEAD(&op->list);
564 op->kp.addr = p->addr;
565 arch_prepare_optimized_kprobe(op);
566
567 return &op->kp;
568}
569
570static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
571
572/*
573 * Prepare an optimized_kprobe and optimize it
574 * NOTE: p must be a normal registered kprobe
575 */
576static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
577{
578 struct kprobe *ap;
579 struct optimized_kprobe *op;
580
581 ap = alloc_aggr_kprobe(p);
582 if (!ap)
583 return;
584
585 op = container_of(ap, struct optimized_kprobe, kp);
586 if (!arch_prepared_optinsn(&op->optinsn)) {
587 /* If failed to setup optimizing, fallback to kprobe */
588 free_aggr_kprobe(ap);
589 return;
590 }
591
592 init_aggr_kprobe(ap, p);
593 optimize_kprobe(ap);
594}
595
596#ifdef CONFIG_SYSCTL
597static void __kprobes optimize_all_kprobes(void)
598{
599 struct hlist_head *head;
600 struct hlist_node *node;
601 struct kprobe *p;
602 unsigned int i;
603
604 /* If optimization is already allowed, just return */
605 if (kprobes_allow_optimization)
606 return;
607
608 kprobes_allow_optimization = true;
609 mutex_lock(&text_mutex);
610 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
611 head = &kprobe_table[i];
612 hlist_for_each_entry_rcu(p, node, head, hlist)
613 if (!kprobe_disabled(p))
614 optimize_kprobe(p);
615 }
616 mutex_unlock(&text_mutex);
617 printk(KERN_INFO "Kprobes globally optimized\n");
618}
619
620static void __kprobes unoptimize_all_kprobes(void)
621{
622 struct hlist_head *head;
623 struct hlist_node *node;
624 struct kprobe *p;
625 unsigned int i;
626
627 /* If optimization is already prohibited, just return */
628 if (!kprobes_allow_optimization)
629 return;
630
631 kprobes_allow_optimization = false;
632 printk(KERN_INFO "Kprobes globally unoptimized\n");
633 get_online_cpus(); /* For avoiding text_mutex deadlock */
634 mutex_lock(&text_mutex);
635 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
636 head = &kprobe_table[i];
637 hlist_for_each_entry_rcu(p, node, head, hlist) {
638 if (!kprobe_disabled(p))
639 unoptimize_kprobe(p);
640 }
641 }
642
643 mutex_unlock(&text_mutex);
644 put_online_cpus();
645 /* Allow all currently running kprobes to complete */
646 synchronize_sched();
647}
648
649int sysctl_kprobes_optimization;
650int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
651 void __user *buffer, size_t *length,
652 loff_t *ppos)
653{
654 int ret;
655
656 mutex_lock(&kprobe_mutex);
657 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
658 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
659
660 if (sysctl_kprobes_optimization)
661 optimize_all_kprobes();
662 else
663 unoptimize_all_kprobes();
664 mutex_unlock(&kprobe_mutex);
665
666 return ret;
667}
668#endif /* CONFIG_SYSCTL */
669
670static void __kprobes __arm_kprobe(struct kprobe *p)
671{
672 struct kprobe *old_p;
673
674 /* Check collision with other optimized kprobes */
675 old_p = get_optimized_kprobe((unsigned long)p->addr);
676 if (unlikely(old_p))
677 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
678
679 arch_arm_kprobe(p);
680 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
681}
682
683static void __kprobes __disarm_kprobe(struct kprobe *p)
684{
685 struct kprobe *old_p;
686
687 unoptimize_kprobe(p); /* Try to unoptimize */
688 arch_disarm_kprobe(p);
689
690 /* If another kprobe was blocked, optimize it. */
691 old_p = get_optimized_kprobe((unsigned long)p->addr);
692 if (unlikely(old_p))
693 optimize_kprobe(old_p);
694}
695
696#else /* !CONFIG_OPTPROBES */
697
698#define optimize_kprobe(p) do {} while (0)
699#define unoptimize_kprobe(p) do {} while (0)
700#define kill_optimized_kprobe(p) do {} while (0)
701#define prepare_optimized_kprobe(p) do {} while (0)
702#define try_to_optimize_kprobe(p) do {} while (0)
703#define __arm_kprobe(p) arch_arm_kprobe(p)
704#define __disarm_kprobe(p) arch_disarm_kprobe(p)
705
706static __kprobes void free_aggr_kprobe(struct kprobe *p)
707{
708 kfree(p);
709}
710
711static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
712{
713 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
714}
715#endif /* CONFIG_OPTPROBES */
716
290/* Arm a kprobe with text_mutex */ 717/* Arm a kprobe with text_mutex */
291static void __kprobes arm_kprobe(struct kprobe *kp) 718static void __kprobes arm_kprobe(struct kprobe *kp)
292{ 719{
720 /*
721 * Here, since __arm_kprobe() doesn't use stop_machine(),
722 * this doesn't cause deadlock on text_mutex. So, we don't
723 * need get_online_cpus().
724 */
293 mutex_lock(&text_mutex); 725 mutex_lock(&text_mutex);
294 arch_arm_kprobe(kp); 726 __arm_kprobe(kp);
295 mutex_unlock(&text_mutex); 727 mutex_unlock(&text_mutex);
296} 728}
297 729
298/* Disarm a kprobe with text_mutex */ 730/* Disarm a kprobe with text_mutex */
299static void __kprobes disarm_kprobe(struct kprobe *kp) 731static void __kprobes disarm_kprobe(struct kprobe *kp)
300{ 732{
733 get_online_cpus(); /* For avoiding text_mutex deadlock */
301 mutex_lock(&text_mutex); 734 mutex_lock(&text_mutex);
302 arch_disarm_kprobe(kp); 735 __disarm_kprobe(kp);
303 mutex_unlock(&text_mutex); 736 mutex_unlock(&text_mutex);
737 put_online_cpus();
304} 738}
305 739
306/* 740/*
@@ -369,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
369void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 803void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
370{ 804{
371 struct kprobe *kp; 805 struct kprobe *kp;
372 if (p->pre_handler != aggr_pre_handler) { 806 if (!kprobe_aggrprobe(p)) {
373 p->nmissed++; 807 p->nmissed++;
374 } else { 808 } else {
375 list_for_each_entry_rcu(kp, &p->list, list) 809 list_for_each_entry_rcu(kp, &p->list, list)
@@ -493,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
493} 927}
494 928
495/* 929/*
496 * Keep all fields in the kprobe consistent
497 */
498static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
499{
500 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
501 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
502}
503
504/*
505* Add the new probe to ap->list. Fail if this is the 930* Add the new probe to ap->list. Fail if this is the
506* second jprobe at the address - two jprobes can't coexist 931* second jprobe at the address - two jprobes can't coexist
507*/ 932*/
508static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 933static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
509{ 934{
510 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 935 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
936
937 if (p->break_handler || p->post_handler)
938 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
939
511 if (p->break_handler) { 940 if (p->break_handler) {
512 if (ap->break_handler) 941 if (ap->break_handler)
513 return -EEXIST; 942 return -EEXIST;
@@ -522,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
522 ap->flags &= ~KPROBE_FLAG_DISABLED; 951 ap->flags &= ~KPROBE_FLAG_DISABLED;
523 if (!kprobes_all_disarmed) 952 if (!kprobes_all_disarmed)
524 /* Arm the breakpoint again. */ 953 /* Arm the breakpoint again. */
525 arm_kprobe(ap); 954 __arm_kprobe(ap);
526 } 955 }
527 return 0; 956 return 0;
528} 957}
@@ -531,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
531 * Fill in the required fields of the "manager kprobe". Replace the 960 * Fill in the required fields of the "manager kprobe". Replace the
532 * earlier kprobe in the hlist with the manager kprobe 961 * earlier kprobe in the hlist with the manager kprobe
533 */ 962 */
534static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 963static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
535{ 964{
965 /* Copy p's insn slot to ap */
536 copy_kprobe(p, ap); 966 copy_kprobe(p, ap);
537 flush_insn_slot(ap); 967 flush_insn_slot(ap);
538 ap->addr = p->addr; 968 ap->addr = p->addr;
539 ap->flags = p->flags; 969 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
540 ap->pre_handler = aggr_pre_handler; 970 ap->pre_handler = aggr_pre_handler;
541 ap->fault_handler = aggr_fault_handler; 971 ap->fault_handler = aggr_fault_handler;
542 /* We don't care the kprobe which has gone. */ 972 /* We don't care the kprobe which has gone. */
@@ -546,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
546 ap->break_handler = aggr_break_handler; 976 ap->break_handler = aggr_break_handler;
547 977
548 INIT_LIST_HEAD(&ap->list); 978 INIT_LIST_HEAD(&ap->list);
549 list_add_rcu(&p->list, &ap->list); 979 INIT_HLIST_NODE(&ap->hlist);
550 980
981 list_add_rcu(&p->list, &ap->list);
551 hlist_replace_rcu(&p->hlist, &ap->hlist); 982 hlist_replace_rcu(&p->hlist, &ap->hlist);
552} 983}
553 984
@@ -561,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
561 int ret = 0; 992 int ret = 0;
562 struct kprobe *ap = old_p; 993 struct kprobe *ap = old_p;
563 994
564 if (old_p->pre_handler != aggr_pre_handler) { 995 if (!kprobe_aggrprobe(old_p)) {
565 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 996 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
566 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 997 ap = alloc_aggr_kprobe(old_p);
567 if (!ap) 998 if (!ap)
568 return -ENOMEM; 999 return -ENOMEM;
569 add_aggr_kprobe(ap, old_p); 1000 init_aggr_kprobe(ap, old_p);
570 } 1001 }
571 1002
572 if (kprobe_gone(ap)) { 1003 if (kprobe_gone(ap)) {
@@ -585,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
585 */ 1016 */
586 return ret; 1017 return ret;
587 1018
1019 /* Prepare optimized instructions if possible. */
1020 prepare_optimized_kprobe(ap);
1021
588 /* 1022 /*
589 * Clear gone flag to prevent allocating new slot again, and 1023 * Clear gone flag to prevent allocating new slot again, and
590 * set disabled flag because it is not armed yet. 1024 * set disabled flag because it is not armed yet.
@@ -593,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
593 | KPROBE_FLAG_DISABLED; 1027 | KPROBE_FLAG_DISABLED;
594 } 1028 }
595 1029
1030 /* Copy ap's insn slot to p */
596 copy_kprobe(ap, p); 1031 copy_kprobe(ap, p);
597 return add_new_kprobe(ap, p); 1032 return add_new_kprobe(ap, p);
598} 1033}
@@ -743,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
743 p->nmissed = 0; 1178 p->nmissed = 0;
744 INIT_LIST_HEAD(&p->list); 1179 INIT_LIST_HEAD(&p->list);
745 mutex_lock(&kprobe_mutex); 1180 mutex_lock(&kprobe_mutex);
1181
1182 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1183 mutex_lock(&text_mutex);
1184
746 old_p = get_kprobe(p->addr); 1185 old_p = get_kprobe(p->addr);
747 if (old_p) { 1186 if (old_p) {
1187 /* Since this may unoptimize old_p, locking text_mutex. */
748 ret = register_aggr_kprobe(old_p, p); 1188 ret = register_aggr_kprobe(old_p, p);
749 goto out; 1189 goto out;
750 } 1190 }
751 1191
752 mutex_lock(&text_mutex);
753 ret = arch_prepare_kprobe(p); 1192 ret = arch_prepare_kprobe(p);
754 if (ret) 1193 if (ret)
755 goto out_unlock_text; 1194 goto out;
756 1195
757 INIT_HLIST_NODE(&p->hlist); 1196 INIT_HLIST_NODE(&p->hlist);
758 hlist_add_head_rcu(&p->hlist, 1197 hlist_add_head_rcu(&p->hlist,
759 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1198 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
760 1199
761 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1200 if (!kprobes_all_disarmed && !kprobe_disabled(p))
762 arch_arm_kprobe(p); 1201 __arm_kprobe(p);
1202
1203 /* Try to optimize kprobe */
1204 try_to_optimize_kprobe(p);
763 1205
764out_unlock_text:
765 mutex_unlock(&text_mutex);
766out: 1206out:
1207 mutex_unlock(&text_mutex);
1208 put_online_cpus();
767 mutex_unlock(&kprobe_mutex); 1209 mutex_unlock(&kprobe_mutex);
768 1210
769 if (probed_mod) 1211 if (probed_mod)
@@ -785,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
785 return -EINVAL; 1227 return -EINVAL;
786 1228
787 if (old_p == p || 1229 if (old_p == p ||
788 (old_p->pre_handler == aggr_pre_handler && 1230 (kprobe_aggrprobe(old_p) &&
789 list_is_singular(&old_p->list))) { 1231 list_is_singular(&old_p->list))) {
790 /* 1232 /*
791 * Only probe on the hash list. Disarm only if kprobes are 1233 * Only probe on the hash list. Disarm only if kprobes are
@@ -793,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
793 * already have been removed. We save on flushing icache. 1235 * already have been removed. We save on flushing icache.
794 */ 1236 */
795 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1237 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
796 disarm_kprobe(p); 1238 disarm_kprobe(old_p);
797 hlist_del_rcu(&old_p->hlist); 1239 hlist_del_rcu(&old_p->hlist);
798 } else { 1240 } else {
799 if (p->break_handler && !kprobe_gone(p)) 1241 if (p->break_handler && !kprobe_gone(p))
@@ -809,8 +1251,13 @@ noclean:
809 list_del_rcu(&p->list); 1251 list_del_rcu(&p->list);
810 if (!kprobe_disabled(old_p)) { 1252 if (!kprobe_disabled(old_p)) {
811 try_to_disable_aggr_kprobe(old_p); 1253 try_to_disable_aggr_kprobe(old_p);
812 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1254 if (!kprobes_all_disarmed) {
813 disarm_kprobe(old_p); 1255 if (kprobe_disabled(old_p))
1256 disarm_kprobe(old_p);
1257 else
1258 /* Try to optimize this probe again */
1259 optimize_kprobe(old_p);
1260 }
814 } 1261 }
815 } 1262 }
816 return 0; 1263 return 0;
@@ -827,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
827 old_p = list_entry(p->list.next, struct kprobe, list); 1274 old_p = list_entry(p->list.next, struct kprobe, list);
828 list_del(&p->list); 1275 list_del(&p->list);
829 arch_remove_kprobe(old_p); 1276 arch_remove_kprobe(old_p);
830 kfree(old_p); 1277 free_aggr_kprobe(old_p);
831 } 1278 }
832} 1279}
833 1280
@@ -1123,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1123 struct kprobe *kp; 1570 struct kprobe *kp;
1124 1571
1125 p->flags |= KPROBE_FLAG_GONE; 1572 p->flags |= KPROBE_FLAG_GONE;
1126 if (p->pre_handler == aggr_pre_handler) { 1573 if (kprobe_aggrprobe(p)) {
1127 /* 1574 /*
1128 * If this is an aggr_kprobe, we have to list all the 1575 * If this is an aggr_kprobe, we have to list all the
1129 * chained probes and mark them GONE. 1576 * chained probes and mark them GONE.
@@ -1132,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1132 kp->flags |= KPROBE_FLAG_GONE; 1579 kp->flags |= KPROBE_FLAG_GONE;
1133 p->post_handler = NULL; 1580 p->post_handler = NULL;
1134 p->break_handler = NULL; 1581 p->break_handler = NULL;
1582 kill_optimized_kprobe(p);
1135 } 1583 }
1136 /* 1584 /*
1137 * Here, we can remove insn_slot safely, because no thread calls 1585 * Here, we can remove insn_slot safely, because no thread calls
@@ -1140,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1140 arch_remove_kprobe(p); 1588 arch_remove_kprobe(p);
1141} 1589}
1142 1590
1591/* Disable one kprobe */
1592int __kprobes disable_kprobe(struct kprobe *kp)
1593{
1594 int ret = 0;
1595 struct kprobe *p;
1596
1597 mutex_lock(&kprobe_mutex);
1598
1599 /* Check whether specified probe is valid. */
1600 p = __get_valid_kprobe(kp);
1601 if (unlikely(p == NULL)) {
1602 ret = -EINVAL;
1603 goto out;
1604 }
1605
1606 /* If the probe is already disabled (or gone), just return */
1607 if (kprobe_disabled(kp))
1608 goto out;
1609
1610 kp->flags |= KPROBE_FLAG_DISABLED;
1611 if (p != kp)
1612 /* When kp != p, p is always enabled. */
1613 try_to_disable_aggr_kprobe(p);
1614
1615 if (!kprobes_all_disarmed && kprobe_disabled(p))
1616 disarm_kprobe(p);
1617out:
1618 mutex_unlock(&kprobe_mutex);
1619 return ret;
1620}
1621EXPORT_SYMBOL_GPL(disable_kprobe);
1622
1623/* Enable one kprobe */
1624int __kprobes enable_kprobe(struct kprobe *kp)
1625{
1626 int ret = 0;
1627 struct kprobe *p;
1628
1629 mutex_lock(&kprobe_mutex);
1630
1631 /* Check whether specified probe is valid. */
1632 p = __get_valid_kprobe(kp);
1633 if (unlikely(p == NULL)) {
1634 ret = -EINVAL;
1635 goto out;
1636 }
1637
1638 if (kprobe_gone(kp)) {
1639 /* This kprobe has gone, we couldn't enable it. */
1640 ret = -EINVAL;
1641 goto out;
1642 }
1643
1644 if (p != kp)
1645 kp->flags &= ~KPROBE_FLAG_DISABLED;
1646
1647 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1648 p->flags &= ~KPROBE_FLAG_DISABLED;
1649 arm_kprobe(p);
1650 }
1651out:
1652 mutex_unlock(&kprobe_mutex);
1653 return ret;
1654}
1655EXPORT_SYMBOL_GPL(enable_kprobe);
1656
1143void __kprobes dump_kprobe(struct kprobe *kp) 1657void __kprobes dump_kprobe(struct kprobe *kp)
1144{ 1658{
1145 printk(KERN_WARNING "Dumping kprobe:\n"); 1659 printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1241,6 +1755,15 @@ static int __init init_kprobes(void)
1241 } 1755 }
1242 } 1756 }
1243 1757
1758#if defined(CONFIG_OPTPROBES)
1759#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1760 /* Init kprobe_optinsn_slots */
1761 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1762#endif
1763 /* By default, kprobes can be optimized */
1764 kprobes_allow_optimization = true;
1765#endif
1766
1244 /* By default, kprobes are armed */ 1767 /* By default, kprobes are armed */
1245 kprobes_all_disarmed = false; 1768 kprobes_all_disarmed = false;
1246 1769
@@ -1259,7 +1782,7 @@ static int __init init_kprobes(void)
1259 1782
1260#ifdef CONFIG_DEBUG_FS 1783#ifdef CONFIG_DEBUG_FS
1261static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1784static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1262 const char *sym, int offset,char *modname) 1785 const char *sym, int offset, char *modname, struct kprobe *pp)
1263{ 1786{
1264 char *kprobe_type; 1787 char *kprobe_type;
1265 1788
@@ -1269,19 +1792,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1269 kprobe_type = "j"; 1792 kprobe_type = "j";
1270 else 1793 else
1271 kprobe_type = "k"; 1794 kprobe_type = "k";
1795
1272 if (sym) 1796 if (sym)
1273 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1797 seq_printf(pi, "%p %s %s+0x%x %s ",
1274 p->addr, kprobe_type, sym, offset, 1798 p->addr, kprobe_type, sym, offset,
1275 (modname ? modname : " "), 1799 (modname ? modname : " "));
1276 (kprobe_gone(p) ? "[GONE]" : ""),
1277 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1278 "[DISABLED]" : ""));
1279 else 1800 else
1280 seq_printf(pi, "%p %s %p %s%s\n", 1801 seq_printf(pi, "%p %s %p ",
1281 p->addr, kprobe_type, p->addr, 1802 p->addr, kprobe_type, p->addr);
1282 (kprobe_gone(p) ? "[GONE]" : ""), 1803
1283 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1804 if (!pp)
1284 "[DISABLED]" : "")); 1805 pp = p;
1806 seq_printf(pi, "%s%s%s\n",
1807 (kprobe_gone(p) ? "[GONE]" : ""),
1808 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1809 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1285} 1810}
1286 1811
1287static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1812static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1317,11 +1842,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1317 hlist_for_each_entry_rcu(p, node, head, hlist) { 1842 hlist_for_each_entry_rcu(p, node, head, hlist) {
1318 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1843 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1319 &offset, &modname, namebuf); 1844 &offset, &modname, namebuf);
1320 if (p->pre_handler == aggr_pre_handler) { 1845 if (kprobe_aggrprobe(p)) {
1321 list_for_each_entry_rcu(kp, &p->list, list) 1846 list_for_each_entry_rcu(kp, &p->list, list)
1322 report_probe(pi, kp, sym, offset, modname); 1847 report_probe(pi, kp, sym, offset, modname, p);
1323 } else 1848 } else
1324 report_probe(pi, p, sym, offset, modname); 1849 report_probe(pi, p, sym, offset, modname, NULL);
1325 } 1850 }
1326 preempt_enable(); 1851 preempt_enable();
1327 return 0; 1852 return 0;
@@ -1346,71 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
1346 .release = seq_release, 1871 .release = seq_release,
1347}; 1872};
1348 1873
1349/* Disable one kprobe */
1350int __kprobes disable_kprobe(struct kprobe *kp)
1351{
1352 int ret = 0;
1353 struct kprobe *p;
1354
1355 mutex_lock(&kprobe_mutex);
1356
1357 /* Check whether specified probe is valid. */
1358 p = __get_valid_kprobe(kp);
1359 if (unlikely(p == NULL)) {
1360 ret = -EINVAL;
1361 goto out;
1362 }
1363
1364 /* If the probe is already disabled (or gone), just return */
1365 if (kprobe_disabled(kp))
1366 goto out;
1367
1368 kp->flags |= KPROBE_FLAG_DISABLED;
1369 if (p != kp)
1370 /* When kp != p, p is always enabled. */
1371 try_to_disable_aggr_kprobe(p);
1372
1373 if (!kprobes_all_disarmed && kprobe_disabled(p))
1374 disarm_kprobe(p);
1375out:
1376 mutex_unlock(&kprobe_mutex);
1377 return ret;
1378}
1379EXPORT_SYMBOL_GPL(disable_kprobe);
1380
1381/* Enable one kprobe */
1382int __kprobes enable_kprobe(struct kprobe *kp)
1383{
1384 int ret = 0;
1385 struct kprobe *p;
1386
1387 mutex_lock(&kprobe_mutex);
1388
1389 /* Check whether specified probe is valid. */
1390 p = __get_valid_kprobe(kp);
1391 if (unlikely(p == NULL)) {
1392 ret = -EINVAL;
1393 goto out;
1394 }
1395
1396 if (kprobe_gone(kp)) {
1397 /* This kprobe has gone, we couldn't enable it. */
1398 ret = -EINVAL;
1399 goto out;
1400 }
1401
1402 if (!kprobes_all_disarmed && kprobe_disabled(p))
1403 arm_kprobe(p);
1404
1405 p->flags &= ~KPROBE_FLAG_DISABLED;
1406 if (p != kp)
1407 kp->flags &= ~KPROBE_FLAG_DISABLED;
1408out:
1409 mutex_unlock(&kprobe_mutex);
1410 return ret;
1411}
1412EXPORT_SYMBOL_GPL(enable_kprobe);
1413
1414static void __kprobes arm_all_kprobes(void) 1874static void __kprobes arm_all_kprobes(void)
1415{ 1875{
1416 struct hlist_head *head; 1876 struct hlist_head *head;
@@ -1424,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
1424 if (!kprobes_all_disarmed) 1884 if (!kprobes_all_disarmed)
1425 goto already_enabled; 1885 goto already_enabled;
1426 1886
1887 /* Arming kprobes doesn't optimize kprobe itself */
1427 mutex_lock(&text_mutex); 1888 mutex_lock(&text_mutex);
1428 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1889 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1429 head = &kprobe_table[i]; 1890 head = &kprobe_table[i];
1430 hlist_for_each_entry_rcu(p, node, head, hlist) 1891 hlist_for_each_entry_rcu(p, node, head, hlist)
1431 if (!kprobe_disabled(p)) 1892 if (!kprobe_disabled(p))
1432 arch_arm_kprobe(p); 1893 __arm_kprobe(p);
1433 } 1894 }
1434 mutex_unlock(&text_mutex); 1895 mutex_unlock(&text_mutex);
1435 1896
@@ -1456,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
1456 1917
1457 kprobes_all_disarmed = true; 1918 kprobes_all_disarmed = true;
1458 printk(KERN_INFO "Kprobes globally disabled\n"); 1919 printk(KERN_INFO "Kprobes globally disabled\n");
1920
1921 /*
1922 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1923 * because disarming may also unoptimize kprobes.
1924 */
1925 get_online_cpus();
1459 mutex_lock(&text_mutex); 1926 mutex_lock(&text_mutex);
1460 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1927 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1461 head = &kprobe_table[i]; 1928 head = &kprobe_table[i];
1462 hlist_for_each_entry_rcu(p, node, head, hlist) { 1929 hlist_for_each_entry_rcu(p, node, head, hlist) {
1463 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1930 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1464 arch_disarm_kprobe(p); 1931 __disarm_kprobe(p);
1465 } 1932 }
1466 } 1933 }
1467 1934
1468 mutex_unlock(&text_mutex); 1935 mutex_unlock(&text_mutex);
1936 put_online_cpus();
1469 mutex_unlock(&kprobe_mutex); 1937 mutex_unlock(&kprobe_mutex);
1470 /* Allow all currently running kprobes to complete */ 1938 /* Allow all currently running kprobes to complete */
1471 synchronize_sched(); 1939 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6b1ccc3f0205..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
33} 33}
34KERNEL_ATTR_RO(uevent_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
35 35
36/* uevent helper program, used during early boo */ 36/* uevent helper program, used during early boot */
37static ssize_t uevent_helper_show(struct kobject *kobj, 37static ssize_t uevent_helper_show(struct kobject *kobj,
38 struct kobj_attribute *attr, char *buf) 38 struct kobj_attribute *attr, char *buf)
39{ 39{
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 82ed0ea15194..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
219 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
220 ignore_signals(tsk); 220 ignore_signals(tsk);
221 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
222 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_states[N_HIGH_MEMORY]);
223 223
224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
225 225
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/slab.h>
60#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
61 60
62static DEFINE_SPINLOCK(latency_lock); 61static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0c30d0455de1..ec21304856d1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/gfp.h>
46 47
47#include <asm/sections.h> 48#include <asm/sections.h>
48 49
@@ -430,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
430/* 431/*
431 * Various lockdep statistics: 432 * Various lockdep statistics:
432 */ 433 */
433atomic_t chain_lookup_hits; 434DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
434atomic_t chain_lookup_misses;
435atomic_t hardirqs_on_events;
436atomic_t hardirqs_off_events;
437atomic_t redundant_hardirqs_on;
438atomic_t redundant_hardirqs_off;
439atomic_t softirqs_on_events;
440atomic_t softirqs_off_events;
441atomic_t redundant_softirqs_on;
442atomic_t redundant_softirqs_off;
443atomic_t nr_unused_locks;
444atomic_t nr_cyclic_checks;
445atomic_t nr_find_usage_forwards_checks;
446atomic_t nr_find_usage_backwards_checks;
447#endif 435#endif
448 436
449/* 437/*
@@ -582,9 +570,6 @@ static int static_obj(void *obj)
582 unsigned long start = (unsigned long) &_stext, 570 unsigned long start = (unsigned long) &_stext,
583 end = (unsigned long) &_end, 571 end = (unsigned long) &_end,
584 addr = (unsigned long) obj; 572 addr = (unsigned long) obj;
585#ifdef CONFIG_SMP
586 int i;
587#endif
588 573
589 /* 574 /*
590 * static variable? 575 * static variable?
@@ -595,24 +580,16 @@ static int static_obj(void *obj)
595 if (arch_is_kernel_data(addr)) 580 if (arch_is_kernel_data(addr))
596 return 1; 581 return 1;
597 582
598#ifdef CONFIG_SMP
599 /* 583 /*
600 * percpu var? 584 * in-kernel percpu var?
601 */ 585 */
602 for_each_possible_cpu(i) { 586 if (is_kernel_percpu_address(addr))
603 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); 587 return 1;
604 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
605 + per_cpu_offset(i);
606
607 if ((addr >= start) && (addr < end))
608 return 1;
609 }
610#endif
611 588
612 /* 589 /*
613 * module var? 590 * module static or percpu var?
614 */ 591 */
615 return is_module_address(addr); 592 return is_module_address(addr) || is_module_percpu_address(addr);
616} 593}
617 594
618/* 595/*
@@ -758,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
758 return NULL; 735 return NULL;
759 } 736 }
760 class = lock_classes + nr_lock_classes++; 737 class = lock_classes + nr_lock_classes++;
761 debug_atomic_inc(&nr_unused_locks); 738 debug_atomic_inc(nr_unused_locks);
762 class->key = key; 739 class->key = key;
763 class->name = lock->name; 740 class->name = lock->name;
764 class->subclass = subclass; 741 class->subclass = subclass;
@@ -828,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
828 * Add a new dependency to the head of the list: 805 * Add a new dependency to the head of the list:
829 */ 806 */
830static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 807static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
831 struct list_head *head, unsigned long ip, int distance) 808 struct list_head *head, unsigned long ip,
809 int distance, struct stack_trace *trace)
832{ 810{
833 struct lock_list *entry; 811 struct lock_list *entry;
834 /* 812 /*
@@ -839,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
839 if (!entry) 817 if (!entry)
840 return 0; 818 return 0;
841 819
842 if (!save_trace(&entry->trace))
843 return 0;
844
845 entry->class = this; 820 entry->class = this;
846 entry->distance = distance; 821 entry->distance = distance;
822 entry->trace = *trace;
847 /* 823 /*
848 * Since we never remove from the dependency list, the list can 824 * Since we never remove from the dependency list, the list can
849 * be walked lockless by other CPUs, it's only allocation 825 * be walked lockless by other CPUs, it's only allocation
@@ -1215,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1215{ 1191{
1216 int result; 1192 int result;
1217 1193
1218 debug_atomic_inc(&nr_cyclic_checks); 1194 debug_atomic_inc(nr_cyclic_checks);
1219 1195
1220 result = __bfs_forwards(root, target, class_equal, target_entry); 1196 result = __bfs_forwards(root, target, class_equal, target_entry);
1221 1197
@@ -1252,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1252{ 1228{
1253 int result; 1229 int result;
1254 1230
1255 debug_atomic_inc(&nr_find_usage_forwards_checks); 1231 debug_atomic_inc(nr_find_usage_forwards_checks);
1256 1232
1257 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1233 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1258 1234
@@ -1275,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1275{ 1251{
1276 int result; 1252 int result;
1277 1253
1278 debug_atomic_inc(&nr_find_usage_backwards_checks); 1254 debug_atomic_inc(nr_find_usage_backwards_checks);
1279 1255
1280 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1256 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1281 1257
@@ -1645,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1645 */ 1621 */
1646static int 1622static int
1647check_prev_add(struct task_struct *curr, struct held_lock *prev, 1623check_prev_add(struct task_struct *curr, struct held_lock *prev,
1648 struct held_lock *next, int distance) 1624 struct held_lock *next, int distance, int trylock_loop)
1649{ 1625{
1650 struct lock_list *entry; 1626 struct lock_list *entry;
1651 int ret; 1627 int ret;
1652 struct lock_list this; 1628 struct lock_list this;
1653 struct lock_list *uninitialized_var(target_entry); 1629 struct lock_list *uninitialized_var(target_entry);
1630 /*
1631 * Static variable, serialized by the graph_lock().
1632 *
1633 * We use this static variable to save the stack trace in case
1634 * we call into this function multiple times due to encountering
1635 * trylocks in the held lock stack.
1636 */
1637 static struct stack_trace trace;
1654 1638
1655 /* 1639 /*
1656 * Prove that the new <prev> -> <next> dependency would not 1640 * Prove that the new <prev> -> <next> dependency would not
@@ -1698,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1698 } 1682 }
1699 } 1683 }
1700 1684
1685 if (!trylock_loop && !save_trace(&trace))
1686 return 0;
1687
1701 /* 1688 /*
1702 * Ok, all validations passed, add the new lock 1689 * Ok, all validations passed, add the new lock
1703 * to the previous lock's dependency list: 1690 * to the previous lock's dependency list:
1704 */ 1691 */
1705 ret = add_lock_to_list(hlock_class(prev), hlock_class(next), 1692 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1706 &hlock_class(prev)->locks_after, 1693 &hlock_class(prev)->locks_after,
1707 next->acquire_ip, distance); 1694 next->acquire_ip, distance, &trace);
1708 1695
1709 if (!ret) 1696 if (!ret)
1710 return 0; 1697 return 0;
1711 1698
1712 ret = add_lock_to_list(hlock_class(next), hlock_class(prev), 1699 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1713 &hlock_class(next)->locks_before, 1700 &hlock_class(next)->locks_before,
1714 next->acquire_ip, distance); 1701 next->acquire_ip, distance, &trace);
1715 if (!ret) 1702 if (!ret)
1716 return 0; 1703 return 0;
1717 1704
@@ -1741,6 +1728,7 @@ static int
1741check_prevs_add(struct task_struct *curr, struct held_lock *next) 1728check_prevs_add(struct task_struct *curr, struct held_lock *next)
1742{ 1729{
1743 int depth = curr->lockdep_depth; 1730 int depth = curr->lockdep_depth;
1731 int trylock_loop = 0;
1744 struct held_lock *hlock; 1732 struct held_lock *hlock;
1745 1733
1746 /* 1734 /*
@@ -1766,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1766 * added: 1754 * added:
1767 */ 1755 */
1768 if (hlock->read != 2) { 1756 if (hlock->read != 2) {
1769 if (!check_prev_add(curr, hlock, next, distance)) 1757 if (!check_prev_add(curr, hlock, next,
1758 distance, trylock_loop))
1770 return 0; 1759 return 0;
1771 /* 1760 /*
1772 * Stop after the first non-trylock entry, 1761 * Stop after the first non-trylock entry,
@@ -1789,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1789 if (curr->held_locks[depth].irq_context != 1778 if (curr->held_locks[depth].irq_context !=
1790 curr->held_locks[depth-1].irq_context) 1779 curr->held_locks[depth-1].irq_context)
1791 break; 1780 break;
1781 trylock_loop = 1;
1792 } 1782 }
1793 return 1; 1783 return 1;
1794out_bug: 1784out_bug:
@@ -1835,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1835 list_for_each_entry(chain, hash_head, entry) { 1825 list_for_each_entry(chain, hash_head, entry) {
1836 if (chain->chain_key == chain_key) { 1826 if (chain->chain_key == chain_key) {
1837cache_hit: 1827cache_hit:
1838 debug_atomic_inc(&chain_lookup_hits); 1828 debug_atomic_inc(chain_lookup_hits);
1839 if (very_verbose(class)) 1829 if (very_verbose(class))
1840 printk("\nhash chain already cached, key: " 1830 printk("\nhash chain already cached, key: "
1841 "%016Lx tail class: [%p] %s\n", 1831 "%016Lx tail class: [%p] %s\n",
@@ -1900,7 +1890,7 @@ cache_hit:
1900 chain_hlocks[chain->base + j] = class - lock_classes; 1890 chain_hlocks[chain->base + j] = class - lock_classes;
1901 } 1891 }
1902 list_add_tail_rcu(&chain->entry, hash_head); 1892 list_add_tail_rcu(&chain->entry, hash_head);
1903 debug_atomic_inc(&chain_lookup_misses); 1893 debug_atomic_inc(chain_lookup_misses);
1904 inc_chains(); 1894 inc_chains();
1905 1895
1906 return 1; 1896 return 1;
@@ -2321,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2321 return; 2311 return;
2322 2312
2323 if (unlikely(curr->hardirqs_enabled)) { 2313 if (unlikely(curr->hardirqs_enabled)) {
2324 debug_atomic_inc(&redundant_hardirqs_on); 2314 /*
2315 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit
2317 * in a stat is not a big deal.
2318 */
2319 __debug_atomic_inc(redundant_hardirqs_on);
2325 return; 2320 return;
2326 } 2321 }
2327 /* we'll do an OFF -> ON transition: */ 2322 /* we'll do an OFF -> ON transition: */
@@ -2348,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2348 2343
2349 curr->hardirq_enable_ip = ip; 2344 curr->hardirq_enable_ip = ip;
2350 curr->hardirq_enable_event = ++curr->irq_events; 2345 curr->hardirq_enable_event = ++curr->irq_events;
2351 debug_atomic_inc(&hardirqs_on_events); 2346 debug_atomic_inc(hardirqs_on_events);
2352} 2347}
2353EXPORT_SYMBOL(trace_hardirqs_on_caller); 2348EXPORT_SYMBOL(trace_hardirqs_on_caller);
2354 2349
@@ -2380,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
2380 curr->hardirqs_enabled = 0; 2375 curr->hardirqs_enabled = 0;
2381 curr->hardirq_disable_ip = ip; 2376 curr->hardirq_disable_ip = ip;
2382 curr->hardirq_disable_event = ++curr->irq_events; 2377 curr->hardirq_disable_event = ++curr->irq_events;
2383 debug_atomic_inc(&hardirqs_off_events); 2378 debug_atomic_inc(hardirqs_off_events);
2384 } else 2379 } else
2385 debug_atomic_inc(&redundant_hardirqs_off); 2380 debug_atomic_inc(redundant_hardirqs_off);
2386} 2381}
2387EXPORT_SYMBOL(trace_hardirqs_off_caller); 2382EXPORT_SYMBOL(trace_hardirqs_off_caller);
2388 2383
@@ -2406,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
2406 return; 2401 return;
2407 2402
2408 if (curr->softirqs_enabled) { 2403 if (curr->softirqs_enabled) {
2409 debug_atomic_inc(&redundant_softirqs_on); 2404 debug_atomic_inc(redundant_softirqs_on);
2410 return; 2405 return;
2411 } 2406 }
2412 2407
@@ -2416,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
2416 curr->softirqs_enabled = 1; 2411 curr->softirqs_enabled = 1;
2417 curr->softirq_enable_ip = ip; 2412 curr->softirq_enable_ip = ip;
2418 curr->softirq_enable_event = ++curr->irq_events; 2413 curr->softirq_enable_event = ++curr->irq_events;
2419 debug_atomic_inc(&softirqs_on_events); 2414 debug_atomic_inc(softirqs_on_events);
2420 /* 2415 /*
2421 * We are going to turn softirqs on, so set the 2416 * We are going to turn softirqs on, so set the
2422 * usage bit for all held locks, if hardirqs are 2417 * usage bit for all held locks, if hardirqs are
@@ -2446,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
2446 curr->softirqs_enabled = 0; 2441 curr->softirqs_enabled = 0;
2447 curr->softirq_disable_ip = ip; 2442 curr->softirq_disable_ip = ip;
2448 curr->softirq_disable_event = ++curr->irq_events; 2443 curr->softirq_disable_event = ++curr->irq_events;
2449 debug_atomic_inc(&softirqs_off_events); 2444 debug_atomic_inc(softirqs_off_events);
2450 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2445 DEBUG_LOCKS_WARN_ON(!softirq_count());
2451 } else 2446 } else
2452 debug_atomic_inc(&redundant_softirqs_off); 2447 debug_atomic_inc(redundant_softirqs_off);
2453} 2448}
2454 2449
2455static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) 2450static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2654,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2654 return 0; 2649 return 0;
2655 break; 2650 break;
2656 case LOCK_USED: 2651 case LOCK_USED:
2657 debug_atomic_dec(&nr_unused_locks); 2652 debug_atomic_dec(nr_unused_locks);
2658 break; 2653 break;
2659 default: 2654 default:
2660 if (!debug_locks_off_graph_unlock()) 2655 if (!debug_locks_off_graph_unlock())
@@ -2760,7 +2755,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2760 if (!class) 2755 if (!class)
2761 return 0; 2756 return 0;
2762 } 2757 }
2763 debug_atomic_inc((atomic_t *)&class->ops); 2758 atomic_inc((atomic_t *)&class->ops);
2764 if (very_verbose(class)) { 2759 if (very_verbose(class)) {
2765 printk("\nacquire class [%p] %s", class->key, class->name); 2760 printk("\nacquire class [%p] %s", class->key, class->name);
2766 if (class->name_version > 1) 2761 if (class->name_version > 1)
@@ -3211,8 +3206,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3211{ 3206{
3212 unsigned long flags; 3207 unsigned long flags;
3213 3208
3214 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3215
3216 if (unlikely(current->lockdep_recursion)) 3209 if (unlikely(current->lockdep_recursion))
3217 return; 3210 return;
3218 3211
@@ -3220,6 +3213,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3220 check_flags(flags); 3213 check_flags(flags);
3221 3214
3222 current->lockdep_recursion = 1; 3215 current->lockdep_recursion = 1;
3216 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3223 __lock_acquire(lock, subclass, trylock, read, check, 3217 __lock_acquire(lock, subclass, trylock, read, check,
3224 irqs_disabled_flags(flags), nest_lock, ip, 0); 3218 irqs_disabled_flags(flags), nest_lock, ip, 0);
3225 current->lockdep_recursion = 0; 3219 current->lockdep_recursion = 0;
@@ -3232,14 +3226,13 @@ void lock_release(struct lockdep_map *lock, int nested,
3232{ 3226{
3233 unsigned long flags; 3227 unsigned long flags;
3234 3228
3235 trace_lock_release(lock, nested, ip);
3236
3237 if (unlikely(current->lockdep_recursion)) 3229 if (unlikely(current->lockdep_recursion))
3238 return; 3230 return;
3239 3231
3240 raw_local_irq_save(flags); 3232 raw_local_irq_save(flags);
3241 check_flags(flags); 3233 check_flags(flags);
3242 current->lockdep_recursion = 1; 3234 current->lockdep_recursion = 1;
3235 trace_lock_release(lock, ip);
3243 __lock_release(lock, nested, ip); 3236 __lock_release(lock, nested, ip);
3244 current->lockdep_recursion = 0; 3237 current->lockdep_recursion = 0;
3245 raw_local_irq_restore(flags); 3238 raw_local_irq_restore(flags);
@@ -3392,7 +3385,7 @@ found_it:
3392 hlock->holdtime_stamp = now; 3385 hlock->holdtime_stamp = now;
3393 } 3386 }
3394 3387
3395 trace_lock_acquired(lock, ip, waittime); 3388 trace_lock_acquired(lock, ip);
3396 3389
3397 stats = get_lock_stats(hlock_class(hlock)); 3390 stats = get_lock_stats(hlock_class(hlock));
3398 if (waittime) { 3391 if (waittime) {
@@ -3413,8 +3406,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3413{ 3406{
3414 unsigned long flags; 3407 unsigned long flags;
3415 3408
3416 trace_lock_contended(lock, ip);
3417
3418 if (unlikely(!lock_stat)) 3409 if (unlikely(!lock_stat))
3419 return; 3410 return;
3420 3411
@@ -3424,6 +3415,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3424 raw_local_irq_save(flags); 3415 raw_local_irq_save(flags);
3425 check_flags(flags); 3416 check_flags(flags);
3426 current->lockdep_recursion = 1; 3417 current->lockdep_recursion = 1;
3418 trace_lock_contended(lock, ip);
3427 __lock_contended(lock, ip); 3419 __lock_contended(lock, ip);
3428 current->lockdep_recursion = 0; 3420 current->lockdep_recursion = 0;
3429 raw_local_irq_restore(flags); 3421 raw_local_irq_restore(flags);
@@ -3814,14 +3806,18 @@ void lockdep_rcu_dereference(const char *file, const int line)
3814{ 3806{
3815 struct task_struct *curr = current; 3807 struct task_struct *curr = current;
3816 3808
3809#ifndef CONFIG_PROVE_RCU_REPEATEDLY
3817 if (!debug_locks_off()) 3810 if (!debug_locks_off())
3818 return; 3811 return;
3812#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
3813 /* Note: the following can be executed concurrently, so be careful. */
3819 printk("\n===================================================\n"); 3814 printk("\n===================================================\n");
3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 3815 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3821 printk( "---------------------------------------------------\n"); 3816 printk( "---------------------------------------------------\n");
3822 printk("%s:%d invoked rcu_dereference_check() without protection!\n", 3817 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3823 file, line); 3818 file, line);
3824 printk("\nother info that might help us debug this:\n\n"); 3819 printk("\nother info that might help us debug this:\n\n");
3820 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
3825 lockdep_print_held_locks(curr); 3821 lockdep_print_held_locks(curr);
3826 printk("\nstack backtrace:\n"); 3822 printk("\nstack backtrace:\n");
3827 dump_stack(); 3823 dump_stack();
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
110#endif 110#endif
111 111
112#ifdef CONFIG_DEBUG_LOCKDEP 112#ifdef CONFIG_DEBUG_LOCKDEP
113
114#include <asm/local.h>
113/* 115/*
114 * Various lockdep statistics: 116 * Various lockdep statistics.
117 * We want them per cpu as they are often accessed in fast path
118 * and we want to avoid too much cache bouncing.
115 */ 119 */
116extern atomic_t chain_lookup_hits; 120struct lockdep_stats {
117extern atomic_t chain_lookup_misses; 121 int chain_lookup_hits;
118extern atomic_t hardirqs_on_events; 122 int chain_lookup_misses;
119extern atomic_t hardirqs_off_events; 123 int hardirqs_on_events;
120extern atomic_t redundant_hardirqs_on; 124 int hardirqs_off_events;
121extern atomic_t redundant_hardirqs_off; 125 int redundant_hardirqs_on;
122extern atomic_t softirqs_on_events; 126 int redundant_hardirqs_off;
123extern atomic_t softirqs_off_events; 127 int softirqs_on_events;
124extern atomic_t redundant_softirqs_on; 128 int softirqs_off_events;
125extern atomic_t redundant_softirqs_off; 129 int redundant_softirqs_on;
126extern atomic_t nr_unused_locks; 130 int redundant_softirqs_off;
127extern atomic_t nr_cyclic_checks; 131 int nr_unused_locks;
128extern atomic_t nr_cyclic_check_recursions; 132 int nr_cyclic_checks;
129extern atomic_t nr_find_usage_forwards_checks; 133 int nr_cyclic_check_recursions;
130extern atomic_t nr_find_usage_forwards_recursions; 134 int nr_find_usage_forwards_checks;
131extern atomic_t nr_find_usage_backwards_checks; 135 int nr_find_usage_forwards_recursions;
132extern atomic_t nr_find_usage_backwards_recursions; 136 int nr_find_usage_backwards_checks;
133# define debug_atomic_inc(ptr) atomic_inc(ptr) 137 int nr_find_usage_backwards_recursions;
134# define debug_atomic_dec(ptr) atomic_dec(ptr) 138};
135# define debug_atomic_read(ptr) atomic_read(ptr) 139
140DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
141
142#define __debug_atomic_inc(ptr) \
143 this_cpu_inc(lockdep_stats.ptr);
144
145#define debug_atomic_inc(ptr) { \
146 WARN_ON_ONCE(!irqs_disabled()); \
147 __this_cpu_inc(lockdep_stats.ptr); \
148}
149
150#define debug_atomic_dec(ptr) { \
151 WARN_ON_ONCE(!irqs_disabled()); \
152 __this_cpu_dec(lockdep_stats.ptr); \
153}
154
155#define debug_atomic_read(ptr) ({ \
156 struct lockdep_stats *__cpu_lockdep_stats; \
157 unsigned long long __total = 0; \
158 int __cpu; \
159 for_each_possible_cpu(__cpu) { \
160 __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
161 __total += __cpu_lockdep_stats->ptr; \
162 } \
163 __total; \
164})
136#else 165#else
166# define __debug_atomic_inc(ptr) do { } while (0)
137# define debug_atomic_inc(ptr) do { } while (0) 167# define debug_atomic_inc(ptr) do { } while (0)
138# define debug_atomic_dec(ptr) do { } while (0) 168# define debug_atomic_dec(ptr) do { } while (0)
139# define debug_atomic_read(ptr) 0 169# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
184static void lockdep_stats_debug_show(struct seq_file *m) 184static void lockdep_stats_debug_show(struct seq_file *m)
185{ 185{
186#ifdef CONFIG_DEBUG_LOCKDEP 186#ifdef CONFIG_DEBUG_LOCKDEP
187 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), 187 unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
188 hi2 = debug_atomic_read(&hardirqs_off_events), 188 hi2 = debug_atomic_read(hardirqs_off_events),
189 hr1 = debug_atomic_read(&redundant_hardirqs_on), 189 hr1 = debug_atomic_read(redundant_hardirqs_on),
190 hr2 = debug_atomic_read(&redundant_hardirqs_off), 190 hr2 = debug_atomic_read(redundant_hardirqs_off),
191 si1 = debug_atomic_read(&softirqs_on_events), 191 si1 = debug_atomic_read(softirqs_on_events),
192 si2 = debug_atomic_read(&softirqs_off_events), 192 si2 = debug_atomic_read(softirqs_off_events),
193 sr1 = debug_atomic_read(&redundant_softirqs_on), 193 sr1 = debug_atomic_read(redundant_softirqs_on),
194 sr2 = debug_atomic_read(&redundant_softirqs_off); 194 sr2 = debug_atomic_read(redundant_softirqs_off);
195 195
196 seq_printf(m, " chain lookup misses: %11u\n", 196 seq_printf(m, " chain lookup misses: %11llu\n",
197 debug_atomic_read(&chain_lookup_misses)); 197 debug_atomic_read(chain_lookup_misses));
198 seq_printf(m, " chain lookup hits: %11u\n", 198 seq_printf(m, " chain lookup hits: %11llu\n",
199 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(chain_lookup_hits));
200 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11llu\n",
201 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(nr_cyclic_checks));
202 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11llu\n",
203 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(nr_find_usage_forwards_checks));
204 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11llu\n",
205 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(nr_find_usage_backwards_checks));
206 206
207 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11llu\n", hi1);
208 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11llu\n", hi2);
209 seq_printf(m, " redundant hardirq ons: %11u\n", hr1); 209 seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
210 seq_printf(m, " redundant hardirq offs: %11u\n", hr2); 210 seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
211 seq_printf(m, " softirq on events: %11u\n", si1); 211 seq_printf(m, " softirq on events: %11llu\n", si1);
212 seq_printf(m, " softirq off events: %11u\n", si2); 212 seq_printf(m, " softirq off events: %11llu\n", si2);
213 seq_printf(m, " redundant softirq ons: %11u\n", sr1); 213 seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
214 seq_printf(m, " redundant softirq offs: %11u\n", sr2); 214 seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
215#endif 215#endif
216} 216}
217 217
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
263#endif 263#endif
264 } 264 }
265#ifdef CONFIG_DEBUG_LOCKDEP 265#ifdef CONFIG_DEBUG_LOCKDEP
266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
267#endif 267#endif
268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
269 nr_lock_classes, MAX_LOCKDEP_KEYS); 269 nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index f82386bd9ee9..e2564580f3f1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
59#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 60#include <trace/events/module.h>
61 61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
64#if 0 62#if 0
65#define DEBUGP printk 63#define DEBUGP printk
66#else 64#else
@@ -370,27 +368,33 @@ EXPORT_SYMBOL_GPL(find_module);
370 368
371#ifdef CONFIG_SMP 369#ifdef CONFIG_SMP
372 370
373static void *percpu_modalloc(unsigned long size, unsigned long align, 371static inline void __percpu *mod_percpu(struct module *mod)
374 const char *name)
375{ 372{
376 void *ptr; 373 return mod->percpu;
374}
377 375
376static int percpu_modalloc(struct module *mod,
377 unsigned long size, unsigned long align)
378{
378 if (align > PAGE_SIZE) { 379 if (align > PAGE_SIZE) {
379 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 380 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
380 name, align, PAGE_SIZE); 381 mod->name, align, PAGE_SIZE);
381 align = PAGE_SIZE; 382 align = PAGE_SIZE;
382 } 383 }
383 384
384 ptr = __alloc_reserved_percpu(size, align); 385 mod->percpu = __alloc_reserved_percpu(size, align);
385 if (!ptr) 386 if (!mod->percpu) {
386 printk(KERN_WARNING 387 printk(KERN_WARNING
387 "Could not allocate %lu bytes percpu data\n", size); 388 "Could not allocate %lu bytes percpu data\n", size);
388 return ptr; 389 return -ENOMEM;
390 }
391 mod->percpu_size = size;
392 return 0;
389} 393}
390 394
391static void percpu_modfree(void *freeme) 395static void percpu_modfree(struct module *mod)
392{ 396{
393 free_percpu(freeme); 397 free_percpu(mod->percpu);
394} 398}
395 399
396static unsigned int find_pcpusec(Elf_Ehdr *hdr, 400static unsigned int find_pcpusec(Elf_Ehdr *hdr,
@@ -400,24 +404,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
400 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 404 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
401} 405}
402 406
403static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) 407static void percpu_modcopy(struct module *mod,
408 const void *from, unsigned long size)
404{ 409{
405 int cpu; 410 int cpu;
406 411
407 for_each_possible_cpu(cpu) 412 for_each_possible_cpu(cpu)
408 memcpy(pcpudest + per_cpu_offset(cpu), from, size); 413 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
414}
415
416/**
417 * is_module_percpu_address - test whether address is from module static percpu
418 * @addr: address to test
419 *
420 * Test whether @addr belongs to module static percpu area.
421 *
422 * RETURNS:
423 * %true if @addr is from module static percpu area
424 */
425bool is_module_percpu_address(unsigned long addr)
426{
427 struct module *mod;
428 unsigned int cpu;
429
430 preempt_disable();
431
432 list_for_each_entry_rcu(mod, &modules, list) {
433 if (!mod->percpu_size)
434 continue;
435 for_each_possible_cpu(cpu) {
436 void *start = per_cpu_ptr(mod->percpu, cpu);
437
438 if ((void *)addr >= start &&
439 (void *)addr < start + mod->percpu_size) {
440 preempt_enable();
441 return true;
442 }
443 }
444 }
445
446 preempt_enable();
447 return false;
409} 448}
410 449
411#else /* ... !CONFIG_SMP */ 450#else /* ... !CONFIG_SMP */
412 451
413static inline void *percpu_modalloc(unsigned long size, unsigned long align, 452static inline void __percpu *mod_percpu(struct module *mod)
414 const char *name)
415{ 453{
416 return NULL; 454 return NULL;
417} 455}
418static inline void percpu_modfree(void *pcpuptr) 456static inline int percpu_modalloc(struct module *mod,
457 unsigned long size, unsigned long align)
458{
459 return -ENOMEM;
460}
461static inline void percpu_modfree(struct module *mod)
419{ 462{
420 BUG();
421} 463}
422static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 464static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
423 Elf_Shdr *sechdrs, 465 Elf_Shdr *sechdrs,
@@ -425,12 +467,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
425{ 467{
426 return 0; 468 return 0;
427} 469}
428static inline void percpu_modcopy(void *pcpudst, const void *src, 470static inline void percpu_modcopy(struct module *mod,
429 unsigned long size) 471 const void *from, unsigned long size)
430{ 472{
431 /* pcpusec should be 0, and size of that section should be 0. */ 473 /* pcpusec should be 0, and size of that section should be 0. */
432 BUG_ON(size != 0); 474 BUG_ON(size != 0);
433} 475}
476bool is_module_percpu_address(unsigned long addr)
477{
478 return false;
479}
434 480
435#endif /* CONFIG_SMP */ 481#endif /* CONFIG_SMP */
436 482
@@ -467,16 +513,22 @@ MODINFO_ATTR(srcversion);
467static char last_unloaded_module[MODULE_NAME_LEN+1]; 513static char last_unloaded_module[MODULE_NAME_LEN+1];
468 514
469#ifdef CONFIG_MODULE_UNLOAD 515#ifdef CONFIG_MODULE_UNLOAD
516
517EXPORT_TRACEPOINT_SYMBOL(module_get);
518
470/* Init the unload section of the module. */ 519/* Init the unload section of the module. */
471static void module_unload_init(struct module *mod) 520static void module_unload_init(struct module *mod)
472{ 521{
473 int cpu; 522 int cpu;
474 523
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 524 INIT_LIST_HEAD(&mod->modules_which_use_me);
476 for_each_possible_cpu(cpu) 525 for_each_possible_cpu(cpu) {
477 local_set(__module_ref_addr(mod, cpu), 0); 526 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
527 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
528 }
529
478 /* Hold reference count during initialization. */ 530 /* Hold reference count during initialization. */
479 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 531 __this_cpu_write(mod->refptr->incs, 1);
480 /* Backwards compatibility macros put refcount during init. */ 532 /* Backwards compatibility macros put refcount during init. */
481 mod->waiter = current; 533 mod->waiter = current;
482} 534}
@@ -615,12 +667,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
615 667
616unsigned int module_refcount(struct module *mod) 668unsigned int module_refcount(struct module *mod)
617{ 669{
618 unsigned int total = 0; 670 unsigned int incs = 0, decs = 0;
619 int cpu; 671 int cpu;
620 672
621 for_each_possible_cpu(cpu) 673 for_each_possible_cpu(cpu)
622 total += local_read(__module_ref_addr(mod, cpu)); 674 decs += per_cpu_ptr(mod->refptr, cpu)->decs;
623 return total; 675 /*
676 * ensure the incs are added up after the decs.
677 * module_put ensures incs are visible before decs with smp_wmb.
678 *
679 * This 2-count scheme avoids the situation where the refcount
680 * for CPU0 is read, then CPU0 increments the module refcount,
681 * then CPU1 drops that refcount, then the refcount for CPU1 is
682 * read. We would record a decrement but not its corresponding
683 * increment so we would see a low count (disaster).
684 *
685 * Rare situation? But module_refcount can be preempted, and we
686 * might be tallying up 4096+ CPUs. So it is not impossible.
687 */
688 smp_rmb();
689 for_each_possible_cpu(cpu)
690 incs += per_cpu_ptr(mod->refptr, cpu)->incs;
691 return incs - decs;
624} 692}
625EXPORT_SYMBOL(module_refcount); 693EXPORT_SYMBOL(module_refcount);
626 694
@@ -656,16 +724,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
656 return -EFAULT; 724 return -EFAULT;
657 name[MODULE_NAME_LEN-1] = '\0'; 725 name[MODULE_NAME_LEN-1] = '\0';
658 726
659 /* Create stop_machine threads since free_module relies on 727 if (mutex_lock_interruptible(&module_mutex) != 0)
660 * a non-failing stop_machine call. */ 728 return -EINTR;
661 ret = stop_machine_create();
662 if (ret)
663 return ret;
664
665 if (mutex_lock_interruptible(&module_mutex) != 0) {
666 ret = -EINTR;
667 goto out_stop;
668 }
669 729
670 mod = find_module(name); 730 mod = find_module(name);
671 if (!mod) { 731 if (!mod) {
@@ -725,8 +785,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
725 785
726 out: 786 out:
727 mutex_unlock(&module_mutex); 787 mutex_unlock(&module_mutex);
728out_stop:
729 stop_machine_destroy();
730 return ret; 788 return ret;
731} 789}
732 790
@@ -796,14 +854,15 @@ static struct module_attribute refcnt = {
796void module_put(struct module *module) 854void module_put(struct module *module)
797{ 855{
798 if (module) { 856 if (module) {
799 unsigned int cpu = get_cpu(); 857 preempt_disable();
800 local_dec(__module_ref_addr(module, cpu)); 858 smp_wmb(); /* see comment in module_refcount */
801 trace_module_put(module, _RET_IP_, 859 __this_cpu_inc(module->refptr->decs);
802 local_read(__module_ref_addr(module, cpu))); 860
861 trace_module_put(module, _RET_IP_);
803 /* Maybe they're waiting for us to drop reference? */ 862 /* Maybe they're waiting for us to drop reference? */
804 if (unlikely(!module_is_live(module))) 863 if (unlikely(!module_is_live(module)))
805 wake_up_process(module->waiter); 864 wake_up_process(module->waiter);
806 put_cpu(); 865 preempt_enable();
807 } 866 }
808} 867}
809EXPORT_SYMBOL(module_put); 868EXPORT_SYMBOL(module_put);
@@ -1083,6 +1142,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1083 if (sattr->name == NULL) 1142 if (sattr->name == NULL)
1084 goto out; 1143 goto out;
1085 sect_attrs->nsections++; 1144 sect_attrs->nsections++;
1145 sysfs_attr_init(&sattr->mattr.attr);
1086 sattr->mattr.show = module_sect_show; 1146 sattr->mattr.show = module_sect_show;
1087 sattr->mattr.store = NULL; 1147 sattr->mattr.store = NULL;
1088 sattr->mattr.attr.name = sattr->name; 1148 sattr->mattr.attr.name = sattr->name;
@@ -1178,6 +1238,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1178 if (sect_empty(&sechdrs[i])) 1238 if (sect_empty(&sechdrs[i]))
1179 continue; 1239 continue;
1180 if (sechdrs[i].sh_type == SHT_NOTE) { 1240 if (sechdrs[i].sh_type == SHT_NOTE) {
1241 sysfs_bin_attr_init(nattr);
1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1242 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1182 nattr->attr.mode = S_IRUGO; 1243 nattr->attr.mode = S_IRUGO;
1183 nattr->size = sechdrs[i].sh_size; 1244 nattr->size = sechdrs[i].sh_size;
@@ -1250,6 +1311,7 @@ int module_add_modinfo_attrs(struct module *mod)
1250 if (!attr->test || 1311 if (!attr->test ||
1251 (attr->test && attr->test(mod))) { 1312 (attr->test && attr->test(mod))) {
1252 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1313 memcpy(temp_attr, attr, sizeof(*temp_attr));
1314 sysfs_attr_init(&temp_attr->attr);
1253 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1315 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1254 ++temp_attr; 1316 ++temp_attr;
1255 } 1317 }
@@ -1395,11 +1457,10 @@ static void free_module(struct module *mod)
1395 /* This may be NULL, but that's OK */ 1457 /* This may be NULL, but that's OK */
1396 module_free(mod, mod->module_init); 1458 module_free(mod, mod->module_init);
1397 kfree(mod->args); 1459 kfree(mod->args);
1398 if (mod->percpu) 1460 percpu_modfree(mod);
1399 percpu_modfree(mod->percpu); 1461#if defined(CONFIG_MODULE_UNLOAD)
1400#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
1401 if (mod->refptr) 1462 if (mod->refptr)
1402 percpu_modfree(mod->refptr); 1463 free_percpu(mod->refptr);
1403#endif 1464#endif
1404 /* Free lock-classes: */ 1465 /* Free lock-classes: */
1405 lockdep_free_key_range(mod->module_core, mod->core_size); 1466 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1515,7 +1576,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1515 default: 1576 default:
1516 /* Divert to percpu allocation if a percpu var. */ 1577 /* Divert to percpu allocation if a percpu var. */
1517 if (sym[i].st_shndx == pcpuindex) 1578 if (sym[i].st_shndx == pcpuindex)
1518 secbase = (unsigned long)mod->percpu; 1579 secbase = (unsigned long)mod_percpu(mod);
1519 else 1580 else
1520 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1581 secbase = sechdrs[sym[i].st_shndx].sh_addr;
1521 sym[i].st_value += secbase; 1582 sym[i].st_value += secbase;
@@ -1949,7 +2010,7 @@ static noinline struct module *load_module(void __user *umod,
1949 unsigned int modindex, versindex, infoindex, pcpuindex; 2010 unsigned int modindex, versindex, infoindex, pcpuindex;
1950 struct module *mod; 2011 struct module *mod;
1951 long err = 0; 2012 long err = 0;
1952 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2013 void *ptr = NULL; /* Stops spurious gcc warning */
1953 unsigned long symoffs, stroffs, *strmap; 2014 unsigned long symoffs, stroffs, *strmap;
1954 2015
1955 mm_segment_t old_fs; 2016 mm_segment_t old_fs;
@@ -2089,15 +2150,11 @@ static noinline struct module *load_module(void __user *umod,
2089 2150
2090 if (pcpuindex) { 2151 if (pcpuindex) {
2091 /* We have a special allocation for this section. */ 2152 /* We have a special allocation for this section. */
2092 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, 2153 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
2093 sechdrs[pcpuindex].sh_addralign, 2154 sechdrs[pcpuindex].sh_addralign);
2094 mod->name); 2155 if (err)
2095 if (!percpu) {
2096 err = -ENOMEM;
2097 goto free_mod; 2156 goto free_mod;
2098 }
2099 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2157 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2100 mod->percpu = percpu;
2101 } 2158 }
2102 2159
2103 /* Determine total sizes, and put offsets in sh_entsize. For now 2160 /* Determine total sizes, and put offsets in sh_entsize. For now
@@ -2162,9 +2219,8 @@ static noinline struct module *load_module(void __user *umod,
2162 mod = (void *)sechdrs[modindex].sh_addr; 2219 mod = (void *)sechdrs[modindex].sh_addr;
2163 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2220 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2164 2221
2165#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2222#if defined(CONFIG_MODULE_UNLOAD)
2166 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2223 mod->refptr = alloc_percpu(struct module_ref);
2167 mod->name);
2168 if (!mod->refptr) { 2224 if (!mod->refptr) {
2169 err = -ENOMEM; 2225 err = -ENOMEM;
2170 goto free_init; 2226 goto free_init;
@@ -2313,7 +2369,7 @@ static noinline struct module *load_module(void __user *umod,
2313 sort_extable(mod->extable, mod->extable + mod->num_exentries); 2369 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2314 2370
2315 /* Finally, copy percpu area over. */ 2371 /* Finally, copy percpu area over. */
2316 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2372 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2317 sechdrs[pcpuindex].sh_size); 2373 sechdrs[pcpuindex].sh_size);
2318 2374
2319 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, 2375 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2396,8 +2452,8 @@ static noinline struct module *load_module(void __user *umod,
2396 kobject_put(&mod->mkobj.kobj); 2452 kobject_put(&mod->mkobj.kobj);
2397 free_unload: 2453 free_unload:
2398 module_unload_free(mod); 2454 module_unload_free(mod);
2399#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2455#if defined(CONFIG_MODULE_UNLOAD)
2400 percpu_modfree(mod->refptr); 2456 free_percpu(mod->refptr);
2401 free_init: 2457 free_init:
2402#endif 2458#endif
2403 module_free(mod, mod->module_init); 2459 module_free(mod, mod->module_init);
@@ -2405,8 +2461,7 @@ static noinline struct module *load_module(void __user *umod,
2405 module_free(mod, mod->module_core); 2461 module_free(mod, mod->module_core);
2406 /* mod will be freed with core. Don't access it beyond this line! */ 2462 /* mod will be freed with core. Don't access it beyond this line! */
2407 free_percpu: 2463 free_percpu:
2408 if (percpu) 2464 percpu_modfree(mod);
2409 percpu_modfree(percpu);
2410 free_mod: 2465 free_mod:
2411 kfree(args); 2466 kfree(args);
2412 kfree(strmap); 2467 kfree(strmap);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
13 * Pavel Emelianov <xemul@openvz.org> 13 * Pavel Emelianov <xemul@openvz.org>
14 */ 14 */
15 15
16#include <linux/slab.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
18#include <linux/init_task.h> 19#include <linux/init_task.h>
@@ -24,7 +25,18 @@
24 25
25static struct kmem_cache *nsproxy_cachep; 26static struct kmem_cache *nsproxy_cachep;
26 27
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 28struct nsproxy init_nsproxy = {
29 .count = ATOMIC_INIT(1),
30 .uts_ns = &init_uts_ns,
31#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
32 .ipc_ns = &init_ipc_ns,
33#endif
34 .mnt_ns = NULL,
35 .pid_ns = &init_pid_ns,
36#ifdef CONFIG_NET
37 .net_ns = &init_net,
38#endif
39};
28 40
29static inline struct nsproxy *create_nsproxy(void) 41static inline struct nsproxy *create_nsproxy(void)
30{ 42{
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f9bcb8313d6..fd03513c7327 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -25,6 +25,7 @@
25#include <linux/padata.h> 25#include <linux/padata.h>
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/slab.h>
28#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
29 30
30#define MAX_SEQ_NR INT_MAX - NR_CPUS 31#define MAX_SEQ_NR INT_MAX - NR_CPUS
@@ -642,6 +643,9 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
642 if (!pd) 643 if (!pd)
643 goto err_free_inst; 644 goto err_free_inst;
644 645
646 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
647 goto err_free_pd;
648
645 rcu_assign_pointer(pinst->pd, pd); 649 rcu_assign_pointer(pinst->pd, pd);
646 650
647 pinst->wq = wq; 651 pinst->wq = wq;
@@ -654,12 +658,14 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
654 pinst->cpu_notifier.priority = 0; 658 pinst->cpu_notifier.priority = 0;
655 err = register_hotcpu_notifier(&pinst->cpu_notifier); 659 err = register_hotcpu_notifier(&pinst->cpu_notifier);
656 if (err) 660 if (err)
657 goto err_free_pd; 661 goto err_free_cpumask;
658 662
659 mutex_init(&pinst->lock); 663 mutex_init(&pinst->lock);
660 664
661 return pinst; 665 return pinst;
662 666
667err_free_cpumask:
668 free_cpumask_var(pinst->cpumask);
663err_free_pd: 669err_free_pd:
664 padata_free_pd(pd); 670 padata_free_pd(pd);
665err_free_inst: 671err_free_inst:
@@ -685,6 +691,7 @@ void padata_free(struct padata_instance *pinst)
685 691
686 unregister_hotcpu_notifier(&pinst->cpu_notifier); 692 unregister_hotcpu_notifier(&pinst->cpu_notifier);
687 padata_free_pd(pinst->pd); 693 padata_free_pd(pinst->pd);
694 free_cpumask_var(pinst->cpumask);
688 kfree(pinst); 695 kfree(pinst);
689} 696}
690EXPORT_SYMBOL(padata_free); 697EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 36
37EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
38 38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
45long (*panic_blink)(long time); 40long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
47 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
48/** 69/**
49 * panic - halt the system 70 * panic - halt the system
50 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
95 116
96 bust_spinlocks(0); 117 bust_spinlocks(0);
97 118
98 if (!panic_blink)
99 panic_blink = no_blink;
100
101 if (panic_timeout > 0) { 119 if (panic_timeout > 0) {
102 /* 120 /*
103 * Delay timeout seconds before rebooting the machine. 121 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
105 */ 123 */
106 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
107 125
108 for (i = 0; i < panic_timeout*1000; ) { 126 for (i = 0; i < panic_timeout; i++) {
109 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
110 i += panic_blink(i); 128 panic_blink_one_second();
111 mdelay(1);
112 i++;
113 } 129 }
114 /* 130 /*
115 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
135 } 151 }
136#endif 152#endif
137 local_irq_enable(); 153 local_irq_enable();
138 for (i = 0; ; ) { 154 while (1) {
139 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
140 i += panic_blink(i); 156 panic_blink_one_second();
141 mdelay(1);
142 i++;
143 } 157 }
144} 158}
145 159
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
28 27
29#if 0 28#if 0
30#define DEBUGP printk 29#define DEBUGP printk
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
402} 401}
403 402
404/* sysfs output in /sys/modules/XYZ/parameters/ */ 403/* sysfs output in /sys/modules/XYZ/parameters/ */
405#define to_module_attr(n) container_of(n, struct module_attribute, attr); 404#define to_module_attr(n) container_of(n, struct module_attribute, attr)
406#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 405#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
407 406
408extern struct kernel_param __start___param[], __stop___param[]; 407extern struct kernel_param __start___param[], __stop___param[];
409 408
@@ -421,7 +420,7 @@ struct module_param_attrs
421}; 420};
422 421
423#ifdef CONFIG_SYSFS 422#ifdef CONFIG_SYSFS
424#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 423#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
425 424
426static ssize_t param_attr_show(struct module_attribute *mattr, 425static ssize_t param_attr_show(struct module_attribute *mattr,
427 struct module *mod, char *buf) 426 struct module *mod, char *buf)
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
517 new->grp.attrs = attrs; 516 new->grp.attrs = attrs;
518 517
519 /* Tack new one on the end. */ 518 /* Tack new one on the end. */
519 sysfs_attr_init(&new->attrs[num].mattr.attr);
520 new->attrs[num].param = kp; 520 new->attrs[num].param = kp;
521 new->attrs[num].mattr.show = param_attr_show; 521 new->attrs[num].mattr.show = param_attr_show;
522 new->attrs[num].mattr.store = param_attr_store; 522 new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
723 return ret; 723 return ret;
724} 724}
725 725
726static struct sysfs_ops module_sysfs_ops = { 726static const struct sysfs_ops module_sysfs_ops = {
727 .show = module_attr_show, 727 .show = module_attr_show,
728 .store = module_attr_store, 728 .store = module_attr_store,
729}; 729};
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
737 return 0; 737 return 0;
738} 738}
739 739
740static struct kset_uevent_ops module_uevent_ops = { 740static const struct kset_uevent_ops module_uevent_ops = {
741 .filter = uevent_filter, 741 .filter = uevent_filter,
742}; 742};
743 743
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a661e7991865..a4fa381db3c2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,8 @@
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
18#include <linux/sysfs.h> 20#include <linux/sysfs.h>
19#include <linux/dcache.h> 21#include <linux/dcache.h>
20#include <linux/percpu.h> 22#include <linux/percpu.h>
@@ -56,21 +58,6 @@ static atomic_t nr_task_events __read_mostly;
56 */ 58 */
57int sysctl_perf_event_paranoid __read_mostly = 1; 59int sysctl_perf_event_paranoid __read_mostly = 1;
58 60
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 61int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75 62
76/* 63/*
@@ -96,41 +83,19 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
96void __weak hw_perf_disable(void) { barrier(); } 83void __weak hw_perf_disable(void) { barrier(); }
97void __weak hw_perf_enable(void) { barrier(); } 84void __weak hw_perf_enable(void) { barrier(); }
98 85
99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
102
103int __weak
104hw_perf_group_sched_in(struct perf_event *group_leader,
105 struct perf_cpu_context *cpuctx,
106 struct perf_event_context *ctx)
107{
108 return 0;
109}
110
111void __weak perf_event_print_debug(void) { } 86void __weak perf_event_print_debug(void) { }
112 87
113static DEFINE_PER_CPU(int, perf_disable_count); 88static DEFINE_PER_CPU(int, perf_disable_count);
114 89
115void __perf_disable(void)
116{
117 __get_cpu_var(perf_disable_count)++;
118}
119
120bool __perf_enable(void)
121{
122 return !--__get_cpu_var(perf_disable_count);
123}
124
125void perf_disable(void) 90void perf_disable(void)
126{ 91{
127 __perf_disable(); 92 if (!__get_cpu_var(perf_disable_count)++)
128 hw_perf_disable(); 93 hw_perf_disable();
129} 94}
130 95
131void perf_enable(void) 96void perf_enable(void)
132{ 97{
133 if (__perf_enable()) 98 if (!--__get_cpu_var(perf_disable_count))
134 hw_perf_enable(); 99 hw_perf_enable();
135} 100}
136 101
@@ -290,6 +255,18 @@ static void update_event_times(struct perf_event *event)
290 event->total_time_running = run_end - event->tstamp_running; 255 event->total_time_running = run_end - event->tstamp_running;
291} 256}
292 257
258/*
259 * Update total_time_enabled and total_time_running for all events in a group.
260 */
261static void update_group_times(struct perf_event *leader)
262{
263 struct perf_event *event;
264
265 update_event_times(leader);
266 list_for_each_entry(event, &leader->sibling_list, group_entry)
267 update_event_times(event);
268}
269
293static struct list_head * 270static struct list_head *
294ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 271ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
295{ 272{
@@ -343,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
343static void 320static void
344list_del_event(struct perf_event *event, struct perf_event_context *ctx) 321list_del_event(struct perf_event *event, struct perf_event_context *ctx)
345{ 322{
346 struct perf_event *sibling, *tmp;
347
348 if (list_empty(&event->group_entry)) 323 if (list_empty(&event->group_entry))
349 return; 324 return;
350 ctx->nr_events--; 325 ctx->nr_events--;
@@ -357,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
357 if (event->group_leader != event) 332 if (event->group_leader != event)
358 event->group_leader->nr_siblings--; 333 event->group_leader->nr_siblings--;
359 334
360 update_event_times(event); 335 update_group_times(event);
361 336
362 /* 337 /*
363 * If event was in error state, then keep it 338 * If event was in error state, then keep it
@@ -368,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
368 */ 343 */
369 if (event->state > PERF_EVENT_STATE_OFF) 344 if (event->state > PERF_EVENT_STATE_OFF)
370 event->state = PERF_EVENT_STATE_OFF; 345 event->state = PERF_EVENT_STATE_OFF;
346}
347
348static void
349perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
350{
351 struct perf_event *sibling, *tmp;
371 352
372 /* 353 /*
373 * If this was a group event with sibling events then 354 * If this was a group event with sibling events then
@@ -533,18 +514,6 @@ retry:
533} 514}
534 515
535/* 516/*
536 * Update total_time_enabled and total_time_running for all events in a group.
537 */
538static void update_group_times(struct perf_event *leader)
539{
540 struct perf_event *event;
541
542 update_event_times(leader);
543 list_for_each_entry(event, &leader->sibling_list, group_entry)
544 update_event_times(event);
545}
546
547/*
548 * Cross CPU call to disable a performance event 517 * Cross CPU call to disable a performance event
549 */ 518 */
550static void __perf_event_disable(void *info) 519static void __perf_event_disable(void *info)
@@ -668,15 +637,20 @@ group_sched_in(struct perf_event *group_event,
668 struct perf_cpu_context *cpuctx, 637 struct perf_cpu_context *cpuctx,
669 struct perf_event_context *ctx) 638 struct perf_event_context *ctx)
670{ 639{
671 struct perf_event *event, *partial_group; 640 struct perf_event *event, *partial_group = NULL;
641 const struct pmu *pmu = group_event->pmu;
642 bool txn = false;
672 int ret; 643 int ret;
673 644
674 if (group_event->state == PERF_EVENT_STATE_OFF) 645 if (group_event->state == PERF_EVENT_STATE_OFF)
675 return 0; 646 return 0;
676 647
677 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); 648 /* Check if group transaction availabe */
678 if (ret) 649 if (pmu->start_txn)
679 return ret < 0 ? ret : 0; 650 txn = true;
651
652 if (txn)
653 pmu->start_txn(pmu);
680 654
681 if (event_sched_in(group_event, cpuctx, ctx)) 655 if (event_sched_in(group_event, cpuctx, ctx))
682 return -EAGAIN; 656 return -EAGAIN;
@@ -691,9 +665,19 @@ group_sched_in(struct perf_event *group_event,
691 } 665 }
692 } 666 }
693 667
694 return 0; 668 if (!txn)
669 return 0;
670
671 ret = pmu->commit_txn(pmu);
672 if (!ret) {
673 pmu->cancel_txn(pmu);
674 return 0;
675 }
695 676
696group_error: 677group_error:
678 if (txn)
679 pmu->cancel_txn(pmu);
680
697 /* 681 /*
698 * Groups can be scheduled in as one unit only, so undo any 682 * Groups can be scheduled in as one unit only, so undo any
699 * partial group before returning: 683 * partial group before returning:
@@ -1193,11 +1177,9 @@ void perf_event_task_sched_out(struct task_struct *task,
1193 struct perf_event_context *ctx = task->perf_event_ctxp; 1177 struct perf_event_context *ctx = task->perf_event_ctxp;
1194 struct perf_event_context *next_ctx; 1178 struct perf_event_context *next_ctx;
1195 struct perf_event_context *parent; 1179 struct perf_event_context *parent;
1196 struct pt_regs *regs;
1197 int do_switch = 1; 1180 int do_switch = 1;
1198 1181
1199 regs = task_pt_regs(task); 1182 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1200 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1201 1183
1202 if (likely(!ctx || !cpuctx->task_ctx)) 1184 if (likely(!ctx || !cpuctx->task_ctx))
1203 return; 1185 return;
@@ -1397,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1397 if (cpuctx->task_ctx == ctx) 1379 if (cpuctx->task_ctx == ctx)
1398 return; 1380 return;
1399 1381
1382 perf_disable();
1383
1400 /* 1384 /*
1401 * We want to keep the following priority order: 1385 * We want to keep the following priority order:
1402 * cpu pinned (that don't need to move), task pinned, 1386 * cpu pinned (that don't need to move), task pinned,
@@ -1409,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1409 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 1393 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1410 1394
1411 cpuctx->task_ctx = ctx; 1395 cpuctx->task_ctx = ctx;
1396
1397 perf_enable();
1412} 1398}
1413 1399
1414#define MAX_INTERRUPTS (~0ULL) 1400#define MAX_INTERRUPTS (~0ULL)
@@ -1553,12 +1539,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1553 */ 1539 */
1554 if (interrupts == MAX_INTERRUPTS) { 1540 if (interrupts == MAX_INTERRUPTS) {
1555 perf_log_throttle(event, 1); 1541 perf_log_throttle(event, 1);
1542 perf_disable();
1556 event->pmu->unthrottle(event); 1543 event->pmu->unthrottle(event);
1544 perf_enable();
1557 } 1545 }
1558 1546
1559 if (!event->attr.freq || !event->attr.sample_freq) 1547 if (!event->attr.freq || !event->attr.sample_freq)
1560 continue; 1548 continue;
1561 1549
1550 perf_disable();
1562 event->pmu->read(event); 1551 event->pmu->read(event);
1563 now = atomic64_read(&event->count); 1552 now = atomic64_read(&event->count);
1564 delta = now - hwc->freq_count_stamp; 1553 delta = now - hwc->freq_count_stamp;
@@ -1566,6 +1555,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1566 1555
1567 if (delta > 0) 1556 if (delta > 0)
1568 perf_adjust_period(event, TICK_NSEC, delta); 1557 perf_adjust_period(event, TICK_NSEC, delta);
1558 perf_enable();
1569 } 1559 }
1570 raw_spin_unlock(&ctx->lock); 1560 raw_spin_unlock(&ctx->lock);
1571} 1561}
@@ -1575,9 +1565,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1575 */ 1565 */
1576static void rotate_ctx(struct perf_event_context *ctx) 1566static void rotate_ctx(struct perf_event_context *ctx)
1577{ 1567{
1578 if (!ctx->nr_events)
1579 return;
1580
1581 raw_spin_lock(&ctx->lock); 1568 raw_spin_lock(&ctx->lock);
1582 1569
1583 /* Rotate the first entry last of non-pinned groups */ 1570 /* Rotate the first entry last of non-pinned groups */
@@ -1590,19 +1577,28 @@ void perf_event_task_tick(struct task_struct *curr)
1590{ 1577{
1591 struct perf_cpu_context *cpuctx; 1578 struct perf_cpu_context *cpuctx;
1592 struct perf_event_context *ctx; 1579 struct perf_event_context *ctx;
1580 int rotate = 0;
1593 1581
1594 if (!atomic_read(&nr_events)) 1582 if (!atomic_read(&nr_events))
1595 return; 1583 return;
1596 1584
1597 cpuctx = &__get_cpu_var(perf_cpu_context); 1585 cpuctx = &__get_cpu_var(perf_cpu_context);
1598 ctx = curr->perf_event_ctxp; 1586 if (cpuctx->ctx.nr_events &&
1587 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1588 rotate = 1;
1599 1589
1600 perf_disable(); 1590 ctx = curr->perf_event_ctxp;
1591 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1592 rotate = 1;
1601 1593
1602 perf_ctx_adjust_freq(&cpuctx->ctx); 1594 perf_ctx_adjust_freq(&cpuctx->ctx);
1603 if (ctx) 1595 if (ctx)
1604 perf_ctx_adjust_freq(ctx); 1596 perf_ctx_adjust_freq(ctx);
1605 1597
1598 if (!rotate)
1599 return;
1600
1601 perf_disable();
1606 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1602 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1607 if (ctx) 1603 if (ctx)
1608 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1604 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1614,7 +1610,6 @@ void perf_event_task_tick(struct task_struct *curr)
1614 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1610 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1615 if (ctx) 1611 if (ctx)
1616 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1612 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618 perf_enable(); 1613 perf_enable();
1619} 1614}
1620 1615
@@ -1877,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event)
1877{ 1872{
1878 struct perf_event_context *ctx = event->ctx; 1873 struct perf_event_context *ctx = event->ctx;
1879 1874
1875 /*
1876 * Remove from the PMU, can't get re-enabled since we got
1877 * here because the last ref went.
1878 */
1879 perf_event_disable(event);
1880
1880 WARN_ON_ONCE(ctx->parent_ctx); 1881 WARN_ON_ONCE(ctx->parent_ctx);
1881 mutex_lock(&ctx->mutex); 1882 /*
1882 perf_event_remove_from_context(event); 1883 * There are two ways this annotation is useful:
1884 *
1885 * 1) there is a lock recursion from perf_event_exit_task
1886 * see the comment there.
1887 *
1888 * 2) there is a lock-inversion with mmap_sem through
1889 * perf_event_read_group(), which takes faults while
1890 * holding ctx->mutex, however this is called after
1891 * the last filedesc died, so there is no possibility
1892 * to trigger the AB-BA case.
1893 */
1894 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1895 raw_spin_lock_irq(&ctx->lock);
1896 list_del_event(event, ctx);
1897 perf_destroy_group(event, ctx);
1898 raw_spin_unlock_irq(&ctx->lock);
1883 mutex_unlock(&ctx->mutex); 1899 mutex_unlock(&ctx->mutex);
1884 1900
1885 mutex_lock(&event->owner->perf_event_mutex); 1901 mutex_lock(&event->owner->perf_event_mutex);
@@ -2610,7 +2626,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2610 if (user_locked > user_lock_limit) 2626 if (user_locked > user_lock_limit)
2611 extra = user_locked - user_lock_limit; 2627 extra = user_locked - user_lock_limit;
2612 2628
2613 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2629 lock_limit = rlimit(RLIMIT_MEMLOCK);
2614 lock_limit >>= PAGE_SHIFT; 2630 lock_limit >>= PAGE_SHIFT;
2615 locked = vma->vm_mm->locked_vm + extra; 2631 locked = vma->vm_mm->locked_vm + extra;
2616 2632
@@ -2663,6 +2679,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
2663} 2679}
2664 2680
2665static const struct file_operations perf_fops = { 2681static const struct file_operations perf_fops = {
2682 .llseek = no_llseek,
2666 .release = perf_release, 2683 .release = perf_release,
2667 .read = perf_read, 2684 .read = perf_read,
2668 .poll = perf_poll, 2685 .poll = perf_poll,
@@ -2806,6 +2823,33 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2806 return NULL; 2823 return NULL;
2807} 2824}
2808 2825
2826__weak
2827void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2828{
2829}
2830
2831
2832/*
2833 * We assume there is only KVM supporting the callbacks.
2834 * Later on, we might change it to a list if there is
2835 * another virtualization implementation supporting the callbacks.
2836 */
2837struct perf_guest_info_callbacks *perf_guest_cbs;
2838
2839int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2840{
2841 perf_guest_cbs = cbs;
2842 return 0;
2843}
2844EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
2845
2846int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2847{
2848 perf_guest_cbs = NULL;
2849 return 0;
2850}
2851EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2852
2809/* 2853/*
2810 * Output 2854 * Output
2811 */ 2855 */
@@ -3391,15 +3435,23 @@ static void perf_event_task_output(struct perf_event *event,
3391 struct perf_task_event *task_event) 3435 struct perf_task_event *task_event)
3392{ 3436{
3393 struct perf_output_handle handle; 3437 struct perf_output_handle handle;
3394 int size;
3395 struct task_struct *task = task_event->task; 3438 struct task_struct *task = task_event->task;
3396 int ret; 3439 unsigned long flags;
3440 int size, ret;
3441
3442 /*
3443 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3444 * in perf_output_lock() from interrupt context, it's game over.
3445 */
3446 local_irq_save(flags);
3397 3447
3398 size = task_event->event_id.header.size; 3448 size = task_event->event_id.header.size;
3399 ret = perf_output_begin(&handle, event, size, 0, 0); 3449 ret = perf_output_begin(&handle, event, size, 0, 0);
3400 3450
3401 if (ret) 3451 if (ret) {
3452 local_irq_restore(flags);
3402 return; 3453 return;
3454 }
3403 3455
3404 task_event->event_id.pid = perf_event_pid(event, task); 3456 task_event->event_id.pid = perf_event_pid(event, task);
3405 task_event->event_id.ppid = perf_event_pid(event, current); 3457 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3410,6 +3462,7 @@ static void perf_event_task_output(struct perf_event *event,
3410 perf_output_put(&handle, task_event->event_id); 3462 perf_output_put(&handle, task_event->event_id);
3411 3463
3412 perf_output_end(&handle); 3464 perf_output_end(&handle);
3465 local_irq_restore(flags);
3413} 3466}
3414 3467
3415static int perf_event_task_match(struct perf_event *event) 3468static int perf_event_task_match(struct perf_event *event)
@@ -3749,7 +3802,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3749 .event_id = { 3802 .event_id = {
3750 .header = { 3803 .header = {
3751 .type = PERF_RECORD_MMAP, 3804 .type = PERF_RECORD_MMAP,
3752 .misc = 0, 3805 .misc = PERF_RECORD_MISC_USER,
3753 /* .size */ 3806 /* .size */
3754 }, 3807 },
3755 /* .pid */ 3808 /* .pid */
@@ -3967,36 +4020,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3967 perf_swevent_overflow(event, 0, nmi, data, regs); 4020 perf_swevent_overflow(event, 0, nmi, data, regs);
3968} 4021}
3969 4022
3970static int perf_swevent_is_counting(struct perf_event *event)
3971{
3972 /*
3973 * The event is active, we're good!
3974 */
3975 if (event->state == PERF_EVENT_STATE_ACTIVE)
3976 return 1;
3977
3978 /*
3979 * The event is off/error, not counting.
3980 */
3981 if (event->state != PERF_EVENT_STATE_INACTIVE)
3982 return 0;
3983
3984 /*
3985 * The event is inactive, if the context is active
3986 * we're part of a group that didn't make it on the 'pmu',
3987 * not counting.
3988 */
3989 if (event->ctx->is_active)
3990 return 0;
3991
3992 /*
3993 * We're inactive and the context is too, this means the
3994 * task is scheduled out, we're counting events that happen
3995 * to us, like migration events.
3996 */
3997 return 1;
3998}
3999
4000static int perf_tp_event_match(struct perf_event *event, 4023static int perf_tp_event_match(struct perf_event *event,
4001 struct perf_sample_data *data); 4024 struct perf_sample_data *data);
4002 4025
@@ -4020,12 +4043,6 @@ static int perf_swevent_match(struct perf_event *event,
4020 struct perf_sample_data *data, 4043 struct perf_sample_data *data,
4021 struct pt_regs *regs) 4044 struct pt_regs *regs)
4022{ 4045{
4023 if (event->cpu != -1 && event->cpu != smp_processor_id())
4024 return 0;
4025
4026 if (!perf_swevent_is_counting(event))
4027 return 0;
4028
4029 if (event->attr.type != type) 4046 if (event->attr.type != type)
4030 return 0; 4047 return 0;
4031 4048
@@ -4042,18 +4059,53 @@ static int perf_swevent_match(struct perf_event *event,
4042 return 1; 4059 return 1;
4043} 4060}
4044 4061
4045static void perf_swevent_ctx_event(struct perf_event_context *ctx, 4062static inline u64 swevent_hash(u64 type, u32 event_id)
4046 enum perf_type_id type, 4063{
4047 u32 event_id, u64 nr, int nmi, 4064 u64 val = event_id | (type << 32);
4048 struct perf_sample_data *data, 4065
4049 struct pt_regs *regs) 4066 return hash_64(val, SWEVENT_HLIST_BITS);
4067}
4068
4069static struct hlist_head *
4070find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4071{
4072 u64 hash;
4073 struct swevent_hlist *hlist;
4074
4075 hash = swevent_hash(type, event_id);
4076
4077 hlist = rcu_dereference(ctx->swevent_hlist);
4078 if (!hlist)
4079 return NULL;
4080
4081 return &hlist->heads[hash];
4082}
4083
4084static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4085 u64 nr, int nmi,
4086 struct perf_sample_data *data,
4087 struct pt_regs *regs)
4050{ 4088{
4089 struct perf_cpu_context *cpuctx;
4051 struct perf_event *event; 4090 struct perf_event *event;
4091 struct hlist_node *node;
4092 struct hlist_head *head;
4052 4093
4053 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4094 cpuctx = &__get_cpu_var(perf_cpu_context);
4095
4096 rcu_read_lock();
4097
4098 head = find_swevent_head(cpuctx, type, event_id);
4099
4100 if (!head)
4101 goto end;
4102
4103 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4054 if (perf_swevent_match(event, type, event_id, data, regs)) 4104 if (perf_swevent_match(event, type, event_id, data, regs))
4055 perf_swevent_add(event, nr, nmi, data, regs); 4105 perf_swevent_add(event, nr, nmi, data, regs);
4056 } 4106 }
4107end:
4108 rcu_read_unlock();
4057} 4109}
4058 4110
4059int perf_swevent_get_recursion_context(void) 4111int perf_swevent_get_recursion_context(void)
@@ -4091,27 +4143,6 @@ void perf_swevent_put_recursion_context(int rctx)
4091} 4143}
4092EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4144EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4093 4145
4094static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4095 u64 nr, int nmi,
4096 struct perf_sample_data *data,
4097 struct pt_regs *regs)
4098{
4099 struct perf_cpu_context *cpuctx;
4100 struct perf_event_context *ctx;
4101
4102 cpuctx = &__get_cpu_var(perf_cpu_context);
4103 rcu_read_lock();
4104 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4105 nr, nmi, data, regs);
4106 /*
4107 * doesn't really matter which of the child contexts the
4108 * events ends up in.
4109 */
4110 ctx = rcu_dereference(current->perf_event_ctxp);
4111 if (ctx)
4112 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4113 rcu_read_unlock();
4114}
4115 4146
4116void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4147void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4117 struct pt_regs *regs, u64 addr) 4148 struct pt_regs *regs, u64 addr)
@@ -4123,8 +4154,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4123 if (rctx < 0) 4154 if (rctx < 0)
4124 return; 4155 return;
4125 4156
4126 data.addr = addr; 4157 perf_sample_data_init(&data, addr);
4127 data.raw = NULL;
4128 4158
4129 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4159 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4130 4160
@@ -4138,16 +4168,28 @@ static void perf_swevent_read(struct perf_event *event)
4138static int perf_swevent_enable(struct perf_event *event) 4168static int perf_swevent_enable(struct perf_event *event)
4139{ 4169{
4140 struct hw_perf_event *hwc = &event->hw; 4170 struct hw_perf_event *hwc = &event->hw;
4171 struct perf_cpu_context *cpuctx;
4172 struct hlist_head *head;
4173
4174 cpuctx = &__get_cpu_var(perf_cpu_context);
4141 4175
4142 if (hwc->sample_period) { 4176 if (hwc->sample_period) {
4143 hwc->last_period = hwc->sample_period; 4177 hwc->last_period = hwc->sample_period;
4144 perf_swevent_set_period(event); 4178 perf_swevent_set_period(event);
4145 } 4179 }
4180
4181 head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
4182 if (WARN_ON_ONCE(!head))
4183 return -EINVAL;
4184
4185 hlist_add_head_rcu(&event->hlist_entry, head);
4186
4146 return 0; 4187 return 0;
4147} 4188}
4148 4189
4149static void perf_swevent_disable(struct perf_event *event) 4190static void perf_swevent_disable(struct perf_event *event)
4150{ 4191{
4192 hlist_del_rcu(&event->hlist_entry);
4151} 4193}
4152 4194
4153static const struct pmu perf_ops_generic = { 4195static const struct pmu perf_ops_generic = {
@@ -4169,22 +4211,14 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4169 struct perf_event *event; 4211 struct perf_event *event;
4170 u64 period; 4212 u64 period;
4171 4213
4172 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4214 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4173 event->pmu->read(event); 4215 event->pmu->read(event);
4174 4216
4175 data.addr = 0; 4217 perf_sample_data_init(&data, 0);
4176 data.raw = NULL;
4177 data.period = event->hw.last_period; 4218 data.period = event->hw.last_period;
4178 regs = get_irq_regs(); 4219 regs = get_irq_regs();
4179 /*
4180 * In case we exclude kernel IPs or are somehow not in interrupt
4181 * context, provide the next best thing, the user IP.
4182 */
4183 if ((event->attr.exclude_kernel || !regs) &&
4184 !event->attr.exclude_user)
4185 regs = task_pt_regs(current);
4186 4220
4187 if (regs) { 4221 if (regs && !perf_exclude_event(event, regs)) {
4188 if (!(event->attr.exclude_idle && current->pid == 0)) 4222 if (!(event->attr.exclude_idle && current->pid == 0))
4189 if (perf_event_overflow(event, 0, &data, regs)) 4223 if (perf_event_overflow(event, 0, &data, regs))
4190 ret = HRTIMER_NORESTART; 4224 ret = HRTIMER_NORESTART;
@@ -4332,29 +4366,122 @@ static const struct pmu perf_ops_task_clock = {
4332 .read = task_clock_perf_event_read, 4366 .read = task_clock_perf_event_read,
4333}; 4367};
4334 4368
4369static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4370{
4371 struct swevent_hlist *hlist;
4372
4373 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4374 kfree(hlist);
4375}
4376
4377static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4378{
4379 struct swevent_hlist *hlist;
4380
4381 if (!cpuctx->swevent_hlist)
4382 return;
4383
4384 hlist = cpuctx->swevent_hlist;
4385 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4386 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4387}
4388
4389static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4390{
4391 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4392
4393 mutex_lock(&cpuctx->hlist_mutex);
4394
4395 if (!--cpuctx->hlist_refcount)
4396 swevent_hlist_release(cpuctx);
4397
4398 mutex_unlock(&cpuctx->hlist_mutex);
4399}
4400
4401static void swevent_hlist_put(struct perf_event *event)
4402{
4403 int cpu;
4404
4405 if (event->cpu != -1) {
4406 swevent_hlist_put_cpu(event, event->cpu);
4407 return;
4408 }
4409
4410 for_each_possible_cpu(cpu)
4411 swevent_hlist_put_cpu(event, cpu);
4412}
4413
4414static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4415{
4416 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4417 int err = 0;
4418
4419 mutex_lock(&cpuctx->hlist_mutex);
4420
4421 if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
4422 struct swevent_hlist *hlist;
4423
4424 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4425 if (!hlist) {
4426 err = -ENOMEM;
4427 goto exit;
4428 }
4429 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4430 }
4431 cpuctx->hlist_refcount++;
4432 exit:
4433 mutex_unlock(&cpuctx->hlist_mutex);
4434
4435 return err;
4436}
4437
4438static int swevent_hlist_get(struct perf_event *event)
4439{
4440 int err;
4441 int cpu, failed_cpu;
4442
4443 if (event->cpu != -1)
4444 return swevent_hlist_get_cpu(event, event->cpu);
4445
4446 get_online_cpus();
4447 for_each_possible_cpu(cpu) {
4448 err = swevent_hlist_get_cpu(event, cpu);
4449 if (err) {
4450 failed_cpu = cpu;
4451 goto fail;
4452 }
4453 }
4454 put_online_cpus();
4455
4456 return 0;
4457 fail:
4458 for_each_possible_cpu(cpu) {
4459 if (cpu == failed_cpu)
4460 break;
4461 swevent_hlist_put_cpu(event, cpu);
4462 }
4463
4464 put_online_cpus();
4465 return err;
4466}
4467
4335#ifdef CONFIG_EVENT_TRACING 4468#ifdef CONFIG_EVENT_TRACING
4336 4469
4337void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4470void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4338 int entry_size) 4471 int entry_size, struct pt_regs *regs)
4339{ 4472{
4473 struct perf_sample_data data;
4340 struct perf_raw_record raw = { 4474 struct perf_raw_record raw = {
4341 .size = entry_size, 4475 .size = entry_size,
4342 .data = record, 4476 .data = record,
4343 }; 4477 };
4344 4478
4345 struct perf_sample_data data = { 4479 perf_sample_data_init(&data, addr);
4346 .addr = addr, 4480 data.raw = &raw;
4347 .raw = &raw,
4348 };
4349
4350 struct pt_regs *regs = get_irq_regs();
4351
4352 if (!regs)
4353 regs = task_pt_regs(current);
4354 4481
4355 /* Trace events already protected against recursion */ 4482 /* Trace events already protected against recursion */
4356 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4483 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4357 &data, regs); 4484 &data, regs);
4358} 4485}
4359EXPORT_SYMBOL_GPL(perf_tp_event); 4486EXPORT_SYMBOL_GPL(perf_tp_event);
4360 4487
@@ -4370,11 +4497,14 @@ static int perf_tp_event_match(struct perf_event *event,
4370 4497
4371static void tp_perf_event_destroy(struct perf_event *event) 4498static void tp_perf_event_destroy(struct perf_event *event)
4372{ 4499{
4373 ftrace_profile_disable(event->attr.config); 4500 perf_trace_disable(event->attr.config);
4501 swevent_hlist_put(event);
4374} 4502}
4375 4503
4376static const struct pmu *tp_perf_event_init(struct perf_event *event) 4504static const struct pmu *tp_perf_event_init(struct perf_event *event)
4377{ 4505{
4506 int err;
4507
4378 /* 4508 /*
4379 * Raw tracepoint data is a severe data leak, only allow root to 4509 * Raw tracepoint data is a severe data leak, only allow root to
4380 * have these. 4510 * have these.
@@ -4384,10 +4514,15 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4384 !capable(CAP_SYS_ADMIN)) 4514 !capable(CAP_SYS_ADMIN))
4385 return ERR_PTR(-EPERM); 4515 return ERR_PTR(-EPERM);
4386 4516
4387 if (ftrace_profile_enable(event->attr.config)) 4517 if (perf_trace_enable(event->attr.config))
4388 return NULL; 4518 return NULL;
4389 4519
4390 event->destroy = tp_perf_event_destroy; 4520 event->destroy = tp_perf_event_destroy;
4521 err = swevent_hlist_get(event);
4522 if (err) {
4523 perf_trace_disable(event->attr.config);
4524 return ERR_PTR(err);
4525 }
4391 4526
4392 return &perf_ops_generic; 4527 return &perf_ops_generic;
4393} 4528}
@@ -4463,8 +4598,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4463 struct perf_sample_data sample; 4598 struct perf_sample_data sample;
4464 struct pt_regs *regs = data; 4599 struct pt_regs *regs = data;
4465 4600
4466 sample.raw = NULL; 4601 perf_sample_data_init(&sample, bp->attr.bp_addr);
4467 sample.addr = bp->attr.bp_addr;
4468 4602
4469 if (!perf_exclude_event(bp, regs)) 4603 if (!perf_exclude_event(bp, regs))
4470 perf_swevent_add(bp, 1, 1, &sample, regs); 4604 perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4489,6 +4623,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4489 WARN_ON(event->parent); 4623 WARN_ON(event->parent);
4490 4624
4491 atomic_dec(&perf_swevent_enabled[event_id]); 4625 atomic_dec(&perf_swevent_enabled[event_id]);
4626 swevent_hlist_put(event);
4492} 4627}
4493 4628
4494static const struct pmu *sw_perf_event_init(struct perf_event *event) 4629static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4527,6 +4662,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4527 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4662 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4528 case PERF_COUNT_SW_EMULATION_FAULTS: 4663 case PERF_COUNT_SW_EMULATION_FAULTS:
4529 if (!event->parent) { 4664 if (!event->parent) {
4665 int err;
4666
4667 err = swevent_hlist_get(event);
4668 if (err)
4669 return ERR_PTR(err);
4670
4530 atomic_inc(&perf_swevent_enabled[event_id]); 4671 atomic_inc(&perf_swevent_enabled[event_id]);
4531 event->destroy = sw_perf_event_destroy; 4672 event->destroy = sw_perf_event_destroy;
4532 } 4673 }
@@ -4912,7 +5053,7 @@ err_fput_free_put_context:
4912 5053
4913err_free_put_context: 5054err_free_put_context:
4914 if (err < 0) 5055 if (err < 0)
4915 kfree(event); 5056 free_event(event);
4916 5057
4917err_put_context: 5058err_put_context:
4918 if (err < 0) 5059 if (err < 0)
@@ -5191,7 +5332,7 @@ void perf_event_exit_task(struct task_struct *child)
5191 * 5332 *
5192 * But since its the parent context it won't be the same instance. 5333 * But since its the parent context it won't be the same instance.
5193 */ 5334 */
5194 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5335 mutex_lock(&child_ctx->mutex);
5195 5336
5196again: 5337again:
5197 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, 5338 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5392,18 +5533,37 @@ int perf_event_init_task(struct task_struct *child)
5392 return ret; 5533 return ret;
5393} 5534}
5394 5535
5536static void __init perf_event_init_all_cpus(void)
5537{
5538 int cpu;
5539 struct perf_cpu_context *cpuctx;
5540
5541 for_each_possible_cpu(cpu) {
5542 cpuctx = &per_cpu(perf_cpu_context, cpu);
5543 mutex_init(&cpuctx->hlist_mutex);
5544 __perf_event_init_context(&cpuctx->ctx, NULL);
5545 }
5546}
5547
5395static void __cpuinit perf_event_init_cpu(int cpu) 5548static void __cpuinit perf_event_init_cpu(int cpu)
5396{ 5549{
5397 struct perf_cpu_context *cpuctx; 5550 struct perf_cpu_context *cpuctx;
5398 5551
5399 cpuctx = &per_cpu(perf_cpu_context, cpu); 5552 cpuctx = &per_cpu(perf_cpu_context, cpu);
5400 __perf_event_init_context(&cpuctx->ctx, NULL);
5401 5553
5402 spin_lock(&perf_resource_lock); 5554 spin_lock(&perf_resource_lock);
5403 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5555 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5404 spin_unlock(&perf_resource_lock); 5556 spin_unlock(&perf_resource_lock);
5405 5557
5406 hw_perf_event_setup(cpu); 5558 mutex_lock(&cpuctx->hlist_mutex);
5559 if (cpuctx->hlist_refcount > 0) {
5560 struct swevent_hlist *hlist;
5561
5562 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5563 WARN_ON_ONCE(!hlist);
5564 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5565 }
5566 mutex_unlock(&cpuctx->hlist_mutex);
5407} 5567}
5408 5568
5409#ifdef CONFIG_HOTPLUG_CPU 5569#ifdef CONFIG_HOTPLUG_CPU
@@ -5423,6 +5583,10 @@ static void perf_event_exit_cpu(int cpu)
5423 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5583 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5424 struct perf_event_context *ctx = &cpuctx->ctx; 5584 struct perf_event_context *ctx = &cpuctx->ctx;
5425 5585
5586 mutex_lock(&cpuctx->hlist_mutex);
5587 swevent_hlist_release(cpuctx);
5588 mutex_unlock(&cpuctx->hlist_mutex);
5589
5426 mutex_lock(&ctx->mutex); 5590 mutex_lock(&ctx->mutex);
5427 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5591 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5428 mutex_unlock(&ctx->mutex); 5592 mutex_unlock(&ctx->mutex);
@@ -5443,20 +5607,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5443 perf_event_init_cpu(cpu); 5607 perf_event_init_cpu(cpu);
5444 break; 5608 break;
5445 5609
5446 case CPU_ONLINE:
5447 case CPU_ONLINE_FROZEN:
5448 hw_perf_event_setup_online(cpu);
5449 break;
5450
5451 case CPU_DOWN_PREPARE: 5610 case CPU_DOWN_PREPARE:
5452 case CPU_DOWN_PREPARE_FROZEN: 5611 case CPU_DOWN_PREPARE_FROZEN:
5453 perf_event_exit_cpu(cpu); 5612 perf_event_exit_cpu(cpu);
5454 break; 5613 break;
5455 5614
5456 case CPU_DEAD:
5457 hw_perf_event_setup_offline(cpu);
5458 break;
5459
5460 default: 5615 default:
5461 break; 5616 break;
5462 } 5617 }
@@ -5474,6 +5629,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
5474 5629
5475void __init perf_event_init(void) 5630void __init perf_event_init(void)
5476{ 5631{
5632 perf_event_init_all_cpus();
5477 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 5633 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5478 (void *)(long)smp_processor_id()); 5634 (void *)(long)smp_processor_id());
5479 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 5635 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5481,13 +5637,16 @@ void __init perf_event_init(void)
5481 register_cpu_notifier(&perf_cpu_nb); 5637 register_cpu_notifier(&perf_cpu_nb);
5482} 5638}
5483 5639
5484static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5640static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5641 struct sysdev_class_attribute *attr,
5642 char *buf)
5485{ 5643{
5486 return sprintf(buf, "%d\n", perf_reserved_percpu); 5644 return sprintf(buf, "%d\n", perf_reserved_percpu);
5487} 5645}
5488 5646
5489static ssize_t 5647static ssize_t
5490perf_set_reserve_percpu(struct sysdev_class *class, 5648perf_set_reserve_percpu(struct sysdev_class *class,
5649 struct sysdev_class_attribute *attr,
5491 const char *buf, 5650 const char *buf,
5492 size_t count) 5651 size_t count)
5493{ 5652{
@@ -5516,13 +5675,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5516 return count; 5675 return count;
5517} 5676}
5518 5677
5519static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5678static ssize_t perf_show_overcommit(struct sysdev_class *class,
5679 struct sysdev_class_attribute *attr,
5680 char *buf)
5520{ 5681{
5521 return sprintf(buf, "%d\n", perf_overcommit); 5682 return sprintf(buf, "%d\n", perf_overcommit);
5522} 5683}
5523 5684
5524static ssize_t 5685static ssize_t
5525perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5686perf_set_overcommit(struct sysdev_class *class,
5687 struct sysdev_class_attribute *attr,
5688 const char *buf, size_t count)
5526{ 5689{
5527 unsigned long val; 5690 unsigned long val;
5528 int err; 5691 int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index b08e697cd83f..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); 370 first = rcu_dereference_check(pid->tasks[type].first,
371 rcu_read_lock_held() ||
372 lockdep_tasklist_lock_is_held());
371 if (first) 373 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 374 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 375 }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 378EXPORT_SYMBOL(pid_task);
377 379
378/* 380/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 381 * Must be called under rcu_read_lock().
380 */ 382 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 383struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 384{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h>
16 17
17#define BITS_PER_PAGE (PAGE_SIZE*8) 18#define BITS_PER_PAGE (PAGE_SIZE*8)
18 19
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
161 rcu_read_lock(); 162 rcu_read_lock();
162 163
163 /* 164 /*
164 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring 165 * Any nested-container's init processes won't ignore the
165 * any nested-container's init processes don't ignore the 166 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
166 * signal
167 */ 167 */
168 task = pid_task(find_vpid(nr), PIDTYPE_PID); 168 task = pid_task(find_vpid(nr), PIDTYPE_PID);
169 if (task) 169 if (task)
170 force_sig(SIGKILL, task); 170 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
171 171
172 rcu_read_unlock(); 172 rcu_read_unlock();
173 173
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..bc7704b3a443 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
982 int maxfire; 982 int maxfire;
983 struct list_head *timers = tsk->cpu_timers; 983 struct list_head *timers = tsk->cpu_timers;
984 struct signal_struct *const sig = tsk->signal; 984 struct signal_struct *const sig = tsk->signal;
985 unsigned long soft;
985 986
986 maxfire = 20; 987 maxfire = 20;
987 tsk->cputime_expires.prof_exp = cputime_zero; 988 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
1030 /* 1031 /*
1031 * Check for the special case thread timers. 1032 * Check for the special case thread timers.
1032 */ 1033 */
1033 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 1034 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1034 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 1035 if (soft != RLIM_INFINITY) {
1035 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 1036 unsigned long hard =
1037 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1036 1038
1037 if (hard != RLIM_INFINITY && 1039 if (hard != RLIM_INFINITY &&
1038 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 1040 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
1043 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1045 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1044 return; 1046 return;
1045 } 1047 }
1046 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 1048 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1047 /* 1049 /*
1048 * At the soft limit, send a SIGXCPU every second. 1050 * At the soft limit, send a SIGXCPU every second.
1049 */ 1051 */
1050 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 1052 if (soft < hard) {
1051 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 1053 soft += USEC_PER_SEC;
1052 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1054 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1053 USEC_PER_SEC;
1054 } 1055 }
1055 printk(KERN_INFO 1056 printk(KERN_INFO
1056 "RT Watchdog Timeout: %s[%d]\n", 1057 "RT Watchdog Timeout: %s[%d]\n",
@@ -1060,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,
1060 } 1061 }
1061} 1062}
1062 1063
1063static void stop_process_timers(struct task_struct *tsk) 1064static void stop_process_timers(struct signal_struct *sig)
1064{ 1065{
1065 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 1066 struct thread_group_cputimer *cputimer = &sig->cputimer;
1066 unsigned long flags; 1067 unsigned long flags;
1067 1068
1068 if (!cputimer->running) 1069 if (!cputimer->running)
@@ -1071,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)
1071 spin_lock_irqsave(&cputimer->lock, flags); 1072 spin_lock_irqsave(&cputimer->lock, flags);
1072 cputimer->running = 0; 1073 cputimer->running = 0;
1073 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1074} 1079}
1075 1080
1076static u32 onecputick; 1081static u32 onecputick;
@@ -1121,6 +1126,7 @@ static void check_process_timers(struct task_struct *tsk,
1121 unsigned long long sum_sched_runtime, sched_expires; 1126 unsigned long long sum_sched_runtime, sched_expires;
1122 struct list_head *timers = sig->cpu_timers; 1127 struct list_head *timers = sig->cpu_timers;
1123 struct task_cputime cputime; 1128 struct task_cputime cputime;
1129 unsigned long soft;
1124 1130
1125 /* 1131 /*
1126 * Don't sample the current process CPU clocks if there are no timers. 1132 * Don't sample the current process CPU clocks if there are no timers.
@@ -1131,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,
1131 list_empty(&timers[CPUCLOCK_VIRT]) && 1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1132 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && 1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1133 list_empty(&timers[CPUCLOCK_SCHED])) { 1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1134 stop_process_timers(tsk); 1140 stop_process_timers(sig);
1135 return; 1141 return;
1136 } 1142 }
1137 1143
@@ -1193,11 +1199,13 @@ static void check_process_timers(struct task_struct *tsk,
1193 SIGPROF); 1199 SIGPROF);
1194 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1200 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1195 SIGVTALRM); 1201 SIGVTALRM);
1196 1202 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1197 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1203 if (soft != RLIM_INFINITY) {
1198 unsigned long psecs = cputime_to_secs(ptime); 1204 unsigned long psecs = cputime_to_secs(ptime);
1205 unsigned long hard =
1206 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1199 cputime_t x; 1207 cputime_t x;
1200 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1208 if (psecs >= hard) {
1201 /* 1209 /*
1202 * At the hard limit, we just die. 1210 * At the hard limit, we just die.
1203 * No need to calculate anything else now. 1211 * No need to calculate anything else now.
@@ -1205,17 +1213,17 @@ static void check_process_timers(struct task_struct *tsk,
1205 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1213 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1206 return; 1214 return;
1207 } 1215 }
1208 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1216 if (psecs >= soft) {
1209 /* 1217 /*
1210 * At the soft limit, send a SIGXCPU every second. 1218 * At the soft limit, send a SIGXCPU every second.
1211 */ 1219 */
1212 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1220 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1213 if (sig->rlim[RLIMIT_CPU].rlim_cur 1221 if (soft < hard) {
1214 < sig->rlim[RLIMIT_CPU].rlim_max) { 1222 soft++;
1215 sig->rlim[RLIMIT_CPU].rlim_cur++; 1223 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1216 } 1224 }
1217 } 1225 }
1218 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1226 x = secs_to_cputime(soft);
1219 if (cputime_eq(prof_expires, cputime_zero) || 1227 if (cputime_eq(prof_expires, cputime_zero) ||
1220 cputime_lt(x, prof_expires)) { 1228 cputime_lt(x, prof_expires)) {
1221 prof_expires = x; 1229 prof_expires = x;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..aa9e916da4d5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h>
25#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
26#include <asm/suspend.h> 27#include <asm/suspend.h>
27 28
@@ -323,6 +324,7 @@ static int create_image(int platform_mode)
323int hibernation_snapshot(int platform_mode) 324int hibernation_snapshot(int platform_mode)
324{ 325{
325 int error; 326 int error;
327 gfp_t saved_mask;
326 328
327 error = platform_begin(platform_mode); 329 error = platform_begin(platform_mode);
328 if (error) 330 if (error)
@@ -334,6 +336,7 @@ int hibernation_snapshot(int platform_mode)
334 goto Close; 336 goto Close;
335 337
336 suspend_console(); 338 suspend_console();
339 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
337 error = dpm_suspend_start(PMSG_FREEZE); 340 error = dpm_suspend_start(PMSG_FREEZE);
338 if (error) 341 if (error)
339 goto Recover_platform; 342 goto Recover_platform;
@@ -351,6 +354,7 @@ int hibernation_snapshot(int platform_mode)
351 354
352 dpm_resume_end(in_suspend ? 355 dpm_resume_end(in_suspend ?
353 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 356 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
357 set_gfp_allowed_mask(saved_mask);
354 resume_console(); 358 resume_console();
355 Close: 359 Close:
356 platform_end(platform_mode); 360 platform_end(platform_mode);
@@ -445,14 +449,17 @@ static int resume_target_kernel(bool platform_mode)
445int hibernation_restore(int platform_mode) 449int hibernation_restore(int platform_mode)
446{ 450{
447 int error; 451 int error;
452 gfp_t saved_mask;
448 453
449 pm_prepare_console(); 454 pm_prepare_console();
450 suspend_console(); 455 suspend_console();
456 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
451 error = dpm_suspend_start(PMSG_QUIESCE); 457 error = dpm_suspend_start(PMSG_QUIESCE);
452 if (!error) { 458 if (!error) {
453 error = resume_target_kernel(platform_mode); 459 error = resume_target_kernel(platform_mode);
454 dpm_resume_end(PMSG_RECOVER); 460 dpm_resume_end(PMSG_RECOVER);
455 } 461 }
462 set_gfp_allowed_mask(saved_mask);
456 resume_console(); 463 resume_console();
457 pm_restore_console(); 464 pm_restore_console();
458 return error; 465 return error;
@@ -466,6 +473,7 @@ int hibernation_restore(int platform_mode)
466int hibernation_platform_enter(void) 473int hibernation_platform_enter(void)
467{ 474{
468 int error; 475 int error;
476 gfp_t saved_mask;
469 477
470 if (!hibernation_ops) 478 if (!hibernation_ops)
471 return -ENOSYS; 479 return -ENOSYS;
@@ -481,6 +489,7 @@ int hibernation_platform_enter(void)
481 489
482 entering_platform_hibernation = true; 490 entering_platform_hibernation = true;
483 suspend_console(); 491 suspend_console();
492 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
484 error = dpm_suspend_start(PMSG_HIBERNATE); 493 error = dpm_suspend_start(PMSG_HIBERNATE);
485 if (error) { 494 if (error) {
486 if (hibernation_ops->recover) 495 if (hibernation_ops->recover)
@@ -518,6 +527,7 @@ int hibernation_platform_enter(void)
518 Resume_devices: 527 Resume_devices:
519 entering_platform_hibernation = false; 528 entering_platform_hibernation = false;
520 dpm_resume_end(PMSG_RESTORE); 529 dpm_resume_end(PMSG_RESTORE);
530 set_gfp_allowed_mask(saved_mask);
521 resume_console(); 531 resume_console();
522 532
523 Close: 533 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
index 39ac698ef836..fdcad9ed5a7b 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/hibernate_nvs.c
@@ -10,6 +10,7 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
13#include <linux/suspend.h> 14#include <linux/suspend.h>
14 15
15/* 16/*
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5ade1bdcf366..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 89 "(%d tasks refusing to freeze):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 90 elapsed_csecs / 100, elapsed_csecs % 100, todo);
91 show_state();
92 read_lock(&tasklist_lock); 91 read_lock(&tasklist_lock);
93 do_each_thread(g, p) { 92 do_each_thread(g, p) {
94 task_lock(p); 93 task_lock(p);
95 if (freezing(p) && !freezer_should_skip(p)) 94 if (freezing(p) && !freezer_should_skip(p))
96 printk(KERN_ERR " %s\n", p->comm); 95 sched_show_task(p);
97 cancel_freezing(p); 96 cancel_freezing(p);
98 task_unlock(p); 97 task_unlock(p);
99 } while_each_thread(g, p); 98 } while_each_thread(g, p);
@@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
145 if (nosig_only && should_send_signal(p)) 144 if (nosig_only && should_send_signal(p))
146 continue; 145 continue;
147 146
148 if (cgroup_frozen(p)) 147 if (cgroup_freezing_or_frozen(p))
149 continue; 148 continue;
150 149
151 thaw_process(p); 150 thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 830cadecbdfc..be861c26dda7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -26,6 +26,7 @@
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/mmu_context.h> 32#include <asm/mmu_context.h>
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..56e7dbb8b996 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,7 @@
15#include <linux/console.h> 15#include <linux/console.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/gfp.h>
18 19
19#include "power.h" 20#include "power.h"
20 21
@@ -189,6 +190,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 190int suspend_devices_and_enter(suspend_state_t state)
190{ 191{
191 int error; 192 int error;
193 gfp_t saved_mask;
192 194
193 if (!suspend_ops) 195 if (!suspend_ops)
194 return -ENOSYS; 196 return -ENOSYS;
@@ -199,6 +201,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 201 goto Close;
200 } 202 }
201 suspend_console(); 203 suspend_console();
204 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 205 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 206 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 207 if (error) {
@@ -215,6 +218,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 218 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 219 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 220 suspend_test_finish("resume devices");
221 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 222 resume_console();
219 Close: 223 Close:
220 if (suspend_ops->end) 224 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1d575733d4e1..66824d71983a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/swapops.h> 24#include <linux/swapops.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/slab.h>
26 27
27#include "power.h" 28#include "power.h"
28 29
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4d2289626a84..a8c96212bc1b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -420,7 +420,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
420 * User space encodes device types as two-byte values, 420 * User space encodes device types as two-byte values,
421 * so we need to recode them 421 * so we need to recode them
422 */ 422 */
423 swdev = old_decode_dev(swap_area.dev); 423 swdev = new_decode_dev(swap_area.dev);
424 if (swdev) { 424 if (swdev) {
425 offset = swap_area.offset; 425 offset = swap_area.offset;
426 data->swap = swap_type_of(swdev, offset, NULL); 426 data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 37#include <linux/kmsg_dump.h>
38#include <linux/syslog.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
@@ -69,8 +70,6 @@ int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 70 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
70}; 71};
71 72
72static int saved_console_loglevel = -1;
73
74/* 73/*
75 * Low level drivers may need that to know if they can schedule in 74 * Low level drivers may need that to know if they can schedule in
76 * their unblank() callback or not. So let's export it. 75 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
145static char *log_buf = __log_buf; 144static char *log_buf = __log_buf;
146static int log_buf_len = __LOG_BUF_LEN; 145static int log_buf_len = __LOG_BUF_LEN;
147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 146static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
147static int saved_console_loglevel = -1;
148 148
149#ifdef CONFIG_KEXEC 149#ifdef CONFIG_KEXEC
150/* 150/*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
258} 258}
259#endif 259#endif
260 260
261/* 261int do_syslog(int type, char __user *buf, int len, bool from_file)
262 * Commands to do_syslog:
263 *
264 * 0 -- Close the log. Currently a NOP.
265 * 1 -- Open the log. Currently a NOP.
266 * 2 -- Read from the log.
267 * 3 -- Read all messages remaining in the ring buffer.
268 * 4 -- Read and clear all messages remaining in the ring buffer
269 * 5 -- Clear ring buffer.
270 * 6 -- Disable printk's to console
271 * 7 -- Enable printk's to console
272 * 8 -- Set level of messages printed to console
273 * 9 -- Return number of unread characters in the log buffer
274 * 10 -- Return size of the log buffer
275 */
276int do_syslog(int type, char __user *buf, int len)
277{ 262{
278 unsigned i, j, limit, count; 263 unsigned i, j, limit, count;
279 int do_clear = 0; 264 int do_clear = 0;
280 char c; 265 char c;
281 int error = 0; 266 int error = 0;
282 267
283 error = security_syslog(type); 268 error = security_syslog(type, from_file);
284 if (error) 269 if (error)
285 return error; 270 return error;
286 271
287 switch (type) { 272 switch (type) {
288 case 0: /* Close log */ 273 case SYSLOG_ACTION_CLOSE: /* Close log */
289 break; 274 break;
290 case 1: /* Open log */ 275 case SYSLOG_ACTION_OPEN: /* Open log */
291 break; 276 break;
292 case 2: /* Read from log */ 277 case SYSLOG_ACTION_READ: /* Read from log */
293 error = -EINVAL; 278 error = -EINVAL;
294 if (!buf || len < 0) 279 if (!buf || len < 0)
295 goto out; 280 goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
320 if (!error) 305 if (!error)
321 error = i; 306 error = i;
322 break; 307 break;
323 case 4: /* Read/clear last kernel messages */ 308 /* Read/clear last kernel messages */
309 case SYSLOG_ACTION_READ_CLEAR:
324 do_clear = 1; 310 do_clear = 1;
325 /* FALL THRU */ 311 /* FALL THRU */
326 case 3: /* Read last kernel messages */ 312 /* Read last kernel messages */
313 case SYSLOG_ACTION_READ_ALL:
327 error = -EINVAL; 314 error = -EINVAL;
328 if (!buf || len < 0) 315 if (!buf || len < 0)
329 goto out; 316 goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
376 } 363 }
377 } 364 }
378 break; 365 break;
379 case 5: /* Clear ring buffer */ 366 /* Clear ring buffer */
367 case SYSLOG_ACTION_CLEAR:
380 logged_chars = 0; 368 logged_chars = 0;
381 break; 369 break;
382 case 6: /* Disable logging to console */ 370 /* Disable logging to console */
371 case SYSLOG_ACTION_CONSOLE_OFF:
383 if (saved_console_loglevel == -1) 372 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel; 373 saved_console_loglevel = console_loglevel;
385 console_loglevel = minimum_console_loglevel; 374 console_loglevel = minimum_console_loglevel;
386 break; 375 break;
387 case 7: /* Enable logging to console */ 376 /* Enable logging to console */
377 case SYSLOG_ACTION_CONSOLE_ON:
388 if (saved_console_loglevel != -1) { 378 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel; 379 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1; 380 saved_console_loglevel = -1;
391 } 381 }
392 break; 382 break;
393 case 8: /* Set level of messages printed to console */ 383 /* Set level of messages printed to console */
384 case SYSLOG_ACTION_CONSOLE_LEVEL:
394 error = -EINVAL; 385 error = -EINVAL;
395 if (len < 1 || len > 8) 386 if (len < 1 || len > 8)
396 goto out; 387 goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
401 saved_console_loglevel = -1; 392 saved_console_loglevel = -1;
402 error = 0; 393 error = 0;
403 break; 394 break;
404 case 9: /* Number of chars in the log buffer */ 395 /* Number of chars in the log buffer */
396 case SYSLOG_ACTION_SIZE_UNREAD:
405 error = log_end - log_start; 397 error = log_end - log_start;
406 break; 398 break;
407 case 10: /* Size of the log buffer */ 399 /* Size of the log buffer */
400 case SYSLOG_ACTION_SIZE_BUFFER:
408 error = log_buf_len; 401 error = log_buf_len;
409 break; 402 break;
410 default: 403 default:
@@ -417,7 +410,7 @@ out:
417 410
418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 411SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
419{ 412{
420 return do_syslog(type, buf, len); 413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 414}
422 415
423/* 416/*
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..dfadc5b729f1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vmalloc(buffer_bytes);
130 if (prof_buffer) 130 if (prof_buffer) {
131 memset(prof_buffer, 0, buffer_bytes);
131 return 0; 132 return 0;
133 }
132 134
133 free_cpumask_var(prof_cpu_mask); 135 free_cpumask_var(prof_cpu_mask);
134 return -ENOMEM; 136 return -ENOMEM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..6af9cdd558b7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/ptrace.h> 17#include <linux/ptrace.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/signal.h> 19#include <linux/signal.h>
@@ -76,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
76 child->parent = child->real_parent; 75 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 76 list_del_init(&child->ptrace_entry);
78 77
79 arch_ptrace_untrace(child);
80 if (task_is_traced(child)) 78 if (task_is_traced(child))
81 ptrace_untrace(child); 79 ptrace_untrace(child);
82} 80}
@@ -666,10 +664,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
666 struct task_struct *child; 664 struct task_struct *child;
667 long ret; 665 long ret;
668 666
669 /*
670 * This lock_kernel fixes a subtle race with suid exec
671 */
672 lock_kernel();
673 if (request == PTRACE_TRACEME) { 667 if (request == PTRACE_TRACEME) {
674 ret = ptrace_traceme(); 668 ret = ptrace_traceme();
675 if (!ret) 669 if (!ret)
@@ -703,7 +697,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
703 out_put_task_struct: 697 out_put_task_struct:
704 put_task_struct(child); 698 put_task_struct(child);
705 out: 699 out:
706 unlock_kernel();
707 return ret; 700 return ret;
708} 701}
709 702
@@ -813,10 +806,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
813 struct task_struct *child; 806 struct task_struct *child;
814 long ret; 807 long ret;
815 808
816 /*
817 * This lock_kernel fixes a subtle race with suid exec
818 */
819 lock_kernel();
820 if (request == PTRACE_TRACEME) { 809 if (request == PTRACE_TRACEME) {
821 ret = ptrace_traceme(); 810 ret = ptrace_traceme();
822 goto out; 811 goto out;
@@ -846,7 +835,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
846 out_put_task_struct: 835 out_put_task_struct:
847 put_task_struct(child); 836 put_task_struct(child);
848 out: 837 out:
849 unlock_kernel();
850 return ret; 838 return ret;
851} 839}
852#endif /* CONFIG_COMPAT */ 840#endif /* CONFIG_COMPAT */
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f1125c1a6321..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,7 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h> 47#include <linux/hardirq.h>
48 48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
@@ -63,23 +63,34 @@ struct lockdep_map rcu_sched_lock_map =
63EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
64#endif 64#endif
65 65
66int rcu_scheduler_active __read_mostly; 66#ifdef CONFIG_DEBUG_LOCK_ALLOC
67EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68 67
69/* 68int debug_lockdep_rcu_enabled(void)
70 * This function is invoked towards the end of the scheduler's initialization 69{
71 * process. Before this is called, the idle task might contain 70 return rcu_scheduler_active && debug_locks &&
72 * RCU read-side critical sections (during which time, this idle 71 current->lockdep_recursion == 0;
73 * task is booting the system). After this function is called, the 72}
74 * idle tasks are prohibited from containing RCU read-side critical 73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
75 * sections. 74
75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
77 *
78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation.
82 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
76 */ 84 */
77void rcu_scheduler_starting(void) 85int rcu_read_lock_bh_held(void)
78{ 86{
79 WARN_ON(num_online_cpus() != 1); 87 if (!debug_lockdep_rcu_enabled())
80 WARN_ON(nr_context_switches() > 0); 88 return 1;
81 rcu_scheduler_active = 1; 89 return in_softirq();
82} 90}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92
93#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
83 94
84/* 95/*
85 * Awaken the corresponding synchronize_rcu() instance now that a 96 * Awaken the corresponding synchronize_rcu() instance now that a
@@ -92,3 +103,14 @@ void wakeme_after_rcu(struct rcu_head *head)
92 rcu = container_of(head, struct rcu_synchronize, head); 103 rcu = container_of(head, struct rcu_synchronize, head);
93 complete(&rcu->completion); 104 complete(&rcu->completion);
94} 105}
106
107#ifdef CONFIG_PROVE_RCU
108/*
109 * wrapper function to avoid #include problems.
110 */
111int rcu_my_thread_group_empty(void)
112{
113 return thread_group_empty(current);
114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
44}; 44};
45 45
46/* Definition for rcupdate control block. */ 46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = { 47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist, 48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist, 49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50}; 50};
51 51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = { 52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
54 .curtail = &rcu_bh_ctrlblk.rcucblist, 54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55}; 55};
56 56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61
57#ifdef CONFIG_NO_HZ 62#ifdef CONFIG_NO_HZ
58 63
59static long rcu_dynticks_nesting = 1; 64static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
108 */ 113 */
109void rcu_sched_qs(int cpu) 114void rcu_sched_qs(int cpu)
110{ 115{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) 116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ); 118 raise_softirq(RCU_SOFTIRQ);
113} 119}
114 120
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
173 */ 179 */
174static void rcu_process_callbacks(struct softirq_action *unused) 180static void rcu_process_callbacks(struct softirq_action *unused)
175{ 181{
176 __rcu_process_callbacks(&rcu_ctrlblk); 182 __rcu_process_callbacks(&rcu_sched_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk); 183 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178} 184}
179 185
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
187 * 193 *
188 * Cool, huh? (Due to Josh Triplett.) 194 * Cool, huh? (Due to Josh Triplett.)
189 * 195 *
190 * But we want to make this a static inline later. 196 * But we want to make this a static inline later. The cond_resched()
197 * currently makes this problematic.
191 */ 198 */
192void synchronize_sched(void) 199void synchronize_sched(void)
193{ 200{
@@ -195,12 +202,6 @@ void synchronize_sched(void)
195} 202}
196EXPORT_SYMBOL_GPL(synchronize_sched); 203EXPORT_SYMBOL_GPL(synchronize_sched);
197 204
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/* 205/*
205 * Helper function for call_rcu() and call_rcu_bh(). 206 * Helper function for call_rcu() and call_rcu_bh().
206 */ 207 */
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
226 */ 227 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 228void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{ 229{
229 __call_rcu(head, func, &rcu_ctrlblk); 230 __call_rcu(head, func, &rcu_sched_ctrlblk);
230} 231}
231EXPORT_SYMBOL_GPL(call_rcu); 232EXPORT_SYMBOL_GPL(call_rcu);
232 233
@@ -244,11 +245,13 @@ void rcu_barrier(void)
244{ 245{
245 struct rcu_synchronize rcu; 246 struct rcu_synchronize rcu;
246 247
248 init_rcu_head_on_stack(&rcu.head);
247 init_completion(&rcu.completion); 249 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */ 250 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu); 251 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */ 252 /* Wait for it. */
251 wait_for_completion(&rcu.completion); 253 wait_for_completion(&rcu.completion);
254 destroy_rcu_head_on_stack(&rcu.head);
252} 255}
253EXPORT_SYMBOL_GPL(rcu_barrier); 256EXPORT_SYMBOL_GPL(rcu_barrier);
254 257
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
256{ 259{
257 struct rcu_synchronize rcu; 260 struct rcu_synchronize rcu;
258 261
262 init_rcu_head_on_stack(&rcu.head);
259 init_completion(&rcu.completion); 263 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */ 264 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu); 265 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */ 266 /* Wait for it. */
263 wait_for_completion(&rcu.completion); 267 wait_for_completion(&rcu.completion);
268 destroy_rcu_head_on_stack(&rcu.head);
264} 269}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh); 270EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266 271
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
268{ 273{
269 struct rcu_synchronize rcu; 274 struct rcu_synchronize rcu;
270 275
276 init_rcu_head_on_stack(&rcu.head);
271 init_completion(&rcu.completion); 277 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */ 278 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu); 279 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */ 280 /* Wait for it. */
275 wait_for_completion(&rcu.completion); 281 wait_for_completion(&rcu.completion);
282 destroy_rcu_head_on_stack(&rcu.head);
276} 283}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched); 284EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278 285
@@ -280,3 +287,5 @@ void __init rcu_init(void)
280{ 287{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 288 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282} 289}
290
291#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright IBM Corporation, 2009
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#ifdef CONFIG_DEBUG_LOCK_ALLOC
26
27#include <linux/kernel_stat.h>
28
29/*
30 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously.
32 */
33void rcu_scheduler_starting(void)
34{
35 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1;
37}
38
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 258cdf0a91eb..6535ac8bc6a5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
464{ 464{
465 struct rcu_bh_torture_synchronize rcu; 465 struct rcu_bh_torture_synchronize rcu;
466 466
467 init_rcu_head_on_stack(&rcu.head);
467 init_completion(&rcu.completion); 468 init_completion(&rcu.completion);
468 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 469 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
469 wait_for_completion(&rcu.completion); 470 wait_for_completion(&rcu.completion);
471 destroy_rcu_head_on_stack(&rcu.head);
470} 472}
471 473
472static struct rcu_torture_ops rcu_bh_ops = { 474static struct rcu_torture_ops rcu_bh_ops = {
@@ -669,7 +671,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
669 .sync = synchronize_sched_expedited, 671 .sync = synchronize_sched_expedited,
670 .cb_barrier = NULL, 672 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state, 673 .fqs = rcu_sched_force_quiescent_state,
672 .stats = rcu_expedited_torture_stats, 674 .stats = NULL,
673 .irq_capable = 1, 675 .irq_capable = 1,
674 .name = "sched_expedited" 676 .name = "sched_expedited"
675}; 677};
@@ -818,13 +820,13 @@ static void rcu_torture_timer(unsigned long unused)
818 /* Should not happen, but... */ 820 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 821 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 822 }
821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 823 __this_cpu_inc(rcu_torture_count[pipe_count]);
822 completed = cur_ops->completed() - completed; 824 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 825 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 826 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 827 completed = RCU_TORTURE_PIPE_LEN;
826 } 828 }
827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 829 __this_cpu_inc(rcu_torture_batch[completed]);
828 preempt_enable(); 830 preempt_enable();
829 cur_ops->readunlock(idx); 831 cur_ops->readunlock(idx);
830} 832}
@@ -877,13 +879,13 @@ rcu_torture_reader(void *arg)
877 /* Should not happen, but... */ 879 /* Should not happen, but... */
878 pipe_count = RCU_TORTURE_PIPE_LEN; 880 pipe_count = RCU_TORTURE_PIPE_LEN;
879 } 881 }
880 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 882 __this_cpu_inc(rcu_torture_count[pipe_count]);
881 completed = cur_ops->completed() - completed; 883 completed = cur_ops->completed() - completed;
882 if (completed > RCU_TORTURE_PIPE_LEN) { 884 if (completed > RCU_TORTURE_PIPE_LEN) {
883 /* Should not happen, but... */ 885 /* Should not happen, but... */
884 completed = RCU_TORTURE_PIPE_LEN; 886 completed = RCU_TORTURE_PIPE_LEN;
885 } 887 }
886 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 888 __this_cpu_inc(rcu_torture_batch[completed]);
887 preempt_enable(); 889 preempt_enable();
888 cur_ops->readunlock(idx); 890 cur_ops->readunlock(idx);
889 schedule(); 891 schedule();
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75f..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
@@ -53,8 +54,8 @@
53 54
54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 56
56#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(structname) { \
57 .level = { &name.node[0] }, \ 58 .level = { &structname.node[0] }, \
58 .levelcnt = { \ 59 .levelcnt = { \
59 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
60 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
@@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
65 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
66 .gpnum = -300, \ 67 .gpnum = -300, \
67 .completed = -300, \ 68 .completed = -300, \
68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
69 .orphan_cbs_list = NULL, \ 70 .orphan_cbs_list = NULL, \
70 .orphan_cbs_tail = &name.orphan_cbs_list, \ 71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
71 .orphan_qlen = 0, \ 72 .orphan_qlen = 0, \
72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
73 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
74 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
76 .name = #structname, \
75} 77}
76 78
77struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
82 84
85int rcu_scheduler_active __read_mostly;
86EXPORT_SYMBOL_GPL(rcu_scheduler_active);
87
83/* 88/*
84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
85 * permit this function to be invoked without holding the root rcu_node 90 * permit this function to be invoked without holding the root rcu_node
@@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
97 */ 102 */
98void rcu_sched_qs(int cpu) 103void rcu_sched_qs(int cpu)
99{ 104{
100 struct rcu_data *rdp; 105 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
101 106
102 rdp = &per_cpu(rcu_sched_data, cpu);
103 rdp->passed_quiesc_completed = rdp->gpnum - 1; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
104 barrier(); 108 barrier();
105 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
106 rcu_preempt_note_context_switch(cpu);
107} 110}
108 111
109void rcu_bh_qs(int cpu) 112void rcu_bh_qs(int cpu)
110{ 113{
111 struct rcu_data *rdp; 114 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
112 115
113 rdp = &per_cpu(rcu_bh_data, cpu);
114 rdp->passed_quiesc_completed = rdp->gpnum - 1; 116 rdp->passed_quiesc_completed = rdp->gpnum - 1;
115 barrier(); 117 barrier();
116 rdp->passed_quiesc = 1; 118 rdp->passed_quiesc = 1;
117} 119}
118 120
121/*
122 * Note a context switch. This is a quiescent state for RCU-sched,
123 * and requires special handling for preemptible RCU.
124 */
125void rcu_note_context_switch(int cpu)
126{
127 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu);
129}
130
119#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ
120DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
121 .dynticks_nesting = 1, 133 .dynticks_nesting = 1,
@@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
438 450
439#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
440 452
453int rcu_cpu_stall_panicking __read_mostly;
454
441static void record_gp_stall_check_time(struct rcu_state *rsp) 455static void record_gp_stall_check_time(struct rcu_state *rsp)
442{ 456{
443 rsp->gp_start = jiffies; 457 rsp->gp_start = jiffies;
@@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
470 484
471 /* OK, time to rat on our buddy... */ 485 /* OK, time to rat on our buddy... */
472 486
473 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name);
474 rcu_for_each_leaf_node(rsp, rnp) { 489 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags); 490 raw_spin_lock_irqsave(&rnp->lock, flags);
476 rcu_print_task_stall(rnp); 491 rcu_print_task_stall(rnp);
@@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
481 if (rnp->qsmask & (1UL << cpu)) 496 if (rnp->qsmask & (1UL << cpu))
482 printk(" %d", rnp->grplo + cpu); 497 printk(" %d", rnp->grplo + cpu);
483 } 498 }
484 printk(" (detected by %d, t=%ld jiffies)\n", 499 printk("} (detected by %d, t=%ld jiffies)\n",
485 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 500 smp_processor_id(), (long)(jiffies - rsp->gp_start));
486 trigger_all_cpu_backtrace(); 501 trigger_all_cpu_backtrace();
487 502
@@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
497 unsigned long flags; 512 unsigned long flags;
498 struct rcu_node *rnp = rcu_get_root(rsp); 513 struct rcu_node *rnp = rcu_get_root(rsp);
499 514
500 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
501 smp_processor_id(), jiffies - rsp->gp_start); 516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
502 trigger_all_cpu_backtrace(); 517 trigger_all_cpu_backtrace();
503 518
504 raw_spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
515 long delta; 530 long delta;
516 struct rcu_node *rnp; 531 struct rcu_node *rnp;
517 532
533 if (rcu_cpu_stall_panicking)
534 return;
518 delta = jiffies - rsp->jiffies_stall; 535 delta = jiffies - rsp->jiffies_stall;
519 rnp = rdp->mynode; 536 rnp = rdp->mynode;
520 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
529 } 546 }
530} 547}
531 548
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{
551 rcu_cpu_stall_panicking = 1;
552 return NOTIFY_DONE;
553}
554
555static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic,
557};
558
559static void __init check_cpu_stall_init(void)
560{
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562}
563
532#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
533 565
534static void record_gp_stall_check_time(struct rcu_state *rsp) 566static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 571{
540} 572}
541 573
574static void __init check_cpu_stall_init(void)
575{
576}
577
542#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
543 579
544/* 580/*
@@ -1125,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1125 */ 1161 */
1126void rcu_check_callbacks(int cpu, int user) 1162void rcu_check_callbacks(int cpu, int user)
1127{ 1163{
1128 if (!rcu_pending(cpu))
1129 return; /* if nothing for RCU to do. */
1130 if (user || 1164 if (user ||
1131 (idle_cpu(cpu) && rcu_scheduler_active && 1165 (idle_cpu(cpu) && rcu_scheduler_active &&
1132 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1166 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1158,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
1158 rcu_bh_qs(cpu); 1192 rcu_bh_qs(cpu);
1159 } 1193 }
1160 rcu_preempt_check_callbacks(cpu); 1194 rcu_preempt_check_callbacks(cpu);
1161 raise_softirq(RCU_SOFTIRQ); 1195 if (rcu_pending(cpu))
1196 raise_softirq(RCU_SOFTIRQ);
1162} 1197}
1163 1198
1164#ifdef CONFIG_SMP 1199#ifdef CONFIG_SMP
@@ -1236,11 +1271,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1236 break; /* grace period idle or initializing, ignore. */ 1271 break; /* grace period idle or initializing, ignore. */
1237 1272
1238 case RCU_SAVE_DYNTICK: 1273 case RCU_SAVE_DYNTICK:
1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1274 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1242 break; /* So gcc recognizes the dead code. */ 1275 break; /* So gcc recognizes the dead code. */
1243 1276
1277 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1278
1244 /* Record dyntick-idle state. */ 1279 /* Record dyntick-idle state. */
1245 force_qs_rnp(rsp, dyntick_save_progress_counter); 1280 force_qs_rnp(rsp, dyntick_save_progress_counter);
1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1281 raw_spin_lock(&rnp->lock); /* irqs already disabled */
@@ -1449,11 +1484,13 @@ void synchronize_sched(void)
1449 if (rcu_blocking_is_gp()) 1484 if (rcu_blocking_is_gp())
1450 return; 1485 return;
1451 1486
1487 init_rcu_head_on_stack(&rcu.head);
1452 init_completion(&rcu.completion); 1488 init_completion(&rcu.completion);
1453 /* Will wake me after RCU finished. */ 1489 /* Will wake me after RCU finished. */
1454 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1490 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1455 /* Wait for it. */ 1491 /* Wait for it. */
1456 wait_for_completion(&rcu.completion); 1492 wait_for_completion(&rcu.completion);
1493 destroy_rcu_head_on_stack(&rcu.head);
1457} 1494}
1458EXPORT_SYMBOL_GPL(synchronize_sched); 1495EXPORT_SYMBOL_GPL(synchronize_sched);
1459 1496
@@ -1473,11 +1510,13 @@ void synchronize_rcu_bh(void)
1473 if (rcu_blocking_is_gp()) 1510 if (rcu_blocking_is_gp())
1474 return; 1511 return;
1475 1512
1513 init_rcu_head_on_stack(&rcu.head);
1476 init_completion(&rcu.completion); 1514 init_completion(&rcu.completion);
1477 /* Will wake me after RCU finished. */ 1515 /* Will wake me after RCU finished. */
1478 call_rcu_bh(&rcu.head, wakeme_after_rcu); 1516 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1479 /* Wait for it. */ 1517 /* Wait for it. */
1480 wait_for_completion(&rcu.completion); 1518 wait_for_completion(&rcu.completion);
1519 destroy_rcu_head_on_stack(&rcu.head);
1481} 1520}
1482EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1521EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1483 1522
@@ -1498,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1498 check_cpu_stall(rsp, rdp); 1537 check_cpu_stall(rsp, rdp);
1499 1538
1500 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1539 /* Is the RCU core waiting for a quiescent state from this CPU? */
1501 if (rdp->qs_pending) { 1540 if (rdp->qs_pending && !rdp->passed_quiesc) {
1541
1542 /*
1543 * If force_quiescent_state() coming soon and this CPU
1544 * needs a quiescent state, and this is either RCU-sched
1545 * or RCU-bh, force a local reschedule.
1546 */
1502 rdp->n_rp_qs_pending++; 1547 rdp->n_rp_qs_pending++;
1548 if (!rdp->preemptable &&
1549 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1550 jiffies))
1551 set_need_resched();
1552 } else if (rdp->qs_pending && rdp->passed_quiesc) {
1553 rdp->n_rp_report_qs++;
1503 return 1; 1554 return 1;
1504 } 1555 }
1505 1556
@@ -1767,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1767} 1818}
1768 1819
1769/* 1820/*
1821 * This function is invoked towards the end of the scheduler's initialization
1822 * process. Before this is called, the idle task might contain
1823 * RCU read-side critical sections (during which time, this idle
1824 * task is booting the system). After this function is called, the
1825 * idle tasks are prohibited from containing RCU read-side critical
1826 * sections. This function also enables RCU lockdep checking.
1827 */
1828void rcu_scheduler_starting(void)
1829{
1830 WARN_ON(num_online_cpus() != 1);
1831 WARN_ON(nr_context_switches() > 0);
1832 rcu_scheduler_active = 1;
1833}
1834
1835/*
1770 * Compute the per-level fanout, either using the exact fanout specified 1836 * Compute the per-level fanout, either using the exact fanout specified
1771 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 1837 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1772 */ 1838 */
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]); 1915 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1850 } 1916 }
1851 } 1917 }
1918
1919 rnp = rsp->level[NUM_RCU_LVLS - 1];
1920 for_each_possible_cpu(i) {
1921 while (i > rnp->grphi)
1922 rnp++;
1923 rsp->rda[i]->mynode = rnp;
1924 rcu_boot_init_percpu_data(i, rsp);
1925 }
1852} 1926}
1853 1927
1854/* 1928/*
@@ -1859,19 +1933,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1933#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1860do { \ 1934do { \
1861 int i; \ 1935 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \ 1936 \
1865 rcu_init_one(rsp); \
1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1867 j = 0; \
1868 for_each_possible_cpu(i) { \ 1937 for_each_possible_cpu(i) { \
1869 if (i > rnp[j].grphi) \
1870 j++; \
1871 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1872 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1938 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1873 rcu_boot_init_percpu_data(i, rsp); \
1874 } \ 1939 } \
1940 rcu_init_one(rsp); \
1875} while (0) 1941} while (0)
1876 1942
1877void __init rcu_init(void) 1943void __init rcu_init(void)
@@ -1879,12 +1945,6 @@ void __init rcu_init(void)
1879 int cpu; 1945 int cpu;
1880 1946
1881 rcu_bootup_announce(); 1947 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1948 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1949 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1890 __rcu_init_preempt(); 1950 __rcu_init_preempt();
@@ -1898,6 +1958,7 @@ void __init rcu_init(void)
1898 cpu_notifier(rcu_cpu_notify, 0); 1958 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(cpu) 1959 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1960 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1961 check_cpu_stall_init();
1901} 1962}
1902 1963
1903#include "rcutree_plugin.h" 1964#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1439eb504c22..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 unsigned long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 unsigned long n_rp_report_qs;
226 unsigned long n_rp_cb_ready; 227 unsigned long n_rp_cb_ready;
227 unsigned long n_rp_cpu_needs_gp; 228 unsigned long n_rp_cpu_needs_gp;
228 unsigned long n_rp_gp_completed; 229 unsigned long n_rp_gp_completed;
@@ -246,12 +247,21 @@ struct rcu_data {
246 247
247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
248#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 249#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
249#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ 250
250#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ 251#ifdef CONFIG_PROVE_RCU
251#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 252#define RCU_STALL_DELAY_DELTA (5 * HZ)
252 /* to take at least one */ 253#else
253 /* scheduling clock irq */ 254#define RCU_STALL_DELAY_DELTA 0
254 /* before ratting on them. */ 255#endif
256
257#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
258 /* for rsp->jiffies_stall */
259#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
260 /* for rsp->jiffies_stall */
261#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
262 /* to take at least one */
263 /* scheduling clock irq */
264 /* before ratting on them. */
255 265
256#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 266#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257 267
@@ -317,6 +327,7 @@ struct rcu_state {
317 unsigned long jiffies_stall; /* Time at which to check */ 327 unsigned long jiffies_stall; /* Time at which to check */
318 /* for CPU stalls. */ 328 /* for CPU stalls. */
319#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
330 char *name; /* Name of structure. */
320}; 331};
321 332
322/* Return values for rcu_preempt_offline_tasks(). */ 333/* Return values for rcu_preempt_offline_tasks(). */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 464ad2cdee00..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28 28
29/*
30 * Check the RCU kernel configuration parameters and print informative
31 * messages about anything out of the ordinary. If you like #ifdef, you
32 * will love this function.
33 */
34static void __init rcu_bootup_announce_oddness(void)
35{
36#ifdef CONFIG_RCU_TRACE
37 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
38#endif
39#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
40 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
41 CONFIG_RCU_FANOUT);
42#endif
43#ifdef CONFIG_RCU_FANOUT_EXACT
44 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
45#endif
46#ifdef CONFIG_RCU_FAST_NO_HZ
47 printk(KERN_INFO
48 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
49#endif
50#ifdef CONFIG_PROVE_RCU
51 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
52#endif
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif
63#if NUM_RCU_LVL_4 != 0
64 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
65#endif
66}
67
29#ifdef CONFIG_TREE_PREEMPT_RCU 68#ifdef CONFIG_TREE_PREEMPT_RCU
30 69
31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
38 */ 77 */
39static void __init rcu_bootup_announce(void) 78static void __init rcu_bootup_announce(void)
40{ 79{
41 printk(KERN_INFO 80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
42 "Experimental preemptable hierarchical RCU implementation.\n"); 81 rcu_bootup_announce_oddness();
43} 82}
44 83
45/* 84/*
@@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
75 * that this just means that the task currently running on the CPU is 114 * that this just means that the task currently running on the CPU is
76 * not in a quiescent state. There might be any number of tasks blocked 115 * not in a quiescent state. There might be any number of tasks blocked
77 * while in an RCU read-side critical section. 116 * while in an RCU read-side critical section.
117 *
118 * Unlike the other rcu_*_qs() functions, callers to this function
119 * must disable irqs in order to protect the assignment to
120 * ->rcu_read_unlock_special.
78 */ 121 */
79static void rcu_preempt_qs(int cpu) 122static void rcu_preempt_qs(int cpu)
80{ 123{
81 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 124 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
125
82 rdp->passed_quiesc_completed = rdp->gpnum - 1; 126 rdp->passed_quiesc_completed = rdp->gpnum - 1;
83 barrier(); 127 barrier();
84 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
129 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
85} 130}
86 131
87/* 132/*
@@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
144 * grace period, then the fact that the task has been enqueued 189 * grace period, then the fact that the task has been enqueued
145 * means that we continue to block the current grace period. 190 * means that we continue to block the current grace period.
146 */ 191 */
147 rcu_preempt_qs(cpu);
148 local_irq_save(flags); 192 local_irq_save(flags);
149 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 193 rcu_preempt_qs(cpu);
150 local_irq_restore(flags); 194 local_irq_restore(flags);
151} 195}
152 196
@@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
236 */ 280 */
237 special = t->rcu_read_unlock_special; 281 special = t->rcu_read_unlock_special;
238 if (special & RCU_READ_UNLOCK_NEED_QS) { 282 if (special & RCU_READ_UNLOCK_NEED_QS) {
239 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
240 rcu_preempt_qs(smp_processor_id()); 283 rcu_preempt_qs(smp_processor_id());
241 } 284 }
242 285
@@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
473 struct task_struct *t = current; 516 struct task_struct *t = current;
474 517
475 if (t->rcu_read_lock_nesting == 0) { 518 if (t->rcu_read_lock_nesting == 0) {
476 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
477 rcu_preempt_qs(cpu); 519 rcu_preempt_qs(cpu);
478 return; 520 return;
479 } 521 }
@@ -515,11 +557,13 @@ void synchronize_rcu(void)
515 if (!rcu_scheduler_active) 557 if (!rcu_scheduler_active)
516 return; 558 return;
517 559
560 init_rcu_head_on_stack(&rcu.head);
518 init_completion(&rcu.completion); 561 init_completion(&rcu.completion);
519 /* Will wake me after RCU finished. */ 562 /* Will wake me after RCU finished. */
520 call_rcu(&rcu.head, wakeme_after_rcu); 563 call_rcu(&rcu.head, wakeme_after_rcu);
521 /* Wait for it. */ 564 /* Wait for it. */
522 wait_for_completion(&rcu.completion); 565 wait_for_completion(&rcu.completion);
566 destroy_rcu_head_on_stack(&rcu.head);
523} 567}
524EXPORT_SYMBOL_GPL(synchronize_rcu); 568EXPORT_SYMBOL_GPL(synchronize_rcu);
525 569
@@ -754,6 +798,7 @@ void exit_rcu(void)
754static void __init rcu_bootup_announce(void) 798static void __init rcu_bootup_announce(void)
755{ 799{
756 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 800 printk(KERN_INFO "Hierarchical RCU implementation.\n");
801 rcu_bootup_announce_oddness();
757} 802}
758 803
759/* 804/*
@@ -1008,15 +1053,27 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1008int rcu_needs_cpu(int cpu) 1053int rcu_needs_cpu(int cpu)
1009{ 1054{
1010 int c = 0; 1055 int c = 0;
1056 int snap;
1057 int snap_nmi;
1011 int thatcpu; 1058 int thatcpu;
1012 1059
1060 /* Check for being in the holdoff period. */
1061 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1062 return rcu_needs_cpu_quick_check(cpu);
1063
1013 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1064 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1014 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1065 for_each_online_cpu(thatcpu) {
1015 if (thatcpu != cpu) { 1066 if (thatcpu == cpu)
1067 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
1070 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
1016 per_cpu(rcu_dyntick_drain, cpu) = 0; 1072 per_cpu(rcu_dyntick_drain, cpu) = 0;
1017 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1018 return rcu_needs_cpu_quick_check(cpu); 1074 return rcu_needs_cpu_quick_check(cpu);
1019 } 1075 }
1076 }
1020 1077
1021 /* Check and update the rcu_dyntick_drain sequencing. */ 1078 /* Check and update the rcu_dyntick_drain sequencing. */
1022 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 1079 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
@@ -1041,10 +1098,8 @@ int rcu_needs_cpu(int cpu)
1041 } 1098 }
1042 1099
1043 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1100 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1044 if (c) { 1101 if (c)
1045 raise_softirq(RCU_SOFTIRQ); 1102 raise_softirq(RCU_SOFTIRQ);
1046 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1047 }
1048 return c; 1103 return c;
1049} 1104}
1050 1105
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d27..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
242{ 242{
243 seq_printf(m, "%3d%cnp=%ld " 243 seq_printf(m, "%3d%cnp=%ld "
244 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", 244 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
245 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
245 rdp->cpu, 246 rdp->cpu,
246 cpu_is_offline(rdp->cpu) ? '!' : ' ', 247 cpu_is_offline(rdp->cpu) ? '!' : ' ',
247 rdp->n_rcu_pending, 248 rdp->n_rcu_pending,
248 rdp->n_rp_qs_pending, 249 rdp->n_rp_qs_pending,
250 rdp->n_rp_report_qs,
249 rdp->n_rp_cb_ready, 251 rdp->n_rp_cb_ready,
250 rdp->n_rp_cpu_needs_gp, 252 rdp->n_rp_cpu_needs_gp,
251 rdp->n_rp_gp_completed, 253 rdp->n_rp_gp_completed,
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/res_counter.h> 13#include <linux/res_counter.h>
15#include <linux/uaccess.h> 14#include <linux/uaccess.h>
16#include <linux/mm.h> 15#include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 4e9d87fd7bc5..9c358e263534 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -219,19 +219,34 @@ void release_child_resources(struct resource *r)
219} 219}
220 220
221/** 221/**
222 * request_resource - request and reserve an I/O or memory resource 222 * request_resource_conflict - request and reserve an I/O or memory resource
223 * @root: root resource descriptor 223 * @root: root resource descriptor
224 * @new: resource descriptor desired by caller 224 * @new: resource descriptor desired by caller
225 * 225 *
226 * Returns 0 for success, negative error code on error. 226 * Returns 0 for success, conflict resource on error.
227 */ 227 */
228int request_resource(struct resource *root, struct resource *new) 228struct resource *request_resource_conflict(struct resource *root, struct resource *new)
229{ 229{
230 struct resource *conflict; 230 struct resource *conflict;
231 231
232 write_lock(&resource_lock); 232 write_lock(&resource_lock);
233 conflict = __request_resource(root, new); 233 conflict = __request_resource(root, new);
234 write_unlock(&resource_lock); 234 write_unlock(&resource_lock);
235 return conflict;
236}
237
238/**
239 * request_resource - request and reserve an I/O or memory resource
240 * @root: root resource descriptor
241 * @new: resource descriptor desired by caller
242 *
243 * Returns 0 for success, negative error code on error.
244 */
245int request_resource(struct resource *root, struct resource *new)
246{
247 struct resource *conflict;
248
249 conflict = request_resource_conflict(root, new);
235 return conflict ? -EBUSY : 0; 250 return conflict ? -EBUSY : 0;
236} 251}
237 252
@@ -304,7 +319,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
304 void *arg, int (*func)(unsigned long, unsigned long, void *)) 319 void *arg, int (*func)(unsigned long, unsigned long, void *))
305{ 320{
306 struct resource res; 321 struct resource res;
307 unsigned long pfn, len; 322 unsigned long pfn, end_pfn;
308 u64 orig_end; 323 u64 orig_end;
309 int ret = -1; 324 int ret = -1;
310 325
@@ -314,9 +329,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
314 orig_end = res.end; 329 orig_end = res.end;
315 while ((res.start < res.end) && 330 while ((res.start < res.end) &&
316 (find_next_system_ram(&res, "System RAM") >= 0)) { 331 (find_next_system_ram(&res, "System RAM") >= 0)) {
317 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 332 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
318 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 333 end_pfn = (res.end + 1) >> PAGE_SHIFT;
319 ret = (*func)(pfn, len, arg); 334 if (end_pfn > pfn)
335 ret = (*func)(pfn, end_pfn - pfn, arg);
320 if (ret) 336 if (ret)
321 break; 337 break;
322 res.start = res.end + 1; 338 res.start = res.end + 1;
@@ -473,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
473} 489}
474 490
475/** 491/**
476 * insert_resource - Inserts a resource in the resource tree 492 * insert_resource_conflict - Inserts resource in the resource tree
477 * @parent: parent of the new resource 493 * @parent: parent of the new resource
478 * @new: new resource to insert 494 * @new: new resource to insert
479 * 495 *
480 * Returns 0 on success, -EBUSY if the resource can't be inserted. 496 * Returns 0 on success, conflict resource if the resource can't be inserted.
481 * 497 *
482 * This function is equivalent to request_resource when no conflict 498 * This function is equivalent to request_resource_conflict when no conflict
483 * happens. If a conflict happens, and the conflicting resources 499 * happens. If a conflict happens, and the conflicting resources
484 * entirely fit within the range of the new resource, then the new 500 * entirely fit within the range of the new resource, then the new
485 * resource is inserted and the conflicting resources become children of 501 * resource is inserted and the conflicting resources become children of
486 * the new resource. 502 * the new resource.
487 */ 503 */
488int insert_resource(struct resource *parent, struct resource *new) 504struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
489{ 505{
490 struct resource *conflict; 506 struct resource *conflict;
491 507
492 write_lock(&resource_lock); 508 write_lock(&resource_lock);
493 conflict = __insert_resource(parent, new); 509 conflict = __insert_resource(parent, new);
494 write_unlock(&resource_lock); 510 write_unlock(&resource_lock);
511 return conflict;
512}
513
514/**
515 * insert_resource - Inserts a resource in the resource tree
516 * @parent: parent of the new resource
517 * @new: new resource to insert
518 *
519 * Returns 0 on success, -EBUSY if the resource can't be inserted.
520 */
521int insert_resource(struct resource *parent, struct resource *new)
522{
523 struct resource *conflict;
524
525 conflict = insert_resource_conflict(parent, new);
495 return conflict ? -EBUSY : 0; 526 return conflict ? -EBUSY : 0;
496} 527}
497 528
diff --git a/kernel/sched.c b/kernel/sched.c
index 6a212c97f523..1d93cd0ae4d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -71,6 +71,7 @@
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h> 73#include <linux/ftrace.h>
74#include <linux/slab.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -322,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p)
322/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
323static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 324static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
324{ 325{
326 /*
327 * Strictly speaking this rcu_read_lock() is not needed since the
328 * task_group is tied to the cgroup, which in turn can never go away
329 * as long as there are tasks attached to it.
330 *
331 * However since task_group() uses task_subsys_state() which is an
332 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
333 */
334 rcu_read_lock();
325#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
326 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 336 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
327 p->se.parent = task_group(p)->se[cpu]; 337 p->se.parent = task_group(p)->se[cpu];
@@ -331,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
331 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 341 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
332 p->rt.parent = task_group(p)->rt_se[cpu]; 342 p->rt.parent = task_group(p)->rt_se[cpu];
333#endif 343#endif
344 rcu_read_unlock();
334} 345}
335 346
336#else 347#else
@@ -492,8 +503,11 @@ struct rq {
492 #define CPU_LOAD_IDX_MAX 5 503 #define CPU_LOAD_IDX_MAX 5
493 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 504 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
494#ifdef CONFIG_NO_HZ 505#ifdef CONFIG_NO_HZ
506 u64 nohz_stamp;
495 unsigned char in_nohz_recently; 507 unsigned char in_nohz_recently;
496#endif 508#endif
509 unsigned int skip_clock_update;
510
497 /* capture load from *all* tasks on this cpu: */ 511 /* capture load from *all* tasks on this cpu: */
498 struct load_weight load; 512 struct load_weight load;
499 unsigned long nr_load_updates; 513 unsigned long nr_load_updates;
@@ -535,15 +549,13 @@ struct rq {
535 int post_schedule; 549 int post_schedule;
536 int active_balance; 550 int active_balance;
537 int push_cpu; 551 int push_cpu;
552 struct cpu_stop_work active_balance_work;
538 /* cpu of this runqueue: */ 553 /* cpu of this runqueue: */
539 int cpu; 554 int cpu;
540 int online; 555 int online;
541 556
542 unsigned long avg_load_per_task; 557 unsigned long avg_load_per_task;
543 558
544 struct task_struct *migration_thread;
545 struct list_head migration_queue;
546
547 u64 rt_avg; 559 u64 rt_avg;
548 u64 age_stamp; 560 u64 age_stamp;
549 u64 idle_stamp; 561 u64 idle_stamp;
@@ -591,6 +603,13 @@ static inline
591void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 603void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
592{ 604{
593 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 605 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
606
607 /*
608 * A queue event has occurred, and we're going to schedule. In
609 * this case, we can save a useless back to back clock update.
610 */
611 if (test_tsk_need_resched(p))
612 rq->skip_clock_update = 1;
594} 613}
595 614
596static inline int cpu_of(struct rq *rq) 615static inline int cpu_of(struct rq *rq)
@@ -625,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
625 644
626inline void update_rq_clock(struct rq *rq) 645inline void update_rq_clock(struct rq *rq)
627{ 646{
628 rq->clock = sched_clock_cpu(cpu_of(rq)); 647 if (!rq->skip_clock_update)
648 rq->clock = sched_clock_cpu(cpu_of(rq));
629} 649}
630 650
631/* 651/*
@@ -903,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 923#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
904 924
905/* 925/*
906 * Check whether the task is waking, we use this to synchronize against 926 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
907 * ttwu() so that task_cpu() reports a stable number. 927 * against ttwu().
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */ 928 */
913static inline int task_is_waking(struct task_struct *p) 929static inline int task_is_waking(struct task_struct *p)
914{ 930{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 931 return unlikely(p->state == TASK_WAKING);
916} 932}
917 933
918/* 934/*
@@ -925,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
925 struct rq *rq; 941 struct rq *rq;
926 942
927 for (;;) { 943 for (;;) {
928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p); 944 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
932 if (likely(rq == task_rq(p) && !task_is_waking(p))) 946 if (likely(rq == task_rq(p)))
933 return rq; 947 return rq;
934 raw_spin_unlock(&rq->lock); 948 raw_spin_unlock(&rq->lock);
935 } 949 }
@@ -946,12 +960,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
946 struct rq *rq; 960 struct rq *rq;
947 961
948 for (;;) { 962 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
951 local_irq_save(*flags); 963 local_irq_save(*flags);
952 rq = task_rq(p); 964 rq = task_rq(p);
953 raw_spin_lock(&rq->lock); 965 raw_spin_lock(&rq->lock);
954 if (likely(rq == task_rq(p) && !task_is_waking(p))) 966 if (likely(rq == task_rq(p)))
955 return rq; 967 return rq;
956 raw_spin_unlock_irqrestore(&rq->lock, *flags); 968 raw_spin_unlock_irqrestore(&rq->lock, *flags);
957 } 969 }
@@ -1228,6 +1240,17 @@ void wake_up_idle_cpu(int cpu)
1228 if (!tsk_is_polling(rq->idle)) 1240 if (!tsk_is_polling(rq->idle))
1229 smp_send_reschedule(cpu); 1241 smp_send_reschedule(cpu);
1230} 1242}
1243
1244int nohz_ratelimit(int cpu)
1245{
1246 struct rq *rq = cpu_rq(cpu);
1247 u64 diff = rq->clock - rq->nohz_stamp;
1248
1249 rq->nohz_stamp = rq->clock;
1250
1251 return diff < (NSEC_PER_SEC / HZ) >> 1;
1252}
1253
1231#endif /* CONFIG_NO_HZ */ 1254#endif /* CONFIG_NO_HZ */
1232 1255
1233static u64 sched_avg_period(void) 1256static u64 sched_avg_period(void)
@@ -1521,7 +1544,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1521 1544
1522#ifdef CONFIG_FAIR_GROUP_SCHED 1545#ifdef CONFIG_FAIR_GROUP_SCHED
1523 1546
1524static __read_mostly unsigned long *update_shares_data; 1547static __read_mostly unsigned long __percpu *update_shares_data;
1525 1548
1526static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1549static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1527 1550
@@ -1770,8 +1793,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1793 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 } 1794 }
1772 } 1795 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775} 1796}
1776 1797
1777/* 1798/*
@@ -1802,7 +1823,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1802} 1823}
1803#endif 1824#endif
1804 1825
1805static void calc_load_account_active(struct rq *this_rq); 1826static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1827static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1828static int get_update_sysctl_factor(void);
1808 1829
@@ -1859,62 +1880,43 @@ static void set_load_weight(struct task_struct *p)
1859 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1880 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1860} 1881}
1861 1882
1862static void update_avg(u64 *avg, u64 sample) 1883static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1863{
1864 s64 diff = sample - *avg;
1865 *avg += diff >> 3;
1866}
1867
1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1870{ 1884{
1871 if (wakeup) 1885 update_rq_clock(rq);
1872 p->se.start_runtime = p->se.sum_exec_runtime;
1873
1874 sched_info_queued(p); 1886 sched_info_queued(p);
1875 p->sched_class->enqueue_task(rq, p, wakeup, head); 1887 p->sched_class->enqueue_task(rq, p, flags);
1876 p->se.on_rq = 1; 1888 p->se.on_rq = 1;
1877} 1889}
1878 1890
1879static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1891static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1880{ 1892{
1881 if (sleep) { 1893 update_rq_clock(rq);
1882 if (p->se.last_wakeup) {
1883 update_avg(&p->se.avg_overlap,
1884 p->se.sum_exec_runtime - p->se.last_wakeup);
1885 p->se.last_wakeup = 0;
1886 } else {
1887 update_avg(&p->se.avg_wakeup,
1888 sysctl_sched_wakeup_granularity);
1889 }
1890 }
1891
1892 sched_info_dequeued(p); 1894 sched_info_dequeued(p);
1893 p->sched_class->dequeue_task(rq, p, sleep); 1895 p->sched_class->dequeue_task(rq, p, flags);
1894 p->se.on_rq = 0; 1896 p->se.on_rq = 0;
1895} 1897}
1896 1898
1897/* 1899/*
1898 * activate_task - move a task to the runqueue. 1900 * activate_task - move a task to the runqueue.
1899 */ 1901 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1902static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1901{ 1903{
1902 if (task_contributes_to_load(p)) 1904 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--; 1905 rq->nr_uninterruptible--;
1904 1906
1905 enqueue_task(rq, p, wakeup, false); 1907 enqueue_task(rq, p, flags);
1906 inc_nr_running(rq); 1908 inc_nr_running(rq);
1907} 1909}
1908 1910
1909/* 1911/*
1910 * deactivate_task - remove a task from the runqueue. 1912 * deactivate_task - remove a task from the runqueue.
1911 */ 1913 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1914static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1913{ 1915{
1914 if (task_contributes_to_load(p)) 1916 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++; 1917 rq->nr_uninterruptible++;
1916 1918
1917 dequeue_task(rq, p, sleep); 1919 dequeue_task(rq, p, flags);
1918 dec_nr_running(rq); 1920 dec_nr_running(rq);
1919} 1921}
1920 1922
@@ -2043,21 +2045,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2043 __set_task_cpu(p, new_cpu); 2045 __set_task_cpu(p, new_cpu);
2044} 2046}
2045 2047
2046struct migration_req { 2048struct migration_arg {
2047 struct list_head list;
2048
2049 struct task_struct *task; 2049 struct task_struct *task;
2050 int dest_cpu; 2050 int dest_cpu;
2051
2052 struct completion done;
2053}; 2051};
2054 2052
2053static int migration_cpu_stop(void *data);
2054
2055/* 2055/*
2056 * The task's runqueue lock must be held. 2056 * The task's runqueue lock must be held.
2057 * Returns true if you have to wait for migration thread. 2057 * Returns true if you have to wait for migration thread.
2058 */ 2058 */
2059static int 2059static bool migrate_task(struct task_struct *p, int dest_cpu)
2060migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2061{ 2060{
2062 struct rq *rq = task_rq(p); 2061 struct rq *rq = task_rq(p);
2063 2062
@@ -2065,58 +2064,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2065 * If the task is not on a runqueue (and not running), then 2064 * If the task is not on a runqueue (and not running), then
2066 * the next wake-up will properly place the task. 2065 * the next wake-up will properly place the task.
2067 */ 2066 */
2068 if (!p->se.on_rq && !task_running(rq, p)) 2067 return p->se.on_rq || task_running(rq, p);
2069 return 0;
2070
2071 init_completion(&req->done);
2072 req->task = p;
2073 req->dest_cpu = dest_cpu;
2074 list_add(&req->list, &rq->migration_queue);
2075
2076 return 1;
2077}
2078
2079/*
2080 * wait_task_context_switch - wait for a thread to complete at least one
2081 * context switch.
2082 *
2083 * @p must not be current.
2084 */
2085void wait_task_context_switch(struct task_struct *p)
2086{
2087 unsigned long nvcsw, nivcsw, flags;
2088 int running;
2089 struct rq *rq;
2090
2091 nvcsw = p->nvcsw;
2092 nivcsw = p->nivcsw;
2093 for (;;) {
2094 /*
2095 * The runqueue is assigned before the actual context
2096 * switch. We need to take the runqueue lock.
2097 *
2098 * We could check initially without the lock but it is
2099 * very likely that we need to take the lock in every
2100 * iteration.
2101 */
2102 rq = task_rq_lock(p, &flags);
2103 running = task_running(rq, p);
2104 task_rq_unlock(rq, &flags);
2105
2106 if (likely(!running))
2107 break;
2108 /*
2109 * The switch count is incremented before the actual
2110 * context switch. We thus wait for two switches to be
2111 * sure at least one completed.
2112 */
2113 if ((p->nvcsw - nvcsw) > 1)
2114 break;
2115 if ((p->nivcsw - nivcsw) > 1)
2116 break;
2117
2118 cpu_relax();
2119 }
2120} 2068}
2121 2069
2122/* 2070/*
@@ -2174,7 +2122,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2174 * just go back and repeat. 2122 * just go back and repeat.
2175 */ 2123 */
2176 rq = task_rq_lock(p, &flags); 2124 rq = task_rq_lock(p, &flags);
2177 trace_sched_wait_task(rq, p); 2125 trace_sched_wait_task(p);
2178 running = task_running(rq, p); 2126 running = task_running(rq, p);
2179 on_rq = p->se.on_rq; 2127 on_rq = p->se.on_rq;
2180 ncsw = 0; 2128 ncsw = 0;
@@ -2272,6 +2220,9 @@ void task_oncpu_function_call(struct task_struct *p,
2272} 2220}
2273 2221
2274#ifdef CONFIG_SMP 2222#ifdef CONFIG_SMP
2223/*
2224 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2225 */
2275static int select_fallback_rq(int cpu, struct task_struct *p) 2226static int select_fallback_rq(int cpu, struct task_struct *p)
2276{ 2227{
2277 int dest_cpu; 2228 int dest_cpu;
@@ -2288,12 +2239,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2288 return dest_cpu; 2239 return dest_cpu;
2289 2240
2290 /* No more Mr. Nice Guy. */ 2241 /* No more Mr. Nice Guy. */
2291 if (dest_cpu >= nr_cpu_ids) { 2242 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2292 rcu_read_lock(); 2243 dest_cpu = cpuset_cpus_allowed_fallback(p);
2293 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2294 rcu_read_unlock();
2295 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2296
2297 /* 2244 /*
2298 * Don't tell them about moving exiting tasks or 2245 * Don't tell them about moving exiting tasks or
2299 * kernel threads (both mm NULL), since they never 2246 * kernel threads (both mm NULL), since they never
@@ -2310,17 +2257,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2310} 2257}
2311 2258
2312/* 2259/*
2313 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2260 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2314 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2315 * by:
2316 *
2317 * exec: is unstable, retry loop
2318 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2319 */ 2261 */
2320static inline 2262static inline
2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2263int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2322{ 2264{
2323 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2265 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2324 2266
2325 /* 2267 /*
2326 * In order not to call set_task_cpu() on a blocking task we need 2268 * In order not to call set_task_cpu() on a blocking task we need
@@ -2338,6 +2280,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2338 2280
2339 return cpu; 2281 return cpu;
2340} 2282}
2283
2284static void update_avg(u64 *avg, u64 sample)
2285{
2286 s64 diff = sample - *avg;
2287 *avg += diff >> 3;
2288}
2341#endif 2289#endif
2342 2290
2343/*** 2291/***
@@ -2359,16 +2307,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359{ 2307{
2360 int cpu, orig_cpu, this_cpu, success = 0; 2308 int cpu, orig_cpu, this_cpu, success = 0;
2361 unsigned long flags; 2309 unsigned long flags;
2362 struct rq *rq, *orig_rq; 2310 unsigned long en_flags = ENQUEUE_WAKEUP;
2363 2311 struct rq *rq;
2364 if (!sched_feat(SYNC_WAKEUPS))
2365 wake_flags &= ~WF_SYNC;
2366 2312
2367 this_cpu = get_cpu(); 2313 this_cpu = get_cpu();
2368 2314
2369 smp_wmb(); 2315 smp_wmb();
2370 rq = orig_rq = task_rq_lock(p, &flags); 2316 rq = task_rq_lock(p, &flags);
2371 update_rq_clock(rq);
2372 if (!(p->state & state)) 2317 if (!(p->state & state))
2373 goto out; 2318 goto out;
2374 2319
@@ -2388,28 +2333,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2388 * 2333 *
2389 * First fix up the nr_uninterruptible count: 2334 * First fix up the nr_uninterruptible count:
2390 */ 2335 */
2391 if (task_contributes_to_load(p)) 2336 if (task_contributes_to_load(p)) {
2392 rq->nr_uninterruptible--; 2337 if (likely(cpu_online(orig_cpu)))
2338 rq->nr_uninterruptible--;
2339 else
2340 this_rq()->nr_uninterruptible--;
2341 }
2393 p->state = TASK_WAKING; 2342 p->state = TASK_WAKING;
2394 2343
2395 if (p->sched_class->task_waking) 2344 if (p->sched_class->task_waking) {
2396 p->sched_class->task_waking(rq, p); 2345 p->sched_class->task_waking(rq, p);
2346 en_flags |= ENQUEUE_WAKING;
2347 }
2397 2348
2398 __task_rq_unlock(rq); 2349 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2399 2350 if (cpu != orig_cpu)
2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2407 set_task_cpu(p, cpu); 2351 set_task_cpu(p, cpu);
2408 } 2352 __task_rq_unlock(rq);
2409 2353
2410 rq = cpu_rq(cpu); 2354 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock); 2355 raw_spin_lock(&rq->lock);
2412 update_rq_clock(rq);
2413 2356
2414 /* 2357 /*
2415 * We migrated the task without holding either rq->lock, however 2358 * We migrated the task without holding either rq->lock, however
@@ -2437,36 +2380,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2437 2380
2438out_activate: 2381out_activate:
2439#endif /* CONFIG_SMP */ 2382#endif /* CONFIG_SMP */
2440 schedstat_inc(p, se.nr_wakeups); 2383 schedstat_inc(p, se.statistics.nr_wakeups);
2441 if (wake_flags & WF_SYNC) 2384 if (wake_flags & WF_SYNC)
2442 schedstat_inc(p, se.nr_wakeups_sync); 2385 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2443 if (orig_cpu != cpu) 2386 if (orig_cpu != cpu)
2444 schedstat_inc(p, se.nr_wakeups_migrate); 2387 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2445 if (cpu == this_cpu) 2388 if (cpu == this_cpu)
2446 schedstat_inc(p, se.nr_wakeups_local); 2389 schedstat_inc(p, se.statistics.nr_wakeups_local);
2447 else 2390 else
2448 schedstat_inc(p, se.nr_wakeups_remote); 2391 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2449 activate_task(rq, p, 1); 2392 activate_task(rq, p, en_flags);
2450 success = 1; 2393 success = 1;
2451 2394
2452 /*
2453 * Only attribute actual wakeups done by this task.
2454 */
2455 if (!in_interrupt()) {
2456 struct sched_entity *se = &current->se;
2457 u64 sample = se->sum_exec_runtime;
2458
2459 if (se->last_wakeup)
2460 sample -= se->last_wakeup;
2461 else
2462 sample -= se->start_runtime;
2463 update_avg(&se->avg_wakeup, sample);
2464
2465 se->last_wakeup = se->sum_exec_runtime;
2466 }
2467
2468out_running: 2395out_running:
2469 trace_sched_wakeup(rq, p, success); 2396 trace_sched_wakeup(p, success);
2470 check_preempt_curr(rq, p, wake_flags); 2397 check_preempt_curr(rq, p, wake_flags);
2471 2398
2472 p->state = TASK_RUNNING; 2399 p->state = TASK_RUNNING;
@@ -2526,42 +2453,9 @@ static void __sched_fork(struct task_struct *p)
2526 p->se.sum_exec_runtime = 0; 2453 p->se.sum_exec_runtime = 0;
2527 p->se.prev_sum_exec_runtime = 0; 2454 p->se.prev_sum_exec_runtime = 0;
2528 p->se.nr_migrations = 0; 2455 p->se.nr_migrations = 0;
2529 p->se.last_wakeup = 0;
2530 p->se.avg_overlap = 0;
2531 p->se.start_runtime = 0;
2532 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2533 2456
2534#ifdef CONFIG_SCHEDSTATS 2457#ifdef CONFIG_SCHEDSTATS
2535 p->se.wait_start = 0; 2458 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2536 p->se.wait_max = 0;
2537 p->se.wait_count = 0;
2538 p->se.wait_sum = 0;
2539
2540 p->se.sleep_start = 0;
2541 p->se.sleep_max = 0;
2542 p->se.sum_sleep_runtime = 0;
2543
2544 p->se.block_start = 0;
2545 p->se.block_max = 0;
2546 p->se.exec_max = 0;
2547 p->se.slice_max = 0;
2548
2549 p->se.nr_migrations_cold = 0;
2550 p->se.nr_failed_migrations_affine = 0;
2551 p->se.nr_failed_migrations_running = 0;
2552 p->se.nr_failed_migrations_hot = 0;
2553 p->se.nr_forced_migrations = 0;
2554
2555 p->se.nr_wakeups = 0;
2556 p->se.nr_wakeups_sync = 0;
2557 p->se.nr_wakeups_migrate = 0;
2558 p->se.nr_wakeups_local = 0;
2559 p->se.nr_wakeups_remote = 0;
2560 p->se.nr_wakeups_affine = 0;
2561 p->se.nr_wakeups_affine_attempts = 0;
2562 p->se.nr_wakeups_passive = 0;
2563 p->se.nr_wakeups_idle = 0;
2564
2565#endif 2459#endif
2566 2460
2567 INIT_LIST_HEAD(&p->rt.run_list); 2461 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2582,11 +2476,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2582 2476
2583 __sched_fork(p); 2477 __sched_fork(p);
2584 /* 2478 /*
2585 * We mark the process as waking here. This guarantees that 2479 * We mark the process as running here. This guarantees that
2586 * nobody will actually run it, and a signal or other external 2480 * nobody will actually run it, and a signal or other external
2587 * event cannot wake it up and insert it on the runqueue either. 2481 * event cannot wake it up and insert it on the runqueue either.
2588 */ 2482 */
2589 p->state = TASK_WAKING; 2483 p->state = TASK_RUNNING;
2590 2484
2591 /* 2485 /*
2592 * Revert to default priority/policy on fork if requested. 2486 * Revert to default priority/policy on fork if requested.
@@ -2650,34 +2544,30 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2650{ 2544{
2651 unsigned long flags; 2545 unsigned long flags;
2652 struct rq *rq; 2546 struct rq *rq;
2653 int cpu = get_cpu(); 2547 int cpu __maybe_unused = get_cpu();
2654 2548
2655#ifdef CONFIG_SMP 2549#ifdef CONFIG_SMP
2550 rq = task_rq_lock(p, &flags);
2551 p->state = TASK_WAKING;
2552
2656 /* 2553 /*
2657 * Fork balancing, do it here and not earlier because: 2554 * Fork balancing, do it here and not earlier because:
2658 * - cpus_allowed can change in the fork path 2555 * - cpus_allowed can change in the fork path
2659 * - any previously selected cpu might disappear through hotplug 2556 * - any previously selected cpu might disappear through hotplug
2660 * 2557 *
2661 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2558 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2662 * ->cpus_allowed is stable, we have preemption disabled, meaning 2559 * without people poking at ->cpus_allowed.
2663 * cpu_online_mask is stable.
2664 */ 2560 */
2665 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2561 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2666 set_task_cpu(p, cpu); 2562 set_task_cpu(p, cpu);
2667#endif
2668
2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675 2563
2676 BUG_ON(p->state != TASK_WAKING);
2677 p->state = TASK_RUNNING; 2564 p->state = TASK_RUNNING;
2678 update_rq_clock(rq); 2565 task_rq_unlock(rq, &flags);
2566#endif
2567
2568 rq = task_rq_lock(p, &flags);
2679 activate_task(rq, p, 0); 2569 activate_task(rq, p, 0);
2680 trace_sched_wakeup_new(rq, p, 1); 2570 trace_sched_wakeup_new(p, 1);
2681 check_preempt_curr(rq, p, WF_FORK); 2571 check_preempt_curr(rq, p, WF_FORK);
2682#ifdef CONFIG_SMP 2572#ifdef CONFIG_SMP
2683 if (p->sched_class->task_woken) 2573 if (p->sched_class->task_woken)
@@ -2897,7 +2787,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2897 struct mm_struct *mm, *oldmm; 2787 struct mm_struct *mm, *oldmm;
2898 2788
2899 prepare_task_switch(rq, prev, next); 2789 prepare_task_switch(rq, prev, next);
2900 trace_sched_switch(rq, prev, next); 2790 trace_sched_switch(prev, next);
2901 mm = next->mm; 2791 mm = next->mm;
2902 oldmm = prev->active_mm; 2792 oldmm = prev->active_mm;
2903 /* 2793 /*
@@ -3014,6 +2904,61 @@ static unsigned long calc_load_update;
3014unsigned long avenrun[3]; 2904unsigned long avenrun[3];
3015EXPORT_SYMBOL(avenrun); 2905EXPORT_SYMBOL(avenrun);
3016 2906
2907static long calc_load_fold_active(struct rq *this_rq)
2908{
2909 long nr_active, delta = 0;
2910
2911 nr_active = this_rq->nr_running;
2912 nr_active += (long) this_rq->nr_uninterruptible;
2913
2914 if (nr_active != this_rq->calc_load_active) {
2915 delta = nr_active - this_rq->calc_load_active;
2916 this_rq->calc_load_active = nr_active;
2917 }
2918
2919 return delta;
2920}
2921
2922#ifdef CONFIG_NO_HZ
2923/*
2924 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2925 *
2926 * When making the ILB scale, we should try to pull this in as well.
2927 */
2928static atomic_long_t calc_load_tasks_idle;
2929
2930static void calc_load_account_idle(struct rq *this_rq)
2931{
2932 long delta;
2933
2934 delta = calc_load_fold_active(this_rq);
2935 if (delta)
2936 atomic_long_add(delta, &calc_load_tasks_idle);
2937}
2938
2939static long calc_load_fold_idle(void)
2940{
2941 long delta = 0;
2942
2943 /*
2944 * Its got a race, we don't care...
2945 */
2946 if (atomic_long_read(&calc_load_tasks_idle))
2947 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2948
2949 return delta;
2950}
2951#else
2952static void calc_load_account_idle(struct rq *this_rq)
2953{
2954}
2955
2956static inline long calc_load_fold_idle(void)
2957{
2958 return 0;
2959}
2960#endif
2961
3017/** 2962/**
3018 * get_avenrun - get the load average array 2963 * get_avenrun - get the load average array
3019 * @loads: pointer to dest load array 2964 * @loads: pointer to dest load array
@@ -3060,20 +3005,22 @@ void calc_global_load(void)
3060} 3005}
3061 3006
3062/* 3007/*
3063 * Either called from update_cpu_load() or from a cpu going idle 3008 * Called from update_cpu_load() to periodically update this CPU's
3009 * active count.
3064 */ 3010 */
3065static void calc_load_account_active(struct rq *this_rq) 3011static void calc_load_account_active(struct rq *this_rq)
3066{ 3012{
3067 long nr_active, delta; 3013 long delta;
3068 3014
3069 nr_active = this_rq->nr_running; 3015 if (time_before(jiffies, this_rq->calc_load_update))
3070 nr_active += (long) this_rq->nr_uninterruptible; 3016 return;
3071 3017
3072 if (nr_active != this_rq->calc_load_active) { 3018 delta = calc_load_fold_active(this_rq);
3073 delta = nr_active - this_rq->calc_load_active; 3019 delta += calc_load_fold_idle();
3074 this_rq->calc_load_active = nr_active; 3020 if (delta)
3075 atomic_long_add(delta, &calc_load_tasks); 3021 atomic_long_add(delta, &calc_load_tasks);
3076 } 3022
3023 this_rq->calc_load_update += LOAD_FREQ;
3077} 3024}
3078 3025
3079/* 3026/*
@@ -3105,10 +3052,7 @@ static void update_cpu_load(struct rq *this_rq)
3105 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3052 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3106 } 3053 }
3107 3054
3108 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3055 calc_load_account_active(this_rq);
3109 this_rq->calc_load_update += LOAD_FREQ;
3110 calc_load_account_active(this_rq);
3111 }
3112} 3056}
3113 3057
3114#ifdef CONFIG_SMP 3058#ifdef CONFIG_SMP
@@ -3120,44 +3064,27 @@ static void update_cpu_load(struct rq *this_rq)
3120void sched_exec(void) 3064void sched_exec(void)
3121{ 3065{
3122 struct task_struct *p = current; 3066 struct task_struct *p = current;
3123 struct migration_req req;
3124 int dest_cpu, this_cpu;
3125 unsigned long flags; 3067 unsigned long flags;
3126 struct rq *rq; 3068 struct rq *rq;
3127 3069 int dest_cpu;
3128again:
3129 this_cpu = get_cpu();
3130 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3131 if (dest_cpu == this_cpu) {
3132 put_cpu();
3133 return;
3134 }
3135 3070
3136 rq = task_rq_lock(p, &flags); 3071 rq = task_rq_lock(p, &flags);
3137 put_cpu(); 3072 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3073 if (dest_cpu == smp_processor_id())
3074 goto unlock;
3138 3075
3139 /* 3076 /*
3140 * select_task_rq() can race against ->cpus_allowed 3077 * select_task_rq() can race against ->cpus_allowed
3141 */ 3078 */
3142 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3079 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3143 || unlikely(!cpu_active(dest_cpu))) { 3080 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3144 task_rq_unlock(rq, &flags); 3081 struct migration_arg arg = { p, dest_cpu };
3145 goto again;
3146 }
3147 3082
3148 /* force the process onto the specified CPU */
3149 if (migrate_task(p, dest_cpu, &req)) {
3150 /* Need to wait for migration thread (might exit: take ref). */
3151 struct task_struct *mt = rq->migration_thread;
3152
3153 get_task_struct(mt);
3154 task_rq_unlock(rq, &flags); 3083 task_rq_unlock(rq, &flags);
3155 wake_up_process(mt); 3084 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3156 put_task_struct(mt);
3157 wait_for_completion(&req.done);
3158
3159 return; 3085 return;
3160 } 3086 }
3087unlock:
3161 task_rq_unlock(rq, &flags); 3088 task_rq_unlock(rq, &flags);
3162} 3089}
3163 3090
@@ -3629,23 +3556,9 @@ static inline void schedule_debug(struct task_struct *prev)
3629 3556
3630static void put_prev_task(struct rq *rq, struct task_struct *prev) 3557static void put_prev_task(struct rq *rq, struct task_struct *prev)
3631{ 3558{
3632 if (prev->state == TASK_RUNNING) { 3559 if (prev->se.on_rq)
3633 u64 runtime = prev->se.sum_exec_runtime; 3560 update_rq_clock(rq);
3634 3561 rq->skip_clock_update = 0;
3635 runtime -= prev->se.prev_sum_exec_runtime;
3636 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3637
3638 /*
3639 * In order to avoid avg_overlap growing stale when we are
3640 * indeed overlapping and hence not getting put to sleep, grow
3641 * the avg_overlap on preemption.
3642 *
3643 * We use the average preemption runtime because that
3644 * correlates to the amount of cache footprint a task can
3645 * build up.
3646 */
3647 update_avg(&prev->se.avg_overlap, runtime);
3648 }
3649 prev->sched_class->put_prev_task(rq, prev); 3562 prev->sched_class->put_prev_task(rq, prev);
3650} 3563}
3651 3564
@@ -3695,7 +3608,7 @@ need_resched:
3695 preempt_disable(); 3608 preempt_disable();
3696 cpu = smp_processor_id(); 3609 cpu = smp_processor_id();
3697 rq = cpu_rq(cpu); 3610 rq = cpu_rq(cpu);
3698 rcu_sched_qs(cpu); 3611 rcu_note_context_switch(cpu);
3699 prev = rq->curr; 3612 prev = rq->curr;
3700 switch_count = &prev->nivcsw; 3613 switch_count = &prev->nivcsw;
3701 3614
@@ -3708,14 +3621,13 @@ need_resched_nonpreemptible:
3708 hrtick_clear(rq); 3621 hrtick_clear(rq);
3709 3622
3710 raw_spin_lock_irq(&rq->lock); 3623 raw_spin_lock_irq(&rq->lock);
3711 update_rq_clock(rq);
3712 clear_tsk_need_resched(prev); 3624 clear_tsk_need_resched(prev);
3713 3625
3714 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3626 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3715 if (unlikely(signal_pending_state(prev->state, prev))) 3627 if (unlikely(signal_pending_state(prev->state, prev)))
3716 prev->state = TASK_RUNNING; 3628 prev->state = TASK_RUNNING;
3717 else 3629 else
3718 deactivate_task(rq, prev, 1); 3630 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3719 switch_count = &prev->nvcsw; 3631 switch_count = &prev->nvcsw;
3720 } 3632 }
3721 3633
@@ -3779,7 +3691,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3779 * the mutex owner just released it and exited. 3691 * the mutex owner just released it and exited.
3780 */ 3692 */
3781 if (probe_kernel_address(&owner->cpu, cpu)) 3693 if (probe_kernel_address(&owner->cpu, cpu))
3782 goto out; 3694 return 0;
3783#else 3695#else
3784 cpu = owner->cpu; 3696 cpu = owner->cpu;
3785#endif 3697#endif
@@ -3789,14 +3701,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3789 * the cpu field may no longer be valid. 3701 * the cpu field may no longer be valid.
3790 */ 3702 */
3791 if (cpu >= nr_cpumask_bits) 3703 if (cpu >= nr_cpumask_bits)
3792 goto out; 3704 return 0;
3793 3705
3794 /* 3706 /*
3795 * We need to validate that we can do a 3707 * We need to validate that we can do a
3796 * get_cpu() and that we have the percpu area. 3708 * get_cpu() and that we have the percpu area.
3797 */ 3709 */
3798 if (!cpu_online(cpu)) 3710 if (!cpu_online(cpu))
3799 goto out; 3711 return 0;
3800 3712
3801 rq = cpu_rq(cpu); 3713 rq = cpu_rq(cpu);
3802 3714
@@ -3815,7 +3727,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3815 3727
3816 cpu_relax(); 3728 cpu_relax();
3817 } 3729 }
3818out: 3730
3819 return 1; 3731 return 1;
3820} 3732}
3821#endif 3733#endif
@@ -4038,8 +3950,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4038 if (!x->done) { 3950 if (!x->done) {
4039 DECLARE_WAITQUEUE(wait, current); 3951 DECLARE_WAITQUEUE(wait, current);
4040 3952
4041 wait.flags |= WQ_FLAG_EXCLUSIVE; 3953 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4042 __add_wait_queue_tail(&x->wait, &wait);
4043 do { 3954 do {
4044 if (signal_pending_state(state, current)) { 3955 if (signal_pending_state(state, current)) {
4045 timeout = -ERESTARTSYS; 3956 timeout = -ERESTARTSYS;
@@ -4265,7 +4176,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4265 BUG_ON(prio < 0 || prio > MAX_PRIO); 4176 BUG_ON(prio < 0 || prio > MAX_PRIO);
4266 4177
4267 rq = task_rq_lock(p, &flags); 4178 rq = task_rq_lock(p, &flags);
4268 update_rq_clock(rq);
4269 4179
4270 oldprio = p->prio; 4180 oldprio = p->prio;
4271 prev_class = p->sched_class; 4181 prev_class = p->sched_class;
@@ -4286,7 +4196,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4286 if (running) 4196 if (running)
4287 p->sched_class->set_curr_task(rq); 4197 p->sched_class->set_curr_task(rq);
4288 if (on_rq) { 4198 if (on_rq) {
4289 enqueue_task(rq, p, 0, oldprio < prio); 4199 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4290 4200
4291 check_class_changed(rq, p, prev_class, oldprio, running); 4201 check_class_changed(rq, p, prev_class, oldprio, running);
4292 } 4202 }
@@ -4308,7 +4218,6 @@ void set_user_nice(struct task_struct *p, long nice)
4308 * the task might be in the middle of scheduling on another CPU. 4218 * the task might be in the middle of scheduling on another CPU.
4309 */ 4219 */
4310 rq = task_rq_lock(p, &flags); 4220 rq = task_rq_lock(p, &flags);
4311 update_rq_clock(rq);
4312 /* 4221 /*
4313 * The RT priorities are set via sched_setscheduler(), but we still 4222 * The RT priorities are set via sched_setscheduler(), but we still
4314 * allow the 'normal' nice value to be set - but as expected 4223 * allow the 'normal' nice value to be set - but as expected
@@ -4330,7 +4239,7 @@ void set_user_nice(struct task_struct *p, long nice)
4330 delta = p->prio - old_prio; 4239 delta = p->prio - old_prio;
4331 4240
4332 if (on_rq) { 4241 if (on_rq) {
4333 enqueue_task(rq, p, 0, false); 4242 enqueue_task(rq, p, 0);
4334 /* 4243 /*
4335 * If the task increased its priority or is running and 4244 * If the task increased its priority or is running and
4336 * lowered its priority, then reschedule its CPU: 4245 * lowered its priority, then reschedule its CPU:
@@ -4353,7 +4262,7 @@ int can_nice(const struct task_struct *p, const int nice)
4353 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4262 /* convert nice value [19,-20] to rlimit style value [1,40] */
4354 int nice_rlim = 20 - nice; 4263 int nice_rlim = 20 - nice;
4355 4264
4356 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4265 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4357 capable(CAP_SYS_NICE)); 4266 capable(CAP_SYS_NICE));
4358} 4267}
4359 4268
@@ -4530,7 +4439,7 @@ recheck:
4530 4439
4531 if (!lock_task_sighand(p, &flags)) 4440 if (!lock_task_sighand(p, &flags))
4532 return -ESRCH; 4441 return -ESRCH;
4533 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4442 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4534 unlock_task_sighand(p, &flags); 4443 unlock_task_sighand(p, &flags);
4535 4444
4536 /* can't set/change the rt policy */ 4445 /* can't set/change the rt policy */
@@ -4591,7 +4500,6 @@ recheck:
4591 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4500 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4592 goto recheck; 4501 goto recheck;
4593 } 4502 }
4594 update_rq_clock(rq);
4595 on_rq = p->se.on_rq; 4503 on_rq = p->se.on_rq;
4596 running = task_current(rq, p); 4504 running = task_current(rq, p);
4597 if (on_rq) 4505 if (on_rq)
@@ -4902,7 +4810,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4902 int ret; 4810 int ret;
4903 cpumask_var_t mask; 4811 cpumask_var_t mask;
4904 4812
4905 if (len < cpumask_size()) 4813 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4814 return -EINVAL;
4815 if (len & (sizeof(unsigned long)-1))
4906 return -EINVAL; 4816 return -EINVAL;
4907 4817
4908 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4818 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -4910,10 +4820,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4910 4820
4911 ret = sched_getaffinity(pid, mask); 4821 ret = sched_getaffinity(pid, mask);
4912 if (ret == 0) { 4822 if (ret == 0) {
4913 if (copy_to_user(user_mask_ptr, mask, cpumask_size())) 4823 size_t retlen = min_t(size_t, len, cpumask_size());
4824
4825 if (copy_to_user(user_mask_ptr, mask, retlen))
4914 ret = -EFAULT; 4826 ret = -EFAULT;
4915 else 4827 else
4916 ret = cpumask_size(); 4828 ret = retlen;
4917 } 4829 }
4918 free_cpumask_var(mask); 4830 free_cpumask_var(mask);
4919 4831
@@ -5324,17 +5236,15 @@ static inline void sched_init_granularity(void)
5324/* 5236/*
5325 * This is how migration works: 5237 * This is how migration works:
5326 * 5238 *
5327 * 1) we queue a struct migration_req structure in the source CPU's 5239 * 1) we invoke migration_cpu_stop() on the target CPU using
5328 * runqueue and wake up that CPU's migration thread. 5240 * stop_one_cpu().
5329 * 2) we down() the locked semaphore => thread blocks. 5241 * 2) stopper starts to run (implicitly forcing the migrated thread
5330 * 3) migration thread wakes up (implicitly it forces the migrated 5242 * off the CPU)
5331 * thread off the CPU) 5243 * 3) it checks whether the migrated task is still in the wrong runqueue.
5332 * 4) it gets the migration request and checks whether the migrated 5244 * 4) if it's in the wrong runqueue then the migration thread removes
5333 * task is still in the wrong runqueue.
5334 * 5) if it's in the wrong runqueue then the migration thread removes
5335 * it and puts it into the right queue. 5245 * it and puts it into the right queue.
5336 * 6) migration thread up()s the semaphore. 5246 * 5) stopper completes and stop_one_cpu() returns and the migration
5337 * 7) we wake up and the migration is done. 5247 * is done.
5338 */ 5248 */
5339 5249
5340/* 5250/*
@@ -5348,12 +5258,23 @@ static inline void sched_init_granularity(void)
5348 */ 5258 */
5349int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5259int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5350{ 5260{
5351 struct migration_req req;
5352 unsigned long flags; 5261 unsigned long flags;
5353 struct rq *rq; 5262 struct rq *rq;
5263 unsigned int dest_cpu;
5354 int ret = 0; 5264 int ret = 0;
5355 5265
5266 /*
5267 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5268 * drop the rq->lock and still rely on ->cpus_allowed.
5269 */
5270again:
5271 while (task_is_waking(p))
5272 cpu_relax();
5356 rq = task_rq_lock(p, &flags); 5273 rq = task_rq_lock(p, &flags);
5274 if (task_is_waking(p)) {
5275 task_rq_unlock(rq, &flags);
5276 goto again;
5277 }
5357 5278
5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5279 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5359 ret = -EINVAL; 5280 ret = -EINVAL;
@@ -5377,15 +5298,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5377 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5298 if (cpumask_test_cpu(task_cpu(p), new_mask))
5378 goto out; 5299 goto out;
5379 5300
5380 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5301 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5302 if (migrate_task(p, dest_cpu)) {
5303 struct migration_arg arg = { p, dest_cpu };
5381 /* Need help from migration thread: drop lock and wait. */ 5304 /* Need help from migration thread: drop lock and wait. */
5382 struct task_struct *mt = rq->migration_thread;
5383
5384 get_task_struct(mt);
5385 task_rq_unlock(rq, &flags); 5305 task_rq_unlock(rq, &flags);
5386 wake_up_process(rq->migration_thread); 5306 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5387 put_task_struct(mt);
5388 wait_for_completion(&req.done);
5389 tlb_migrate_finish(p->mm); 5307 tlb_migrate_finish(p->mm);
5390 return 0; 5308 return 0;
5391 } 5309 }
@@ -5443,98 +5361,49 @@ fail:
5443 return ret; 5361 return ret;
5444} 5362}
5445 5363
5446#define RCU_MIGRATION_IDLE 0
5447#define RCU_MIGRATION_NEED_QS 1
5448#define RCU_MIGRATION_GOT_QS 2
5449#define RCU_MIGRATION_MUST_SYNC 3
5450
5451/* 5364/*
5452 * migration_thread - this is a highprio system thread that performs 5365 * migration_cpu_stop - this will be executed by a highprio stopper thread
5453 * thread migration by bumping thread off CPU then 'pushing' onto 5366 * and performs thread migration by bumping thread off CPU then
5454 * another runqueue. 5367 * 'pushing' onto another runqueue.
5455 */ 5368 */
5456static int migration_thread(void *data) 5369static int migration_cpu_stop(void *data)
5457{ 5370{
5458 int badcpu; 5371 struct migration_arg *arg = data;
5459 int cpu = (long)data;
5460 struct rq *rq;
5461
5462 rq = cpu_rq(cpu);
5463 BUG_ON(rq->migration_thread != current);
5464
5465 set_current_state(TASK_INTERRUPTIBLE);
5466 while (!kthread_should_stop()) {
5467 struct migration_req *req;
5468 struct list_head *head;
5469
5470 raw_spin_lock_irq(&rq->lock);
5471
5472 if (cpu_is_offline(cpu)) {
5473 raw_spin_unlock_irq(&rq->lock);
5474 break;
5475 }
5476
5477 if (rq->active_balance) {
5478 active_load_balance(rq, cpu);
5479 rq->active_balance = 0;
5480 }
5481
5482 head = &rq->migration_queue;
5483
5484 if (list_empty(head)) {
5485 raw_spin_unlock_irq(&rq->lock);
5486 schedule();
5487 set_current_state(TASK_INTERRUPTIBLE);
5488 continue;
5489 }
5490 req = list_entry(head->next, struct migration_req, list);
5491 list_del_init(head->next);
5492
5493 if (req->task != NULL) {
5494 raw_spin_unlock(&rq->lock);
5495 __migrate_task(req->task, cpu, req->dest_cpu);
5496 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5497 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5498 raw_spin_unlock(&rq->lock);
5499 } else {
5500 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5501 raw_spin_unlock(&rq->lock);
5502 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5503 }
5504 local_irq_enable();
5505
5506 complete(&req->done);
5507 }
5508 __set_current_state(TASK_RUNNING);
5509
5510 return 0;
5511}
5512
5513#ifdef CONFIG_HOTPLUG_CPU
5514
5515static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5516{
5517 int ret;
5518 5372
5373 /*
5374 * The original target cpu might have gone down and we might
5375 * be on another cpu but it doesn't matter.
5376 */
5519 local_irq_disable(); 5377 local_irq_disable();
5520 ret = __migrate_task(p, src_cpu, dest_cpu); 5378 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5521 local_irq_enable(); 5379 local_irq_enable();
5522 return ret; 5380 return 0;
5523} 5381}
5524 5382
5383#ifdef CONFIG_HOTPLUG_CPU
5525/* 5384/*
5526 * Figure out where task on dead CPU should go, use force if necessary. 5385 * Figure out where task on dead CPU should go, use force if necessary.
5527 */ 5386 */
5528static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5387void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5529{ 5388{
5530 int dest_cpu; 5389 struct rq *rq = cpu_rq(dead_cpu);
5390 int needs_cpu, uninitialized_var(dest_cpu);
5391 unsigned long flags;
5531 5392
5532again: 5393 local_irq_save(flags);
5533 dest_cpu = select_fallback_rq(dead_cpu, p);
5534 5394
5535 /* It can have affinity changed while we were choosing. */ 5395 raw_spin_lock(&rq->lock);
5536 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5396 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5537 goto again; 5397 if (needs_cpu)
5398 dest_cpu = select_fallback_rq(dead_cpu, p);
5399 raw_spin_unlock(&rq->lock);
5400 /*
5401 * It can only fail if we race with set_cpus_allowed(),
5402 * in the racer should migrate the task anyway.
5403 */
5404 if (needs_cpu)
5405 __migrate_task(p, dead_cpu, dest_cpu);
5406 local_irq_restore(flags);
5538} 5407}
5539 5408
5540/* 5409/*
@@ -5598,7 +5467,6 @@ void sched_idle_next(void)
5598 5467
5599 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5468 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5600 5469
5601 update_rq_clock(rq);
5602 activate_task(rq, p, 0); 5470 activate_task(rq, p, 0);
5603 5471
5604 raw_spin_unlock_irqrestore(&rq->lock, flags); 5472 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5653,7 +5521,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5653 for ( ; ; ) { 5521 for ( ; ; ) {
5654 if (!rq->nr_running) 5522 if (!rq->nr_running)
5655 break; 5523 break;
5656 update_rq_clock(rq);
5657 next = pick_next_task(rq); 5524 next = pick_next_task(rq);
5658 if (!next) 5525 if (!next)
5659 break; 5526 break;
@@ -5876,35 +5743,20 @@ static void set_rq_offline(struct rq *rq)
5876static int __cpuinit 5743static int __cpuinit
5877migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5744migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5878{ 5745{
5879 struct task_struct *p;
5880 int cpu = (long)hcpu; 5746 int cpu = (long)hcpu;
5881 unsigned long flags; 5747 unsigned long flags;
5882 struct rq *rq; 5748 struct rq *rq = cpu_rq(cpu);
5883 5749
5884 switch (action) { 5750 switch (action) {
5885 5751
5886 case CPU_UP_PREPARE: 5752 case CPU_UP_PREPARE:
5887 case CPU_UP_PREPARE_FROZEN: 5753 case CPU_UP_PREPARE_FROZEN:
5888 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5889 if (IS_ERR(p))
5890 return NOTIFY_BAD;
5891 kthread_bind(p, cpu);
5892 /* Must be high prio: stop_machine expects to yield to it. */
5893 rq = task_rq_lock(p, &flags);
5894 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5895 task_rq_unlock(rq, &flags);
5896 get_task_struct(p);
5897 cpu_rq(cpu)->migration_thread = p;
5898 rq->calc_load_update = calc_load_update; 5754 rq->calc_load_update = calc_load_update;
5899 break; 5755 break;
5900 5756
5901 case CPU_ONLINE: 5757 case CPU_ONLINE:
5902 case CPU_ONLINE_FROZEN: 5758 case CPU_ONLINE_FROZEN:
5903 /* Strictly unnecessary, as first user will wake it. */
5904 wake_up_process(cpu_rq(cpu)->migration_thread);
5905
5906 /* Update our root-domain */ 5759 /* Update our root-domain */
5907 rq = cpu_rq(cpu);
5908 raw_spin_lock_irqsave(&rq->lock, flags); 5760 raw_spin_lock_irqsave(&rq->lock, flags);
5909 if (rq->rd) { 5761 if (rq->rd) {
5910 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5762 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5915,61 +5767,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5915 break; 5767 break;
5916 5768
5917#ifdef CONFIG_HOTPLUG_CPU 5769#ifdef CONFIG_HOTPLUG_CPU
5918 case CPU_UP_CANCELED:
5919 case CPU_UP_CANCELED_FROZEN:
5920 if (!cpu_rq(cpu)->migration_thread)
5921 break;
5922 /* Unbind it from offline cpu so it can run. Fall thru. */
5923 kthread_bind(cpu_rq(cpu)->migration_thread,
5924 cpumask_any(cpu_online_mask));
5925 kthread_stop(cpu_rq(cpu)->migration_thread);
5926 put_task_struct(cpu_rq(cpu)->migration_thread);
5927 cpu_rq(cpu)->migration_thread = NULL;
5928 break;
5929
5930 case CPU_DEAD: 5770 case CPU_DEAD:
5931 case CPU_DEAD_FROZEN: 5771 case CPU_DEAD_FROZEN:
5932 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5933 migrate_live_tasks(cpu); 5772 migrate_live_tasks(cpu);
5934 rq = cpu_rq(cpu);
5935 kthread_stop(rq->migration_thread);
5936 put_task_struct(rq->migration_thread);
5937 rq->migration_thread = NULL;
5938 /* Idle task back to normal (off runqueue, low prio) */ 5773 /* Idle task back to normal (off runqueue, low prio) */
5939 raw_spin_lock_irq(&rq->lock); 5774 raw_spin_lock_irq(&rq->lock);
5940 update_rq_clock(rq);
5941 deactivate_task(rq, rq->idle, 0); 5775 deactivate_task(rq, rq->idle, 0);
5942 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5776 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5943 rq->idle->sched_class = &idle_sched_class; 5777 rq->idle->sched_class = &idle_sched_class;
5944 migrate_dead_tasks(cpu); 5778 migrate_dead_tasks(cpu);
5945 raw_spin_unlock_irq(&rq->lock); 5779 raw_spin_unlock_irq(&rq->lock);
5946 cpuset_unlock();
5947 migrate_nr_uninterruptible(rq); 5780 migrate_nr_uninterruptible(rq);
5948 BUG_ON(rq->nr_running != 0); 5781 BUG_ON(rq->nr_running != 0);
5949 calc_global_load_remove(rq); 5782 calc_global_load_remove(rq);
5950 /*
5951 * No need to migrate the tasks: it was best-effort if
5952 * they didn't take sched_hotcpu_mutex. Just wake up
5953 * the requestors.
5954 */
5955 raw_spin_lock_irq(&rq->lock);
5956 while (!list_empty(&rq->migration_queue)) {
5957 struct migration_req *req;
5958
5959 req = list_entry(rq->migration_queue.next,
5960 struct migration_req, list);
5961 list_del_init(&req->list);
5962 raw_spin_unlock_irq(&rq->lock);
5963 complete(&req->done);
5964 raw_spin_lock_irq(&rq->lock);
5965 }
5966 raw_spin_unlock_irq(&rq->lock);
5967 break; 5783 break;
5968 5784
5969 case CPU_DYING: 5785 case CPU_DYING:
5970 case CPU_DYING_FROZEN: 5786 case CPU_DYING_FROZEN:
5971 /* Update our root-domain */ 5787 /* Update our root-domain */
5972 rq = cpu_rq(cpu);
5973 raw_spin_lock_irqsave(&rq->lock, flags); 5788 raw_spin_lock_irqsave(&rq->lock, flags);
5974 if (rq->rd) { 5789 if (rq->rd) {
5975 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5790 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6300,6 +6115,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6300 struct rq *rq = cpu_rq(cpu); 6115 struct rq *rq = cpu_rq(cpu);
6301 struct sched_domain *tmp; 6116 struct sched_domain *tmp;
6302 6117
6118 for (tmp = sd; tmp; tmp = tmp->parent)
6119 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6120
6303 /* Remove the sched domains which do not contribute to scheduling. */ 6121 /* Remove the sched domains which do not contribute to scheduling. */
6304 for (tmp = sd; tmp; ) { 6122 for (tmp = sd; tmp; ) {
6305 struct sched_domain *parent = tmp->parent; 6123 struct sched_domain *parent = tmp->parent;
@@ -7406,11 +7224,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7406 7224
7407#ifdef CONFIG_SCHED_MC 7225#ifdef CONFIG_SCHED_MC
7408static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7226static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7227 struct sysdev_class_attribute *attr,
7409 char *page) 7228 char *page)
7410{ 7229{
7411 return sprintf(page, "%u\n", sched_mc_power_savings); 7230 return sprintf(page, "%u\n", sched_mc_power_savings);
7412} 7231}
7413static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7232static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7233 struct sysdev_class_attribute *attr,
7414 const char *buf, size_t count) 7234 const char *buf, size_t count)
7415{ 7235{
7416 return sched_power_savings_store(buf, count, 0); 7236 return sched_power_savings_store(buf, count, 0);
@@ -7422,11 +7242,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7422 7242
7423#ifdef CONFIG_SCHED_SMT 7243#ifdef CONFIG_SCHED_SMT
7424static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7244static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7245 struct sysdev_class_attribute *attr,
7425 char *page) 7246 char *page)
7426{ 7247{
7427 return sprintf(page, "%u\n", sched_smt_power_savings); 7248 return sprintf(page, "%u\n", sched_smt_power_savings);
7428} 7249}
7429static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7250static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7251 struct sysdev_class_attribute *attr,
7430 const char *buf, size_t count) 7252 const char *buf, size_t count)
7431{ 7253{
7432 return sched_power_savings_store(buf, count, 1); 7254 return sched_power_savings_store(buf, count, 1);
@@ -7779,10 +7601,8 @@ void __init sched_init(void)
7779 rq->push_cpu = 0; 7601 rq->push_cpu = 0;
7780 rq->cpu = i; 7602 rq->cpu = i;
7781 rq->online = 0; 7603 rq->online = 0;
7782 rq->migration_thread = NULL;
7783 rq->idle_stamp = 0; 7604 rq->idle_stamp = 0;
7784 rq->avg_idle = 2*sysctl_sched_migration_cost; 7605 rq->avg_idle = 2*sysctl_sched_migration_cost;
7785 INIT_LIST_HEAD(&rq->migration_queue);
7786 rq_attach_root(rq, &def_root_domain); 7606 rq_attach_root(rq, &def_root_domain);
7787#endif 7607#endif
7788 init_rq_hrtick(rq); 7608 init_rq_hrtick(rq);
@@ -7883,7 +7703,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7883{ 7703{
7884 int on_rq; 7704 int on_rq;
7885 7705
7886 update_rq_clock(rq);
7887 on_rq = p->se.on_rq; 7706 on_rq = p->se.on_rq;
7888 if (on_rq) 7707 if (on_rq)
7889 deactivate_task(rq, p, 0); 7708 deactivate_task(rq, p, 0);
@@ -7910,9 +7729,9 @@ void normalize_rt_tasks(void)
7910 7729
7911 p->se.exec_start = 0; 7730 p->se.exec_start = 0;
7912#ifdef CONFIG_SCHEDSTATS 7731#ifdef CONFIG_SCHEDSTATS
7913 p->se.wait_start = 0; 7732 p->se.statistics.wait_start = 0;
7914 p->se.sleep_start = 0; 7733 p->se.statistics.sleep_start = 0;
7915 p->se.block_start = 0; 7734 p->se.statistics.block_start = 0;
7916#endif 7735#endif
7917 7736
7918 if (!rt_task(p)) { 7737 if (!rt_task(p)) {
@@ -8245,8 +8064,6 @@ void sched_move_task(struct task_struct *tsk)
8245 8064
8246 rq = task_rq_lock(tsk, &flags); 8065 rq = task_rq_lock(tsk, &flags);
8247 8066
8248 update_rq_clock(rq);
8249
8250 running = task_current(rq, tsk); 8067 running = task_current(rq, tsk);
8251 on_rq = tsk->se.on_rq; 8068 on_rq = tsk->se.on_rq;
8252 8069
@@ -8265,7 +8082,7 @@ void sched_move_task(struct task_struct *tsk)
8265 if (unlikely(running)) 8082 if (unlikely(running))
8266 tsk->sched_class->set_curr_task(rq); 8083 tsk->sched_class->set_curr_task(rq);
8267 if (on_rq) 8084 if (on_rq)
8268 enqueue_task(rq, tsk, 0, false); 8085 enqueue_task(rq, tsk, 0);
8269 8086
8270 task_rq_unlock(rq, &flags); 8087 task_rq_unlock(rq, &flags);
8271} 8088}
@@ -8813,7 +8630,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8813struct cpuacct { 8630struct cpuacct {
8814 struct cgroup_subsys_state css; 8631 struct cgroup_subsys_state css;
8815 /* cpuusage holds pointer to a u64-type object on every cpu */ 8632 /* cpuusage holds pointer to a u64-type object on every cpu */
8816 u64 *cpuusage; 8633 u64 __percpu *cpuusage;
8817 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8634 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
8818 struct cpuacct *parent; 8635 struct cpuacct *parent;
8819}; 8636};
@@ -9079,43 +8896,32 @@ struct cgroup_subsys cpuacct_subsys = {
9079 8896
9080#ifndef CONFIG_SMP 8897#ifndef CONFIG_SMP
9081 8898
9082int rcu_expedited_torture_stats(char *page)
9083{
9084 return 0;
9085}
9086EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9087
9088void synchronize_sched_expedited(void) 8899void synchronize_sched_expedited(void)
9089{ 8900{
8901 barrier();
9090} 8902}
9091EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8903EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9092 8904
9093#else /* #ifndef CONFIG_SMP */ 8905#else /* #ifndef CONFIG_SMP */
9094 8906
9095static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 8907static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9096static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9097
9098#define RCU_EXPEDITED_STATE_POST -2
9099#define RCU_EXPEDITED_STATE_IDLE -1
9100 8908
9101static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 8909static int synchronize_sched_expedited_cpu_stop(void *data)
9102
9103int rcu_expedited_torture_stats(char *page)
9104{ 8910{
9105 int cnt = 0; 8911 /*
9106 int cpu; 8912 * There must be a full memory barrier on each affected CPU
9107 8913 * between the time that try_stop_cpus() is called and the
9108 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 8914 * time that it returns.
9109 for_each_online_cpu(cpu) { 8915 *
9110 cnt += sprintf(&page[cnt], " %d:%d", 8916 * In the current initial implementation of cpu_stop, the
9111 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 8917 * above condition is already met when the control reaches
9112 } 8918 * this point and the following smp_mb() is not strictly
9113 cnt += sprintf(&page[cnt], "\n"); 8919 * necessary. Do smp_mb() anyway for documentation and
9114 return cnt; 8920 * robustness against future implementation changes.
8921 */
8922 smp_mb(); /* See above comment block. */
8923 return 0;
9115} 8924}
9116EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9117
9118static long synchronize_sched_expedited_count;
9119 8925
9120/* 8926/*
9121 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 8927 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9129,18 +8935,14 @@ static long synchronize_sched_expedited_count;
9129 */ 8935 */
9130void synchronize_sched_expedited(void) 8936void synchronize_sched_expedited(void)
9131{ 8937{
9132 int cpu; 8938 int snap, trycount = 0;
9133 unsigned long flags;
9134 bool need_full_sync = 0;
9135 struct rq *rq;
9136 struct migration_req *req;
9137 long snap;
9138 int trycount = 0;
9139 8939
9140 smp_mb(); /* ensure prior mod happens before capturing snap. */ 8940 smp_mb(); /* ensure prior mod happens before capturing snap. */
9141 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 8941 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9142 get_online_cpus(); 8942 get_online_cpus();
9143 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 8943 while (try_stop_cpus(cpu_online_mask,
8944 synchronize_sched_expedited_cpu_stop,
8945 NULL) == -EAGAIN) {
9144 put_online_cpus(); 8946 put_online_cpus();
9145 if (trycount++ < 10) 8947 if (trycount++ < 10)
9146 udelay(trycount * num_online_cpus()); 8948 udelay(trycount * num_online_cpus());
@@ -9148,41 +8950,15 @@ void synchronize_sched_expedited(void)
9148 synchronize_sched(); 8950 synchronize_sched();
9149 return; 8951 return;
9150 } 8952 }
9151 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 8953 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9152 smp_mb(); /* ensure test happens before caller kfree */ 8954 smp_mb(); /* ensure test happens before caller kfree */
9153 return; 8955 return;
9154 } 8956 }
9155 get_online_cpus(); 8957 get_online_cpus();
9156 } 8958 }
9157 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 8959 atomic_inc(&synchronize_sched_expedited_count);
9158 for_each_online_cpu(cpu) { 8960 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9159 rq = cpu_rq(cpu);
9160 req = &per_cpu(rcu_migration_req, cpu);
9161 init_completion(&req->done);
9162 req->task = NULL;
9163 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9164 raw_spin_lock_irqsave(&rq->lock, flags);
9165 list_add(&req->list, &rq->migration_queue);
9166 raw_spin_unlock_irqrestore(&rq->lock, flags);
9167 wake_up_process(rq->migration_thread);
9168 }
9169 for_each_online_cpu(cpu) {
9170 rcu_expedited_state = cpu;
9171 req = &per_cpu(rcu_migration_req, cpu);
9172 rq = cpu_rq(cpu);
9173 wait_for_completion(&req->done);
9174 raw_spin_lock_irqsave(&rq->lock, flags);
9175 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9176 need_full_sync = 1;
9177 req->dest_cpu = RCU_MIGRATION_IDLE;
9178 raw_spin_unlock_irqrestore(&rq->lock, flags);
9179 }
9180 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9181 synchronize_sched_expedited_count++;
9182 mutex_unlock(&rcu_sched_expedited_mutex);
9183 put_online_cpus(); 8961 put_online_cpus();
9184 if (need_full_sync)
9185 synchronize_sched();
9186} 8962}
9187EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8963EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9188 8964
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index eeb3506c4834..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
27 * of the License. 27 * of the License.
28 */ 28 */
29 29
30#include <linux/gfp.h>
30#include "sched_cpupri.h" 31#include "sched_cpupri.h"
31 32
32/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -47,7 +48,7 @@ static int convert_prio(int prio)
47} 48}
48 49
49#define for_each_cpupri_active(array, idx) \ 50#define for_each_cpupri_active(array, idx) \
50 for_each_bit(idx, array, CPUPRI_NR_PRIORITIES) 51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 52
52/** 53/**
53 * cpupri_find - find the best (lowest-pri) CPU in the system 54 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -56,7 +57,7 @@ static int convert_prio(int prio)
56 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) 57 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
57 * 58 *
58 * Note: This function returns the recommended CPUs as calculated during the 59 * Note: This function returns the recommended CPUs as calculated during the
59 * current invokation. By the time the call returns, the CPUs may have in 60 * current invocation. By the time the call returns, the CPUs may have in
60 * fact changed priorities any number of times. While not ideal, it is not 61 * fact changed priorities any number of times. While not ideal, it is not
61 * an issue of correctness since the normal rebalancer logic will correct 62 * an issue of correctness since the normal rebalancer logic will correct
62 * any discrepancies created by racing against the uncertainty of the current 63 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b9..87a330a7185f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
70 PN(se->vruntime); 70 PN(se->vruntime);
71 PN(se->sum_exec_runtime); 71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS 72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start); 73 PN(se->statistics.wait_start);
74 PN(se->sleep_start); 74 PN(se->statistics.sleep_start);
75 PN(se->block_start); 75 PN(se->statistics.block_start);
76 PN(se->sleep_max); 76 PN(se->statistics.sleep_max);
77 PN(se->block_max); 77 PN(se->statistics.block_max);
78 PN(se->exec_max); 78 PN(se->statistics.exec_max);
79 PN(se->slice_max); 79 PN(se->statistics.slice_max);
80 PN(se->wait_max); 80 PN(se->statistics.wait_max);
81 PN(se->wait_sum); 81 PN(se->statistics.wait_sum);
82 P(se->wait_count); 82 P(se->statistics.wait_count);
83#endif 83#endif
84 P(se->load.weight); 84 P(se->load.weight);
85#undef PN 85#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
105 SPLIT_NS(p->se.vruntime), 105 SPLIT_NS(p->se.vruntime),
106 SPLIT_NS(p->se.sum_exec_runtime), 106 SPLIT_NS(p->se.sum_exec_runtime),
107 SPLIT_NS(p->se.sum_sleep_runtime)); 107 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
108#else 108#else
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
114 { 114 {
115 char path[64]; 115 char path[64];
116 116
117 rcu_read_lock();
117 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); 118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
118 SEQ_printf(m, " %s", path); 120 SEQ_printf(m, " %s", path);
119 } 121 }
120#endif 122#endif
@@ -173,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
173 task_group_path(tg, path, sizeof(path)); 175 task_group_path(tg, path, sizeof(path));
174 176
175 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
176#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
177 {
178 uid_t uid = cfs_rq->tg->uid;
179 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
180 }
181#else 178#else
182 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
183#endif 180#endif
@@ -407,40 +404,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
407 PN(se.exec_start); 404 PN(se.exec_start);
408 PN(se.vruntime); 405 PN(se.vruntime);
409 PN(se.sum_exec_runtime); 406 PN(se.sum_exec_runtime);
410 PN(se.avg_overlap);
411 PN(se.avg_wakeup);
412 407
413 nr_switches = p->nvcsw + p->nivcsw; 408 nr_switches = p->nvcsw + p->nivcsw;
414 409
415#ifdef CONFIG_SCHEDSTATS 410#ifdef CONFIG_SCHEDSTATS
416 PN(se.wait_start); 411 PN(se.statistics.wait_start);
417 PN(se.sleep_start); 412 PN(se.statistics.sleep_start);
418 PN(se.block_start); 413 PN(se.statistics.block_start);
419 PN(se.sleep_max); 414 PN(se.statistics.sleep_max);
420 PN(se.block_max); 415 PN(se.statistics.block_max);
421 PN(se.exec_max); 416 PN(se.statistics.exec_max);
422 PN(se.slice_max); 417 PN(se.statistics.slice_max);
423 PN(se.wait_max); 418 PN(se.statistics.wait_max);
424 PN(se.wait_sum); 419 PN(se.statistics.wait_sum);
425 P(se.wait_count); 420 P(se.statistics.wait_count);
426 PN(se.iowait_sum); 421 PN(se.statistics.iowait_sum);
427 P(se.iowait_count); 422 P(se.statistics.iowait_count);
428 P(sched_info.bkl_count); 423 P(sched_info.bkl_count);
429 P(se.nr_migrations); 424 P(se.nr_migrations);
430 P(se.nr_migrations_cold); 425 P(se.statistics.nr_migrations_cold);
431 P(se.nr_failed_migrations_affine); 426 P(se.statistics.nr_failed_migrations_affine);
432 P(se.nr_failed_migrations_running); 427 P(se.statistics.nr_failed_migrations_running);
433 P(se.nr_failed_migrations_hot); 428 P(se.statistics.nr_failed_migrations_hot);
434 P(se.nr_forced_migrations); 429 P(se.statistics.nr_forced_migrations);
435 P(se.nr_wakeups); 430 P(se.statistics.nr_wakeups);
436 P(se.nr_wakeups_sync); 431 P(se.statistics.nr_wakeups_sync);
437 P(se.nr_wakeups_migrate); 432 P(se.statistics.nr_wakeups_migrate);
438 P(se.nr_wakeups_local); 433 P(se.statistics.nr_wakeups_local);
439 P(se.nr_wakeups_remote); 434 P(se.statistics.nr_wakeups_remote);
440 P(se.nr_wakeups_affine); 435 P(se.statistics.nr_wakeups_affine);
441 P(se.nr_wakeups_affine_attempts); 436 P(se.statistics.nr_wakeups_affine_attempts);
442 P(se.nr_wakeups_passive); 437 P(se.statistics.nr_wakeups_passive);
443 P(se.nr_wakeups_idle); 438 P(se.statistics.nr_wakeups_idle);
444 439
445 { 440 {
446 u64 avg_atom, avg_per_cpu; 441 u64 avg_atom, avg_per_cpu;
@@ -491,35 +486,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
491void proc_sched_set_task(struct task_struct *p) 486void proc_sched_set_task(struct task_struct *p)
492{ 487{
493#ifdef CONFIG_SCHEDSTATS 488#ifdef CONFIG_SCHEDSTATS
494 p->se.wait_max = 0; 489 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
495 p->se.wait_sum = 0;
496 p->se.wait_count = 0;
497 p->se.iowait_sum = 0;
498 p->se.iowait_count = 0;
499 p->se.sleep_max = 0;
500 p->se.sum_sleep_runtime = 0;
501 p->se.block_max = 0;
502 p->se.exec_max = 0;
503 p->se.slice_max = 0;
504 p->se.nr_migrations = 0;
505 p->se.nr_migrations_cold = 0;
506 p->se.nr_failed_migrations_affine = 0;
507 p->se.nr_failed_migrations_running = 0;
508 p->se.nr_failed_migrations_hot = 0;
509 p->se.nr_forced_migrations = 0;
510 p->se.nr_wakeups = 0;
511 p->se.nr_wakeups_sync = 0;
512 p->se.nr_wakeups_migrate = 0;
513 p->se.nr_wakeups_local = 0;
514 p->se.nr_wakeups_remote = 0;
515 p->se.nr_wakeups_affine = 0;
516 p->se.nr_wakeups_affine_attempts = 0;
517 p->se.nr_wakeups_passive = 0;
518 p->se.nr_wakeups_idle = 0;
519 p->sched_info.bkl_count = 0;
520#endif 490#endif
521 p->se.sum_exec_runtime = 0;
522 p->se.prev_sum_exec_runtime = 0;
523 p->nvcsw = 0;
524 p->nivcsw = 0;
525} 491}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3e1fd96c6cf9..217e4a9393e4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
35 * (to see the precise effective timeslice length of your workload, 35 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
37 */ 37 */
38unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 40
41/* 41/*
42 * The initial- and re-scaling of tunables is configurable 42 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 2000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 5; 63static unsigned int sched_nr_latency = 3;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
505{ 505{
506 unsigned long delta_exec_weighted; 506 unsigned long delta_exec_weighted;
507 507
508 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 508 schedstat_set(curr->statistics.exec_max,
509 max((u64)delta_exec, curr->statistics.exec_max));
509 510
510 curr->sum_exec_runtime += delta_exec; 511 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 512 schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
548static inline void 549static inline void
549update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{ 551{
551 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
552} 553}
553 554
554/* 555/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
567static void 568static void
568update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
569{ 570{
570 schedstat_set(se->wait_max, max(se->wait_max, 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
571 rq_of(cfs_rq)->clock - se->wait_start)); 572 rq_of(cfs_rq)->clock - se->statistics.wait_start));
572 schedstat_set(se->wait_count, se->wait_count + 1); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
573 schedstat_set(se->wait_sum, se->wait_sum + 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
574 rq_of(cfs_rq)->clock - se->wait_start); 575 rq_of(cfs_rq)->clock - se->statistics.wait_start);
575#ifdef CONFIG_SCHEDSTATS 576#ifdef CONFIG_SCHEDSTATS
576 if (entity_is_task(se)) { 577 if (entity_is_task(se)) {
577 trace_sched_stat_wait(task_of(se), 578 trace_sched_stat_wait(task_of(se),
578 rq_of(cfs_rq)->clock - se->wait_start); 579 rq_of(cfs_rq)->clock - se->statistics.wait_start);
579 } 580 }
580#endif 581#endif
581 schedstat_set(se->wait_start, 0); 582 schedstat_set(se->statistics.wait_start, 0);
582} 583}
583 584
584static inline void 585static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 if (entity_is_task(se)) 658 if (entity_is_task(se))
658 tsk = task_of(se); 659 tsk = task_of(se);
659 660
660 if (se->sleep_start) { 661 if (se->statistics.sleep_start) {
661 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
662 663
663 if ((s64)delta < 0) 664 if ((s64)delta < 0)
664 delta = 0; 665 delta = 0;
665 666
666 if (unlikely(delta > se->sleep_max)) 667 if (unlikely(delta > se->statistics.sleep_max))
667 se->sleep_max = delta; 668 se->statistics.sleep_max = delta;
668 669
669 se->sleep_start = 0; 670 se->statistics.sleep_start = 0;
670 se->sum_sleep_runtime += delta; 671 se->statistics.sum_sleep_runtime += delta;
671 672
672 if (tsk) { 673 if (tsk) {
673 account_scheduler_latency(tsk, delta >> 10, 1); 674 account_scheduler_latency(tsk, delta >> 10, 1);
674 trace_sched_stat_sleep(tsk, delta); 675 trace_sched_stat_sleep(tsk, delta);
675 } 676 }
676 } 677 }
677 if (se->block_start) { 678 if (se->statistics.block_start) {
678 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
679 680
680 if ((s64)delta < 0) 681 if ((s64)delta < 0)
681 delta = 0; 682 delta = 0;
682 683
683 if (unlikely(delta > se->block_max)) 684 if (unlikely(delta > se->statistics.block_max))
684 se->block_max = delta; 685 se->statistics.block_max = delta;
685 686
686 se->block_start = 0; 687 se->statistics.block_start = 0;
687 se->sum_sleep_runtime += delta; 688 se->statistics.sum_sleep_runtime += delta;
688 689
689 if (tsk) { 690 if (tsk) {
690 if (tsk->in_iowait) { 691 if (tsk->in_iowait) {
691 se->iowait_sum += delta; 692 se->statistics.iowait_sum += delta;
692 se->iowait_count++; 693 se->statistics.iowait_count++;
693 trace_sched_stat_iowait(tsk, delta); 694 trace_sched_stat_iowait(tsk, delta);
694 } 695 }
695 696
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
737 vruntime += sched_vslice(cfs_rq, se); 738 vruntime += sched_vslice(cfs_rq, se);
738 739
739 /* sleeps up to a single latency don't count. */ 740 /* sleeps up to a single latency don't count. */
740 if (!initial && sched_feat(FAIR_SLEEPERS)) { 741 if (!initial) {
741 unsigned long thresh = sysctl_sched_latency; 742 unsigned long thresh = sysctl_sched_latency;
742 743
743 /* 744 /*
744 * Convert the sleeper threshold into virtual time.
745 * SCHED_IDLE is a special sub-class. We care about
746 * fairness only relative to other SCHED_IDLE tasks,
747 * all of which have the same weight.
748 */
749 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
750 task_of(se)->policy != SCHED_IDLE))
751 thresh = calc_delta_fair(thresh, se);
752
753 /*
754 * Halve their sleep time's effect, to allow 745 * Halve their sleep time's effect, to allow
755 * for a gentler effect of sleepers: 746 * for a gentler effect of sleepers:
756 */ 747 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
766 se->vruntime = vruntime; 757 se->vruntime = vruntime;
767} 758}
768 759
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
772static void 760static void
773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
774{ 762{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
776 * Update the normalized vruntime before updating min_vruntime 764 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr(). 765 * through callig update_curr().
778 */ 766 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
780 se->vruntime += cfs_rq->min_vruntime; 768 se->vruntime += cfs_rq->min_vruntime;
781 769
782 /* 770 /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
812} 800}
813 801
814static void 802static void
815dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
816{ 804{
817 /* 805 /*
818 * Update run-time statistics of the 'current'. 806 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
820 update_curr(cfs_rq); 808 update_curr(cfs_rq);
821 809
822 update_stats_dequeue(cfs_rq, se); 810 update_stats_dequeue(cfs_rq, se);
823 if (sleep) { 811 if (flags & DEQUEUE_SLEEP) {
824#ifdef CONFIG_SCHEDSTATS 812#ifdef CONFIG_SCHEDSTATS
825 if (entity_is_task(se)) { 813 if (entity_is_task(se)) {
826 struct task_struct *tsk = task_of(se); 814 struct task_struct *tsk = task_of(se);
827 815
828 if (tsk->state & TASK_INTERRUPTIBLE) 816 if (tsk->state & TASK_INTERRUPTIBLE)
829 se->sleep_start = rq_of(cfs_rq)->clock; 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
830 if (tsk->state & TASK_UNINTERRUPTIBLE) 818 if (tsk->state & TASK_UNINTERRUPTIBLE)
831 se->block_start = rq_of(cfs_rq)->clock; 819 se->statistics.block_start = rq_of(cfs_rq)->clock;
832 } 820 }
833#endif 821#endif
834 } 822 }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
845 * update can refer to the ->curr item and we need to reflect this 833 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position. 834 * movement in our normalized position.
847 */ 835 */
848 if (!sleep) 836 if (!(flags & DEQUEUE_SLEEP))
849 se->vruntime -= cfs_rq->min_vruntime; 837 se->vruntime -= cfs_rq->min_vruntime;
850} 838}
851 839
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
912 * when there are only lesser-weight tasks around): 900 * when there are only lesser-weight tasks around):
913 */ 901 */
914 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
915 se->slice_max = max(se->slice_max, 903 se->statistics.slice_max = max(se->statistics.slice_max,
916 se->sum_exec_runtime - se->prev_sum_exec_runtime); 904 se->sum_exec_runtime - se->prev_sum_exec_runtime);
917 } 905 }
918#endif 906#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
1054 * then put the task into the rbtree: 1042 * then put the task into the rbtree:
1055 */ 1043 */
1056static void 1044static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) 1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1058{ 1046{
1059 struct cfs_rq *cfs_rq; 1047 struct cfs_rq *cfs_rq;
1060 struct sched_entity *se = &p->se; 1048 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1067 1049
1068 for_each_sched_entity(se) { 1050 for_each_sched_entity(se) {
1069 if (se->on_rq) 1051 if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1081 * decreased. We remove the task from the rbtree and 1063 * decreased. We remove the task from the rbtree and
1082 * update the fair scheduling stats: 1064 * update the fair scheduling stats:
1083 */ 1065 */
1084static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1085{ 1067{
1086 struct cfs_rq *cfs_rq; 1068 struct cfs_rq *cfs_rq;
1087 struct sched_entity *se = &p->se; 1069 struct sched_entity *se = &p->se;
1088 1070
1089 for_each_sched_entity(se) { 1071 for_each_sched_entity(se) {
1090 cfs_rq = cfs_rq_of(se); 1072 cfs_rq = cfs_rq_of(se);
1091 dequeue_entity(cfs_rq, se, sleep); 1073 dequeue_entity(cfs_rq, se, flags);
1092 /* Don't dequeue parent if it has other entities besides us */ 1074 /* Don't dequeue parent if it has other entities besides us */
1093 if (cfs_rq->load.weight) 1075 if (cfs_rq->load.weight)
1094 break; 1076 break;
1095 sleep = 1; 1077 flags |= DEQUEUE_SLEEP;
1096 } 1078 }
1097 1079
1098 hrtick_update(rq); 1080 hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1240 1222
1241static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1242{ 1224{
1243 struct task_struct *curr = current;
1244 unsigned long this_load, load; 1225 unsigned long this_load, load;
1245 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1246 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1255 load = source_load(prev_cpu, idx); 1236 load = source_load(prev_cpu, idx);
1256 this_load = target_load(this_cpu, idx); 1237 this_load = target_load(this_cpu, idx);
1257 1238
1258 if (sync) {
1259 if (sched_feat(SYNC_LESS) &&
1260 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1261 p->se.avg_overlap > sysctl_sched_migration_cost))
1262 sync = 0;
1263 } else {
1264 if (sched_feat(SYNC_MORE) &&
1265 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1266 p->se.avg_overlap < sysctl_sched_migration_cost))
1267 sync = 1;
1268 }
1269
1270 /* 1239 /*
1271 * If sync wakeup then subtract the (maximum possible) 1240 * If sync wakeup then subtract the (maximum possible)
1272 * effect of the currently running task from the load 1241 * effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1306 if (sync && balanced) 1275 if (sync && balanced)
1307 return 1; 1276 return 1;
1308 1277
1309 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1278 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1310 tl_per_task = cpu_avg_load_per_task(this_cpu); 1279 tl_per_task = cpu_avg_load_per_task(this_cpu);
1311 1280
1312 if (balanced || 1281 if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1318 * there is no bad imbalance. 1287 * there is no bad imbalance.
1319 */ 1288 */
1320 schedstat_inc(sd, ttwu_move_affine); 1289 schedstat_inc(sd, ttwu_move_affine);
1321 schedstat_inc(p, se.nr_wakeups_affine); 1290 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1322 1291
1323 return 1; 1292 return 1;
1324 } 1293 }
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1406/* 1375/*
1407 * Try and locate an idle CPU in the sched_domain. 1376 * Try and locate an idle CPU in the sched_domain.
1408 */ 1377 */
1409static int 1378static int select_idle_sibling(struct task_struct *p, int target)
1410select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1411{ 1379{
1412 int cpu = smp_processor_id(); 1380 int cpu = smp_processor_id();
1413 int prev_cpu = task_cpu(p); 1381 int prev_cpu = task_cpu(p);
1382 struct sched_domain *sd;
1414 int i; 1383 int i;
1415 1384
1416 /* 1385 /*
1417 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE 1386 * If the task is going to be woken-up on this cpu and if it is
1418 * test in select_task_rq_fair) and the prev_cpu is idle then that's 1387 * already idle, then it is the right target.
1419 * always a better target than the current cpu.
1420 */ 1388 */
1421 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) 1389 if (target == cpu && idle_cpu(cpu))
1390 return cpu;
1391
1392 /*
1393 * If the task is going to be woken-up on the cpu where it previously
1394 * ran and if it is currently idle, then it the right target.
1395 */
1396 if (target == prev_cpu && idle_cpu(prev_cpu))
1422 return prev_cpu; 1397 return prev_cpu;
1423 1398
1424 /* 1399 /*
1425 * Otherwise, iterate the domain and find an elegible idle cpu. 1400 * Otherwise, iterate the domains and find an elegible idle cpu.
1426 */ 1401 */
1427 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1402 for_each_domain(target, sd) {
1428 if (!cpu_rq(i)->cfs.nr_running) { 1403 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1429 target = i;
1430 break; 1404 break;
1405
1406 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1407 if (idle_cpu(i)) {
1408 target = i;
1409 break;
1410 }
1431 } 1411 }
1412
1413 /*
1414 * Lets stop looking for an idle sibling when we reached
1415 * the domain that spans the current cpu and prev_cpu.
1416 */
1417 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1418 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1419 break;
1432 } 1420 }
1433 1421
1434 return target; 1422 return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1445 * 1433 *
1446 * preempt must be disabled. 1434 * preempt must be disabled.
1447 */ 1435 */
1448static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1436static int
1437select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1449{ 1438{
1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1439 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1451 int cpu = smp_processor_id(); 1440 int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1456 int sync = wake_flags & WF_SYNC; 1445 int sync = wake_flags & WF_SYNC;
1457 1446
1458 if (sd_flag & SD_BALANCE_WAKE) { 1447 if (sd_flag & SD_BALANCE_WAKE) {
1459 if (sched_feat(AFFINE_WAKEUPS) && 1448 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1460 cpumask_test_cpu(cpu, &p->cpus_allowed))
1461 want_affine = 1; 1449 want_affine = 1;
1462 new_cpu = prev_cpu; 1450 new_cpu = prev_cpu;
1463 } 1451 }
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1491 } 1479 }
1492 1480
1493 /* 1481 /*
1494 * While iterating the domains looking for a spanning 1482 * If both cpu and prev_cpu are part of this domain,
1495 * WAKE_AFFINE domain, adjust the affine target to any idle cpu 1483 * cpu is a valid SD_WAKE_AFFINE target.
1496 * in cache sharing domains along the way.
1497 */ 1484 */
1498 if (want_affine) { 1485 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1499 int target = -1; 1486 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1500 1487 affine_sd = tmp;
1501 /* 1488 want_affine = 0;
1502 * If both cpu and prev_cpu are part of this domain,
1503 * cpu is a valid SD_WAKE_AFFINE target.
1504 */
1505 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1506 target = cpu;
1507
1508 /*
1509 * If there's an idle sibling in this domain, make that
1510 * the wake_affine target instead of the current cpu.
1511 */
1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1513 target = select_idle_sibling(p, tmp, target);
1514
1515 if (target >= 0) {
1516 if (tmp->flags & SD_WAKE_AFFINE) {
1517 affine_sd = tmp;
1518 want_affine = 0;
1519 }
1520 cpu = target;
1521 }
1522 } 1489 }
1523 1490
1524 if (!want_sd && !want_affine) 1491 if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1531 sd = tmp; 1498 sd = tmp;
1532 } 1499 }
1533 1500
1501#ifdef CONFIG_FAIR_GROUP_SCHED
1534 if (sched_feat(LB_SHARES_UPDATE)) { 1502 if (sched_feat(LB_SHARES_UPDATE)) {
1535 /* 1503 /*
1536 * Pick the largest domain to update shares over 1504 * Pick the largest domain to update shares over
1537 */ 1505 */
1538 tmp = sd; 1506 tmp = sd;
1539 if (affine_sd && (!tmp || 1507 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1540 cpumask_weight(sched_domain_span(affine_sd)) >
1541 cpumask_weight(sched_domain_span(sd))))
1542 tmp = affine_sd; 1508 tmp = affine_sd;
1543 1509
1544 if (tmp) 1510 if (tmp) {
1511 raw_spin_unlock(&rq->lock);
1545 update_shares(tmp); 1512 update_shares(tmp);
1513 raw_spin_lock(&rq->lock);
1514 }
1546 } 1515 }
1516#endif
1547 1517
1548 if (affine_sd && wake_affine(affine_sd, p, sync)) 1518 if (affine_sd) {
1549 return cpu; 1519 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1520 return select_idle_sibling(p, cpu);
1521 else
1522 return select_idle_sibling(p, prev_cpu);
1523 }
1550 1524
1551 while (sd) { 1525 while (sd) {
1552 int load_idx = sd->forkexec_idx; 1526 int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1576 1550
1577 /* Now try balancing at a lower domain level of new_cpu */ 1551 /* Now try balancing at a lower domain level of new_cpu */
1578 cpu = new_cpu; 1552 cpu = new_cpu;
1579 weight = cpumask_weight(sched_domain_span(sd)); 1553 weight = sd->span_weight;
1580 sd = NULL; 1554 sd = NULL;
1581 for_each_domain(cpu, tmp) { 1555 for_each_domain(cpu, tmp) {
1582 if (weight <= cpumask_weight(sched_domain_span(tmp))) 1556 if (weight <= tmp->span_weight)
1583 break; 1557 break;
1584 if (tmp->flags & sd_flag) 1558 if (tmp->flags & sd_flag)
1585 sd = tmp; 1559 sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1591} 1565}
1592#endif /* CONFIG_SMP */ 1566#endif /* CONFIG_SMP */
1593 1567
1594/*
1595 * Adaptive granularity
1596 *
1597 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1598 * with the limit of wakeup_gran -- when it never does a wakeup.
1599 *
1600 * So the smaller avg_wakeup is the faster we want this task to preempt,
1601 * but we don't want to treat the preemptee unfairly and therefore allow it
1602 * to run for at least the amount of time we'd like to run.
1603 *
1604 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1605 *
1606 * NOTE: we use *nr_running to scale with load, this nicely matches the
1607 * degrading latency on load.
1608 */
1609static unsigned long
1610adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1611{
1612 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1613 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1614 u64 gran = 0;
1615
1616 if (this_run < expected_wakeup)
1617 gran = expected_wakeup - this_run;
1618
1619 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1620}
1621
1622static unsigned long 1568static unsigned long
1623wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1569wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1624{ 1570{
1625 unsigned long gran = sysctl_sched_wakeup_granularity; 1571 unsigned long gran = sysctl_sched_wakeup_granularity;
1626 1572
1627 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1628 gran = adaptive_gran(curr, se);
1629
1630 /* 1573 /*
1631 * Since its curr running now, convert the gran from real-time 1574 * Since its curr running now, convert the gran from real-time
1632 * to virtual-time in his units. 1575 * to virtual-time in his units.
1576 *
1577 * By using 'se' instead of 'curr' we penalize light tasks, so
1578 * they get preempted easier. That is, if 'se' < 'curr' then
1579 * the resulting gran will be larger, therefore penalizing the
1580 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1581 * be smaller, again penalizing the lighter task.
1582 *
1583 * This is especially important for buddies when the leftmost
1584 * task is higher priority than the buddy.
1633 */ 1585 */
1634 if (sched_feat(ASYM_GRAN)) { 1586 if (unlikely(se->load.weight != NICE_0_LOAD))
1635 /* 1587 gran = calc_delta_fair(gran, se);
1636 * By using 'se' instead of 'curr' we penalize light tasks, so
1637 * they get preempted easier. That is, if 'se' < 'curr' then
1638 * the resulting gran will be larger, therefore penalizing the
1639 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1640 * be smaller, again penalizing the lighter task.
1641 *
1642 * This is especially important for buddies when the leftmost
1643 * task is higher priority than the buddy.
1644 */
1645 if (unlikely(se->load.weight != NICE_0_LOAD))
1646 gran = calc_delta_fair(gran, se);
1647 } else {
1648 if (unlikely(curr->load.weight != NICE_0_LOAD))
1649 gran = calc_delta_fair(gran, curr);
1650 }
1651 1588
1652 return gran; 1589 return gran;
1653} 1590}
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1705 struct task_struct *curr = rq->curr; 1642 struct task_struct *curr = rq->curr;
1706 struct sched_entity *se = &curr->se, *pse = &p->se; 1643 struct sched_entity *se = &curr->se, *pse = &p->se;
1707 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1644 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1708 int sync = wake_flags & WF_SYNC;
1709 int scale = cfs_rq->nr_running >= sched_nr_latency; 1645 int scale = cfs_rq->nr_running >= sched_nr_latency;
1710 1646
1711 if (unlikely(rt_prio(p->prio))) 1647 if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1738 if (unlikely(curr->policy == SCHED_IDLE)) 1674 if (unlikely(curr->policy == SCHED_IDLE))
1739 goto preempt; 1675 goto preempt;
1740 1676
1741 if (sched_feat(WAKEUP_SYNC) && sync)
1742 goto preempt;
1743
1744 if (sched_feat(WAKEUP_OVERLAP) &&
1745 se->avg_overlap < sysctl_sched_migration_cost &&
1746 pse->avg_overlap < sysctl_sched_migration_cost)
1747 goto preempt;
1748
1749 if (!sched_feat(WAKEUP_PREEMPT)) 1677 if (!sched_feat(WAKEUP_PREEMPT))
1750 return; 1678 return;
1751 1679
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1844 * 3) are cache-hot on their current CPU. 1772 * 3) are cache-hot on their current CPU.
1845 */ 1773 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 1774 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine); 1775 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1848 return 0; 1776 return 0;
1849 } 1777 }
1850 *all_pinned = 0; 1778 *all_pinned = 0;
1851 1779
1852 if (task_running(rq, p)) { 1780 if (task_running(rq, p)) {
1853 schedstat_inc(p, se.nr_failed_migrations_running); 1781 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1854 return 0; 1782 return 0;
1855 } 1783 }
1856 1784
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1866#ifdef CONFIG_SCHEDSTATS 1794#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) { 1795 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]); 1796 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations); 1797 schedstat_inc(p, se.statistics.nr_forced_migrations);
1870 } 1798 }
1871#endif 1799#endif
1872 return 1; 1800 return 1;
1873 } 1801 }
1874 1802
1875 if (tsk_cache_hot) { 1803 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot); 1804 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1877 return 0; 1805 return 0;
1878 } 1806 }
1879 return 1; 1807 return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2311 2239
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 2240unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{ 2241{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2242 unsigned long weight = sd->span_weight;
2315 unsigned long smt_gain = sd->smt_gain; 2243 unsigned long smt_gain = sd->smt_gain;
2316 2244
2317 smt_gain /= weight; 2245 smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
2344 2272
2345static void update_cpu_power(struct sched_domain *sd, int cpu) 2273static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{ 2274{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2275 unsigned long weight = sd->span_weight;
2348 unsigned long power = SCHED_LOAD_SCALE; 2276 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups; 2277 struct sched_group *sdg = sd->groups;
2350 2278
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2798 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871} 2799}
2872 2800
2801static int active_load_balance_cpu_stop(void *data);
2802
2873/* 2803/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2804 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance. 2805 * tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
2959 if (need_active_balance(sd, sd_idle, idle)) { 2889 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags); 2890 raw_spin_lock_irqsave(&busiest->lock, flags);
2961 2891
2962 /* don't kick the migration_thread, if the curr 2892 /* don't kick the active_load_balance_cpu_stop,
2963 * task on busiest cpu can't be moved to this_cpu 2893 * if the curr task on busiest cpu can't be
2894 * moved to this_cpu
2964 */ 2895 */
2965 if (!cpumask_test_cpu(this_cpu, 2896 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) { 2897 &busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
2970 goto out_one_pinned; 2901 goto out_one_pinned;
2971 } 2902 }
2972 2903
2904 /*
2905 * ->active_balance synchronizes accesses to
2906 * ->active_balance_work. Once set, it's cleared
2907 * only after active load balance is finished.
2908 */
2973 if (!busiest->active_balance) { 2909 if (!busiest->active_balance) {
2974 busiest->active_balance = 1; 2910 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu; 2911 busiest->push_cpu = this_cpu;
2976 active_balance = 1; 2912 active_balance = 1;
2977 } 2913 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags); 2914 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2915
2979 if (active_balance) 2916 if (active_balance)
2980 wake_up_process(busiest->migration_thread); 2917 stop_one_cpu_nowait(cpu_of(busiest),
2918 active_load_balance_cpu_stop, busiest,
2919 &busiest->active_balance_work);
2981 2920
2982 /* 2921 /*
2983 * We've kicked active balancing, reset the failure 2922 * We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3084} 3023}
3085 3024
3086/* 3025/*
3087 * active_load_balance is run by migration threads. It pushes running tasks 3026 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 3027 * running tasks off the busiest CPU onto idle CPUs. It requires at
3089 * running on each physical CPU where possible, and avoids physical / 3028 * least 1 task to be running on each physical CPU where possible, and
3090 * logical imbalances. 3029 * avoids physical / logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */ 3030 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 3031static int active_load_balance_cpu_stop(void *data)
3095{ 3032{
3033 struct rq *busiest_rq = data;
3034 int busiest_cpu = cpu_of(busiest_rq);
3096 int target_cpu = busiest_rq->push_cpu; 3035 int target_cpu = busiest_rq->push_cpu;
3036 struct rq *target_rq = cpu_rq(target_cpu);
3097 struct sched_domain *sd; 3037 struct sched_domain *sd;
3098 struct rq *target_rq; 3038
3039 raw_spin_lock_irq(&busiest_rq->lock);
3040
3041 /* make sure the requested cpu hasn't gone down in the meantime */
3042 if (unlikely(busiest_cpu != smp_processor_id() ||
3043 !busiest_rq->active_balance))
3044 goto out_unlock;
3099 3045
3100 /* Is there any task to move? */ 3046 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1) 3047 if (busiest_rq->nr_running <= 1)
3102 return; 3048 goto out_unlock;
3103
3104 target_rq = cpu_rq(target_cpu);
3105 3049
3106 /* 3050 /*
3107 * This condition is "impossible", if it occurs 3051 * This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3112 3056
3113 /* move a task from busiest_rq to target_rq */ 3057 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq); 3058 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117 3059
3118 /* Search for an sd spanning us and the target CPU. */ 3060 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) { 3061 for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3132 schedstat_inc(sd, alb_failed); 3074 schedstat_inc(sd, alb_failed);
3133 } 3075 }
3134 double_unlock_balance(busiest_rq, target_rq); 3076 double_unlock_balance(busiest_rq, target_rq);
3077out_unlock:
3078 busiest_rq->active_balance = 0;
3079 raw_spin_unlock_irq(&busiest_rq->lock);
3080 return 0;
3135} 3081}
3136 3082
3137#ifdef CONFIG_NO_HZ 3083#ifdef CONFIG_NO_HZ
@@ -3476,7 +3422,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3476 3422
3477static inline int on_null_domain(int cpu) 3423static inline int on_null_domain(int cpu)
3478{ 3424{
3479 return !rcu_dereference(cpu_rq(cpu)->sd); 3425 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
3480} 3426}
3481 3427
3482/* 3428/*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
1/* 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows 2 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart. 4 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14 7
15/* 8/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
21
22/*
23 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
24 * tasks 10 * tasks
25 */ 11 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
31SCHED_FEAT(WAKEUP_PREEMPT, 1) 17SCHED_FEAT(WAKEUP_PREEMPT, 1)
32 18
33/* 19/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
58 * the remote end is likely to consume the data we just wrote, and
59 * therefore has cache benefit from being placed on the same cpu, see
60 * also AFFINE_WAKEUPS.
61 */
62SCHED_FEAT(SYNC_WAKEUPS, 1)
63
64/*
65 * Based on load and program behaviour, see if it makes sense to place 20 * Based on load and program behaviour, see if it makes sense to place
66 * a newly woken task on the same cpu as the task that woke it -- 21 * a newly woken task on the same cpu as the task that woke it --
67 * improve cache locality. Typically used with SYNC wakeups as 22 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
70SCHED_FEAT(AFFINE_WAKEUPS, 1) 25SCHED_FEAT(AFFINE_WAKEUPS, 1)
71 26
72/* 27/*
73 * Weaken SYNC hint based on overlap
74 */
75SCHED_FEAT(SYNC_LESS, 1)
76
77/*
78 * Add SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_MORE, 0)
81
82/*
83 * Prefer to schedule the task we woke last (assuming it failed 28 * Prefer to schedule the task we woke last (assuming it failed
84 * wakeup-preemption), since its likely going to consume data we 29 * wakeup-preemption), since its likely going to consume data we
85 * touched, increases cache locality. 30 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
22static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 24{
24 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
25 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
26 calc_load_account_active(rq);
27 return rq->idle; 27 return rq->idle;
28} 28}
29 29
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
32 * message if some code attempts to do it: 32 * message if some code attempts to do it:
33 */ 33 */
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bf3e38fdbe6d..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
616 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 616 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
617 617
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
888 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
889 */ 889 */
890static void 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) 891enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
892{ 892{
893 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
894 894
895 if (wakeup) 895 if (flags & ENQUEUE_WAKEUP)
896 rt_se->timeout = 0; 896 rt_se->timeout = 0;
897 897
898 enqueue_rt_entity(rt_se, head); 898 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
899 899
900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
901 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
902} 902}
903 903
904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
905{ 905{
906 struct sched_rt_entity *rt_se = &p->rt; 906 struct sched_rt_entity *rt_se = &p->rt;
907 907
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
948#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
949static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
950 950
951static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
952{ 953{
953 struct rq *rq = task_rq(p);
954
955 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
956 return smp_processor_id(); 955 return smp_processor_id();
957 956
@@ -1146,7 +1145,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1146 if (next && next->prio < idx) 1145 if (next && next->prio < idx)
1147 continue; 1146 continue;
1148 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1147 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1149 struct task_struct *p = rt_task_of(rt_se); 1148 struct task_struct *p;
1149
1150 if (!rt_entity_is_task(rt_se))
1151 continue;
1152
1153 p = rt_task_of(rt_se);
1150 if (pick_rt_task(rq, p, cpu)) { 1154 if (pick_rt_task(rq, p, cpu)) {
1151 next = p; 1155 next = p;
1152 break; 1156 break;
@@ -1662,8 +1666,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1662 if (!p->signal) 1666 if (!p->signal)
1663 return; 1667 return;
1664 1668
1665 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1669 /* max may change after cur was read, this will be fixed next tick */
1666 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1670 soft = task_rlimit(p, RLIMIT_RTTIME);
1671 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1667 1672
1668 if (soft != RLIM_INFINITY) { 1673 if (soft != RLIM_INFINITY) {
1669 unsigned long next; 1674 unsigned long next;
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
159 159
160/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
161 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
162int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
163{ 167{
164 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
166 170
167 s = pending->signal.sig; 171 s = pending->signal.sig;
168 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
169 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
170 default: 187 default:
171 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
172 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
173 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
174 break; 191 continue;
175 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
176 break; 195 break;
177 196
178 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
179 sig = 1; 198 x = s[1] &~ m[1];
180 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
181 sig = _NSIG_BPW + 1;
182 else
183 break; 200 break;
184 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
185 break; 202 break;
186 203
187 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
188 sig = ffz(~x) + 1; 205 /* Nothing to do */
189 break; 206 break;
190 } 207 }
191 208
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
228 245
229 if (override_rlimit || 246 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
232 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else { 250 } else {
234 print_dropped_signal(sig); 251 print_dropped_signal(sig);
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a270..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
637 goto cancelled; 637 goto cancelled;
638 638
639 /* the timer holds a reference whilst it is pending */ 639 /* the timer holds a reference whilst it is pending */
640 ret = work->ops->get_ref(work); 640 ret = slow_work_get_ref(work);
641 if (ret < 0) 641 if (ret < 0)
642 goto cant_get_ref; 642 goto cant_get_ref;
643 643
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
43 */ 43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid) 44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{ 45{
46#ifdef CONFIG_SLOW_WORK_PROC 46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid; 47 slow_work_pids[id] = pid;
48#endif 48#endif
49} 49}
50 50
51static inline void slow_work_mark_time(struct slow_work *work) 51static inline void slow_work_mark_time(struct slow_work *work)
52{ 52{
53#ifdef CONFIG_SLOW_WORK_PROC 53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME; 54 work->mark = CURRENT_TIME;
55#endif 55#endif
56} 56}
57 57
58static inline void slow_work_begin_exec(int id, struct slow_work *work) 58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{ 59{
60#ifdef CONFIG_SLOW_WORK_PROC 60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work; 61 slow_work_execs[id] = work;
62#endif 62#endif
63} 63}
64 64
65static inline void slow_work_end_exec(int id, struct slow_work *work) 65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{ 66{
67#ifdef CONFIG_SLOW_WORK_PROC 67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock); 68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL; 69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock); 70 write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index 9867b6bfefce..3fc697336183 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,6 +9,7 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/gfp.h>
12#include <linux/smp.h> 13#include <linux/smp.h>
13#include <linux/cpu.h> 14#include <linux/cpu.h>
14 15
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef0274..0db913a5c60f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
716 preempt_enable_no_resched(); 716 preempt_enable_no_resched();
717 cond_resched(); 717 cond_resched();
718 preempt_disable(); 718 preempt_disable();
719 rcu_sched_qs((long)__bind_cpu); 719 rcu_note_context_switch((long)__bind_cpu);
720 } 720 }
721 preempt_enable(); 721 preempt_enable();
722 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0d4c7898ab80..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -155,11 +155,11 @@ void softlockup_tick(void)
155 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan. 156 * threshold timespan.
157 */ 157 */
158 if (now > touch_ts + softlockup_thresh/2) 158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160 160
161 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
162 if (now <= (touch_ts + softlockup_thresh)) 162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return; 163 return;
164 164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index bde4295774c8..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,7 +30,6 @@
30#include <linux/preempt.h> 30#include <linux/preempt.h>
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/slab.h>
34#include <linux/smp.h> 33#include <linux/smp.h>
35#include <linux/srcu.h> 34#include <linux/srcu.h>
36 35
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..b4e7431e7c78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p;
298
299 switch (action & ~CPU_TASKS_FROZEN) {
300 case CPU_UP_PREPARE:
301 BUG_ON(stopper->thread || stopper->enabled ||
302 !list_empty(&stopper->works));
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu);
305 if (IS_ERR(p))
306 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p);
309 stopper->thread = p;
310 break;
311
312 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread);
316 /* mark enabled */
317 spin_lock_irq(&stopper->lock);
318 stopper->enabled = true;
319 spin_unlock_irq(&stopper->lock);
320 break;
321
322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED:
324 case CPU_DEAD:
325 {
326 struct cpu_stop_work *work;
327
328 /* kill the stopper */
329 kthread_stop(stopper->thread);
330 /* drain remaining works */
331 spin_lock_irq(&stopper->lock);
332 list_for_each_entry(work, &stopper->works, list)
333 cpu_stop_signal_done(work->done, false);
334 stopper->enabled = false;
335 spin_unlock_irq(&stopper->lock);
336 /* release the stopper */
337 put_task_struct(stopper->thread);
338 stopper->thread = NULL;
339 break;
340 }
341#endif
342 }
343
344 return NOTIFY_OK;
345}
346
347/*
348 * Give it a higher priority so that cpu stopper is available to other
349 * cpu notifiers. It currently shares the same priority as sched
350 * migration_notifier.
351 */
352static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
353 .notifier_call = cpu_stop_cpu_callback,
354 .priority = 10,
355};
356
357static int __init cpu_stop_init(void)
358{
359 void *bcpu = (void *)(long)smp_processor_id();
360 unsigned int cpu;
361 int err;
362
363 for_each_possible_cpu(cpu) {
364 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
365
366 spin_lock_init(&stopper->lock);
367 INIT_LIST_HEAD(&stopper->works);
368 }
369
370 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu);
373 BUG_ON(err == NOTIFY_BAD);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier);
376
377 return 0;
378}
379early_initcall(cpu_stop_init);
380
381#ifdef CONFIG_STOP_MACHINE
15 382
16/* This controls the threads on each CPU. */ 383/* This controls the threads on each CPU. */
17enum stopmachine_state { 384enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
26 /* Exit */ 393 /* Exit */
27 STOPMACHINE_EXIT, 394 STOPMACHINE_EXIT,
28}; 395};
29static enum stopmachine_state state;
30 396
31struct stop_machine_data { 397struct stop_machine_data {
32 int (*fn)(void *); 398 int (*fn)(void *);
33 void *data; 399 void *data;
34 int fnret; 400 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
401 unsigned int num_threads;
402 const struct cpumask *active_cpus;
403
404 enum stopmachine_state state;
405 atomic_t thread_ack;
35}; 406};
36 407
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 408static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 409 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 410{
52 /* Reset ack counter. */ 411 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 412 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 413 smp_wmb();
55 state = newstate; 414 smdata->state = newstate;
56} 415}
57 416
58/* Last one to ack a state moves to the next state. */ 417/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 418static void ack_state(struct stop_machine_data *smdata)
60{ 419{
61 if (atomic_dec_and_test(&thread_ack)) 420 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 421 set_state(smdata, smdata->state + 1);
63} 422}
64 423
65/* This is the actual function which stops the CPU. It runs 424/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 425static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 426{
427 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 428 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 429 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 430 bool is_active;
72 int err; 431
432 if (!smdata->active_cpus)
433 is_active = cpu == cpumask_first(cpu_online_mask);
434 else
435 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 436
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 437 /* Simple state machine */
82 do { 438 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 439 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 440 cpu_relax();
85 if (state != curstate) { 441 if (smdata->state != curstate) {
86 curstate = state; 442 curstate = smdata->state;
87 switch (curstate) { 443 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 444 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 445 local_irq_disable();
90 hard_irq_disable(); 446 hard_irq_disable();
91 break; 447 break;
92 case STOPMACHINE_RUN: 448 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 449 if (is_active)
94 * is needed to tell that something failed. */ 450 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 451 break;
99 default: 452 default:
100 break; 453 break;
101 } 454 }
102 ack_state(); 455 ack_state(smdata);
103 } 456 }
104 } while (curstate != STOPMACHINE_EXIT); 457 } while (curstate != STOPMACHINE_EXIT);
105 458
106 local_irq_enable(); 459 local_irq_enable();
460 return err;
107} 461}
108 462
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 463int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 464{
154 struct work_struct *sm_work; 465 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 466 .num_threads = num_online_cpus(),
156 467 .active_cpus = cpus };
157 /* Set up initial state. */ 468
158 mutex_lock(&lock); 469 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 470 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 471 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 472}
184 473
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 474int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 475{
187 int ret; 476 int ret;
188 477
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 478 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 479 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 480 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 481 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 482 return ret;
198} 483}
199EXPORT_SYMBOL_GPL(stop_machine); 484EXPORT_SYMBOL_GPL(stop_machine);
485
486#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 877fe4f8e05e..7cb426a58965 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,8 +33,10 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/personality.h>
36#include <linux/ptrace.h> 37#include <linux/ptrace.h>
37#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h>
38 40
39#include <linux/compat.h> 41#include <linux/compat.h>
40#include <linux/syscalls.h> 42#include <linux/syscalls.h>
@@ -571,8 +573,7 @@ static int set_user(struct cred *new)
571 if (!new_user) 573 if (!new_user)
572 return -EAGAIN; 574 return -EAGAIN;
573 575
574 if (atomic_read(&new_user->processes) >= 576 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
575 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
576 new_user != INIT_USER) { 577 new_user != INIT_USER) {
577 free_uid(new_user); 578 free_uid(new_user);
578 return -EAGAIN; 579 return -EAGAIN;
@@ -1115,6 +1116,15 @@ out:
1115 1116
1116DECLARE_RWSEM(uts_sem); 1117DECLARE_RWSEM(uts_sem);
1117 1118
1119#ifdef COMPAT_UTS_MACHINE
1120#define override_architecture(name) \
1121 (personality(current->personality) == PER_LINUX32 && \
1122 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1123 sizeof(COMPAT_UTS_MACHINE)))
1124#else
1125#define override_architecture(name) 0
1126#endif
1127
1118SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1128SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1119{ 1129{
1120 int errno = 0; 1130 int errno = 0;
@@ -1123,9 +1133,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1123 if (copy_to_user(name, utsname(), sizeof *name)) 1133 if (copy_to_user(name, utsname(), sizeof *name))
1124 errno = -EFAULT; 1134 errno = -EFAULT;
1125 up_read(&uts_sem); 1135 up_read(&uts_sem);
1136
1137 if (!errno && override_architecture(name))
1138 errno = -EFAULT;
1126 return errno; 1139 return errno;
1127} 1140}
1128 1141
1142#ifdef __ARCH_WANT_SYS_OLD_UNAME
1143/*
1144 * Old cruft
1145 */
1146SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1147{
1148 int error = 0;
1149
1150 if (!name)
1151 return -EFAULT;
1152
1153 down_read(&uts_sem);
1154 if (copy_to_user(name, utsname(), sizeof(*name)))
1155 error = -EFAULT;
1156 up_read(&uts_sem);
1157
1158 if (!error && override_architecture(name))
1159 error = -EFAULT;
1160 return error;
1161}
1162
1163SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1164{
1165 int error;
1166
1167 if (!name)
1168 return -EFAULT;
1169 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1170 return -EFAULT;
1171
1172 down_read(&uts_sem);
1173 error = __copy_to_user(&name->sysname, &utsname()->sysname,
1174 __OLD_UTS_LEN);
1175 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1176 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1177 __OLD_UTS_LEN);
1178 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1179 error |= __copy_to_user(&name->release, &utsname()->release,
1180 __OLD_UTS_LEN);
1181 error |= __put_user(0, name->release + __OLD_UTS_LEN);
1182 error |= __copy_to_user(&name->version, &utsname()->version,
1183 __OLD_UTS_LEN);
1184 error |= __put_user(0, name->version + __OLD_UTS_LEN);
1185 error |= __copy_to_user(&name->machine, &utsname()->machine,
1186 __OLD_UTS_LEN);
1187 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1188 up_read(&uts_sem);
1189
1190 if (!error && override_architecture(name))
1191 error = -EFAULT;
1192 return error ? -EFAULT : 0;
1193}
1194#endif
1195
1129SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1196SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1130{ 1197{
1131 int errno; 1198 int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
126cond_syscall(sys_setuid16); 126cond_syscall(sys_setuid16);
127cond_syscall(sys_vm86old); 127cond_syscall(sys_vm86old);
128cond_syscall(sys_vm86); 128cond_syscall(sys_vm86);
129cond_syscall(sys_ipc);
129cond_syscall(compat_sys_ipc); 130cond_syscall(compat_sys_ipc);
130cond_syscall(compat_sys_sysctl); 131cond_syscall(compat_sys_sysctl);
131cond_syscall(sys_flock); 132cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h>
26#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
27#include <linux/security.h> 28#include <linux/security.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
@@ -50,6 +51,7 @@
50#include <linux/ftrace.h> 51#include <linux/ftrace.h>
51#include <linux/slow-work.h> 52#include <linux/slow-work.h>
52#include <linux/perf_event.h> 53#include <linux/perf_event.h>
54#include <linux/kprobes.h>
53 55
54#include <asm/uaccess.h> 56#include <asm/uaccess.h>
55#include <asm/processor.h> 57#include <asm/processor.h>
@@ -59,13 +61,23 @@
59#include <asm/stacktrace.h> 61#include <asm/stacktrace.h>
60#include <asm/io.h> 62#include <asm/io.h>
61#endif 63#endif
64#ifdef CONFIG_BSD_PROCESS_ACCT
65#include <linux/acct.h>
66#endif
67#ifdef CONFIG_RT_MUTEXES
68#include <linux/rtmutex.h>
69#endif
70#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
71#include <linux/lockdep.h>
72#endif
73#ifdef CONFIG_CHR_DEV_SG
74#include <scsi/sg.h>
75#endif
62 76
63 77
64#if defined(CONFIG_SYSCTL) 78#if defined(CONFIG_SYSCTL)
65 79
66/* External variables not in a header file. */ 80/* External variables not in a header file. */
67extern int C_A_D;
68extern int print_fatal_signals;
69extern int sysctl_overcommit_memory; 81extern int sysctl_overcommit_memory;
70extern int sysctl_overcommit_ratio; 82extern int sysctl_overcommit_ratio;
71extern int sysctl_panic_on_oom; 83extern int sysctl_panic_on_oom;
@@ -87,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
87#ifndef CONFIG_MMU 99#ifndef CONFIG_MMU
88extern int sysctl_nr_trim_pages; 100extern int sysctl_nr_trim_pages;
89#endif 101#endif
90#ifdef CONFIG_RCU_TORTURE_TEST
91extern int rcutorture_runnable;
92#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
93#ifdef CONFIG_BLOCK 102#ifdef CONFIG_BLOCK
94extern int blk_iopoll_enabled; 103extern int blk_iopoll_enabled;
95#endif 104#endif
@@ -119,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
119 128
120static int ngroups_max = NGROUPS_MAX; 129static int ngroups_max = NGROUPS_MAX;
121 130
122#ifdef CONFIG_MODULES
123extern char modprobe_path[];
124extern int modules_disabled;
125#endif
126#ifdef CONFIG_CHR_DEV_SG
127extern int sg_big_buff;
128#endif
129
130#ifdef CONFIG_SPARC 131#ifdef CONFIG_SPARC
131#include <asm/system.h> 132#include <asm/system.h>
132#endif 133#endif
@@ -148,10 +149,6 @@ extern int sysctl_userprocess_debug;
148extern int spin_retry; 149extern int spin_retry;
149#endif 150#endif
150 151
151#ifdef CONFIG_BSD_PROCESS_ACCT
152extern int acct_parm[];
153#endif
154
155#ifdef CONFIG_IA64 152#ifdef CONFIG_IA64
156extern int no_unaligned_warning; 153extern int no_unaligned_warning;
157extern int unaligned_dump_stack; 154extern int unaligned_dump_stack;
@@ -159,10 +156,6 @@ extern int unaligned_dump_stack;
159 156
160extern struct ratelimit_state printk_ratelimit_state; 157extern struct ratelimit_state printk_ratelimit_state;
161 158
162#ifdef CONFIG_RT_MUTEXES
163extern int max_lock_depth;
164#endif
165
166#ifdef CONFIG_PROC_SYSCTL 159#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 160static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 161 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -201,9 +194,6 @@ extern struct ctl_table epoll_table[];
201int sysctl_legacy_va_layout; 194int sysctl_legacy_va_layout;
202#endif 195#endif
203 196
204extern int prove_locking;
205extern int lock_stat;
206
207/* The default sysctl tables: */ 197/* The default sysctl tables: */
208 198
209static struct ctl_table root_table[] = { 199static struct ctl_table root_table[] = {
@@ -1441,7 +1431,7 @@ static struct ctl_table fs_table[] = {
1441}; 1431};
1442 1432
1443static struct ctl_table debug_table[] = { 1433static struct ctl_table debug_table[] = {
1444#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
1445 { 1435 {
1446 .procname = "exception-trace", 1436 .procname = "exception-trace",
1447 .data = &show_unhandled_signals, 1437 .data = &show_unhandled_signals,
@@ -1450,6 +1440,17 @@ static struct ctl_table debug_table[] = {
1450 .proc_handler = proc_dointvec 1440 .proc_handler = proc_dointvec
1451 }, 1441 },
1452#endif 1442#endif
1443#if defined(CONFIG_OPTPROBES)
1444 {
1445 .procname = "kprobes-optimization",
1446 .data = &sysctl_kprobes_optimization,
1447 .maxlen = sizeof(int),
1448 .mode = 0644,
1449 .proc_handler = proc_kprobes_optimization_handler,
1450 .extra1 = &zero,
1451 .extra2 = &one,
1452 },
1453#endif
1453 { } 1454 { }
1454}; 1455};
1455 1456
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..59030570f5ca 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/slab.h>
16 17
17#ifdef CONFIG_SYSCTL_SYSCALL 18#ifdef CONFIG_SYSCTL_SYSCALL
18 19
@@ -1331,7 +1332,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1331 ssize_t result; 1332 ssize_t result;
1332 char *pathname; 1333 char *pathname;
1333 int flags; 1334 int flags;
1334 int acc_mode, fmode; 1335 int acc_mode;
1335 1336
1336 pathname = sysctl_getname(name, nlen, &table); 1337 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname); 1338 result = PTR_ERR(pathname);
@@ -1342,15 +1343,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1342 if (oldval && oldlen && newval && newlen) { 1343 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR; 1344 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE; 1345 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) { 1346 } else if (newval && newlen) {
1347 flags = O_WRONLY; 1347 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE; 1348 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) { 1349 } else if (oldval && oldlen) {
1351 flags = O_RDONLY; 1350 flags = O_RDONLY;
1352 acc_mode = MAY_READ; 1351 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else { 1352 } else {
1355 result = 0; 1353 result = 0;
1356 goto out_putname; 1354 goto out_putname;
@@ -1361,7 +1359,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1361 if (result) 1359 if (result)
1362 goto out_putname; 1360 goto out_putname;
1363 1361
1364 result = may_open(&nd.path, acc_mode, fmode); 1362 result = may_open(&nd.path, acc_mode, flags);
1365 if (result) 1363 if (result)
1366 goto out_putpath; 1364 goto out_putpath;
1367 1365
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/cpumask.h> 23#include <linux/cpumask.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/slab.h>
25#include <linux/cgroupstats.h> 26#include <linux/cgroupstats.h>
26#include <linux/cgroup.h> 27#include <linux/cgroup.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
@@ -46,15 +47,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 48};
48 49
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 50static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 55
56static struct nla_policy 56static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 58};
60 59
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..656dccfe1cbb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
39#include <linux/math64.h> 38#include <linux/math64.h>
40#include <linux/ptrace.h> 39#include <linux/ptrace.h>
41 40
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f663d23e85e..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -592,6 +592,10 @@ static inline void clocksource_select(void) { }
592 */ 592 */
593static int __init clocksource_done_booting(void) 593static int __init clocksource_done_booting(void)
594{ 594{
595 mutex_lock(&clocksource_mutex);
596 curr_clocksource = clocksource_default_clock();
597 mutex_unlock(&clocksource_mutex);
598
595 finished_booting = 1; 599 finished_booting = 1;
596 600
597 /* 601 /*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
22 22
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/* Limit min_delta to a jiffie */
26#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
27
28static int tick_increase_min_delta(struct clock_event_device *dev)
29{
30 /* Nothing to do if we already reached the limit */
31 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
32 return -ETIME;
33
34 if (dev->min_delta_ns < 5000)
35 dev->min_delta_ns = 5000;
36 else
37 dev->min_delta_ns += dev->min_delta_ns >> 1;
38
39 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
40 dev->min_delta_ns = MIN_DELTA_LIMIT;
41
42 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
43 dev->name ? dev->name : "?",
44 (unsigned long long) dev->min_delta_ns);
45 return 0;
46}
47
25/** 48/**
26 * tick_program_event internal worker function 49 * tick_program_event internal worker function
27 */ 50 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
37 if (!ret || !force) 60 if (!ret || !force)
38 return ret; 61 return ret;
39 62
63 dev->retries++;
40 /* 64 /*
41 * We tried 2 times to program the device with the given 65 * We tried 3 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it 66 * min_delta_ns. If that's not working then we increase it
43 * and emit a warning. 67 * and emit a warning.
44 */ 68 */
45 if (++i > 2) { 69 if (++i > 2) {
46 /* Increase the min. delta and try again */ 70 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns) 71 if (tick_increase_min_delta(dev)) {
48 dev->min_delta_ns = 5000; 72 /*
49 else 73 * Get out of the loop if min_delta_ns
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 74 * hit the limit already. That's
51 75 * better than staying here forever.
52 printk(KERN_WARNING 76 *
53 "CE: %s increasing min_delta_ns to %llu nsec\n", 77 * We clear next_event so we have a
54 dev->name ? dev->name : "?", 78 * chance that the box survives.
55 (unsigned long long) dev->min_delta_ns << 1); 79 */
56 80 printk(KERN_WARNING
81 "CE: Reprogramming failure. Giving up\n");
82 dev->next_event.tv64 = KTIME_MAX;
83 return -ETIME;
84 }
57 i = 0; 85 i = 0;
58 } 86 }
59 87
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..1d7b9bc1c034 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
150 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
151} 151}
152 152
153/*
154 * Updates the per cpu time idle statistics counters
155 */
156static void
157update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{
159 ktime_t delta;
160
161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu() > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now;
167 }
168
169 if (last_update_time)
170 *last_update_time = ktime_to_us(now);
171
172}
173
153static void tick_nohz_stop_idle(int cpu, ktime_t now) 174static void tick_nohz_stop_idle(int cpu, ktime_t now)
154{ 175{
155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
157 177
158 delta = ktime_sub(now, ts->idle_entrytime); 178 update_ts_time_stats(ts, now, NULL);
159 ts->idle_lastupdate = now;
160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
161 ts->idle_active = 0; 179 ts->idle_active = 0;
162 180
163 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
165 183
166static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
167{ 185{
168 ktime_t now, delta; 186 ktime_t now;
169 187
170 now = ktime_get(); 188 now = ktime_get();
171 if (ts->idle_active) { 189
172 delta = ktime_sub(now, ts->idle_entrytime); 190 update_ts_time_stats(ts, now, NULL);
173 ts->idle_lastupdate = now; 191
174 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
175 }
176 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
177 ts->idle_active = 1; 193 ts->idle_active = 1;
178 sched_clock_idle_sleep_event(); 194 sched_clock_idle_sleep_event();
179 return now; 195 return now;
180} 196}
181 197
198/**
199 * get_cpu_idle_time_us - get the total idle time of a cpu
200 * @cpu: CPU number to query
201 * @last_update_time: variable to store update time in
202 *
203 * Return the cummulative idle time (since boot) for a given
204 * CPU, in microseconds. The idle time returned includes
205 * the iowait time (unlike what "top" and co report).
206 *
207 * This time is measured via accounting rather than sampling,
208 * and is as accurate as ktime_get() is.
209 *
210 * This function returns -1 if NOHZ is not enabled.
211 */
182u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 212u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
183{ 213{
184 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 214 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
186 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
187 return -1; 217 return -1;
188 218
189 if (ts->idle_active) 219 update_ts_time_stats(ts, ktime_get(), last_update_time);
190 *last_update_time = ktime_to_us(ts->idle_lastupdate);
191 else
192 *last_update_time = ktime_to_us(ktime_get());
193 220
194 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
195} 222}
196EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 223EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
197 224
225/*
226 * get_cpu_iowait_time_us - get the total iowait time of a cpu
227 * @cpu: CPU number to query
228 * @last_update_time: variable to store update time in
229 *
230 * Return the cummulative iowait time (since boot) for a given
231 * CPU, in microseconds.
232 *
233 * This time is measured via accounting rather than sampling,
234 * and is as accurate as ktime_get() is.
235 *
236 * This function returns -1 if NOHZ is not enabled.
237 */
238u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
239{
240 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
241
242 if (!tick_nohz_enabled)
243 return -1;
244
245 update_ts_time_stats(ts, ktime_get(), last_update_time);
246
247 return ktime_to_us(ts->iowait_sleeptime);
248}
249EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
250
198/** 251/**
199 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 252 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
200 * 253 *
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
262 goto end; 315 goto end;
263 } 316 }
264 317
318 if (nohz_ratelimit(cpu))
319 goto end;
320
265 ts->idle_calls++; 321 ts->idle_calls++;
266 /* Read jiffies and the time when jiffies were updated last */ 322 /* Read jiffies and the time when jiffies were updated last */
267 do { 323 do {
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 12f5c55090be..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/timecompare.h> 20#include <linux/timecompare.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h>
22#include <linux/math64.h> 23#include <linux/math64.h>
23 24
24/* 25/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 16736379a9ca..39f6177fafac 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -818,7 +818,8 @@ void update_wall_time(void)
818 shift = min(shift, maxshift); 818 shift = min(shift, maxshift);
819 while (offset >= timekeeper.cycle_interval) { 819 while (offset >= timekeeper.cycle_interval) {
820 offset = logarithmic_accumulation(offset, shift); 820 offset = logarithmic_accumulation(offset, shift);
821 shift--; 821 if(offset < timekeeper.cycle_interval<<shift)
822 shift--;
822 } 823 }
823 824
824 /* correct the clock when NTP error is too big */ 825 /* correct the clock when NTP error is too big */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
176 P_ns(idle_waketime); 176 P_ns(idle_waketime);
177 P_ns(idle_exittime); 177 P_ns(idle_exittime);
178 P_ns(idle_sleeptime); 178 P_ns(idle_sleeptime);
179 P_ns(iowait_sleeptime);
179 P(last_jiffies); 180 P(last_jiffies);
180 P(next_jiffies); 181 P(next_jiffies);
181 P_ns(idle_expires); 182 P_ns(idle_expires);
@@ -228,6 +229,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
228 SEQ_printf(m, " event_handler: "); 229 SEQ_printf(m, " event_handler: ");
229 print_name_offset(m, dev->event_handler); 230 print_name_offset(m, dev->event_handler);
230 SEQ_printf(m, "\n"); 231 SEQ_printf(m, "\n");
232 SEQ_printf(m, " retries: %lu\n", dev->retries);
231} 233}
232 234
233static void timer_list_show_tickdevices(struct seq_file *m) 235static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
257 u64 now = ktime_to_ns(ktime_get()); 259 u64 now = ktime_to_ns(ktime_get());
258 int cpu; 260 int cpu;
259 261
260 SEQ_printf(m, "Timer List Version: v0.5\n"); 262 SEQ_printf(m, "Timer List Version: v0.6\n");
261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
263 265
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..aeb6a54f2771 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/unistd.h> 45#include <asm/unistd.h>
@@ -880,6 +881,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
880 if (base->running_timer == timer) 881 if (base->running_timer == timer)
881 goto out; 882 goto out;
882 883
884 timer_stats_timer_clear_start_info(timer);
883 ret = 0; 885 ret = 0;
884 if (timer_pending(timer)) { 886 if (timer_pending(timer)) {
885 detach_timer(timer, 1); 887 detach_timer(timer, 1);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd3..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
44 help 44 help
45 See Documentation/trace/ftrace-design.txt 45 See Documentation/trace/ftrace-design.txt
46 46
47config HAVE_HW_BRANCH_TRACER
48 bool
49
50config HAVE_SYSCALL_TRACEPOINTS 47config HAVE_SYSCALL_TRACEPOINTS
51 bool 48 bool
52 help 49 help
@@ -374,14 +371,6 @@ config STACK_TRACER
374 371
375 Say N if unsure. 372 Say N if unsure.
376 373
377config HW_BRANCH_TRACER
378 depends on HAVE_HW_BRANCH_TRACER
379 bool "Trace hw branches"
380 select GENERIC_TRACER
381 help
382 This tracer records all branches on the system in a circular
383 buffer, giving access to the last N branches for each cpu.
384
385config KMEMTRACE 374config KMEMTRACE
386 bool "Trace SLAB allocations" 375 bool "Trace SLAB allocations"
387 select GENERIC_TRACER 376 select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d00c6fe23f54..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -52,7 +51,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 51obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 52obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54ifeq ($(CONFIG_PERF_EVENTS),y) 53ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o 54obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
56endif 55endif
57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 07f945a99430..b3bc91a3f510 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/debugfs.h> 25#include <linux/debugfs.h>
25#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
26#include <linux/time.h> 27#include <linux/time.h>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83783579378f..32837e19e3bd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -24,9 +24,11 @@
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/ftrace.h> 25#include <linux/ftrace.h>
26#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/slab.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/list.h> 29#include <linux/list.h>
29#include <linux/hash.h> 30#include <linux/hash.h>
31#include <linux/rcupdate.h>
30 32
31#include <trace/events/sched.h> 33#include <trace/events/sched.h>
32 34
@@ -84,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
84ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
85ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
86 88
87#ifdef CONFIG_FUNCTION_GRAPH_TRACER 89/*
88static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 90 * Traverse the ftrace_list, invoking all entries. The reason that we
89#endif 91 * can use rcu_dereference_raw() is that elements removed from this list
90 92 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list.
95 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */
91static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
92{ 99{
93 struct ftrace_ops *op = ftrace_list; 100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
94
95 /* in case someone actually ports this to alpha! */
96 read_barrier_depends();
97 101
98 while (op != &ftrace_list_end) { 102 while (op != &ftrace_list_end) {
99 /* silly alpha */
100 read_barrier_depends();
101 op->func(ip, parent_ip); 103 op->func(ip, parent_ip);
102 op = op->next; 104 op = rcu_dereference_raw(op->next); /*see above*/
103 }; 105 };
104} 106}
105 107
@@ -154,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
154 * the ops->next pointer is valid before another CPU sees 156 * the ops->next pointer is valid before another CPU sees
155 * the ops pointer included into the ftrace_list. 157 * the ops pointer included into the ftrace_list.
156 */ 158 */
157 smp_wmb(); 159 rcu_assign_pointer(ftrace_list, ops);
158 ftrace_list = ops;
159 160
160 if (ftrace_enabled) { 161 if (ftrace_enabled) {
161 ftrace_func_t func; 162 ftrace_func_t func;
@@ -263,6 +264,7 @@ struct ftrace_profile {
263 unsigned long counter; 264 unsigned long counter;
264#ifdef CONFIG_FUNCTION_GRAPH_TRACER 265#ifdef CONFIG_FUNCTION_GRAPH_TRACER
265 unsigned long long time; 266 unsigned long long time;
267 unsigned long long time_squared;
266#endif 268#endif
267}; 269};
268 270
@@ -365,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
365{ 367{
366#ifdef CONFIG_FUNCTION_GRAPH_TRACER 368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
367 seq_printf(m, " Function " 369 seq_printf(m, " Function "
368 "Hit Time Avg\n" 370 "Hit Time Avg s^2\n"
369 " -------- " 371 " -------- "
370 "--- ---- ---\n"); 372 "--- ---- --- ---\n");
371#else 373#else
372 seq_printf(m, " Function Hit\n" 374 seq_printf(m, " Function Hit\n"
373 " -------- ---\n"); 375 " -------- ---\n");
@@ -383,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
383 static DEFINE_MUTEX(mutex); 385 static DEFINE_MUTEX(mutex);
384 static struct trace_seq s; 386 static struct trace_seq s;
385 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev;
386#endif 389#endif
387 390
388 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -393,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
393 avg = rec->time; 396 avg = rec->time;
394 do_div(avg, rec->counter); 397 do_div(avg, rec->counter);
395 398
399 /* Sample standard deviation (s^2) */
400 if (rec->counter <= 1)
401 stddev = 0;
402 else {
403 stddev = rec->time_squared - rec->counter * avg * avg;
404 /*
405 * Divide only 1000 for ns^2 -> us^2 conversion.
406 * trace_print_graph_duration will divide 1000 again.
407 */
408 do_div(stddev, (rec->counter - 1) * 1000);
409 }
410
396 mutex_lock(&mutex); 411 mutex_lock(&mutex);
397 trace_seq_init(&s); 412 trace_seq_init(&s);
398 trace_print_graph_duration(rec->time, &s); 413 trace_print_graph_duration(rec->time, &s);
399 trace_seq_puts(&s, " "); 414 trace_seq_puts(&s, " ");
400 trace_print_graph_duration(avg, &s); 415 trace_print_graph_duration(avg, &s);
416 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s);
401 trace_print_seq(m, &s); 418 trace_print_seq(m, &s);
402 mutex_unlock(&mutex); 419 mutex_unlock(&mutex);
403#endif 420#endif
@@ -649,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
649 if (!stat->hash || !ftrace_profile_enabled) 666 if (!stat->hash || !ftrace_profile_enabled)
650 goto out; 667 goto out;
651 668
669 /* If the calltime was zero'd ignore it */
670 if (!trace->calltime)
671 goto out;
672
652 calltime = trace->rettime - trace->calltime; 673 calltime = trace->rettime - trace->calltime;
653 674
654 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 675 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -667,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
667 } 688 }
668 689
669 rec = ftrace_find_profiled_func(stat, trace->func); 690 rec = ftrace_find_profiled_func(stat, trace->func);
670 if (rec) 691 if (rec) {
671 rec->time += calltime; 692 rec->time += calltime;
693 rec->time_squared += calltime * calltime;
694 }
672 695
673 out: 696 out:
674 local_irq_restore(flags); 697 local_irq_restore(flags);
@@ -2276,6 +2299,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
2276 2299
2277#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2300#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2278static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 2301static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2302static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
2303
2279static int __init set_graph_function(char *str) 2304static int __init set_graph_function(char *str)
2280{ 2305{
2281 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 2306 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -3209,8 +3234,7 @@ free:
3209} 3234}
3210 3235
3211static void 3236static void
3212ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, 3237ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next)
3213 struct task_struct *next)
3214{ 3238{
3215 unsigned long long timestamp; 3239 unsigned long long timestamp;
3216 int index; 3240 int index;
@@ -3336,11 +3360,11 @@ void unregister_ftrace_graph(void)
3336 goto out; 3360 goto out;
3337 3361
3338 ftrace_graph_active--; 3362 ftrace_graph_active--;
3339 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3340 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3363 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3341 ftrace_graph_entry = ftrace_graph_entry_stub; 3364 ftrace_graph_entry = ftrace_graph_entry_stub;
3342 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3365 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3343 unregister_pm_notifier(&ftrace_suspend_notifier); 3366 unregister_pm_notifier(&ftrace_suspend_notifier);
3367 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3344 3368
3345 out: 3369 out:
3346 mutex_unlock(&ftrace_lock); 3370 mutex_unlock(&ftrace_lock);
@@ -3351,6 +3375,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3351{ 3375{
3352 /* Make sure we do not use the parent ret_stack */ 3376 /* Make sure we do not use the parent ret_stack */
3353 t->ret_stack = NULL; 3377 t->ret_stack = NULL;
3378 t->curr_ret_stack = -1;
3354 3379
3355 if (ftrace_graph_active) { 3380 if (ftrace_graph_active) {
3356 struct ftrace_ret_stack *ret_stack; 3381 struct ftrace_ret_stack *ret_stack;
@@ -3360,7 +3385,6 @@ void ftrace_graph_init_task(struct task_struct *t)
3360 GFP_KERNEL); 3385 GFP_KERNEL);
3361 if (!ret_stack) 3386 if (!ret_stack)
3362 return; 3387 return;
3363 t->curr_ret_stack = -1;
3364 atomic_set(&t->tracing_graph_pause, 0); 3388 atomic_set(&t->tracing_graph_pause, 0);
3365 atomic_set(&t->trace_overrun, 0); 3389 atomic_set(&t->trace_overrun, 0);
3366 t->ftrace_timestamp = 0; 3390 t->ftrace_timestamp = 0;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9f4f565b01e6..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,7 +9,6 @@
9#include <linux/workqueue.h> 9#include <linux/workqueue.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
13 12
14#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 14#include <trace/events/power.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8c1b2d290718..7f6059c5aa94 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/hash.h> 19#include <linux/hash.h>
19#include <linux/list.h> 20#include <linux/list.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/fs.h> 22#include <linux/fs.h>
22 23
24#include <asm/local.h>
23#include "trace.h" 25#include "trace.h"
24 26
25/* 27/*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
207#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
208 210
211#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
212# define RB_FORCE_8BYTE_ALIGNMENT 0
213# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
214#else
215# define RB_FORCE_8BYTE_ALIGNMENT 1
216# define RB_ARCH_ALIGNMENT 8U
217#endif
218
209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 219/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 220#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
211 221
@@ -309,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
309#define TS_MASK ((1ULL << TS_SHIFT) - 1) 319#define TS_MASK ((1ULL << TS_SHIFT) - 1)
310#define TS_DELTA_TEST (~TS_MASK) 320#define TS_DELTA_TEST (~TS_MASK)
311 321
322/* Flag when events were overwritten */
323#define RB_MISSED_EVENTS (1 << 31)
324/* Missed count stored at end */
325#define RB_MISSED_STORED (1 << 30)
326
312struct buffer_data_page { 327struct buffer_data_page {
313 u64 time_stamp; /* page time stamp */ 328 u64 time_stamp; /* page time stamp */
314 local_t commit; /* write committed index */ 329 local_t commit; /* write committed index */
@@ -328,6 +343,7 @@ struct buffer_page {
328 local_t write; /* index for next write */ 343 local_t write; /* index for next write */
329 unsigned read; /* index for next read */ 344 unsigned read; /* index for next read */
330 local_t entries; /* entries on this page */ 345 local_t entries; /* entries on this page */
346 unsigned long real_end; /* real end of data */
331 struct buffer_data_page *page; /* Actual data page */ 347 struct buffer_data_page *page; /* Actual data page */
332}; 348};
333 349
@@ -407,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
407 (unsigned int)sizeof(field.commit), 423 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long)); 424 (unsigned int)is_signed_type(long));
409 425
426 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 1,
430 (unsigned int)is_signed_type(long));
431
410 ret = trace_seq_printf(s, "\tfield: char data;\t" 432 ret = trace_seq_printf(s, "\tfield: char data;\t"
411 "offset:%u;\tsize:%u;\tsigned:%u;\n", 433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
412 (unsigned int)offsetof(typeof(field), data), 434 (unsigned int)offsetof(typeof(field), data),
@@ -430,6 +452,8 @@ struct ring_buffer_per_cpu {
430 struct buffer_page *tail_page; /* write to tail */ 452 struct buffer_page *tail_page; /* write to tail */
431 struct buffer_page *commit_page; /* committed pages */ 453 struct buffer_page *commit_page; /* committed pages */
432 struct buffer_page *reader_page; 454 struct buffer_page *reader_page;
455 unsigned long lost_events;
456 unsigned long last_overrun;
433 local_t commit_overrun; 457 local_t commit_overrun;
434 local_t overrun; 458 local_t overrun;
435 local_t entries; 459 local_t entries;
@@ -1200,18 +1224,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1200 1224
1201 for (i = 0; i < nr_pages; i++) { 1225 for (i = 0; i < nr_pages; i++) {
1202 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1226 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1203 return; 1227 goto out;
1204 p = cpu_buffer->pages->next; 1228 p = cpu_buffer->pages->next;
1205 bpage = list_entry(p, struct buffer_page, list); 1229 bpage = list_entry(p, struct buffer_page, list);
1206 list_del_init(&bpage->list); 1230 list_del_init(&bpage->list);
1207 free_buffer_page(bpage); 1231 free_buffer_page(bpage);
1208 } 1232 }
1209 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1233 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1210 return; 1234 goto out;
1211 1235
1212 rb_reset_cpu(cpu_buffer); 1236 rb_reset_cpu(cpu_buffer);
1213 rb_check_pages(cpu_buffer); 1237 rb_check_pages(cpu_buffer);
1214 1238
1239out:
1215 spin_unlock_irq(&cpu_buffer->reader_lock); 1240 spin_unlock_irq(&cpu_buffer->reader_lock);
1216} 1241}
1217 1242
@@ -1228,7 +1253,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1228 1253
1229 for (i = 0; i < nr_pages; i++) { 1254 for (i = 0; i < nr_pages; i++) {
1230 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1255 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1231 return; 1256 goto out;
1232 p = pages->next; 1257 p = pages->next;
1233 bpage = list_entry(p, struct buffer_page, list); 1258 bpage = list_entry(p, struct buffer_page, list);
1234 list_del_init(&bpage->list); 1259 list_del_init(&bpage->list);
@@ -1237,6 +1262,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1237 rb_reset_cpu(cpu_buffer); 1262 rb_reset_cpu(cpu_buffer);
1238 rb_check_pages(cpu_buffer); 1263 rb_check_pages(cpu_buffer);
1239 1264
1265out:
1240 spin_unlock_irq(&cpu_buffer->reader_lock); 1266 spin_unlock_irq(&cpu_buffer->reader_lock);
1241} 1267}
1242 1268
@@ -1546,7 +1572,7 @@ rb_update_event(struct ring_buffer_event *event,
1546 1572
1547 case 0: 1573 case 0:
1548 length -= RB_EVNT_HDR_SIZE; 1574 length -= RB_EVNT_HDR_SIZE;
1549 if (length > RB_MAX_SMALL_DATA) 1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1550 event->array[0] = length; 1576 event->array[0] = length;
1551 else 1577 else
1552 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1721,11 +1747,11 @@ static unsigned rb_calculate_event_length(unsigned length)
1721 if (!length) 1747 if (!length)
1722 length = 1; 1748 length = 1;
1723 1749
1724 if (length > RB_MAX_SMALL_DATA) 1750 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1725 length += sizeof(event.array[0]); 1751 length += sizeof(event.array[0]);
1726 1752
1727 length += RB_EVNT_HDR_SIZE; 1753 length += RB_EVNT_HDR_SIZE;
1728 length = ALIGN(length, RB_ALIGNMENT); 1754 length = ALIGN(length, RB_ARCH_ALIGNMENT);
1729 1755
1730 return length; 1756 return length;
1731} 1757}
@@ -1750,6 +1776,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1750 kmemcheck_annotate_bitfield(event, bitfield); 1776 kmemcheck_annotate_bitfield(event, bitfield);
1751 1777
1752 /* 1778 /*
1779 * Save the original length to the meta data.
1780 * This will be used by the reader to add lost event
1781 * counter.
1782 */
1783 tail_page->real_end = tail;
1784
1785 /*
1753 * If this event is bigger than the minimum size, then 1786 * If this event is bigger than the minimum size, then
1754 * we need to be careful that we don't subtract the 1787 * we need to be careful that we don't subtract the
1755 * write counter enough to allow another writer to slip 1788 * write counter enough to allow another writer to slip
@@ -1967,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1967 u64 *ts, u64 *delta) 2000 u64 *ts, u64 *delta)
1968{ 2001{
1969 struct ring_buffer_event *event; 2002 struct ring_buffer_event *event;
1970 static int once;
1971 int ret; 2003 int ret;
1972 2004
1973 if (unlikely(*delta > (1ULL << 59) && !once++)) { 2005 WARN_ONCE(*delta > (1ULL << 59),
1974 printk(KERN_WARNING "Delta way too big! %llu" 2006 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
1975 " ts=%llu write stamp = %llu\n", 2007 (unsigned long long)*delta,
1976 (unsigned long long)*delta, 2008 (unsigned long long)*ts,
1977 (unsigned long long)*ts, 2009 (unsigned long long)cpu_buffer->write_stamp);
1978 (unsigned long long)cpu_buffer->write_stamp);
1979 WARN_ON(1);
1980 }
1981 2010
1982 /* 2011 /*
1983 * The delta is too big, we to add a 2012 * The delta is too big, we to add a
@@ -2232,12 +2261,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2232 if (ring_buffer_flags != RB_BUFFERS_ON) 2261 if (ring_buffer_flags != RB_BUFFERS_ON)
2233 return NULL; 2262 return NULL;
2234 2263
2235 if (atomic_read(&buffer->record_disabled))
2236 return NULL;
2237
2238 /* If we are tracing schedule, we don't want to recurse */ 2264 /* If we are tracing schedule, we don't want to recurse */
2239 resched = ftrace_preempt_disable(); 2265 resched = ftrace_preempt_disable();
2240 2266
2267 if (atomic_read(&buffer->record_disabled))
2268 goto out_nocheck;
2269
2241 if (trace_recursive_lock()) 2270 if (trace_recursive_lock())
2242 goto out_nocheck; 2271 goto out_nocheck;
2243 2272
@@ -2469,11 +2498,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
2469 if (ring_buffer_flags != RB_BUFFERS_ON) 2498 if (ring_buffer_flags != RB_BUFFERS_ON)
2470 return -EBUSY; 2499 return -EBUSY;
2471 2500
2472 if (atomic_read(&buffer->record_disabled))
2473 return -EBUSY;
2474
2475 resched = ftrace_preempt_disable(); 2501 resched = ftrace_preempt_disable();
2476 2502
2503 if (atomic_read(&buffer->record_disabled))
2504 goto out;
2505
2477 cpu = raw_smp_processor_id(); 2506 cpu = raw_smp_processor_id();
2478 2507
2479 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2508 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2541,7 +2570,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2541 * @buffer: The ring buffer to enable writes 2570 * @buffer: The ring buffer to enable writes
2542 * 2571 *
2543 * Note, multiple disables will need the same number of enables 2572 * Note, multiple disables will need the same number of enables
2544 * to truely enable the writing (much like preempt_disable). 2573 * to truly enable the writing (much like preempt_disable).
2545 */ 2574 */
2546void ring_buffer_record_enable(struct ring_buffer *buffer) 2575void ring_buffer_record_enable(struct ring_buffer *buffer)
2547{ 2576{
@@ -2577,7 +2606,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2577 * @cpu: The CPU to enable. 2606 * @cpu: The CPU to enable.
2578 * 2607 *
2579 * Note, multiple disables will need the same number of enables 2608 * Note, multiple disables will need the same number of enables
2580 * to truely enable the writing (much like preempt_disable). 2609 * to truly enable the writing (much like preempt_disable).
2581 */ 2610 */
2582void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 2611void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2583{ 2612{
@@ -2826,6 +2855,7 @@ static struct buffer_page *
2826rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2855rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2827{ 2856{
2828 struct buffer_page *reader = NULL; 2857 struct buffer_page *reader = NULL;
2858 unsigned long overwrite;
2829 unsigned long flags; 2859 unsigned long flags;
2830 int nr_loops = 0; 2860 int nr_loops = 0;
2831 int ret; 2861 int ret;
@@ -2867,6 +2897,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2867 local_set(&cpu_buffer->reader_page->write, 0); 2897 local_set(&cpu_buffer->reader_page->write, 0);
2868 local_set(&cpu_buffer->reader_page->entries, 0); 2898 local_set(&cpu_buffer->reader_page->entries, 0);
2869 local_set(&cpu_buffer->reader_page->page->commit, 0); 2899 local_set(&cpu_buffer->reader_page->page->commit, 0);
2900 cpu_buffer->reader_page->real_end = 0;
2870 2901
2871 spin: 2902 spin:
2872 /* 2903 /*
@@ -2887,6 +2918,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2887 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2918 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2888 2919
2889 /* 2920 /*
2921 * We want to make sure we read the overruns after we set up our
2922 * pointers to the next object. The writer side does a
2923 * cmpxchg to cross pages which acts as the mb on the writer
2924 * side. Note, the reader will constantly fail the swap
2925 * while the writer is updating the pointers, so this
2926 * guarantees that the overwrite recorded here is the one we
2927 * want to compare with the last_overrun.
2928 */
2929 smp_mb();
2930 overwrite = local_read(&(cpu_buffer->overrun));
2931
2932 /*
2890 * Here's the tricky part. 2933 * Here's the tricky part.
2891 * 2934 *
2892 * We need to move the pointer past the header page. 2935 * We need to move the pointer past the header page.
@@ -2917,6 +2960,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2917 cpu_buffer->reader_page = reader; 2960 cpu_buffer->reader_page = reader;
2918 rb_reset_reader_page(cpu_buffer); 2961 rb_reset_reader_page(cpu_buffer);
2919 2962
2963 if (overwrite != cpu_buffer->last_overrun) {
2964 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2965 cpu_buffer->last_overrun = overwrite;
2966 }
2967
2920 goto again; 2968 goto again;
2921 2969
2922 out: 2970 out:
@@ -2993,8 +3041,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2993 rb_advance_iter(iter); 3041 rb_advance_iter(iter);
2994} 3042}
2995 3043
3044static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3045{
3046 return cpu_buffer->lost_events;
3047}
3048
2996static struct ring_buffer_event * 3049static struct ring_buffer_event *
2997rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3050rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3051 unsigned long *lost_events)
2998{ 3052{
2999 struct ring_buffer_event *event; 3053 struct ring_buffer_event *event;
3000 struct buffer_page *reader; 3054 struct buffer_page *reader;
@@ -3046,6 +3100,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3046 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3100 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3047 cpu_buffer->cpu, ts); 3101 cpu_buffer->cpu, ts);
3048 } 3102 }
3103 if (lost_events)
3104 *lost_events = rb_lost_events(cpu_buffer);
3049 return event; 3105 return event;
3050 3106
3051 default: 3107 default:
@@ -3156,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
3156 * @buffer: The ring buffer to read 3212 * @buffer: The ring buffer to read
3157 * @cpu: The cpu to peak at 3213 * @cpu: The cpu to peak at
3158 * @ts: The timestamp counter of this event. 3214 * @ts: The timestamp counter of this event.
3215 * @lost_events: a variable to store if events were lost (may be NULL)
3159 * 3216 *
3160 * This will return the event that will be read next, but does 3217 * This will return the event that will be read next, but does
3161 * not consume the data. 3218 * not consume the data.
3162 */ 3219 */
3163struct ring_buffer_event * 3220struct ring_buffer_event *
3164ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3221ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3222 unsigned long *lost_events)
3165{ 3223{
3166 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3224 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3167 struct ring_buffer_event *event; 3225 struct ring_buffer_event *event;
@@ -3176,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3176 local_irq_save(flags); 3234 local_irq_save(flags);
3177 if (dolock) 3235 if (dolock)
3178 spin_lock(&cpu_buffer->reader_lock); 3236 spin_lock(&cpu_buffer->reader_lock);
3179 event = rb_buffer_peek(cpu_buffer, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3180 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3238 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3181 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
3182 if (dolock) 3240 if (dolock)
@@ -3218,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3218/** 3276/**
3219 * ring_buffer_consume - return an event and consume it 3277 * ring_buffer_consume - return an event and consume it
3220 * @buffer: The ring buffer to get the next event from 3278 * @buffer: The ring buffer to get the next event from
3279 * @cpu: the cpu to read the buffer from
3280 * @ts: a variable to store the timestamp (may be NULL)
3281 * @lost_events: a variable to store if events were lost (may be NULL)
3221 * 3282 *
3222 * Returns the next event in the ring buffer, and that event is consumed. 3283 * Returns the next event in the ring buffer, and that event is consumed.
3223 * Meaning, that sequential reads will keep returning a different event, 3284 * Meaning, that sequential reads will keep returning a different event,
3224 * and eventually empty the ring buffer if the producer is slower. 3285 * and eventually empty the ring buffer if the producer is slower.
3225 */ 3286 */
3226struct ring_buffer_event * 3287struct ring_buffer_event *
3227ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3288ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3289 unsigned long *lost_events)
3228{ 3290{
3229 struct ring_buffer_per_cpu *cpu_buffer; 3291 struct ring_buffer_per_cpu *cpu_buffer;
3230 struct ring_buffer_event *event = NULL; 3292 struct ring_buffer_event *event = NULL;
@@ -3245,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3245 if (dolock) 3307 if (dolock)
3246 spin_lock(&cpu_buffer->reader_lock); 3308 spin_lock(&cpu_buffer->reader_lock);
3247 3309
3248 event = rb_buffer_peek(cpu_buffer, ts); 3310 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3249 if (event) 3311 if (event) {
3312 cpu_buffer->lost_events = 0;
3250 rb_advance_reader(cpu_buffer); 3313 rb_advance_reader(cpu_buffer);
3314 }
3251 3315
3252 if (dolock) 3316 if (dolock)
3253 spin_unlock(&cpu_buffer->reader_lock); 3317 spin_unlock(&cpu_buffer->reader_lock);
@@ -3264,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3264EXPORT_SYMBOL_GPL(ring_buffer_consume); 3328EXPORT_SYMBOL_GPL(ring_buffer_consume);
3265 3329
3266/** 3330/**
3267 * ring_buffer_read_start - start a non consuming read of the buffer 3331 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3268 * @buffer: The ring buffer to read from 3332 * @buffer: The ring buffer to read from
3269 * @cpu: The cpu buffer to iterate over 3333 * @cpu: The cpu buffer to iterate over
3270 * 3334 *
3271 * This starts up an iteration through the buffer. It also disables 3335 * This performs the initial preparations necessary to iterate
3272 * the recording to the buffer until the reading is finished. 3336 * through the buffer. Memory is allocated, buffer recording
3273 * This prevents the reading from being corrupted. This is not 3337 * is disabled, and the iterator pointer is returned to the caller.
3274 * a consuming read, so a producer is not expected.
3275 * 3338 *
3276 * Must be paired with ring_buffer_finish. 3339 * Disabling buffer recordng prevents the reading from being
3340 * corrupted. This is not a consuming read, so a producer is not
3341 * expected.
3342 *
3343 * After a sequence of ring_buffer_read_prepare calls, the user is
3344 * expected to make at least one call to ring_buffer_prepare_sync.
3345 * Afterwards, ring_buffer_read_start is invoked to get things going
3346 * for real.
3347 *
3348 * This overall must be paired with ring_buffer_finish.
3277 */ 3349 */
3278struct ring_buffer_iter * 3350struct ring_buffer_iter *
3279ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3351ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3280{ 3352{
3281 struct ring_buffer_per_cpu *cpu_buffer; 3353 struct ring_buffer_per_cpu *cpu_buffer;
3282 struct ring_buffer_iter *iter; 3354 struct ring_buffer_iter *iter;
3283 unsigned long flags;
3284 3355
3285 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3356 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3286 return NULL; 3357 return NULL;
@@ -3294,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3294 iter->cpu_buffer = cpu_buffer; 3365 iter->cpu_buffer = cpu_buffer;
3295 3366
3296 atomic_inc(&cpu_buffer->record_disabled); 3367 atomic_inc(&cpu_buffer->record_disabled);
3368
3369 return iter;
3370}
3371EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3372
3373/**
3374 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3375 *
3376 * All previously invoked ring_buffer_read_prepare calls to prepare
3377 * iterators will be synchronized. Afterwards, read_buffer_read_start
3378 * calls on those iterators are allowed.
3379 */
3380void
3381ring_buffer_read_prepare_sync(void)
3382{
3297 synchronize_sched(); 3383 synchronize_sched();
3384}
3385EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3386
3387/**
3388 * ring_buffer_read_start - start a non consuming read of the buffer
3389 * @iter: The iterator returned by ring_buffer_read_prepare
3390 *
3391 * This finalizes the startup of an iteration through the buffer.
3392 * The iterator comes from a call to ring_buffer_read_prepare and
3393 * an intervening ring_buffer_read_prepare_sync must have been
3394 * performed.
3395 *
3396 * Must be paired with ring_buffer_finish.
3397 */
3398void
3399ring_buffer_read_start(struct ring_buffer_iter *iter)
3400{
3401 struct ring_buffer_per_cpu *cpu_buffer;
3402 unsigned long flags;
3403
3404 if (!iter)
3405 return;
3406
3407 cpu_buffer = iter->cpu_buffer;
3298 3408
3299 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3300 arch_spin_lock(&cpu_buffer->lock); 3410 arch_spin_lock(&cpu_buffer->lock);
3301 rb_iter_reset(iter); 3411 rb_iter_reset(iter);
3302 arch_spin_unlock(&cpu_buffer->lock); 3412 arch_spin_unlock(&cpu_buffer->lock);
3303 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3413 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3304
3305 return iter;
3306} 3414}
3307EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3415EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3308 3416
@@ -3396,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3396 cpu_buffer->write_stamp = 0; 3504 cpu_buffer->write_stamp = 0;
3397 cpu_buffer->read_stamp = 0; 3505 cpu_buffer->read_stamp = 0;
3398 3506
3507 cpu_buffer->lost_events = 0;
3508 cpu_buffer->last_overrun = 0;
3509
3399 rb_head_page_activate(cpu_buffer); 3510 rb_head_page_activate(cpu_buffer);
3400} 3511}
3401 3512
@@ -3671,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3671 struct ring_buffer_event *event; 3782 struct ring_buffer_event *event;
3672 struct buffer_data_page *bpage; 3783 struct buffer_data_page *bpage;
3673 struct buffer_page *reader; 3784 struct buffer_page *reader;
3785 unsigned long missed_events;
3674 unsigned long flags; 3786 unsigned long flags;
3675 unsigned int commit; 3787 unsigned int commit;
3676 unsigned int read; 3788 unsigned int read;
@@ -3707,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3707 read = reader->read; 3819 read = reader->read;
3708 commit = rb_page_commit(reader); 3820 commit = rb_page_commit(reader);
3709 3821
3822 /* Check if any events were dropped */
3823 missed_events = cpu_buffer->lost_events;
3824
3710 /* 3825 /*
3711 * If this page has been partially read or 3826 * If this page has been partially read or
3712 * if len is not big enough to read the rest of the page or 3827 * if len is not big enough to read the rest of the page or
@@ -3767,9 +3882,35 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3767 local_set(&reader->entries, 0); 3882 local_set(&reader->entries, 0);
3768 reader->read = 0; 3883 reader->read = 0;
3769 *data_page = bpage; 3884 *data_page = bpage;
3885
3886 /*
3887 * Use the real_end for the data size,
3888 * This gives us a chance to store the lost events
3889 * on the page.
3890 */
3891 if (reader->real_end)
3892 local_set(&bpage->commit, reader->real_end);
3770 } 3893 }
3771 ret = read; 3894 ret = read;
3772 3895
3896 cpu_buffer->lost_events = 0;
3897 /*
3898 * Set a flag in the commit field if we lost events
3899 */
3900 if (missed_events) {
3901 commit = local_read(&bpage->commit);
3902
3903 /* If there is room at the end of the page to save the
3904 * missed events, then record it there.
3905 */
3906 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3907 memcpy(&bpage->data[commit], &missed_events,
3908 sizeof(missed_events));
3909 local_add(RB_MISSED_STORED, &bpage->commit);
3910 }
3911 local_add(RB_MISSED_EVENTS, &bpage->commit);
3912 }
3913
3773 out_unlock: 3914 out_unlock:
3774 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3915 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3775 3916
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
@@ -80,7 +81,7 @@ static enum event_status read_event(int cpu)
80 int *entry; 81 int *entry;
81 u64 ts; 82 u64 ts;
82 83
83 event = ring_buffer_consume(buffer, cpu, &ts); 84 event = ring_buffer_consume(buffer, cpu, &ts, NULL);
84 if (!event) 85 if (!event)
85 return EVENT_DROPPED; 86 return EVENT_DROPPED;
86 87
@@ -112,7 +113,8 @@ static enum event_status read_page(int cpu)
112 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); 113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
113 if (ret >= 0) { 114 if (ret >= 0) {
114 rpage = bpage; 115 rpage = bpage;
115 commit = local_read(&rpage->commit); 116 /* The commit may have missed event flags set, clear them */
117 commit = local_read(&rpage->commit) & 0xfffff;
116 for (i = 0; i < commit && !kill_test; i += inc) { 118 for (i = 0; i < commit && !kill_test; i += inc) {
117 119
118 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { 120 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 032c57ca6502..756d7283318b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -33,10 +33,10 @@
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h> 35#include <linux/rwsem.h>
36#include <linux/slab.h>
36#include <linux/ctype.h> 37#include <linux/ctype.h>
37#include <linux/init.h> 38#include <linux/init.h>
38#include <linux/poll.h> 39#include <linux/poll.h>
39#include <linux/gfp.h>
40#include <linux/fs.h> 40#include <linux/fs.h>
41 41
42#include "trace.h" 42#include "trace.h"
@@ -92,12 +92,12 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
92static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
93{ 93{
94 preempt_disable(); 94 preempt_disable();
95 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
96} 96}
97 97
98static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
99{ 99{
100 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
101 preempt_enable(); 101 preempt_enable();
102} 102}
103 103
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
117 * 117 *
118 * It is default off, but you can enable it with either specifying 118 * It is default off, but you can enable it with either specifying
119 * "ftrace_dump_on_oops" in the kernel command line, or setting 119 * "ftrace_dump_on_oops" in the kernel command line, or setting
120 * /proc/sys/kernel/ftrace_dump_on_oops to true. 120 * /proc/sys/kernel/ftrace_dump_on_oops
121 * Set 1 if you want to dump buffers of all CPUs
122 * Set 2 if you want to dump the buffer of the CPU that triggered oops
121 */ 123 */
122int ftrace_dump_on_oops; 124
125enum ftrace_dump_mode ftrace_dump_on_oops;
123 126
124static int tracing_set_tracer(const char *buf); 127static int tracing_set_tracer(const char *buf);
125 128
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
139 142
140static int __init set_ftrace_dump_on_oops(char *str) 143static int __init set_ftrace_dump_on_oops(char *str)
141{ 144{
142 ftrace_dump_on_oops = 1; 145 if (*str++ != '=' || !*str) {
143 return 1; 146 ftrace_dump_on_oops = DUMP_ALL;
147 return 1;
148 }
149
150 if (!strcmp("orig_cpu", str)) {
151 ftrace_dump_on_oops = DUMP_ORIG;
152 return 1;
153 }
154
155 return 0;
144} 156}
145__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
146 158
@@ -374,6 +386,21 @@ static int __init set_buf_size(char *str)
374} 386}
375__setup("trace_buf_size=", set_buf_size); 387__setup("trace_buf_size=", set_buf_size);
376 388
389static int __init set_tracing_thresh(char *str)
390{
391 unsigned long threshhold;
392 int ret;
393
394 if (!str)
395 return 0;
396 ret = strict_strtoul(str, 0, &threshhold);
397 if (ret < 0)
398 return 0;
399 tracing_thresh = threshhold * 1000;
400 return 1;
401}
402__setup("tracing_thresh=", set_tracing_thresh);
403
377unsigned long nsecs_to_usecs(unsigned long nsecs) 404unsigned long nsecs_to_usecs(unsigned long nsecs)
378{ 405{
379 return nsecs / 1000; 406 return nsecs / 1000;
@@ -579,9 +606,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
579static arch_spinlock_t ftrace_max_lock = 606static arch_spinlock_t ftrace_max_lock =
580 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 607 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
581 608
609unsigned long __read_mostly tracing_thresh;
610
582#ifdef CONFIG_TRACER_MAX_TRACE 611#ifdef CONFIG_TRACER_MAX_TRACE
583unsigned long __read_mostly tracing_max_latency; 612unsigned long __read_mostly tracing_max_latency;
584unsigned long __read_mostly tracing_thresh;
585 613
586/* 614/*
587 * Copy the new maximum trace into the separate maximum-trace 615 * Copy the new maximum trace into the separate maximum-trace
@@ -592,7 +620,7 @@ static void
592__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 620__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
593{ 621{
594 struct trace_array_cpu *data = tr->data[cpu]; 622 struct trace_array_cpu *data = tr->data[cpu];
595 struct trace_array_cpu *max_data = tr->data[cpu]; 623 struct trace_array_cpu *max_data;
596 624
597 max_tr.cpu = cpu; 625 max_tr.cpu = cpu;
598 max_tr.time_start = data->preempt_timestamp; 626 max_tr.time_start = data->preempt_timestamp;
@@ -602,7 +630,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
602 max_data->critical_start = data->critical_start; 630 max_data->critical_start = data->critical_start;
603 max_data->critical_end = data->critical_end; 631 max_data->critical_end = data->critical_end;
604 632
605 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 633 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
606 max_data->pid = tsk->pid; 634 max_data->pid = tsk->pid;
607 max_data->uid = task_uid(tsk); 635 max_data->uid = task_uid(tsk);
608 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 636 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -824,10 +852,10 @@ out:
824 mutex_unlock(&trace_types_lock); 852 mutex_unlock(&trace_types_lock);
825} 853}
826 854
827static void __tracing_reset(struct trace_array *tr, int cpu) 855static void __tracing_reset(struct ring_buffer *buffer, int cpu)
828{ 856{
829 ftrace_disable_cpu(); 857 ftrace_disable_cpu();
830 ring_buffer_reset_cpu(tr->buffer, cpu); 858 ring_buffer_reset_cpu(buffer, cpu);
831 ftrace_enable_cpu(); 859 ftrace_enable_cpu();
832} 860}
833 861
@@ -839,7 +867,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
839 867
840 /* Make sure all commits have finished */ 868 /* Make sure all commits have finished */
841 synchronize_sched(); 869 synchronize_sched();
842 __tracing_reset(tr, cpu); 870 __tracing_reset(buffer, cpu);
843 871
844 ring_buffer_record_enable(buffer); 872 ring_buffer_record_enable(buffer);
845} 873}
@@ -857,7 +885,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
857 tr->time_start = ftrace_now(tr->cpu); 885 tr->time_start = ftrace_now(tr->cpu);
858 886
859 for_each_online_cpu(cpu) 887 for_each_online_cpu(cpu)
860 __tracing_reset(tr, cpu); 888 __tracing_reset(buffer, cpu);
861 889
862 ring_buffer_record_enable(buffer); 890 ring_buffer_record_enable(buffer);
863} 891}
@@ -934,6 +962,8 @@ void tracing_start(void)
934 goto out; 962 goto out;
935 } 963 }
936 964
965 /* Prevent the buffers from switching */
966 arch_spin_lock(&ftrace_max_lock);
937 967
938 buffer = global_trace.buffer; 968 buffer = global_trace.buffer;
939 if (buffer) 969 if (buffer)
@@ -943,6 +973,8 @@ void tracing_start(void)
943 if (buffer) 973 if (buffer)
944 ring_buffer_record_enable(buffer); 974 ring_buffer_record_enable(buffer);
945 975
976 arch_spin_unlock(&ftrace_max_lock);
977
946 ftrace_start(); 978 ftrace_start();
947 out: 979 out:
948 spin_unlock_irqrestore(&tracing_start_lock, flags); 980 spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -964,6 +996,9 @@ void tracing_stop(void)
964 if (trace_stop_count++) 996 if (trace_stop_count++)
965 goto out; 997 goto out;
966 998
999 /* Prevent the buffers from switching */
1000 arch_spin_lock(&ftrace_max_lock);
1001
967 buffer = global_trace.buffer; 1002 buffer = global_trace.buffer;
968 if (buffer) 1003 if (buffer)
969 ring_buffer_record_disable(buffer); 1004 ring_buffer_record_disable(buffer);
@@ -972,6 +1007,8 @@ void tracing_stop(void)
972 if (buffer) 1007 if (buffer)
973 ring_buffer_record_disable(buffer); 1008 ring_buffer_record_disable(buffer);
974 1009
1010 arch_spin_unlock(&ftrace_max_lock);
1011
975 out: 1012 out:
976 spin_unlock_irqrestore(&tracing_start_lock, flags); 1013 spin_unlock_irqrestore(&tracing_start_lock, flags);
977} 1014}
@@ -1166,7 +1203,7 @@ trace_function(struct trace_array *tr,
1166 struct ftrace_entry *entry; 1203 struct ftrace_entry *entry;
1167 1204
1168 /* If we are reading the ring buffer, don't trace */ 1205 /* If we are reading the ring buffer, don't trace */
1169 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 1206 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1170 return; 1207 return;
1171 1208
1172 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1209 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1259,6 +1296,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1259 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1296 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1260 return; 1297 return;
1261 1298
1299 /*
1300 * NMIs can not handle page faults, even with fix ups.
1301 * The save user stack can (and often does) fault.
1302 */
1303 if (unlikely(in_nmi()))
1304 return;
1305
1262 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1306 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1263 sizeof(*entry), flags, pc); 1307 sizeof(*entry), flags, pc);
1264 if (!event) 1308 if (!event)
@@ -1513,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
1513} 1557}
1514 1558
1515static struct trace_entry * 1559static struct trace_entry *
1516peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) 1560peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1561 unsigned long *lost_events)
1517{ 1562{
1518 struct ring_buffer_event *event; 1563 struct ring_buffer_event *event;
1519 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1564 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1524,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1524 if (buf_iter) 1569 if (buf_iter)
1525 event = ring_buffer_iter_peek(buf_iter, ts); 1570 event = ring_buffer_iter_peek(buf_iter, ts);
1526 else 1571 else
1527 event = ring_buffer_peek(iter->tr->buffer, cpu, ts); 1572 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1573 lost_events);
1528 1574
1529 ftrace_enable_cpu(); 1575 ftrace_enable_cpu();
1530 1576
@@ -1532,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1532} 1578}
1533 1579
1534static struct trace_entry * 1580static struct trace_entry *
1535__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1581__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1582 unsigned long *missing_events, u64 *ent_ts)
1536{ 1583{
1537 struct ring_buffer *buffer = iter->tr->buffer; 1584 struct ring_buffer *buffer = iter->tr->buffer;
1538 struct trace_entry *ent, *next = NULL; 1585 struct trace_entry *ent, *next = NULL;
1586 unsigned long lost_events = 0, next_lost = 0;
1539 int cpu_file = iter->cpu_file; 1587 int cpu_file = iter->cpu_file;
1540 u64 next_ts = 0, ts; 1588 u64 next_ts = 0, ts;
1541 int next_cpu = -1; 1589 int next_cpu = -1;
@@ -1548,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1548 if (cpu_file > TRACE_PIPE_ALL_CPU) { 1596 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1549 if (ring_buffer_empty_cpu(buffer, cpu_file)) 1597 if (ring_buffer_empty_cpu(buffer, cpu_file))
1550 return NULL; 1598 return NULL;
1551 ent = peek_next_entry(iter, cpu_file, ent_ts); 1599 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
1552 if (ent_cpu) 1600 if (ent_cpu)
1553 *ent_cpu = cpu_file; 1601 *ent_cpu = cpu_file;
1554 1602
@@ -1560,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1560 if (ring_buffer_empty_cpu(buffer, cpu)) 1608 if (ring_buffer_empty_cpu(buffer, cpu))
1561 continue; 1609 continue;
1562 1610
1563 ent = peek_next_entry(iter, cpu, &ts); 1611 ent = peek_next_entry(iter, cpu, &ts, &lost_events);
1564 1612
1565 /* 1613 /*
1566 * Pick the entry with the smallest timestamp: 1614 * Pick the entry with the smallest timestamp:
@@ -1569,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1569 next = ent; 1617 next = ent;
1570 next_cpu = cpu; 1618 next_cpu = cpu;
1571 next_ts = ts; 1619 next_ts = ts;
1620 next_lost = lost_events;
1572 } 1621 }
1573 } 1622 }
1574 1623
@@ -1578,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1578 if (ent_ts) 1627 if (ent_ts)
1579 *ent_ts = next_ts; 1628 *ent_ts = next_ts;
1580 1629
1630 if (missing_events)
1631 *missing_events = next_lost;
1632
1581 return next; 1633 return next;
1582} 1634}
1583 1635
@@ -1585,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1585struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 1637struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1586 int *ent_cpu, u64 *ent_ts) 1638 int *ent_cpu, u64 *ent_ts)
1587{ 1639{
1588 return __find_next_entry(iter, ent_cpu, ent_ts); 1640 return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
1589} 1641}
1590 1642
1591/* Find the next real entry, and increment the iterator to the next entry */ 1643/* Find the next real entry, and increment the iterator to the next entry */
1592static void *find_next_entry_inc(struct trace_iterator *iter) 1644static void *find_next_entry_inc(struct trace_iterator *iter)
1593{ 1645{
1594 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1646 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts);
1595 1648
1596 if (iter->ent) 1649 if (iter->ent)
1597 trace_iterator_increment(iter); 1650 trace_iterator_increment(iter);
@@ -1603,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
1603{ 1656{
1604 /* Don't allow ftrace to trace into the ring buffers */ 1657 /* Don't allow ftrace to trace into the ring buffers */
1605 ftrace_disable_cpu(); 1658 ftrace_disable_cpu();
1606 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); 1659 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1660 &iter->lost_events);
1607 ftrace_enable_cpu(); 1661 ftrace_enable_cpu();
1608} 1662}
1609 1663
@@ -1703,6 +1757,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1703 1757
1704 ftrace_enable_cpu(); 1758 ftrace_enable_cpu();
1705 1759
1760 iter->leftover = 0;
1706 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1761 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1707 ; 1762 ;
1708 1763
@@ -1753,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
1753} 1808}
1754 1809
1755 1810
1756static void 1811void
1757print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1812print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1758{ 1813{
1759 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1814 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1962,7 +2017,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1962 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 2017 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
1963} 2018}
1964 2019
1965static int trace_empty(struct trace_iterator *iter) 2020int trace_empty(struct trace_iterator *iter)
1966{ 2021{
1967 int cpu; 2022 int cpu;
1968 2023
@@ -1997,6 +2052,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1997{ 2052{
1998 enum print_line_t ret; 2053 enum print_line_t ret;
1999 2054
2055 if (iter->lost_events)
2056 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2057 iter->cpu, iter->lost_events);
2058
2000 if (iter->trace && iter->trace->print_line) { 2059 if (iter->trace && iter->trace->print_line) {
2001 ret = iter->trace->print_line(iter); 2060 ret = iter->trace->print_line(iter);
2002 if (ret != TRACE_TYPE_UNHANDLED) 2061 if (ret != TRACE_TYPE_UNHANDLED)
@@ -2025,6 +2084,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2025 return print_trace_fmt(iter); 2084 return print_trace_fmt(iter);
2026} 2085}
2027 2086
2087void trace_default_header(struct seq_file *m)
2088{
2089 struct trace_iterator *iter = m->private;
2090
2091 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2092 /* print nothing if the buffers are empty */
2093 if (trace_empty(iter))
2094 return;
2095 print_trace_header(m, iter);
2096 if (!(trace_flags & TRACE_ITER_VERBOSE))
2097 print_lat_help_header(m);
2098 } else {
2099 if (!(trace_flags & TRACE_ITER_VERBOSE))
2100 print_func_help_header(m);
2101 }
2102}
2103
2028static int s_show(struct seq_file *m, void *v) 2104static int s_show(struct seq_file *m, void *v)
2029{ 2105{
2030 struct trace_iterator *iter = v; 2106 struct trace_iterator *iter = v;
@@ -2037,17 +2113,9 @@ static int s_show(struct seq_file *m, void *v)
2037 } 2113 }
2038 if (iter->trace && iter->trace->print_header) 2114 if (iter->trace && iter->trace->print_header)
2039 iter->trace->print_header(m); 2115 iter->trace->print_header(m);
2040 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2116 else
2041 /* print nothing if the buffers are empty */ 2117 trace_default_header(m);
2042 if (trace_empty(iter)) 2118
2043 return 0;
2044 print_trace_header(m, iter);
2045 if (!(trace_flags & TRACE_ITER_VERBOSE))
2046 print_lat_help_header(m);
2047 } else {
2048 if (!(trace_flags & TRACE_ITER_VERBOSE))
2049 print_func_help_header(m);
2050 }
2051 } else if (iter->leftover) { 2119 } else if (iter->leftover) {
2052 /* 2120 /*
2053 * If we filled the seq_file buffer earlier, we 2121 * If we filled the seq_file buffer earlier, we
@@ -2133,15 +2201,20 @@ __tracing_open(struct inode *inode, struct file *file)
2133 2201
2134 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2202 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2135 for_each_tracing_cpu(cpu) { 2203 for_each_tracing_cpu(cpu) {
2136
2137 iter->buffer_iter[cpu] = 2204 iter->buffer_iter[cpu] =
2138 ring_buffer_read_start(iter->tr->buffer, cpu); 2205 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2206 }
2207 ring_buffer_read_prepare_sync();
2208 for_each_tracing_cpu(cpu) {
2209 ring_buffer_read_start(iter->buffer_iter[cpu]);
2139 tracing_iter_reset(iter, cpu); 2210 tracing_iter_reset(iter, cpu);
2140 } 2211 }
2141 } else { 2212 } else {
2142 cpu = iter->cpu_file; 2213 cpu = iter->cpu_file;
2143 iter->buffer_iter[cpu] = 2214 iter->buffer_iter[cpu] =
2144 ring_buffer_read_start(iter->tr->buffer, cpu); 2215 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2216 ring_buffer_read_prepare_sync();
2217 ring_buffer_read_start(iter->buffer_iter[cpu]);
2145 tracing_iter_reset(iter, cpu); 2218 tracing_iter_reset(iter, cpu);
2146 } 2219 }
2147 2220
@@ -4248,10 +4321,10 @@ static __init int tracer_init_debugfs(void)
4248#ifdef CONFIG_TRACER_MAX_TRACE 4321#ifdef CONFIG_TRACER_MAX_TRACE
4249 trace_create_file("tracing_max_latency", 0644, d_tracer, 4322 trace_create_file("tracing_max_latency", 0644, d_tracer,
4250 &tracing_max_latency, &tracing_max_lat_fops); 4323 &tracing_max_latency, &tracing_max_lat_fops);
4324#endif
4251 4325
4252 trace_create_file("tracing_thresh", 0644, d_tracer, 4326 trace_create_file("tracing_thresh", 0644, d_tracer,
4253 &tracing_thresh, &tracing_max_lat_fops); 4327 &tracing_thresh, &tracing_max_lat_fops);
4254#endif
4255 4328
4256 trace_create_file("README", 0444, d_tracer, 4329 trace_create_file("README", 0444, d_tracer,
4257 NULL, &tracing_readme_fops); 4330 NULL, &tracing_readme_fops);
@@ -4291,7 +4364,7 @@ static int trace_panic_handler(struct notifier_block *this,
4291 unsigned long event, void *unused) 4364 unsigned long event, void *unused)
4292{ 4365{
4293 if (ftrace_dump_on_oops) 4366 if (ftrace_dump_on_oops)
4294 ftrace_dump(); 4367 ftrace_dump(ftrace_dump_on_oops);
4295 return NOTIFY_OK; 4368 return NOTIFY_OK;
4296} 4369}
4297 4370
@@ -4308,7 +4381,7 @@ static int trace_die_handler(struct notifier_block *self,
4308 switch (val) { 4381 switch (val) {
4309 case DIE_OOPS: 4382 case DIE_OOPS:
4310 if (ftrace_dump_on_oops) 4383 if (ftrace_dump_on_oops)
4311 ftrace_dump(); 4384 ftrace_dump(ftrace_dump_on_oops);
4312 break; 4385 break;
4313 default: 4386 default:
4314 break; 4387 break;
@@ -4349,7 +4422,8 @@ trace_printk_seq(struct trace_seq *s)
4349 trace_seq_init(s); 4422 trace_seq_init(s);
4350} 4423}
4351 4424
4352static void __ftrace_dump(bool disable_tracing) 4425static void
4426__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4353{ 4427{
4354 static arch_spinlock_t ftrace_dump_lock = 4428 static arch_spinlock_t ftrace_dump_lock =
4355 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 4429 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4382,12 +4456,25 @@ static void __ftrace_dump(bool disable_tracing)
4382 /* don't look at user memory in panic mode */ 4456 /* don't look at user memory in panic mode */
4383 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4457 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4384 4458
4385 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4386
4387 /* Simulate the iterator */ 4459 /* Simulate the iterator */
4388 iter.tr = &global_trace; 4460 iter.tr = &global_trace;
4389 iter.trace = current_trace; 4461 iter.trace = current_trace;
4390 iter.cpu_file = TRACE_PIPE_ALL_CPU; 4462
4463 switch (oops_dump_mode) {
4464 case DUMP_ALL:
4465 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4466 break;
4467 case DUMP_ORIG:
4468 iter.cpu_file = raw_smp_processor_id();
4469 break;
4470 case DUMP_NONE:
4471 goto out_enable;
4472 default:
4473 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
4474 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4475 }
4476
4477 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4391 4478
4392 /* 4479 /*
4393 * We need to stop all tracing on all CPUS to read the 4480 * We need to stop all tracing on all CPUS to read the
@@ -4426,6 +4513,7 @@ static void __ftrace_dump(bool disable_tracing)
4426 else 4513 else
4427 printk(KERN_TRACE "---------------------------------\n"); 4514 printk(KERN_TRACE "---------------------------------\n");
4428 4515
4516 out_enable:
4429 /* Re-enable tracing if requested */ 4517 /* Re-enable tracing if requested */
4430 if (!disable_tracing) { 4518 if (!disable_tracing) {
4431 trace_flags |= old_userobj; 4519 trace_flags |= old_userobj;
@@ -4442,9 +4530,9 @@ static void __ftrace_dump(bool disable_tracing)
4442} 4530}
4443 4531
4444/* By default: disable tracing after the dump */ 4532/* By default: disable tracing after the dump */
4445void ftrace_dump(void) 4533void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4446{ 4534{
4447 __ftrace_dump(true); 4535 __ftrace_dump(true, oops_dump_mode);
4448} 4536}
4449 4537
4450__init static int tracer_alloc_buffers(void) 4538__init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd05bcaf91b0..d1ce0bec1b3f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
34 TRACE_GRAPH_RET, 34 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 35 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 36 TRACE_USER_STACK,
37 TRACE_HW_BRANCHES,
38 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
39 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
40 TRACE_BLK, 39 TRACE_BLK,
@@ -103,29 +102,17 @@ struct syscall_trace_exit {
103 long ret; 102 long ret;
104}; 103};
105 104
106struct kprobe_trace_entry { 105struct kprobe_trace_entry_head {
107 struct trace_entry ent; 106 struct trace_entry ent;
108 unsigned long ip; 107 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111}; 108};
112 109
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ 110struct kretprobe_trace_entry_head {
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent; 111 struct trace_entry ent;
119 unsigned long func; 112 unsigned long func;
120 unsigned long ret_ip; 113 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123}; 114};
124 115
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
129/* 116/*
130 * trace_flag_type is an enumeration that holds different 117 * trace_flag_type is an enumeration that holds different
131 * states when a trace occurs. These are: 118 * states when a trace occurs. These are:
@@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void);
229 TRACE_GRAPH_ENT); \ 216 TRACE_GRAPH_ENT); \
230 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
231 TRACE_GRAPH_RET); \ 218 TRACE_GRAPH_RET); \
232 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
233 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
234 TRACE_KMEM_ALLOC); \ 220 TRACE_KMEM_ALLOC); \
235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -378,6 +364,9 @@ void trace_function(struct trace_array *tr,
378 unsigned long ip, 364 unsigned long ip,
379 unsigned long parent_ip, 365 unsigned long parent_ip,
380 unsigned long flags, int pc); 366 unsigned long flags, int pc);
367void trace_default_header(struct seq_file *m);
368void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
369int trace_empty(struct trace_iterator *iter);
381 370
382void trace_graph_return(struct ftrace_graph_ret *trace); 371void trace_graph_return(struct ftrace_graph_ret *trace);
383int trace_graph_entry(struct ftrace_graph_ent *trace); 372int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -396,9 +385,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396 385
397extern unsigned long nsecs_to_usecs(unsigned long nsecs); 386extern unsigned long nsecs_to_usecs(unsigned long nsecs);
398 387
388extern unsigned long tracing_thresh;
389
399#ifdef CONFIG_TRACER_MAX_TRACE 390#ifdef CONFIG_TRACER_MAX_TRACE
400extern unsigned long tracing_max_latency; 391extern unsigned long tracing_max_latency;
401extern unsigned long tracing_thresh;
402 392
403void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 393void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
404void update_max_tr_single(struct trace_array *tr, 394void update_max_tr_single(struct trace_array *tr,
@@ -466,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
466 struct trace_array *tr); 456 struct trace_array *tr);
467extern int trace_selftest_startup_branch(struct tracer *trace, 457extern int trace_selftest_startup_branch(struct tracer *trace,
468 struct trace_array *tr); 458 struct trace_array *tr);
469extern int trace_selftest_startup_hw_branches(struct tracer *trace,
470 struct trace_array *tr);
471extern int trace_selftest_startup_ksym(struct tracer *trace, 459extern int trace_selftest_startup_ksym(struct tracer *trace,
472 struct trace_array *tr); 460 struct trace_array *tr);
473#endif /* CONFIG_FTRACE_STARTUP_TEST */ 461#endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -490,9 +478,29 @@ extern int trace_clock_id;
490 478
491/* Standard output formatting function used for function return traces */ 479/* Standard output formatting function used for function return traces */
492#ifdef CONFIG_FUNCTION_GRAPH_TRACER 480#ifdef CONFIG_FUNCTION_GRAPH_TRACER
493extern enum print_line_t print_graph_function(struct trace_iterator *iter); 481
482/* Flag options */
483#define TRACE_GRAPH_PRINT_OVERRUN 0x1
484#define TRACE_GRAPH_PRINT_CPU 0x2
485#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
486#define TRACE_GRAPH_PRINT_PROC 0x8
487#define TRACE_GRAPH_PRINT_DURATION 0x10
488#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
489
490extern enum print_line_t
491print_graph_function_flags(struct trace_iterator *iter, u32 flags);
492extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
494extern enum print_line_t 493extern enum print_line_t
495trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 494trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
495extern void graph_trace_open(struct trace_iterator *iter);
496extern void graph_trace_close(struct trace_iterator *iter);
497extern int __trace_graph_entry(struct trace_array *tr,
498 struct ftrace_graph_ent *trace,
499 unsigned long flags, int pc);
500extern void __trace_graph_return(struct trace_array *tr,
501 struct ftrace_graph_ret *trace,
502 unsigned long flags, int pc);
503
496 504
497#ifdef CONFIG_DYNAMIC_FTRACE 505#ifdef CONFIG_DYNAMIC_FTRACE
498/* TODO: make this variable */ 506/* TODO: make this variable */
@@ -523,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
523#endif /* CONFIG_DYNAMIC_FTRACE */ 531#endif /* CONFIG_DYNAMIC_FTRACE */
524#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 532#else /* CONFIG_FUNCTION_GRAPH_TRACER */
525static inline enum print_line_t 533static inline enum print_line_t
526print_graph_function(struct trace_iterator *iter) 534print_graph_function_flags(struct trace_iterator *iter, u32 flags)
527{ 535{
528 return TRACE_TYPE_UNHANDLED; 536 return TRACE_TYPE_UNHANDLED;
529} 537}
@@ -550,7 +558,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
550 * struct trace_parser - servers for reading the user input separated by spaces 558 * struct trace_parser - servers for reading the user input separated by spaces
551 * @cont: set if the input is not complete - no final space char was found 559 * @cont: set if the input is not complete - no final space char was found
552 * @buffer: holds the parsed user input 560 * @buffer: holds the parsed user input
553 * @idx: user input lenght 561 * @idx: user input length
554 * @size: buffer size 562 * @size: buffer size
555 */ 563 */
556struct trace_parser { 564struct trace_parser {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
13 * Tracer plugins will chose a default from these clocks. 13 * Tracer plugins will chose a default from these clocks.
14 */ 14 */
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/irqflags.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
@@ -83,7 +84,7 @@ u64 notrace trace_clock_global(void)
83 int this_cpu; 84 int this_cpu;
84 u64 now; 85 u64 now;
85 86
86 raw_local_irq_save(flags); 87 local_irq_save(flags);
87 88
88 this_cpu = raw_smp_processor_id(); 89 this_cpu = raw_smp_processor_id();
89 now = cpu_clock(this_cpu); 90 now = cpu_clock(this_cpu);
@@ -109,7 +110,7 @@ u64 notrace trace_clock_global(void)
109 arch_spin_unlock(&trace_clock_struct.lock); 110 arch_spin_unlock(&trace_clock_struct.lock);
110 111
111 out: 112 out:
112 raw_local_irq_restore(flags); 113 local_irq_restore(flags);
113 114
114 return now; 115 return now;
115} 116}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 318 __entry->func, __entry->file, __entry->correct)
319); 319);
320 320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, 321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334 322
335 TRACE_KMEM_ALLOC, 323 TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c
index f0d693005075..0565bb42566f 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_perf.c
@@ -1,32 +1,41 @@
1/* 1/*
2 * trace event based perf counter profiling 2 * trace event based perf event profiling/tracing
3 * 3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> 4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 * 5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
12 16
13static char *perf_trace_buf; 17static char *perf_trace_buf;
14static char *perf_trace_buf_nmi; 18static char *perf_trace_buf_nmi;
15 19
16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 20/*
21 * Force it to be aligned to unsigned long to avoid misaligned accesses
22 * suprises
23 */
24typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
25 perf_trace_t;
17 26
18/* Count the events in use (per event id, not per instance) */ 27/* Count the events in use (per event id, not per instance) */
19static int total_profile_count; 28static int total_ref_count;
20 29
21static int ftrace_profile_enable_event(struct ftrace_event_call *event) 30static int perf_trace_event_enable(struct ftrace_event_call *event)
22{ 31{
23 char *buf; 32 char *buf;
24 int ret = -ENOMEM; 33 int ret = -ENOMEM;
25 34
26 if (event->profile_count++ > 0) 35 if (event->perf_refcount++ > 0)
27 return 0; 36 return 0;
28 37
29 if (!total_profile_count) { 38 if (!total_ref_count) {
30 buf = (char *)alloc_percpu(perf_trace_t); 39 buf = (char *)alloc_percpu(perf_trace_t);
31 if (!buf) 40 if (!buf)
32 goto fail_buf; 41 goto fail_buf;
@@ -40,35 +49,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
40 rcu_assign_pointer(perf_trace_buf_nmi, buf); 49 rcu_assign_pointer(perf_trace_buf_nmi, buf);
41 } 50 }
42 51
43 ret = event->profile_enable(event); 52 ret = event->perf_event_enable(event);
44 if (!ret) { 53 if (!ret) {
45 total_profile_count++; 54 total_ref_count++;
46 return 0; 55 return 0;
47 } 56 }
48 57
49fail_buf_nmi: 58fail_buf_nmi:
50 if (!total_profile_count) { 59 if (!total_ref_count) {
51 free_percpu(perf_trace_buf_nmi); 60 free_percpu(perf_trace_buf_nmi);
52 free_percpu(perf_trace_buf); 61 free_percpu(perf_trace_buf);
53 perf_trace_buf_nmi = NULL; 62 perf_trace_buf_nmi = NULL;
54 perf_trace_buf = NULL; 63 perf_trace_buf = NULL;
55 } 64 }
56fail_buf: 65fail_buf:
57 event->profile_count--; 66 event->perf_refcount--;
58 67
59 return ret; 68 return ret;
60} 69}
61 70
62int ftrace_profile_enable(int event_id) 71int perf_trace_enable(int event_id)
63{ 72{
64 struct ftrace_event_call *event; 73 struct ftrace_event_call *event;
65 int ret = -EINVAL; 74 int ret = -EINVAL;
66 75
67 mutex_lock(&event_mutex); 76 mutex_lock(&event_mutex);
68 list_for_each_entry(event, &ftrace_events, list) { 77 list_for_each_entry(event, &ftrace_events, list) {
69 if (event->id == event_id && event->profile_enable && 78 if (event->id == event_id && event->perf_event_enable &&
70 try_module_get(event->mod)) { 79 try_module_get(event->mod)) {
71 ret = ftrace_profile_enable_event(event); 80 ret = perf_trace_event_enable(event);
72 break; 81 break;
73 } 82 }
74 } 83 }
@@ -77,16 +86,16 @@ int ftrace_profile_enable(int event_id)
77 return ret; 86 return ret;
78} 87}
79 88
80static void ftrace_profile_disable_event(struct ftrace_event_call *event) 89static void perf_trace_event_disable(struct ftrace_event_call *event)
81{ 90{
82 char *buf, *nmi_buf; 91 char *buf, *nmi_buf;
83 92
84 if (--event->profile_count > 0) 93 if (--event->perf_refcount > 0)
85 return; 94 return;
86 95
87 event->profile_disable(event); 96 event->perf_event_disable(event);
88 97
89 if (!--total_profile_count) { 98 if (!--total_ref_count) {
90 buf = perf_trace_buf; 99 buf = perf_trace_buf;
91 rcu_assign_pointer(perf_trace_buf, NULL); 100 rcu_assign_pointer(perf_trace_buf, NULL);
92 101
@@ -104,14 +113,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
104 } 113 }
105} 114}
106 115
107void ftrace_profile_disable(int event_id) 116void perf_trace_disable(int event_id)
108{ 117{
109 struct ftrace_event_call *event; 118 struct ftrace_event_call *event;
110 119
111 mutex_lock(&event_mutex); 120 mutex_lock(&event_mutex);
112 list_for_each_entry(event, &ftrace_events, list) { 121 list_for_each_entry(event, &ftrace_events, list) {
113 if (event->id == event_id) { 122 if (event->id == event_id) {
114 ftrace_profile_disable_event(event); 123 perf_trace_event_disable(event);
115 module_put(event->mod); 124 module_put(event->mod);
116 break; 125 break;
117 } 126 }
@@ -119,13 +128,15 @@ void ftrace_profile_disable(int event_id)
119 mutex_unlock(&event_mutex); 128 mutex_unlock(&event_mutex);
120} 129}
121 130
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, 131__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags) 132 int *rctxp, unsigned long *irq_flags)
124{ 133{
125 struct trace_entry *entry; 134 struct trace_entry *entry;
126 char *trace_buf, *raw_data; 135 char *trace_buf, *raw_data;
127 int pc, cpu; 136 int pc, cpu;
128 137
138 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
139
129 pc = preempt_count(); 140 pc = preempt_count();
130 141
131 /* Protect the per cpu buffer, begin the rcu read side */ 142 /* Protect the per cpu buffer, begin the rcu read side */
@@ -138,9 +149,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
138 cpu = smp_processor_id(); 149 cpu = smp_processor_id();
139 150
140 if (in_nmi()) 151 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi); 152 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
142 else 153 else
143 trace_buf = rcu_dereference(perf_trace_buf); 154 trace_buf = rcu_dereference_sched(perf_trace_buf);
144 155
145 if (!trace_buf) 156 if (!trace_buf)
146 goto err; 157 goto err;
@@ -148,7 +159,7 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
148 raw_data = per_cpu_ptr(trace_buf, cpu); 159 raw_data = per_cpu_ptr(trace_buf, cpu);
149 160
150 /* zero the dead bytes from align to not leak stack to user */ 161 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 162 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
152 163
153 entry = (struct trace_entry *)raw_data; 164 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc); 165 tracing_generic_entry_update(entry, *irq_flags, pc);
@@ -161,4 +172,4 @@ err_recursion:
161 local_irq_restore(*irq_flags); 172 local_irq_restore(*irq_flags);
162 return NULL; 173 return NULL;
163} 174}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); 175EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3f972ad98d04..c697c7043349 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/slab.h>
18#include <linux/delay.h> 19#include <linux/delay.h>
19 20
20#include <asm/setup.h> 21#include <asm/setup.h>
@@ -938,7 +939,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
938 trace_create_file("enable", 0644, call->dir, call, 939 trace_create_file("enable", 0644, call->dir, call,
939 enable); 940 enable);
940 941
941 if (call->id && call->profile_enable) 942 if (call->id && call->perf_event_enable)
942 trace_create_file("id", 0444, call->dir, call, 943 trace_create_file("id", 0444, call->dir, call,
943 id); 944 id);
944 945
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4615f62a04f1..58092d844a1f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,6 +22,7 @@
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/slab.h>
25 26
26#include "trace.h" 27#include "trace.h"
27#include "trace_output.h" 28#include "trace_output.h"
@@ -1397,7 +1398,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1397 } 1398 }
1398 1399
1399 err = -EINVAL; 1400 err = -EINVAL;
1400 if (!call) 1401 if (&call->list == &ftrace_events)
1401 goto out_unlock; 1402 goto out_unlock;
1402 1403
1403 err = -EEXIST; 1404 err = -EEXIST;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e998a824e9db..dd11c830eb84 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/slab.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
14#include "trace.h" 15#include "trace.h"
@@ -39,7 +40,7 @@ struct fgraph_data {
39#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
40#define TRACE_GRAPH_PRINT_PROC 0x8 41#define TRACE_GRAPH_PRINT_PROC 0x8
41#define TRACE_GRAPH_PRINT_DURATION 0x10 42#define TRACE_GRAPH_PRINT_DURATION 0x10
42#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
43 44
44static struct tracer_opt trace_opts[] = { 45static struct tracer_opt trace_opts[] = {
45 /* Display overruns? (for self-debug purpose) */ 46 /* Display overruns? (for self-debug purpose) */
@@ -178,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
178 return ret; 179 return ret;
179} 180}
180 181
181static int __trace_graph_entry(struct trace_array *tr, 182int __trace_graph_entry(struct trace_array *tr,
182 struct ftrace_graph_ent *trace, 183 struct ftrace_graph_ent *trace,
183 unsigned long flags, 184 unsigned long flags,
184 int pc) 185 int pc)
@@ -188,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr,
188 struct ring_buffer *buffer = tr->buffer; 189 struct ring_buffer *buffer = tr->buffer;
189 struct ftrace_graph_ent_entry *entry; 190 struct ftrace_graph_ent_entry *entry;
190 191
191 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 192 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
192 return 0; 193 return 0;
193 194
194 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 195 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -237,7 +238,15 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
237 return ret; 238 return ret;
238} 239}
239 240
240static void __trace_graph_return(struct trace_array *tr, 241int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
242{
243 if (tracing_thresh)
244 return 1;
245 else
246 return trace_graph_entry(trace);
247}
248
249void __trace_graph_return(struct trace_array *tr,
241 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
242 unsigned long flags, 251 unsigned long flags,
243 int pc) 252 int pc)
@@ -247,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr,
247 struct ring_buffer *buffer = tr->buffer; 256 struct ring_buffer *buffer = tr->buffer;
248 struct ftrace_graph_ret_entry *entry; 257 struct ftrace_graph_ret_entry *entry;
249 258
250 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 259 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
251 return; 260 return;
252 261
253 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 262 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -290,13 +299,26 @@ void set_graph_array(struct trace_array *tr)
290 smp_mb(); 299 smp_mb();
291} 300}
292 301
302void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
303{
304 if (tracing_thresh &&
305 (trace->rettime - trace->calltime < tracing_thresh))
306 return;
307 else
308 trace_graph_return(trace);
309}
310
293static int graph_trace_init(struct trace_array *tr) 311static int graph_trace_init(struct trace_array *tr)
294{ 312{
295 int ret; 313 int ret;
296 314
297 set_graph_array(tr); 315 set_graph_array(tr);
298 ret = register_ftrace_graph(&trace_graph_return, 316 if (tracing_thresh)
299 &trace_graph_entry); 317 ret = register_ftrace_graph(&trace_graph_thresh_return,
318 &trace_graph_thresh_entry);
319 else
320 ret = register_ftrace_graph(&trace_graph_return,
321 &trace_graph_entry);
300 if (ret) 322 if (ret)
301 return ret; 323 return ret;
302 tracing_start_cmdline_record(); 324 tracing_start_cmdline_record();
@@ -468,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
468 * We need to consume the current entry to see 490 * We need to consume the current entry to see
469 * the next one. 491 * the next one.
470 */ 492 */
471 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 493 ring_buffer_consume(iter->tr->buffer, iter->cpu,
494 NULL, NULL);
472 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 495 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
473 NULL); 496 NULL, NULL);
474 } 497 }
475 498
476 if (!event) 499 if (!event)
@@ -504,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
504 527
505/* Signal a overhead of time execution to the output */ 528/* Signal a overhead of time execution to the output */
506static int 529static int
507print_graph_overhead(unsigned long long duration, struct trace_seq *s) 530print_graph_overhead(unsigned long long duration, struct trace_seq *s,
531 u32 flags)
508{ 532{
509 /* If duration disappear, we don't need anything */ 533 /* If duration disappear, we don't need anything */
510 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) 534 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
511 return 1; 535 return 1;
512 536
513 /* Non nested entry or return */ 537 /* Non nested entry or return */
514 if (duration == -1) 538 if (duration == -1)
515 return trace_seq_printf(s, " "); 539 return trace_seq_printf(s, " ");
516 540
517 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 541 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
518 /* Duration exceeded 100 msecs */ 542 /* Duration exceeded 100 msecs */
519 if (duration > 100000ULL) 543 if (duration > 100000ULL)
520 return trace_seq_printf(s, "! "); 544 return trace_seq_printf(s, "! ");
@@ -540,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
540 564
541static enum print_line_t 565static enum print_line_t
542print_graph_irq(struct trace_iterator *iter, unsigned long addr, 566print_graph_irq(struct trace_iterator *iter, unsigned long addr,
543 enum trace_type type, int cpu, pid_t pid) 567 enum trace_type type, int cpu, pid_t pid, u32 flags)
544{ 568{
545 int ret; 569 int ret;
546 struct trace_seq *s = &iter->seq; 570 struct trace_seq *s = &iter->seq;
@@ -550,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
550 return TRACE_TYPE_UNHANDLED; 574 return TRACE_TYPE_UNHANDLED;
551 575
552 /* Absolute time */ 576 /* Absolute time */
553 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 577 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
554 ret = print_graph_abs_time(iter->ts, s); 578 ret = print_graph_abs_time(iter->ts, s);
555 if (!ret) 579 if (!ret)
556 return TRACE_TYPE_PARTIAL_LINE; 580 return TRACE_TYPE_PARTIAL_LINE;
557 } 581 }
558 582
559 /* Cpu */ 583 /* Cpu */
560 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 584 if (flags & TRACE_GRAPH_PRINT_CPU) {
561 ret = print_graph_cpu(s, cpu); 585 ret = print_graph_cpu(s, cpu);
562 if (ret == TRACE_TYPE_PARTIAL_LINE) 586 if (ret == TRACE_TYPE_PARTIAL_LINE)
563 return TRACE_TYPE_PARTIAL_LINE; 587 return TRACE_TYPE_PARTIAL_LINE;
564 } 588 }
565 589
566 /* Proc */ 590 /* Proc */
567 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 591 if (flags & TRACE_GRAPH_PRINT_PROC) {
568 ret = print_graph_proc(s, pid); 592 ret = print_graph_proc(s, pid);
569 if (ret == TRACE_TYPE_PARTIAL_LINE) 593 if (ret == TRACE_TYPE_PARTIAL_LINE)
570 return TRACE_TYPE_PARTIAL_LINE; 594 return TRACE_TYPE_PARTIAL_LINE;
@@ -574,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
574 } 598 }
575 599
576 /* No overhead */ 600 /* No overhead */
577 ret = print_graph_overhead(-1, s); 601 ret = print_graph_overhead(-1, s, flags);
578 if (!ret) 602 if (!ret)
579 return TRACE_TYPE_PARTIAL_LINE; 603 return TRACE_TYPE_PARTIAL_LINE;
580 604
@@ -587,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
587 return TRACE_TYPE_PARTIAL_LINE; 611 return TRACE_TYPE_PARTIAL_LINE;
588 612
589 /* Don't close the duration column if haven't one */ 613 /* Don't close the duration column if haven't one */
590 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 614 if (flags & TRACE_GRAPH_PRINT_DURATION)
591 trace_seq_printf(s, " |"); 615 trace_seq_printf(s, " |");
592 ret = trace_seq_printf(s, "\n"); 616 ret = trace_seq_printf(s, "\n");
593 617
@@ -657,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
657static enum print_line_t 681static enum print_line_t
658print_graph_entry_leaf(struct trace_iterator *iter, 682print_graph_entry_leaf(struct trace_iterator *iter,
659 struct ftrace_graph_ent_entry *entry, 683 struct ftrace_graph_ent_entry *entry,
660 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) 684 struct ftrace_graph_ret_entry *ret_entry,
685 struct trace_seq *s, u32 flags)
661{ 686{
662 struct fgraph_data *data = iter->private; 687 struct fgraph_data *data = iter->private;
663 struct ftrace_graph_ret *graph_ret; 688 struct ftrace_graph_ret *graph_ret;
@@ -689,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
689 } 714 }
690 715
691 /* Overhead */ 716 /* Overhead */
692 ret = print_graph_overhead(duration, s); 717 ret = print_graph_overhead(duration, s, flags);
693 if (!ret) 718 if (!ret)
694 return TRACE_TYPE_PARTIAL_LINE; 719 return TRACE_TYPE_PARTIAL_LINE;
695 720
696 /* Duration */ 721 /* Duration */
697 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 722 if (flags & TRACE_GRAPH_PRINT_DURATION) {
698 ret = print_graph_duration(duration, s); 723 ret = print_graph_duration(duration, s);
699 if (ret == TRACE_TYPE_PARTIAL_LINE) 724 if (ret == TRACE_TYPE_PARTIAL_LINE)
700 return TRACE_TYPE_PARTIAL_LINE; 725 return TRACE_TYPE_PARTIAL_LINE;
@@ -717,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
717static enum print_line_t 742static enum print_line_t
718print_graph_entry_nested(struct trace_iterator *iter, 743print_graph_entry_nested(struct trace_iterator *iter,
719 struct ftrace_graph_ent_entry *entry, 744 struct ftrace_graph_ent_entry *entry,
720 struct trace_seq *s, int cpu) 745 struct trace_seq *s, int cpu, u32 flags)
721{ 746{
722 struct ftrace_graph_ent *call = &entry->graph_ent; 747 struct ftrace_graph_ent *call = &entry->graph_ent;
723 struct fgraph_data *data = iter->private; 748 struct fgraph_data *data = iter->private;
@@ -737,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
737 } 762 }
738 763
739 /* No overhead */ 764 /* No overhead */
740 ret = print_graph_overhead(-1, s); 765 ret = print_graph_overhead(-1, s, flags);
741 if (!ret) 766 if (!ret)
742 return TRACE_TYPE_PARTIAL_LINE; 767 return TRACE_TYPE_PARTIAL_LINE;
743 768
744 /* No time */ 769 /* No time */
745 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 770 if (flags & TRACE_GRAPH_PRINT_DURATION) {
746 ret = trace_seq_printf(s, " | "); 771 ret = trace_seq_printf(s, " | ");
747 if (!ret) 772 if (!ret)
748 return TRACE_TYPE_PARTIAL_LINE; 773 return TRACE_TYPE_PARTIAL_LINE;
@@ -768,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
768 793
769static enum print_line_t 794static enum print_line_t
770print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 795print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
771 int type, unsigned long addr) 796 int type, unsigned long addr, u32 flags)
772{ 797{
773 struct fgraph_data *data = iter->private; 798 struct fgraph_data *data = iter->private;
774 struct trace_entry *ent = iter->ent; 799 struct trace_entry *ent = iter->ent;
@@ -781,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
781 806
782 if (type) { 807 if (type) {
783 /* Interrupt */ 808 /* Interrupt */
784 ret = print_graph_irq(iter, addr, type, cpu, ent->pid); 809 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
785 if (ret == TRACE_TYPE_PARTIAL_LINE) 810 if (ret == TRACE_TYPE_PARTIAL_LINE)
786 return TRACE_TYPE_PARTIAL_LINE; 811 return TRACE_TYPE_PARTIAL_LINE;
787 } 812 }
788 813
789 /* Absolute time */ 814 /* Absolute time */
790 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 815 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
791 ret = print_graph_abs_time(iter->ts, s); 816 ret = print_graph_abs_time(iter->ts, s);
792 if (!ret) 817 if (!ret)
793 return TRACE_TYPE_PARTIAL_LINE; 818 return TRACE_TYPE_PARTIAL_LINE;
794 } 819 }
795 820
796 /* Cpu */ 821 /* Cpu */
797 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 822 if (flags & TRACE_GRAPH_PRINT_CPU) {
798 ret = print_graph_cpu(s, cpu); 823 ret = print_graph_cpu(s, cpu);
799 if (ret == TRACE_TYPE_PARTIAL_LINE) 824 if (ret == TRACE_TYPE_PARTIAL_LINE)
800 return TRACE_TYPE_PARTIAL_LINE; 825 return TRACE_TYPE_PARTIAL_LINE;
801 } 826 }
802 827
803 /* Proc */ 828 /* Proc */
804 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 829 if (flags & TRACE_GRAPH_PRINT_PROC) {
805 ret = print_graph_proc(s, ent->pid); 830 ret = print_graph_proc(s, ent->pid);
806 if (ret == TRACE_TYPE_PARTIAL_LINE) 831 if (ret == TRACE_TYPE_PARTIAL_LINE)
807 return TRACE_TYPE_PARTIAL_LINE; 832 return TRACE_TYPE_PARTIAL_LINE;
@@ -823,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
823 848
824static enum print_line_t 849static enum print_line_t
825print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 850print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
826 struct trace_iterator *iter) 851 struct trace_iterator *iter, u32 flags)
827{ 852{
828 struct fgraph_data *data = iter->private; 853 struct fgraph_data *data = iter->private;
829 struct ftrace_graph_ent *call = &field->graph_ent; 854 struct ftrace_graph_ent *call = &field->graph_ent;
@@ -831,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
831 static enum print_line_t ret; 856 static enum print_line_t ret;
832 int cpu = iter->cpu; 857 int cpu = iter->cpu;
833 858
834 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 859 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
835 return TRACE_TYPE_PARTIAL_LINE; 860 return TRACE_TYPE_PARTIAL_LINE;
836 861
837 leaf_ret = get_return_for_leaf(iter, field); 862 leaf_ret = get_return_for_leaf(iter, field);
838 if (leaf_ret) 863 if (leaf_ret)
839 ret = print_graph_entry_leaf(iter, field, leaf_ret, s); 864 ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
840 else 865 else
841 ret = print_graph_entry_nested(iter, field, s, cpu); 866 ret = print_graph_entry_nested(iter, field, s, cpu, flags);
842 867
843 if (data) { 868 if (data) {
844 /* 869 /*
@@ -857,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
857 882
858static enum print_line_t 883static enum print_line_t
859print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 884print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
860 struct trace_entry *ent, struct trace_iterator *iter) 885 struct trace_entry *ent, struct trace_iterator *iter,
886 u32 flags)
861{ 887{
862 unsigned long long duration = trace->rettime - trace->calltime; 888 unsigned long long duration = trace->rettime - trace->calltime;
863 struct fgraph_data *data = iter->private; 889 struct fgraph_data *data = iter->private;
@@ -887,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
887 } 913 }
888 } 914 }
889 915
890 if (print_graph_prologue(iter, s, 0, 0)) 916 if (print_graph_prologue(iter, s, 0, 0, flags))
891 return TRACE_TYPE_PARTIAL_LINE; 917 return TRACE_TYPE_PARTIAL_LINE;
892 918
893 /* Overhead */ 919 /* Overhead */
894 ret = print_graph_overhead(duration, s); 920 ret = print_graph_overhead(duration, s, flags);
895 if (!ret) 921 if (!ret)
896 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
897 923
898 /* Duration */ 924 /* Duration */
899 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 925 if (flags & TRACE_GRAPH_PRINT_DURATION) {
900 ret = print_graph_duration(duration, s); 926 ret = print_graph_duration(duration, s);
901 if (ret == TRACE_TYPE_PARTIAL_LINE) 927 if (ret == TRACE_TYPE_PARTIAL_LINE)
902 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -920,20 +946,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
920 if (!ret) 946 if (!ret)
921 return TRACE_TYPE_PARTIAL_LINE; 947 return TRACE_TYPE_PARTIAL_LINE;
922 } else { 948 } else {
923 ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); 949 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
924 if (!ret) 950 if (!ret)
925 return TRACE_TYPE_PARTIAL_LINE; 951 return TRACE_TYPE_PARTIAL_LINE;
926 } 952 }
927 953
928 /* Overrun */ 954 /* Overrun */
929 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 955 if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
930 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 956 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
931 trace->overrun); 957 trace->overrun);
932 if (!ret) 958 if (!ret)
933 return TRACE_TYPE_PARTIAL_LINE; 959 return TRACE_TYPE_PARTIAL_LINE;
934 } 960 }
935 961
936 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); 962 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
963 cpu, pid, flags);
937 if (ret == TRACE_TYPE_PARTIAL_LINE) 964 if (ret == TRACE_TYPE_PARTIAL_LINE)
938 return TRACE_TYPE_PARTIAL_LINE; 965 return TRACE_TYPE_PARTIAL_LINE;
939 966
@@ -941,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
941} 968}
942 969
943static enum print_line_t 970static enum print_line_t
944print_graph_comment(struct trace_seq *s, struct trace_entry *ent, 971print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
945 struct trace_iterator *iter) 972 struct trace_iterator *iter, u32 flags)
946{ 973{
947 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 974 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
948 struct fgraph_data *data = iter->private; 975 struct fgraph_data *data = iter->private;
@@ -954,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
954 if (data) 981 if (data)
955 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 982 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
956 983
957 if (print_graph_prologue(iter, s, 0, 0)) 984 if (print_graph_prologue(iter, s, 0, 0, flags))
958 return TRACE_TYPE_PARTIAL_LINE; 985 return TRACE_TYPE_PARTIAL_LINE;
959 986
960 /* No overhead */ 987 /* No overhead */
961 ret = print_graph_overhead(-1, s); 988 ret = print_graph_overhead(-1, s, flags);
962 if (!ret) 989 if (!ret)
963 return TRACE_TYPE_PARTIAL_LINE; 990 return TRACE_TYPE_PARTIAL_LINE;
964 991
965 /* No time */ 992 /* No time */
966 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 993 if (flags & TRACE_GRAPH_PRINT_DURATION) {
967 ret = trace_seq_printf(s, " | "); 994 ret = trace_seq_printf(s, " | ");
968 if (!ret) 995 if (!ret)
969 return TRACE_TYPE_PARTIAL_LINE; 996 return TRACE_TYPE_PARTIAL_LINE;
@@ -1018,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1018 1045
1019 1046
1020enum print_line_t 1047enum print_line_t
1021print_graph_function(struct trace_iterator *iter) 1048print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1022{ 1049{
1023 struct ftrace_graph_ent_entry *field; 1050 struct ftrace_graph_ent_entry *field;
1024 struct fgraph_data *data = iter->private; 1051 struct fgraph_data *data = iter->private;
@@ -1039,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
1039 if (data && data->failed) { 1066 if (data && data->failed) {
1040 field = &data->ent; 1067 field = &data->ent;
1041 iter->cpu = data->cpu; 1068 iter->cpu = data->cpu;
1042 ret = print_graph_entry(field, s, iter); 1069 ret = print_graph_entry(field, s, iter, flags);
1043 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { 1070 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1044 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; 1071 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1045 ret = TRACE_TYPE_NO_CONSUME; 1072 ret = TRACE_TYPE_NO_CONSUME;
@@ -1059,32 +1086,49 @@ print_graph_function(struct trace_iterator *iter)
1059 struct ftrace_graph_ent_entry saved; 1086 struct ftrace_graph_ent_entry saved;
1060 trace_assign_type(field, entry); 1087 trace_assign_type(field, entry);
1061 saved = *field; 1088 saved = *field;
1062 return print_graph_entry(&saved, s, iter); 1089 return print_graph_entry(&saved, s, iter, flags);
1063 } 1090 }
1064 case TRACE_GRAPH_RET: { 1091 case TRACE_GRAPH_RET: {
1065 struct ftrace_graph_ret_entry *field; 1092 struct ftrace_graph_ret_entry *field;
1066 trace_assign_type(field, entry); 1093 trace_assign_type(field, entry);
1067 return print_graph_return(&field->ret, s, entry, iter); 1094 return print_graph_return(&field->ret, s, entry, iter, flags);
1068 } 1095 }
1096 case TRACE_STACK:
1097 case TRACE_FN:
1098 /* dont trace stack and functions as comments */
1099 return TRACE_TYPE_UNHANDLED;
1100
1069 default: 1101 default:
1070 return print_graph_comment(s, entry, iter); 1102 return print_graph_comment(s, entry, iter, flags);
1071 } 1103 }
1072 1104
1073 return TRACE_TYPE_HANDLED; 1105 return TRACE_TYPE_HANDLED;
1074} 1106}
1075 1107
1076static void print_lat_header(struct seq_file *s) 1108static enum print_line_t
1109print_graph_function(struct trace_iterator *iter)
1110{
1111 return print_graph_function_flags(iter, tracer_flags.val);
1112}
1113
1114static enum print_line_t
1115print_graph_function_event(struct trace_iterator *iter, int flags)
1116{
1117 return print_graph_function(iter);
1118}
1119
1120static void print_lat_header(struct seq_file *s, u32 flags)
1077{ 1121{
1078 static const char spaces[] = " " /* 16 spaces */ 1122 static const char spaces[] = " " /* 16 spaces */
1079 " " /* 4 spaces */ 1123 " " /* 4 spaces */
1080 " "; /* 17 spaces */ 1124 " "; /* 17 spaces */
1081 int size = 0; 1125 int size = 0;
1082 1126
1083 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1127 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1084 size += 16; 1128 size += 16;
1085 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1129 if (flags & TRACE_GRAPH_PRINT_CPU)
1086 size += 4; 1130 size += 4;
1087 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1131 if (flags & TRACE_GRAPH_PRINT_PROC)
1088 size += 17; 1132 size += 17;
1089 1133
1090 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); 1134 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1095,43 +1139,48 @@ static void print_lat_header(struct seq_file *s)
1095 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1139 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1096} 1140}
1097 1141
1098static void print_graph_headers(struct seq_file *s) 1142void print_graph_headers_flags(struct seq_file *s, u32 flags)
1099{ 1143{
1100 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1144 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1101 1145
1102 if (lat) 1146 if (lat)
1103 print_lat_header(s); 1147 print_lat_header(s, flags);
1104 1148
1105 /* 1st line */ 1149 /* 1st line */
1106 seq_printf(s, "#"); 1150 seq_printf(s, "#");
1107 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1151 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1108 seq_printf(s, " TIME "); 1152 seq_printf(s, " TIME ");
1109 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1153 if (flags & TRACE_GRAPH_PRINT_CPU)
1110 seq_printf(s, " CPU"); 1154 seq_printf(s, " CPU");
1111 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1155 if (flags & TRACE_GRAPH_PRINT_PROC)
1112 seq_printf(s, " TASK/PID "); 1156 seq_printf(s, " TASK/PID ");
1113 if (lat) 1157 if (lat)
1114 seq_printf(s, "|||||"); 1158 seq_printf(s, "|||||");
1115 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1159 if (flags & TRACE_GRAPH_PRINT_DURATION)
1116 seq_printf(s, " DURATION "); 1160 seq_printf(s, " DURATION ");
1117 seq_printf(s, " FUNCTION CALLS\n"); 1161 seq_printf(s, " FUNCTION CALLS\n");
1118 1162
1119 /* 2nd line */ 1163 /* 2nd line */
1120 seq_printf(s, "#"); 1164 seq_printf(s, "#");
1121 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1165 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1122 seq_printf(s, " | "); 1166 seq_printf(s, " | ");
1123 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1167 if (flags & TRACE_GRAPH_PRINT_CPU)
1124 seq_printf(s, " | "); 1168 seq_printf(s, " | ");
1125 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1169 if (flags & TRACE_GRAPH_PRINT_PROC)
1126 seq_printf(s, " | | "); 1170 seq_printf(s, " | | ");
1127 if (lat) 1171 if (lat)
1128 seq_printf(s, "|||||"); 1172 seq_printf(s, "|||||");
1129 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1173 if (flags & TRACE_GRAPH_PRINT_DURATION)
1130 seq_printf(s, " | | "); 1174 seq_printf(s, " | | ");
1131 seq_printf(s, " | | | |\n"); 1175 seq_printf(s, " | | | |\n");
1132} 1176}
1133 1177
1134static void graph_trace_open(struct trace_iterator *iter) 1178void print_graph_headers(struct seq_file *s)
1179{
1180 print_graph_headers_flags(s, tracer_flags.val);
1181}
1182
1183void graph_trace_open(struct trace_iterator *iter)
1135{ 1184{
1136 /* pid and depth on the last trace processed */ 1185 /* pid and depth on the last trace processed */
1137 struct fgraph_data *data; 1186 struct fgraph_data *data;
@@ -1166,7 +1215,7 @@ static void graph_trace_open(struct trace_iterator *iter)
1166 pr_warning("function graph tracer: not enough memory\n"); 1215 pr_warning("function graph tracer: not enough memory\n");
1167} 1216}
1168 1217
1169static void graph_trace_close(struct trace_iterator *iter) 1218void graph_trace_close(struct trace_iterator *iter)
1170{ 1219{
1171 struct fgraph_data *data = iter->private; 1220 struct fgraph_data *data = iter->private;
1172 1221
@@ -1176,6 +1225,16 @@ static void graph_trace_close(struct trace_iterator *iter)
1176 } 1225 }
1177} 1226}
1178 1227
1228static struct trace_event graph_trace_entry_event = {
1229 .type = TRACE_GRAPH_ENT,
1230 .trace = print_graph_function_event,
1231};
1232
1233static struct trace_event graph_trace_ret_event = {
1234 .type = TRACE_GRAPH_RET,
1235 .trace = print_graph_function_event,
1236};
1237
1179static struct tracer graph_trace __read_mostly = { 1238static struct tracer graph_trace __read_mostly = {
1180 .name = "function_graph", 1239 .name = "function_graph",
1181 .open = graph_trace_open, 1240 .open = graph_trace_open,
@@ -1197,6 +1256,16 @@ static __init int init_graph_trace(void)
1197{ 1256{
1198 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1257 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1199 1258
1259 if (!register_ftrace_event(&graph_trace_entry_event)) {
1260 pr_warning("Warning: could not register graph trace events\n");
1261 return 1;
1262 }
1263
1264 if (!register_ftrace_event(&graph_trace_ret_event)) {
1265 pr_warning("Warning: could not register graph trace events\n");
1266 return 1;
1267 }
1268
1200 return register_tracer(&graph_trace); 1269 return register_tracer(&graph_trace);
1201} 1270}
1202 1271
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
1/*
2 * h/w branch tracer for x86 based on BTS
3 *
4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */
7#include <linux/kallsyms.h>
8#include <linux/debugfs.h>
9#include <linux/ftrace.h>
10#include <linux/module.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/fs.h>
14
15#include <asm/ds.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20
21#define BTS_BUFFER_SIZE (1 << 13)
22
23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25
26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27
28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
30static struct trace_array *hw_branch_trace __read_mostly;
31
32
33static void bts_trace_init_cpu(int cpu)
34{
35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
39
40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
41 per_cpu(hwb_tracer, cpu) = NULL;
42}
43
44static int bts_trace_init(struct trace_array *tr)
45{
46 int cpu;
47
48 hw_branch_trace = tr;
49 trace_hw_branches_enabled = 0;
50
51 get_online_cpus();
52 for_each_online_cpu(cpu) {
53 bts_trace_init_cpu(cpu);
54
55 if (likely(per_cpu(hwb_tracer, cpu)))
56 trace_hw_branches_enabled = 1;
57 }
58 trace_hw_branches_suspended = 0;
59 put_online_cpus();
60
61 /* If we could not enable tracing on a single cpu, we fail. */
62 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
63}
64
65static void bts_trace_reset(struct trace_array *tr)
66{
67 int cpu;
68
69 get_online_cpus();
70 for_each_online_cpu(cpu) {
71 if (likely(per_cpu(hwb_tracer, cpu))) {
72 ds_release_bts(per_cpu(hwb_tracer, cpu));
73 per_cpu(hwb_tracer, cpu) = NULL;
74 }
75 }
76 trace_hw_branches_enabled = 0;
77 trace_hw_branches_suspended = 0;
78 put_online_cpus();
79}
80
81static void bts_trace_start(struct trace_array *tr)
82{
83 int cpu;
84
85 get_online_cpus();
86 for_each_online_cpu(cpu)
87 if (likely(per_cpu(hwb_tracer, cpu)))
88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
89 trace_hw_branches_suspended = 0;
90 put_online_cpus();
91}
92
93static void bts_trace_stop(struct trace_array *tr)
94{
95 int cpu;
96
97 get_online_cpus();
98 for_each_online_cpu(cpu)
99 if (likely(per_cpu(hwb_tracer, cpu)))
100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
101 trace_hw_branches_suspended = 1;
102 put_online_cpus();
103}
104
105static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
106 unsigned long action, void *hcpu)
107{
108 int cpu = (long)hcpu;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 /* The notification is sent with interrupts enabled. */
114 if (trace_hw_branches_enabled) {
115 bts_trace_init_cpu(cpu);
116
117 if (trace_hw_branches_suspended &&
118 likely(per_cpu(hwb_tracer, cpu)))
119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
120 }
121 break;
122
123 case CPU_DOWN_PREPARE:
124 /* The notification is sent with interrupts enabled. */
125 if (likely(per_cpu(hwb_tracer, cpu))) {
126 ds_release_bts(per_cpu(hwb_tracer, cpu));
127 per_cpu(hwb_tracer, cpu) = NULL;
128 }
129 }
130
131 return NOTIFY_DONE;
132}
133
134static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
135 .notifier_call = bts_hotcpu_handler
136};
137
138static void bts_trace_print_header(struct seq_file *m)
139{
140 seq_puts(m, "# CPU# TO <- FROM\n");
141}
142
143static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
144{
145 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
146 struct trace_entry *entry = iter->ent;
147 struct trace_seq *seq = &iter->seq;
148 struct hw_branch_entry *it;
149
150 trace_assign_type(it, entry);
151
152 if (entry->type == TRACE_HW_BRANCHES) {
153 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
154 seq_print_ip_sym(seq, it->to, symflags) &&
155 trace_seq_printf(seq, "\t <- ") &&
156 seq_print_ip_sym(seq, it->from, symflags) &&
157 trace_seq_printf(seq, "\n"))
158 return TRACE_TYPE_HANDLED;
159 return TRACE_TYPE_PARTIAL_LINE;
160 }
161 return TRACE_TYPE_UNHANDLED;
162}
163
164void trace_hw_branch(u64 from, u64 to)
165{
166 struct ftrace_event_call *call = &event_hw_branch;
167 struct trace_array *tr = hw_branch_trace;
168 struct ring_buffer_event *event;
169 struct ring_buffer *buf;
170 struct hw_branch_entry *entry;
171 unsigned long irq1;
172 int cpu;
173
174 if (unlikely(!tr))
175 return;
176
177 if (unlikely(!trace_hw_branches_enabled))
178 return;
179
180 local_irq_save(irq1);
181 cpu = raw_smp_processor_id();
182 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
183 goto out;
184
185 buf = tr->buffer;
186 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
187 sizeof(*entry), 0, 0);
188 if (!event)
189 goto out;
190 entry = ring_buffer_event_data(event);
191 tracing_generic_entry_update(&entry->ent, 0, from);
192 entry->ent.type = TRACE_HW_BRANCHES;
193 entry->from = from;
194 entry->to = to;
195 if (!filter_check_discard(call, entry, buf, event))
196 trace_buffer_unlock_commit(buf, event, 0, 0);
197
198 out:
199 atomic_dec(&tr->data[cpu]->disabled);
200 local_irq_restore(irq1);
201}
202
203static void trace_bts_at(const struct bts_trace *trace, void *at)
204{
205 struct bts_struct bts;
206 int err = 0;
207
208 WARN_ON_ONCE(!trace->read);
209 if (!trace->read)
210 return;
211
212 err = trace->read(this_tracer, at, &bts);
213 if (err < 0)
214 return;
215
216 switch (bts.qualifier) {
217 case BTS_BRANCH:
218 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
219 break;
220 }
221}
222
223/*
224 * Collect the trace on the current cpu and write it into the ftrace buffer.
225 *
226 * pre: tracing must be suspended on the current cpu
227 */
228static void trace_bts_cpu(void *arg)
229{
230 struct trace_array *tr = (struct trace_array *)arg;
231 const struct bts_trace *trace;
232 unsigned char *at;
233
234 if (unlikely(!tr))
235 return;
236
237 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
238 return;
239
240 if (unlikely(!this_tracer))
241 return;
242
243 trace = ds_read_bts(this_tracer);
244 if (!trace)
245 return;
246
247 for (at = trace->ds.top; (void *)at < trace->ds.end;
248 at += trace->ds.size)
249 trace_bts_at(trace, at);
250
251 for (at = trace->ds.begin; (void *)at < trace->ds.top;
252 at += trace->ds.size)
253 trace_bts_at(trace, at);
254}
255
256static void trace_bts_prepare(struct trace_iterator *iter)
257{
258 int cpu;
259
260 get_online_cpus();
261 for_each_online_cpu(cpu)
262 if (likely(per_cpu(hwb_tracer, cpu)))
263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
264 /*
265 * We need to collect the trace on the respective cpu since ftrace
266 * implicitly adds the record for the current cpu.
267 * Once that is more flexible, we could collect the data from any cpu.
268 */
269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
270
271 for_each_online_cpu(cpu)
272 if (likely(per_cpu(hwb_tracer, cpu)))
273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
274 put_online_cpus();
275}
276
277static void trace_bts_close(struct trace_iterator *iter)
278{
279 tracing_reset_online_cpus(iter->tr);
280}
281
282void trace_hw_branch_oops(void)
283{
284 if (this_tracer) {
285 ds_suspend_bts_noirq(this_tracer);
286 trace_bts_cpu(hw_branch_trace);
287 ds_resume_bts_noirq(this_tracer);
288 }
289}
290
291struct tracer bts_tracer __read_mostly =
292{
293 .name = "hw-branch-tracer",
294 .init = bts_trace_init,
295 .reset = bts_trace_reset,
296 .print_header = bts_trace_print_header,
297 .print_line = bts_trace_print_line,
298 .start = bts_trace_start,
299 .stop = bts_trace_stop,
300 .open = trace_bts_prepare,
301 .close = trace_bts_close,
302#ifdef CONFIG_FTRACE_SELFTEST
303 .selftest = trace_selftest_startup_hw_branches,
304#endif /* CONFIG_FTRACE_SELFTEST */
305};
306
307__init static int init_bts_trace(void)
308{
309 register_hotcpu_notifier(&bts_hotcpu_notifier);
310 return register_tracer(&bts_tracer);
311}
312device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_lat_flag;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
39
37#ifdef CONFIG_PREEMPT_TRACER 40#ifdef CONFIG_PREEMPT_TRACER
38static inline int 41static inline int
39preempt_trace(void) 42preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
55# define irq_trace() (0) 58# define irq_trace() (0)
56#endif 59#endif
57 60
61#define TRACE_DISPLAY_GRAPH 1
62
63static struct tracer_opt trace_opts[] = {
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 /* display latency trace as call graph */
66 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
67#endif
68 { } /* Empty entry */
69};
70
71static struct tracer_flags tracer_flags = {
72 .val = 0,
73 .opts = trace_opts,
74};
75
76#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
77
58/* 78/*
59 * Sequence count - we record it when starting a measurement and 79 * Sequence count - we record it when starting a measurement and
60 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
108}; 128};
109#endif /* CONFIG_FUNCTION_TRACER */ 129#endif /* CONFIG_FUNCTION_TRACER */
110 130
131#ifdef CONFIG_FUNCTION_GRAPH_TRACER
132static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
133{
134 int cpu;
135
136 if (!(bit & TRACE_DISPLAY_GRAPH))
137 return -EINVAL;
138
139 if (!(is_graph() ^ set))
140 return 0;
141
142 stop_irqsoff_tracer(irqsoff_trace, !set);
143
144 for_each_possible_cpu(cpu)
145 per_cpu(tracing_cpu, cpu) = 0;
146
147 tracing_max_latency = 0;
148 tracing_reset_online_cpus(irqsoff_trace);
149
150 return start_irqsoff_tracer(irqsoff_trace, set);
151}
152
153static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
154{
155 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data;
157 unsigned long flags;
158 long disabled;
159 int ret;
160 int cpu;
161 int pc;
162
163 cpu = raw_smp_processor_id();
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled);
182 return ret;
183}
184
185static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
186{
187 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data;
189 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc;
193
194 cpu = raw_smp_processor_id();
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled);
212}
213
214static void irqsoff_trace_open(struct trace_iterator *iter)
215{
216 if (is_graph())
217 graph_trace_open(iter);
218
219}
220
221static void irqsoff_trace_close(struct trace_iterator *iter)
222{
223 if (iter->private)
224 graph_trace_close(iter);
225}
226
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC)
229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /*
240 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler
242 */
243 if (is_graph())
244 return print_graph_function_flags(iter, flags);
245
246 return TRACE_TYPE_UNHANDLED;
247}
248
249static void irqsoff_print_header(struct seq_file *s)
250{
251 if (is_graph()) {
252 struct trace_iterator *iter = s->private;
253 u32 flags = GRAPH_TRACER_FLAGS;
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s);
268}
269
270static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc)
294{
295 if (!is_graph())
296 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301}
302
303#else
304#define __trace_function trace_function
305
306static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
307{
308 return -EINVAL;
309}
310
311static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
312{
313 return -1;
314}
315
316static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
317{
318 return TRACE_TYPE_UNHANDLED;
319}
320
321static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
322static void irqsoff_print_header(struct seq_file *s) { }
323static void irqsoff_trace_open(struct trace_iterator *iter) { }
324static void irqsoff_trace_close(struct trace_iterator *iter) { }
325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
326
111/* 327/*
112 * Should this new latency be reported/recorded? 328 * Should this new latency be reported/recorded?
113 */ 329 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 366 if (!report_latency(delta))
151 goto out_unlock; 367 goto out_unlock;
152 368
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 369 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */ 370 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc); 371 __trace_stack(tr, flags, 5, pc);
156 372
@@ -172,7 +388,7 @@ out_unlock:
172out: 388out:
173 data->critical_sequence = max_sequence; 389 data->critical_sequence = max_sequence;
174 data->preempt_timestamp = ftrace_now(cpu); 390 data->preempt_timestamp = ftrace_now(cpu);
175 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 391 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
176} 392}
177 393
178static inline void 394static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
204 420
205 local_save_flags(flags); 421 local_save_flags(flags);
206 422
207 trace_function(tr, ip, parent_ip, flags, preempt_count()); 423 __trace_function(tr, ip, parent_ip, flags, preempt_count());
208 424
209 per_cpu(tracing_cpu, cpu) = 1; 425 per_cpu(tracing_cpu, cpu) = 1;
210 426
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
238 atomic_inc(&data->disabled); 454 atomic_inc(&data->disabled);
239 455
240 local_save_flags(flags); 456 local_save_flags(flags);
241 trace_function(tr, ip, parent_ip, flags, preempt_count()); 457 __trace_function(tr, ip, parent_ip, flags, preempt_count());
242 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 458 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
243 data->critical_start = 0; 459 data->critical_start = 0;
244 atomic_dec(&data->disabled); 460 atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
347} 563}
348#endif /* CONFIG_PREEMPT_TRACER */ 564#endif /* CONFIG_PREEMPT_TRACER */
349 565
350static void start_irqsoff_tracer(struct trace_array *tr) 566static int start_irqsoff_tracer(struct trace_array *tr, int graph)
351{ 567{
352 register_ftrace_function(&trace_ops); 568 int ret = 0;
353 if (tracing_is_enabled()) 569
570 if (!graph)
571 ret = register_ftrace_function(&trace_ops);
572 else
573 ret = register_ftrace_graph(&irqsoff_graph_return,
574 &irqsoff_graph_entry);
575
576 if (!ret && tracing_is_enabled())
354 tracer_enabled = 1; 577 tracer_enabled = 1;
355 else 578 else
356 tracer_enabled = 0; 579 tracer_enabled = 0;
580
581 return ret;
357} 582}
358 583
359static void stop_irqsoff_tracer(struct trace_array *tr) 584static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
360{ 585{
361 tracer_enabled = 0; 586 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 587
588 if (!graph)
589 unregister_ftrace_function(&trace_ops);
590 else
591 unregister_ftrace_graph();
363} 592}
364 593
365static void __irqsoff_tracer_init(struct trace_array *tr) 594static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
372 /* make sure that the tracer is visible */ 601 /* make sure that the tracer is visible */
373 smp_wmb(); 602 smp_wmb();
374 tracing_reset_online_cpus(tr); 603 tracing_reset_online_cpus(tr);
375 start_irqsoff_tracer(tr); 604
605 if (start_irqsoff_tracer(tr, is_graph()))
606 printk(KERN_ERR "failed to start irqsoff tracer\n");
376} 607}
377 608
378static void irqsoff_tracer_reset(struct trace_array *tr) 609static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 610{
380 stop_irqsoff_tracer(tr); 611 stop_irqsoff_tracer(tr, is_graph());
381 612
382 if (!save_lat_flag) 613 if (!save_lat_flag)
383 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 614 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
409 .start = irqsoff_tracer_start, 640 .start = irqsoff_tracer_start,
410 .stop = irqsoff_tracer_stop, 641 .stop = irqsoff_tracer_stop,
411 .print_max = 1, 642 .print_max = 1,
643 .print_header = irqsoff_print_header,
644 .print_line = irqsoff_print_line,
645 .flags = &tracer_flags,
646 .set_flag = irqsoff_set_flag,
412#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
413 .selftest = trace_selftest_startup_irqsoff, 648 .selftest = trace_selftest_startup_irqsoff,
414#endif 649#endif
650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close,
415}; 652};
416# define register_irqsoff(trace) register_tracer(&trace) 653# define register_irqsoff(trace) register_tracer(&trace)
417#else 654#else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
435 .start = irqsoff_tracer_start, 672 .start = irqsoff_tracer_start,
436 .stop = irqsoff_tracer_stop, 673 .stop = irqsoff_tracer_stop,
437 .print_max = 1, 674 .print_max = 1,
675 .print_header = irqsoff_print_header,
676 .print_line = irqsoff_print_line,
677 .flags = &tracer_flags,
678 .set_flag = irqsoff_set_flag,
438#ifdef CONFIG_FTRACE_SELFTEST 679#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_preemptoff, 680 .selftest = trace_selftest_startup_preemptoff,
440#endif 681#endif
682 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close,
441}; 684};
442# define register_preemptoff(trace) register_tracer(&trace) 685# define register_preemptoff(trace) register_tracer(&trace)
443#else 686#else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
463 .start = irqsoff_tracer_start, 706 .start = irqsoff_tracer_start,
464 .stop = irqsoff_tracer_stop, 707 .stop = irqsoff_tracer_stop,
465 .print_max = 1, 708 .print_max = 1,
709 .print_header = irqsoff_print_header,
710 .print_line = irqsoff_print_line,
711 .flags = &tracer_flags,
712 .set_flag = irqsoff_set_flag,
466#ifdef CONFIG_FTRACE_SELFTEST 713#ifdef CONFIG_FTRACE_SELFTEST
467 .selftest = trace_selftest_startup_preemptirqsoff, 714 .selftest = trace_selftest_startup_preemptirqsoff,
468#endif 715#endif
716 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close,
469}; 718};
470 719
471# define register_preemptirqsoff(trace) register_tracer(&trace) 720# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 505c92273b1a..a7514326052b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <asm/bitsperlong.h>
32 34
33#include "trace.h" 35#include "trace.h"
34#include "trace_output.h" 36#include "trace_output.h"
@@ -40,7 +42,6 @@
40 42
41/* Reserved field names */ 43/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip" 44#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip" 45#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func" 46#define FIELD_STRING_FUNC "__probe_func"
46 47
@@ -52,56 +53,102 @@ const char *reserved_field_names[] = {
52 "common_tgid", 53 "common_tgid",
53 "common_lock_depth", 54 "common_lock_depth",
54 FIELD_STRING_IP, 55 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP, 56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC, 57 FIELD_STRING_FUNC,
58}; 58};
59 59
60struct fetch_func { 60/* Printing function type */
61 unsigned long (*func)(struct pt_regs *, void *); 61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64
65/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\
69{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \
72static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
73
74DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
75DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
76DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
77DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
78DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82
83/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85
86struct fetch_param {
87 fetch_func_t fn;
62 void *data; 88 void *data;
63}; 89};
64 90
65static __kprobes unsigned long call_fetch(struct fetch_func *f, 91static __kprobes void call_fetch(struct fetch_param *fprm,
66 struct pt_regs *regs) 92 struct pt_regs *regs, void *dest)
67{ 93{
68 return f->func(regs, f->data); 94 return fprm->fn(regs, fprm->data, dest);
69} 95}
70 96
71/* fetch handlers */ 97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type
72static __kprobes unsigned long fetch_register(struct pt_regs *regs, 98/*
73 void *offset) 99 * Define macro for basic types - we don't need to define s* types, because
74{ 100 * we have to care only about bitwidth at recording time.
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset)); 101 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \
103DEFINE_FETCH_##kind(u8) \
104DEFINE_FETCH_##kind(u16) \
105DEFINE_FETCH_##kind(u32) \
106DEFINE_FETCH_##kind(u64)
107
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn))
113
114/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \
118{ \
119 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \
76} 121}
77 122DEFINE_BASIC_FETCH_FUNCS(reg)
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs, 123
79 void *num) 124#define DEFINE_FETCH_stack(type) \
80{ 125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
81 return regs_get_kernel_stack_nth(regs, 126 void *offset, void *dest) \
82 (unsigned int)((unsigned long)num)); 127{ \
128 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
129 (unsigned int)((unsigned long)offset)); \
83} 130}
131DEFINE_BASIC_FETCH_FUNCS(stack)
84 132
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) 133#define DEFINE_FETCH_retval(type) \
86{ 134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
87 unsigned long retval; 135 void *dummy, void *dest) \
88 136{ \
89 if (probe_kernel_address(addr, retval)) 137 *(type *)dest = (type)regs_return_value(regs); \
90 return 0;
91 return retval;
92} 138}
93 139DEFINE_BASIC_FETCH_FUNCS(retval)
94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 140
95 void *dummy) 141#define DEFINE_FETCH_memory(type) \
96{ 142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
97 return regs_return_value(regs); 143 void *addr, void *dest) \
98} 144{ \
99 145 type retval; \
100static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, 146 if (probe_kernel_address(addr, retval)) \
101 void *dummy) 147 *(type *)dest = 0; \
102{ 148 else \
103 return kernel_stack_pointer(regs); 149 *(type *)dest = retval; \
104} 150}
151DEFINE_BASIC_FETCH_FUNCS(memory)
105 152
106/* Memory fetching by symbol */ 153/* Memory fetching by symbol */
107struct symbol_cache { 154struct symbol_cache {
@@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
145 return sc; 192 return sc;
146} 193}
147 194
148static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) 195#define DEFINE_FETCH_symbol(type) \
149{ 196static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
150 struct symbol_cache *sc = data; 197 void *data, void *dest) \
151 198{ \
152 if (sc->addr) 199 struct symbol_cache *sc = data; \
153 return fetch_memory(regs, (void *)sc->addr); 200 if (sc->addr) \
154 else 201 fetch_memory_##type(regs, (void *)sc->addr, dest); \
155 return 0; 202 else \
203 *(type *)dest = 0; \
156} 204}
205DEFINE_BASIC_FETCH_FUNCS(symbol)
157 206
158/* Special indirect memory access interface */ 207/* Dereference memory access function */
159struct indirect_fetch_data { 208struct deref_fetch_param {
160 struct fetch_func orig; 209 struct fetch_param orig;
161 long offset; 210 long offset;
162}; 211};
163 212
164static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) 213#define DEFINE_FETCH_deref(type) \
165{ 214static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
166 struct indirect_fetch_data *ind = data; 215 void *data, void *dest) \
167 unsigned long addr; 216{ \
168 217 struct deref_fetch_param *dprm = data; \
169 addr = call_fetch(&ind->orig, regs); 218 unsigned long addr; \
170 if (addr) { 219 call_fetch(&dprm->orig, regs, &addr); \
171 addr += ind->offset; 220 if (addr) { \
172 return fetch_memory(regs, (void *)addr); 221 addr += dprm->offset; \
173 } else 222 fetch_memory_##type(regs, (void *)addr, dest); \
174 return 0; 223 } else \
224 *(type *)dest = 0; \
175} 225}
226DEFINE_BASIC_FETCH_FUNCS(deref)
176 227
177static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) 228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
178{ 229{
179 if (data->orig.func == fetch_indirect) 230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
180 free_indirect_fetch_data(data->orig.data); 231 free_deref_fetch_param(data->orig.data);
181 else if (data->orig.func == fetch_symbol) 232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
182 free_symbol_cache(data->orig.data); 233 free_symbol_cache(data->orig.data);
183 kfree(data); 234 kfree(data);
184} 235}
185 236
237/* Default (unsigned long) fetch type */
238#define __DEFAULT_FETCH_TYPE(t) u##t
239#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242
243#define ASSIGN_FETCH_FUNC(kind, type) \
244 .kind = FETCH_FUNC_NAME(kind, type)
245
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
247 {.name = #ptype, \
248 .size = sizeof(ftype), \
249 .is_signed = sign, \
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
252ASSIGN_FETCH_FUNC(reg, ftype), \
253ASSIGN_FETCH_FUNC(stack, ftype), \
254ASSIGN_FETCH_FUNC(retval, ftype), \
255ASSIGN_FETCH_FUNC(memory, ftype), \
256ASSIGN_FETCH_FUNC(symbol, ftype), \
257ASSIGN_FETCH_FUNC(deref, ftype), \
258 }
259
260/* Fetch type information table */
261static const struct fetch_type {
262 const char *name; /* Name of type */
263 size_t size; /* Byte size of type */
264 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */
267 /* Fetch functions */
268 fetch_func_t reg;
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = {
275 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0),
278 ASSIGN_FETCH_TYPE(u64, u64, 0),
279 ASSIGN_FETCH_TYPE(s8, u8, 1),
280 ASSIGN_FETCH_TYPE(s16, u16, 1),
281 ASSIGN_FETCH_TYPE(s32, u32, 1),
282 ASSIGN_FETCH_TYPE(s64, u64, 1),
283};
284
285static const struct fetch_type *find_fetch_type(const char *type)
286{
287 int i;
288
289 if (!type)
290 type = DEFAULT_FETCH_TYPE_STR;
291
292 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
293 if (strcmp(type, fetch_type_table[i].name) == 0)
294 return &fetch_type_table[i];
295 return NULL;
296}
297
298/* Special function : only accept unsigned long */
299static __kprobes void fetch_stack_address(struct pt_regs *regs,
300 void *dummy, void *dest)
301{
302 *(unsigned long *)dest = kernel_stack_pointer(regs);
303}
304
186/** 305/**
187 * Kprobe event core functions 306 * Kprobe event core functions
188 */ 307 */
189 308
190struct probe_arg { 309struct probe_arg {
191 struct fetch_func fetch; 310 struct fetch_param fetch;
192 const char *name; 311 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */
314 const struct fetch_type *type; /* Type of this argument */
193}; 315};
194 316
195/* Flags for trace_probe */ 317/* Flags for trace_probe */
@@ -204,6 +326,7 @@ struct trace_probe {
204 const char *symbol; /* symbol name */ 326 const char *symbol; /* symbol name */
205 struct ftrace_event_call call; 327 struct ftrace_event_call call;
206 struct trace_event event; 328 struct trace_event event;
329 ssize_t size; /* trace entry size */
207 unsigned int nr_args; 330 unsigned int nr_args;
208 struct probe_arg args[]; 331 struct probe_arg args[];
209}; 332};
@@ -212,6 +335,7 @@ struct trace_probe {
212 (offsetof(struct trace_probe, args) + \ 335 (offsetof(struct trace_probe, args) + \
213 (sizeof(struct probe_arg) * (n))) 336 (sizeof(struct probe_arg) * (n)))
214 337
338
215static __kprobes int probe_is_return(struct trace_probe *tp) 339static __kprobes int probe_is_return(struct trace_probe *tp)
216{ 340{
217 return tp->rp.handler != NULL; 341 return tp->rp.handler != NULL;
@@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
222 return tp->symbol ? tp->symbol : "unknown"; 346 return tp->symbol ? tp->symbol : "unknown";
223} 347}
224 348
225static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
226{
227 int ret = -EINVAL;
228
229 if (ff->func == fetch_register) {
230 const char *name;
231 name = regs_query_register_name((unsigned int)((long)ff->data));
232 ret = snprintf(buf, n, "%%%s", name);
233 } else if (ff->func == fetch_stack)
234 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
235 else if (ff->func == fetch_memory)
236 ret = snprintf(buf, n, "@0x%p", ff->data);
237 else if (ff->func == fetch_symbol) {
238 struct symbol_cache *sc = ff->data;
239 if (sc->offset)
240 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
241 sc->offset);
242 else
243 ret = snprintf(buf, n, "@%s", sc->symbol);
244 } else if (ff->func == fetch_retvalue)
245 ret = snprintf(buf, n, "$retval");
246 else if (ff->func == fetch_stack_address)
247 ret = snprintf(buf, n, "$stack");
248 else if (ff->func == fetch_indirect) {
249 struct indirect_fetch_data *id = ff->data;
250 size_t l = 0;
251 ret = snprintf(buf, n, "%+ld(", id->offset);
252 if (ret >= n)
253 goto end;
254 l += ret;
255 ret = probe_arg_string(buf + l, n - l, &id->orig);
256 if (ret < 0)
257 goto end;
258 l += ret;
259 ret = snprintf(buf + l, n - l, ")");
260 ret += l;
261 }
262end:
263 if (ret >= n)
264 return -ENOSPC;
265 return ret;
266}
267
268static int register_probe_event(struct trace_probe *tp); 349static int register_probe_event(struct trace_probe *tp);
269static void unregister_probe_event(struct trace_probe *tp); 350static void unregister_probe_event(struct trace_probe *tp);
270 351
@@ -347,11 +428,12 @@ error:
347 428
348static void free_probe_arg(struct probe_arg *arg) 429static void free_probe_arg(struct probe_arg *arg)
349{ 430{
350 if (arg->fetch.func == fetch_symbol) 431 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
432 free_deref_fetch_param(arg->fetch.data);
433 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
351 free_symbol_cache(arg->fetch.data); 434 free_symbol_cache(arg->fetch.data);
352 else if (arg->fetch.func == fetch_indirect)
353 free_indirect_fetch_data(arg->fetch.data);
354 kfree(arg->name); 435 kfree(arg->name);
436 kfree(arg->comm);
355} 437}
356 438
357static void free_trace_probe(struct trace_probe *tp) 439static void free_trace_probe(struct trace_probe *tp)
@@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
457#define PARAM_MAX_ARGS 16 539#define PARAM_MAX_ARGS 16
458#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 540#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
459 541
460static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) 542static int parse_probe_vars(char *arg, const struct fetch_type *t,
543 struct fetch_param *f, int is_return)
461{ 544{
462 int ret = 0; 545 int ret = 0;
463 unsigned long param; 546 unsigned long param;
464 547
465 if (strcmp(arg, "retval") == 0) { 548 if (strcmp(arg, "retval") == 0) {
466 if (is_return) { 549 if (is_return)
467 ff->func = fetch_retvalue; 550 f->fn = t->retval;
468 ff->data = NULL; 551 else
469 } else
470 ret = -EINVAL; 552 ret = -EINVAL;
471 } else if (strncmp(arg, "stack", 5) == 0) { 553 } else if (strncmp(arg, "stack", 5) == 0) {
472 if (arg[5] == '\0') { 554 if (arg[5] == '\0') {
473 ff->func = fetch_stack_address; 555 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
474 ff->data = NULL; 556 f->fn = fetch_stack_address;
557 else
558 ret = -EINVAL;
475 } else if (isdigit(arg[5])) { 559 } else if (isdigit(arg[5])) {
476 ret = strict_strtoul(arg + 5, 10, &param); 560 ret = strict_strtoul(arg + 5, 10, &param);
477 if (ret || param > PARAM_MAX_STACK) 561 if (ret || param > PARAM_MAX_STACK)
478 ret = -EINVAL; 562 ret = -EINVAL;
479 else { 563 else {
480 ff->func = fetch_stack; 564 f->fn = t->stack;
481 ff->data = (void *)param; 565 f->data = (void *)param;
482 } 566 }
483 } else 567 } else
484 ret = -EINVAL; 568 ret = -EINVAL;
@@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
488} 572}
489 573
490/* Recursive argument parser */ 574/* Recursive argument parser */
491static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 575static int __parse_probe_arg(char *arg, const struct fetch_type *t,
576 struct fetch_param *f, int is_return)
492{ 577{
493 int ret = 0; 578 int ret = 0;
494 unsigned long param; 579 unsigned long param;
@@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
497 582
498 switch (arg[0]) { 583 switch (arg[0]) {
499 case '$': 584 case '$':
500 ret = parse_probe_vars(arg + 1, ff, is_return); 585 ret = parse_probe_vars(arg + 1, t, f, is_return);
501 break; 586 break;
502 case '%': /* named register */ 587 case '%': /* named register */
503 ret = regs_query_register_offset(arg + 1); 588 ret = regs_query_register_offset(arg + 1);
504 if (ret >= 0) { 589 if (ret >= 0) {
505 ff->func = fetch_register; 590 f->fn = t->reg;
506 ff->data = (void *)(unsigned long)ret; 591 f->data = (void *)(unsigned long)ret;
507 ret = 0; 592 ret = 0;
508 } 593 }
509 break; 594 break;
@@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
512 ret = strict_strtoul(arg + 1, 0, &param); 597 ret = strict_strtoul(arg + 1, 0, &param);
513 if (ret) 598 if (ret)
514 break; 599 break;
515 ff->func = fetch_memory; 600 f->fn = t->memory;
516 ff->data = (void *)param; 601 f->data = (void *)param;
517 } else { 602 } else {
518 ret = split_symbol_offset(arg + 1, &offset); 603 ret = split_symbol_offset(arg + 1, &offset);
519 if (ret) 604 if (ret)
520 break; 605 break;
521 ff->data = alloc_symbol_cache(arg + 1, offset); 606 f->data = alloc_symbol_cache(arg + 1, offset);
522 if (ff->data) 607 if (f->data)
523 ff->func = fetch_symbol; 608 f->fn = t->symbol;
524 else
525 ret = -EINVAL;
526 } 609 }
527 break; 610 break;
528 case '+': /* indirect memory */ 611 case '+': /* deref memory */
529 case '-': 612 case '-':
530 tmp = strchr(arg, '('); 613 tmp = strchr(arg, '(');
531 if (!tmp) { 614 if (!tmp)
532 ret = -EINVAL;
533 break; 615 break;
534 }
535 *tmp = '\0'; 616 *tmp = '\0';
536 ret = strict_strtol(arg + 1, 0, &offset); 617 ret = strict_strtol(arg + 1, 0, &offset);
537 if (ret) 618 if (ret)
@@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
541 arg = tmp + 1; 622 arg = tmp + 1;
542 tmp = strrchr(arg, ')'); 623 tmp = strrchr(arg, ')');
543 if (tmp) { 624 if (tmp) {
544 struct indirect_fetch_data *id; 625 struct deref_fetch_param *dprm;
626 const struct fetch_type *t2 = find_fetch_type(NULL);
545 *tmp = '\0'; 627 *tmp = '\0';
546 id = kzalloc(sizeof(struct indirect_fetch_data), 628 dprm = kzalloc(sizeof(struct deref_fetch_param),
547 GFP_KERNEL); 629 GFP_KERNEL);
548 if (!id) 630 if (!dprm)
549 return -ENOMEM; 631 return -ENOMEM;
550 id->offset = offset; 632 dprm->offset = offset;
551 ret = __parse_probe_arg(arg, &id->orig, is_return); 633 ret = __parse_probe_arg(arg, t2, &dprm->orig,
634 is_return);
552 if (ret) 635 if (ret)
553 kfree(id); 636 kfree(dprm);
554 else { 637 else {
555 ff->func = fetch_indirect; 638 f->fn = t->deref;
556 ff->data = (void *)id; 639 f->data = (void *)dprm;
557 } 640 }
558 } else 641 }
559 ret = -EINVAL;
560 break; 642 break;
561 default:
562 /* TODO: support custom handler */
563 ret = -EINVAL;
564 } 643 }
644 if (!ret && !f->fn)
645 ret = -EINVAL;
565 return ret; 646 return ret;
566} 647}
567 648
568/* String length checking wrapper */ 649/* String length checking wrapper */
569static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 650static int parse_probe_arg(char *arg, struct trace_probe *tp,
651 struct probe_arg *parg, int is_return)
570{ 652{
653 const char *t;
654
571 if (strlen(arg) > MAX_ARGSTR_LEN) { 655 if (strlen(arg) > MAX_ARGSTR_LEN) {
572 pr_info("Argument is too long.: %s\n", arg); 656 pr_info("Argument is too long.: %s\n", arg);
573 return -ENOSPC; 657 return -ENOSPC;
574 } 658 }
575 return __parse_probe_arg(arg, ff, is_return); 659 parg->comm = kstrdup(arg, GFP_KERNEL);
660 if (!parg->comm) {
661 pr_info("Failed to allocate memory for command '%s'.\n", arg);
662 return -ENOMEM;
663 }
664 t = strchr(parg->comm, ':');
665 if (t) {
666 arg[t - parg->comm] = '\0';
667 t++;
668 }
669 parg->type = find_fetch_type(t);
670 if (!parg->type) {
671 pr_info("Unsupported type: %s\n", t);
672 return -EINVAL;
673 }
674 parg->offset = tp->size;
675 tp->size += parg->type->size;
676 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
576} 677}
577 678
578/* Return 1 if name is reserved or already used by another argument */ 679/* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv)
602 * @ADDR : fetch memory at ADDR (ADDR should be in kernel) 703 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
603 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) 704 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
604 * %REG : fetch register REG 705 * %REG : fetch register REG
605 * Indirect memory fetch: 706 * Dereferencing memory fetch:
606 * +|-offs(ARG) : fetch memory at ARG +|- offs address. 707 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
607 * Alias name of args: 708 * Alias name of args:
608 * NAME=FETCHARG : set NAME as alias of FETCHARG. 709 * NAME=FETCHARG : set NAME as alias of FETCHARG.
710 * Type of args:
711 * FETCHARG:TYPE : use TYPE instead of unsigned long.
609 */ 712 */
610 struct trace_probe *tp; 713 struct trace_probe *tp;
611 int i, ret = 0; 714 int i, ret = 0;
612 int is_return = 0, is_delete = 0; 715 int is_return = 0, is_delete = 0;
613 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 716 char *symbol = NULL, *event = NULL, *group = NULL;
717 char *arg, *tmp;
614 unsigned long offset = 0; 718 unsigned long offset = 0;
615 void *addr = NULL; 719 void *addr = NULL;
616 char buf[MAX_EVENT_NAME_LEN]; 720 char buf[MAX_EVENT_NAME_LEN];
@@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv)
723 else 827 else
724 arg = argv[i]; 828 arg = argv[i];
725 829
726 if (conflict_field_name(argv[i], tp->args, i)) {
727 pr_info("Argument%d name '%s' conflicts with "
728 "another field.\n", i, argv[i]);
729 ret = -EINVAL;
730 goto error;
731 }
732
733 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 830 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
734 if (!tp->args[i].name) { 831 if (!tp->args[i].name) {
735 pr_info("Failed to allocate argument%d name '%s'.\n", 832 pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv)
737 ret = -ENOMEM; 834 ret = -ENOMEM;
738 goto error; 835 goto error;
739 } 836 }
837 tmp = strchr(tp->args[i].name, ':');
838 if (tmp)
839 *tmp = '_'; /* convert : to _ */
840
841 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
842 pr_info("Argument%d name '%s' conflicts with "
843 "another field.\n", i, argv[i]);
844 ret = -EINVAL;
845 goto error;
846 }
740 847
741 /* Parse fetch argument */ 848 /* Parse fetch argument */
742 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); 849 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
743 if (ret) { 850 if (ret) {
744 pr_info("Parse error at argument%d. (%d)\n", i, ret); 851 pr_info("Parse error at argument%d. (%d)\n", i, ret);
745 kfree(tp->args[i].name); 852 kfree(tp->args[i].name);
@@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v)
794static int probes_seq_show(struct seq_file *m, void *v) 901static int probes_seq_show(struct seq_file *m, void *v)
795{ 902{
796 struct trace_probe *tp = v; 903 struct trace_probe *tp = v;
797 int i, ret; 904 int i;
798 char buf[MAX_ARGSTR_LEN + 1];
799 905
800 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 906 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
801 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 907 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
@@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
807 else 913 else
808 seq_printf(m, " %s", probe_symbol(tp)); 914 seq_printf(m, " %s", probe_symbol(tp));
809 915
810 for (i = 0; i < tp->nr_args; i++) { 916 for (i = 0; i < tp->nr_args; i++)
811 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); 917 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
812 if (ret < 0) {
813 pr_warning("Argument%d decoding error(%d).\n", i, ret);
814 return ret;
815 }
816 seq_printf(m, " %s=%s", tp->args[i].name, buf);
817 }
818 seq_printf(m, "\n"); 918 seq_printf(m, "\n");
919
819 return 0; 920 return 0;
820} 921}
821 922
@@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = {
945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1046static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
946{ 1047{
947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1048 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
948 struct kprobe_trace_entry *entry; 1049 struct kprobe_trace_entry_head *entry;
949 struct ring_buffer_event *event; 1050 struct ring_buffer_event *event;
950 struct ring_buffer *buffer; 1051 struct ring_buffer *buffer;
1052 u8 *data;
951 int size, i, pc; 1053 int size, i, pc;
952 unsigned long irq_flags; 1054 unsigned long irq_flags;
953 struct ftrace_event_call *call = &tp->call; 1055 struct ftrace_event_call *call = &tp->call;
@@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
957 local_save_flags(irq_flags); 1059 local_save_flags(irq_flags);
958 pc = preempt_count(); 1060 pc = preempt_count();
959 1061
960 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1062 size = sizeof(*entry) + tp->size;
961 1063
962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1064 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
963 irq_flags, pc); 1065 irq_flags, pc);
@@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
965 return; 1067 return;
966 1068
967 entry = ring_buffer_event_data(event); 1069 entry = ring_buffer_event_data(event);
968 entry->nargs = tp->nr_args;
969 entry->ip = (unsigned long)kp->addr; 1070 entry->ip = (unsigned long)kp->addr;
1071 data = (u8 *)&entry[1];
970 for (i = 0; i < tp->nr_args; i++) 1072 for (i = 0; i < tp->nr_args; i++)
971 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1073 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
972 1074
973 if (!filter_current_check_discard(buffer, call, entry, event)) 1075 if (!filter_current_check_discard(buffer, call, entry, event))
974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1076 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
979 struct pt_regs *regs) 1081 struct pt_regs *regs)
980{ 1082{
981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1083 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
982 struct kretprobe_trace_entry *entry; 1084 struct kretprobe_trace_entry_head *entry;
983 struct ring_buffer_event *event; 1085 struct ring_buffer_event *event;
984 struct ring_buffer *buffer; 1086 struct ring_buffer *buffer;
1087 u8 *data;
985 int size, i, pc; 1088 int size, i, pc;
986 unsigned long irq_flags; 1089 unsigned long irq_flags;
987 struct ftrace_event_call *call = &tp->call; 1090 struct ftrace_event_call *call = &tp->call;
@@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
989 local_save_flags(irq_flags); 1092 local_save_flags(irq_flags);
990 pc = preempt_count(); 1093 pc = preempt_count();
991 1094
992 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1095 size = sizeof(*entry) + tp->size;
993 1096
994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1097 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
995 irq_flags, pc); 1098 irq_flags, pc);
@@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
997 return; 1100 return;
998 1101
999 entry = ring_buffer_event_data(event); 1102 entry = ring_buffer_event_data(event);
1000 entry->nargs = tp->nr_args;
1001 entry->func = (unsigned long)tp->rp.kp.addr; 1103 entry->func = (unsigned long)tp->rp.kp.addr;
1002 entry->ret_ip = (unsigned long)ri->ret_addr; 1104 entry->ret_ip = (unsigned long)ri->ret_addr;
1105 data = (u8 *)&entry[1];
1003 for (i = 0; i < tp->nr_args; i++) 1106 for (i = 0; i < tp->nr_args; i++)
1004 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1107 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1005 1108
1006 if (!filter_current_check_discard(buffer, call, entry, event)) 1109 if (!filter_current_check_discard(buffer, call, entry, event))
1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1110 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1011enum print_line_t 1114enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags) 1115print_kprobe_event(struct trace_iterator *iter, int flags)
1013{ 1116{
1014 struct kprobe_trace_entry *field; 1117 struct kprobe_trace_entry_head *field;
1015 struct trace_seq *s = &iter->seq; 1118 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event; 1119 struct trace_event *event;
1017 struct trace_probe *tp; 1120 struct trace_probe *tp;
1121 u8 *data;
1018 int i; 1122 int i;
1019 1123
1020 field = (struct kprobe_trace_entry *)iter->ent; 1124 field = (struct kprobe_trace_entry_head *)iter->ent;
1021 event = ftrace_find_event(field->ent.type); 1125 event = ftrace_find_event(field->ent.type);
1022 tp = container_of(event, struct trace_probe, event); 1126 tp = container_of(event, struct trace_probe, event);
1023 1127
@@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
1030 if (!trace_seq_puts(s, ")")) 1134 if (!trace_seq_puts(s, ")"))
1031 goto partial; 1135 goto partial;
1032 1136
1033 for (i = 0; i < field->nargs; i++) 1137 data = (u8 *)&field[1];
1034 if (!trace_seq_printf(s, " %s=%lx", 1138 for (i = 0; i < tp->nr_args; i++)
1035 tp->args[i].name, field->args[i])) 1139 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset))
1036 goto partial; 1141 goto partial;
1037 1142
1038 if (!trace_seq_puts(s, "\n")) 1143 if (!trace_seq_puts(s, "\n"))
@@ -1046,13 +1151,14 @@ partial:
1046enum print_line_t 1151enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags) 1152print_kretprobe_event(struct trace_iterator *iter, int flags)
1048{ 1153{
1049 struct kretprobe_trace_entry *field; 1154 struct kretprobe_trace_entry_head *field;
1050 struct trace_seq *s = &iter->seq; 1155 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event; 1156 struct trace_event *event;
1052 struct trace_probe *tp; 1157 struct trace_probe *tp;
1158 u8 *data;
1053 int i; 1159 int i;
1054 1160
1055 field = (struct kretprobe_trace_entry *)iter->ent; 1161 field = (struct kretprobe_trace_entry_head *)iter->ent;
1056 event = ftrace_find_event(field->ent.type); 1162 event = ftrace_find_event(field->ent.type);
1057 tp = container_of(event, struct trace_probe, event); 1163 tp = container_of(event, struct trace_probe, event);
1058 1164
@@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
1071 if (!trace_seq_puts(s, ")")) 1177 if (!trace_seq_puts(s, ")"))
1072 goto partial; 1178 goto partial;
1073 1179
1074 for (i = 0; i < field->nargs; i++) 1180 data = (u8 *)&field[1];
1075 if (!trace_seq_printf(s, " %s=%lx", 1181 for (i = 0; i < tp->nr_args; i++)
1076 tp->args[i].name, field->args[i])) 1182 if (!tp->args[i].type->print(s, tp->args[i].name,
1183 data + tp->args[i].offset))
1077 goto partial; 1184 goto partial;
1078 1185
1079 if (!trace_seq_puts(s, "\n")) 1186 if (!trace_seq_puts(s, "\n"))
@@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 1236static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{ 1237{
1131 int ret, i; 1238 int ret, i;
1132 struct kprobe_trace_entry field; 1239 struct kprobe_trace_entry_head field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1240 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134 1241
1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1242 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1137 /* Set argument names as fields */ 1243 /* Set argument names as fields */
1138 for (i = 0; i < tp->nr_args; i++) 1244 for (i = 0; i < tp->nr_args; i++) {
1139 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1245 ret = trace_define_field(event_call, tp->args[i].type->name,
1246 tp->args[i].name,
1247 sizeof(field) + tp->args[i].offset,
1248 tp->args[i].type->size,
1249 tp->args[i].type->is_signed,
1250 FILTER_OTHER);
1251 if (ret)
1252 return ret;
1253 }
1140 return 0; 1254 return 0;
1141} 1255}
1142 1256
1143static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) 1257static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1144{ 1258{
1145 int ret, i; 1259 int ret, i;
1146 struct kretprobe_trace_entry field; 1260 struct kretprobe_trace_entry_head field;
1147 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1261 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1148 1262
1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1263 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1264 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1152 /* Set argument names as fields */ 1265 /* Set argument names as fields */
1153 for (i = 0; i < tp->nr_args; i++) 1266 for (i = 0; i < tp->nr_args; i++) {
1154 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1267 ret = trace_define_field(event_call, tp->args[i].type->name,
1268 tp->args[i].name,
1269 sizeof(field) + tp->args[i].offset,
1270 tp->args[i].type->size,
1271 tp->args[i].type->is_signed,
1272 FILTER_OTHER);
1273 if (ret)
1274 return ret;
1275 }
1155 return 0; 1276 return 0;
1156} 1277}
1157 1278
@@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); 1297 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1177 1298
1178 for (i = 0; i < tp->nr_args; i++) { 1299 for (i = 0; i < tp->nr_args; i++) {
1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", 1300 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1180 tp->args[i].name); 1301 tp->args[i].name, tp->args[i].type->fmt);
1181 } 1302 }
1182 1303
1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1304 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
@@ -1214,67 +1335,70 @@ static int set_print_fmt(struct trace_probe *tp)
1214#ifdef CONFIG_PERF_EVENTS 1335#ifdef CONFIG_PERF_EVENTS
1215 1336
1216/* Kprobe profile handler */ 1337/* Kprobe profile handler */
1217static __kprobes void kprobe_profile_func(struct kprobe *kp, 1338static __kprobes void kprobe_perf_func(struct kprobe *kp,
1218 struct pt_regs *regs) 1339 struct pt_regs *regs)
1219{ 1340{
1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1341 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1221 struct ftrace_event_call *call = &tp->call; 1342 struct ftrace_event_call *call = &tp->call;
1222 struct kprobe_trace_entry *entry; 1343 struct kprobe_trace_entry_head *entry;
1344 u8 *data;
1223 int size, __size, i; 1345 int size, __size, i;
1224 unsigned long irq_flags; 1346 unsigned long irq_flags;
1225 int rctx; 1347 int rctx;
1226 1348
1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1349 __size = sizeof(*entry) + tp->size;
1228 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1350 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1229 size -= sizeof(u32); 1351 size -= sizeof(u32);
1230 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1352 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1231 "profile buffer not large enough")) 1353 "profile buffer not large enough"))
1232 return; 1354 return;
1233 1355
1234 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); 1356 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
1235 if (!entry) 1357 if (!entry)
1236 return; 1358 return;
1237 1359
1238 entry->nargs = tp->nr_args;
1239 entry->ip = (unsigned long)kp->addr; 1360 entry->ip = (unsigned long)kp->addr;
1361 data = (u8 *)&entry[1];
1240 for (i = 0; i < tp->nr_args; i++) 1362 for (i = 0; i < tp->nr_args; i++)
1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1363 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1242 1364
1243 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); 1365 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
1244} 1366}
1245 1367
1246/* Kretprobe profile handler */ 1368/* Kretprobe profile handler */
1247static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, 1369static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1248 struct pt_regs *regs) 1370 struct pt_regs *regs)
1249{ 1371{
1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1372 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1251 struct ftrace_event_call *call = &tp->call; 1373 struct ftrace_event_call *call = &tp->call;
1252 struct kretprobe_trace_entry *entry; 1374 struct kretprobe_trace_entry_head *entry;
1375 u8 *data;
1253 int size, __size, i; 1376 int size, __size, i;
1254 unsigned long irq_flags; 1377 unsigned long irq_flags;
1255 int rctx; 1378 int rctx;
1256 1379
1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1380 __size = sizeof(*entry) + tp->size;
1258 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1381 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1259 size -= sizeof(u32); 1382 size -= sizeof(u32);
1260 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1383 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1261 "profile buffer not large enough")) 1384 "profile buffer not large enough"))
1262 return; 1385 return;
1263 1386
1264 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); 1387 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
1265 if (!entry) 1388 if (!entry)
1266 return; 1389 return;
1267 1390
1268 entry->nargs = tp->nr_args;
1269 entry->func = (unsigned long)tp->rp.kp.addr; 1391 entry->func = (unsigned long)tp->rp.kp.addr;
1270 entry->ret_ip = (unsigned long)ri->ret_addr; 1392 entry->ret_ip = (unsigned long)ri->ret_addr;
1393 data = (u8 *)&entry[1];
1271 for (i = 0; i < tp->nr_args; i++) 1394 for (i = 0; i < tp->nr_args; i++)
1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1395 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1273 1396
1274 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); 1397 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
1398 irq_flags, regs);
1275} 1399}
1276 1400
1277static int probe_profile_enable(struct ftrace_event_call *call) 1401static int probe_perf_enable(struct ftrace_event_call *call)
1278{ 1402{
1279 struct trace_probe *tp = (struct trace_probe *)call->data; 1403 struct trace_probe *tp = (struct trace_probe *)call->data;
1280 1404
@@ -1286,7 +1410,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
1286 return enable_kprobe(&tp->rp.kp); 1410 return enable_kprobe(&tp->rp.kp);
1287} 1411}
1288 1412
1289static void probe_profile_disable(struct ftrace_event_call *call) 1413static void probe_perf_disable(struct ftrace_event_call *call)
1290{ 1414{
1291 struct trace_probe *tp = (struct trace_probe *)call->data; 1415 struct trace_probe *tp = (struct trace_probe *)call->data;
1292 1416
@@ -1311,7 +1435,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1311 kprobe_trace_func(kp, regs); 1435 kprobe_trace_func(kp, regs);
1312#ifdef CONFIG_PERF_EVENTS 1436#ifdef CONFIG_PERF_EVENTS
1313 if (tp->flags & TP_FLAG_PROFILE) 1437 if (tp->flags & TP_FLAG_PROFILE)
1314 kprobe_profile_func(kp, regs); 1438 kprobe_perf_func(kp, regs);
1315#endif 1439#endif
1316 return 0; /* We don't tweek kernel, so just return 0 */ 1440 return 0; /* We don't tweek kernel, so just return 0 */
1317} 1441}
@@ -1325,7 +1449,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1325 kretprobe_trace_func(ri, regs); 1449 kretprobe_trace_func(ri, regs);
1326#ifdef CONFIG_PERF_EVENTS 1450#ifdef CONFIG_PERF_EVENTS
1327 if (tp->flags & TP_FLAG_PROFILE) 1451 if (tp->flags & TP_FLAG_PROFILE)
1328 kretprobe_profile_func(ri, regs); 1452 kretprobe_perf_func(ri, regs);
1329#endif 1453#endif
1330 return 0; /* We don't tweek kernel, so just return 0 */ 1454 return 0; /* We don't tweek kernel, so just return 0 */
1331} 1455}
@@ -1358,8 +1482,8 @@ static int register_probe_event(struct trace_probe *tp)
1358 call->unregfunc = probe_event_disable; 1482 call->unregfunc = probe_event_disable;
1359 1483
1360#ifdef CONFIG_PERF_EVENTS 1484#ifdef CONFIG_PERF_EVENTS
1361 call->profile_enable = probe_profile_enable; 1485 call->perf_event_enable = probe_perf_enable;
1362 call->profile_disable = probe_profile_disable; 1486 call->perf_event_disable = probe_perf_disable;
1363#endif 1487#endif
1364 call->data = tp; 1488 call->data = tp;
1365 ret = trace_add_event_call(call); 1489 ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 94103cdcf9d8..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -23,6 +23,7 @@
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/slab.h>
26#include <linux/fs.h> 27#include <linux/fs.h>
27 28
28#include "trace_output.h" 29#include "trace_output.h"
@@ -33,12 +34,6 @@
33 34
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35 36
36/*
37 * For now, let us restrict the no. of symbols traced simultaneously to number
38 * of available hardware breakpoint registers.
39 */
40#define KSYM_TRACER_MAX HBP_NUM
41
42#define KSYM_TRACER_OP_LEN 3 /* rw- */ 37#define KSYM_TRACER_OP_LEN 3 /* rw- */
43 38
44struct trace_ksym { 39struct trace_ksym {
@@ -52,7 +47,6 @@ struct trace_ksym {
52 47
53static struct trace_array *ksym_trace_array; 48static struct trace_array *ksym_trace_array;
54 49
55static unsigned int ksym_filter_entry_count;
56static unsigned int ksym_tracing_enabled; 50static unsigned int ksym_tracing_enabled;
57 51
58static HLIST_HEAD(ksym_filter_head); 52static HLIST_HEAD(ksym_filter_head);
@@ -180,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
180 struct trace_ksym *entry; 174 struct trace_ksym *entry;
181 int ret = -ENOMEM; 175 int ret = -ENOMEM;
182 176
183 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
184 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
185 " new requests for tracing can be accepted now.\n",
186 KSYM_TRACER_MAX);
187 return -ENOSPC;
188 }
189
190 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); 177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
191 if (!entry) 178 if (!entry)
192 return -ENOMEM; 179 return -ENOMEM;
@@ -202,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
202 189
203 if (IS_ERR(entry->ksym_hbp)) { 190 if (IS_ERR(entry->ksym_hbp)) {
204 ret = PTR_ERR(entry->ksym_hbp); 191 ret = PTR_ERR(entry->ksym_hbp);
205 printk(KERN_INFO "ksym_tracer request failed. Try again" 192 if (ret == -ENOSPC) {
206 " later!!\n"); 193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
207 goto err; 199 goto err;
208 } 200 }
209 201
210 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); 202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
211 ksym_filter_entry_count++;
212 203
213 return 0; 204 return 0;
214 205
@@ -264,7 +255,6 @@ static void __ksym_trace_reset(void)
264 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, 255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
265 ksym_hlist) { 256 ksym_hlist) {
266 unregister_wide_hw_breakpoint(entry->ksym_hbp); 257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
267 ksym_filter_entry_count--;
268 hlist_del_rcu(&(entry->ksym_hlist)); 258 hlist_del_rcu(&(entry->ksym_hlist));
269 synchronize_rcu(); 259 synchronize_rcu();
270 kfree(entry); 260 kfree(entry);
@@ -337,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
337 goto out_unlock; 327 goto out_unlock;
338 } 328 }
339 /* Error or "symbol:---" case: drop it */ 329 /* Error or "symbol:---" case: drop it */
340 ksym_filter_entry_count--;
341 hlist_del_rcu(&(entry->ksym_hlist)); 330 hlist_del_rcu(&(entry->ksym_hlist));
342 synchronize_rcu(); 331 synchronize_rcu();
343 kfree(entry); 332 kfree(entry);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/slab.h>
12#include <linux/time.h> 13#include <linux/time.h>
13 14
14#include <asm/atomic.h> 15#include <asm/atomic.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..2404c129a8c9 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -253,7 +253,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
253 void *ret; 253 void *ret;
254 254
255 if (s->full) 255 if (s->full)
256 return 0; 256 return NULL;
257 257
258 if (len > ((PAGE_SIZE - 1) - s->len)) { 258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1; 259 s->full = 1;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..a55fccfede5d 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct task_struct *prev, struct task_struct *next)
54 struct task_struct *next)
55{ 54{
56 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
57 unsigned long flags; 56 unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
109} 108}
110 109
111static void 110static void
112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 111probe_sched_wakeup(struct task_struct *wakee, int success)
113{ 112{
114 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
115 unsigned long flags; 114 unsigned long flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..8052446ceeaa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -107,8 +107,7 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 107}
108 108
109static void notrace 109static void notrace
110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
111 struct task_struct *next)
112{ 111{
113 struct trace_array_cpu *data; 112 struct trace_array_cpu *data;
114 cycle_t T0, T1, delta; 113 cycle_t T0, T1, delta;
@@ -200,7 +199,7 @@ static void wakeup_reset(struct trace_array *tr)
200} 199}
201 200
202static void 201static void
203probe_wakeup(struct rq *rq, struct task_struct *p, int success) 202probe_wakeup(struct task_struct *p, int success)
204{ 203{
205 struct trace_array_cpu *data; 204 struct trace_array_cpu *data;
206 int cpu = smp_processor_id(); 205 int cpu = smp_processor_id();
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
3#include <linux/stringify.h> 3#include <linux/stringify.h>
4#include <linux/kthread.h> 4#include <linux/kthread.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/slab.h>
6 7
7static inline int trace_valid_entry(struct trace_entry *entry) 8static inline int trace_valid_entry(struct trace_entry *entry)
8{ 9{
@@ -16,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
16 case TRACE_BRANCH: 17 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT: 18 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 19 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM: 20 case TRACE_KSYM:
21 return 1; 21 return 1;
22 } 22 }
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
29 struct trace_entry *entry; 29 struct trace_entry *entry;
30 unsigned int loops = 0; 30 unsigned int loops = 0;
31 31
32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
33 entry = ring_buffer_event_data(event); 33 entry = ring_buffer_event_data(event);
34 34
35 /* 35 /*
@@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
255/* Maximum number of functions to trace before diagnosing a hang */ 255/* Maximum number of functions to trace before diagnosing a hang */
256#define GRAPH_MAX_FUNC_TEST 100000000 256#define GRAPH_MAX_FUNC_TEST 100000000
257 257
258static void __ftrace_dump(bool disable_tracing); 258static void
259__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
259static unsigned int graph_hang_thresh; 260static unsigned int graph_hang_thresh;
260 261
261/* Wrap the real function entry probe to avoid possible hanging */ 262/* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
266 ftrace_graph_stop(); 267 ftrace_graph_stop();
267 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
268 if (ftrace_dump_on_oops) 269 if (ftrace_dump_on_oops)
269 __ftrace_dump(false); 270 __ftrace_dump(false, DUMP_ALL);
270 return 0; 271 return 0;
271 } 272 }
272 273
@@ -754,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
754} 755}
755#endif /* CONFIG_BRANCH_TRACER */ 756#endif /* CONFIG_BRANCH_TRACER */
756 757
757#ifdef CONFIG_HW_BRANCH_TRACER
758int
759trace_selftest_startup_hw_branches(struct tracer *trace,
760 struct trace_array *tr)
761{
762 struct trace_iterator *iter;
763 struct tracer tracer;
764 unsigned long count;
765 int ret;
766
767 if (!trace->open) {
768 printk(KERN_CONT "missing open function...");
769 return -1;
770 }
771
772 ret = tracer_init(trace, tr);
773 if (ret) {
774 warn_failed_init_tracer(trace, ret);
775 return ret;
776 }
777
778 /*
779 * The hw-branch tracer needs to collect the trace from the various
780 * cpu trace buffers - before tracing is stopped.
781 */
782 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
783 if (!iter)
784 return -ENOMEM;
785
786 memcpy(&tracer, trace, sizeof(tracer));
787
788 iter->trace = &tracer;
789 iter->tr = tr;
790 iter->pos = -1;
791 mutex_init(&iter->mutex);
792
793 trace->open(iter);
794
795 mutex_destroy(&iter->mutex);
796 kfree(iter);
797
798 tracing_stop();
799
800 ret = trace_test_buffer(tr, &count);
801 trace->reset(tr);
802 tracing_start();
803
804 if (!ret && !count) {
805 printk(KERN_CONT "no entries found..");
806 ret = -1;
807 }
808
809 return ret;
810}
811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER 758#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy; 759static int ksym_selftest_dummy;
815 760
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
10 10
11 11
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/slab.h>
13#include <linux/rbtree.h> 14#include <linux/rbtree.h>
14#include <linux/debugfs.h> 15#include <linux/debugfs.h>
15#include "trace_stat.h" 16#include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cba47d7935cc..4d6d711717f2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/slab.h>
3#include <linux/kernel.h> 4#include <linux/kernel.h>
4#include <linux/ftrace.h> 5#include <linux/ftrace.h>
5#include <linux/perf_event.h> 6#include <linux/perf_event.h>
@@ -428,12 +429,12 @@ core_initcall(init_ftrace_syscalls);
428 429
429#ifdef CONFIG_PERF_EVENTS 430#ifdef CONFIG_PERF_EVENTS
430 431
431static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 432static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
432static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 433static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
433static int sys_prof_refcount_enter; 434static int sys_perf_refcount_enter;
434static int sys_prof_refcount_exit; 435static int sys_perf_refcount_exit;
435 436
436static void prof_syscall_enter(struct pt_regs *regs, long id) 437static void perf_syscall_enter(struct pt_regs *regs, long id)
437{ 438{
438 struct syscall_metadata *sys_data; 439 struct syscall_metadata *sys_data;
439 struct syscall_trace_enter *rec; 440 struct syscall_trace_enter *rec;
@@ -443,7 +444,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
443 int size; 444 int size;
444 445
445 syscall_nr = syscall_get_nr(current, regs); 446 syscall_nr = syscall_get_nr(current, regs);
446 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 447 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
447 return; 448 return;
448 449
449 sys_data = syscall_nr_to_meta(syscall_nr); 450 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -455,11 +456,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
455 size = ALIGN(size + sizeof(u32), sizeof(u64)); 456 size = ALIGN(size + sizeof(u32), sizeof(u64));
456 size -= sizeof(u32); 457 size -= sizeof(u32);
457 458
458 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 459 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
459 "profile buffer not large enough")) 460 "perf buffer not large enough"))
460 return; 461 return;
461 462
462 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, 463 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
463 sys_data->enter_event->id, &rctx, &flags); 464 sys_data->enter_event->id, &rctx, &flags);
464 if (!rec) 465 if (!rec)
465 return; 466 return;
@@ -467,10 +468,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
467 rec->nr = syscall_nr; 468 rec->nr = syscall_nr;
468 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 469 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
469 (unsigned long *)&rec->args); 470 (unsigned long *)&rec->args);
470 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); 471 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
471} 472}
472 473
473int prof_sysenter_enable(struct ftrace_event_call *call) 474int perf_sysenter_enable(struct ftrace_event_call *call)
474{ 475{
475 int ret = 0; 476 int ret = 0;
476 int num; 477 int num;
@@ -478,34 +479,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
478 num = ((struct syscall_metadata *)call->data)->syscall_nr; 479 num = ((struct syscall_metadata *)call->data)->syscall_nr;
479 480
480 mutex_lock(&syscall_trace_lock); 481 mutex_lock(&syscall_trace_lock);
481 if (!sys_prof_refcount_enter) 482 if (!sys_perf_refcount_enter)
482 ret = register_trace_sys_enter(prof_syscall_enter); 483 ret = register_trace_sys_enter(perf_syscall_enter);
483 if (ret) { 484 if (ret) {
484 pr_info("event trace: Could not activate" 485 pr_info("event trace: Could not activate"
485 "syscall entry trace point"); 486 "syscall entry trace point");
486 } else { 487 } else {
487 set_bit(num, enabled_prof_enter_syscalls); 488 set_bit(num, enabled_perf_enter_syscalls);
488 sys_prof_refcount_enter++; 489 sys_perf_refcount_enter++;
489 } 490 }
490 mutex_unlock(&syscall_trace_lock); 491 mutex_unlock(&syscall_trace_lock);
491 return ret; 492 return ret;
492} 493}
493 494
494void prof_sysenter_disable(struct ftrace_event_call *call) 495void perf_sysenter_disable(struct ftrace_event_call *call)
495{ 496{
496 int num; 497 int num;
497 498
498 num = ((struct syscall_metadata *)call->data)->syscall_nr; 499 num = ((struct syscall_metadata *)call->data)->syscall_nr;
499 500
500 mutex_lock(&syscall_trace_lock); 501 mutex_lock(&syscall_trace_lock);
501 sys_prof_refcount_enter--; 502 sys_perf_refcount_enter--;
502 clear_bit(num, enabled_prof_enter_syscalls); 503 clear_bit(num, enabled_perf_enter_syscalls);
503 if (!sys_prof_refcount_enter) 504 if (!sys_perf_refcount_enter)
504 unregister_trace_sys_enter(prof_syscall_enter); 505 unregister_trace_sys_enter(perf_syscall_enter);
505 mutex_unlock(&syscall_trace_lock); 506 mutex_unlock(&syscall_trace_lock);
506} 507}
507 508
508static void prof_syscall_exit(struct pt_regs *regs, long ret) 509static void perf_syscall_exit(struct pt_regs *regs, long ret)
509{ 510{
510 struct syscall_metadata *sys_data; 511 struct syscall_metadata *sys_data;
511 struct syscall_trace_exit *rec; 512 struct syscall_trace_exit *rec;
@@ -515,7 +516,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
515 int size; 516 int size;
516 517
517 syscall_nr = syscall_get_nr(current, regs); 518 syscall_nr = syscall_get_nr(current, regs);
518 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 519 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
519 return; 520 return;
520 521
521 sys_data = syscall_nr_to_meta(syscall_nr); 522 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -530,11 +531,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
530 * Impossible, but be paranoid with the future 531 * Impossible, but be paranoid with the future
531 * How to put this check outside runtime? 532 * How to put this check outside runtime?
532 */ 533 */
533 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 534 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
534 "exit event has grown above profile buffer size")) 535 "exit event has grown above perf buffer size"))
535 return; 536 return;
536 537
537 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, 538 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
538 sys_data->exit_event->id, &rctx, &flags); 539 sys_data->exit_event->id, &rctx, &flags);
539 if (!rec) 540 if (!rec)
540 return; 541 return;
@@ -542,10 +543,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
542 rec->nr = syscall_nr; 543 rec->nr = syscall_nr;
543 rec->ret = syscall_get_return_value(current, regs); 544 rec->ret = syscall_get_return_value(current, regs);
544 545
545 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); 546 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
546} 547}
547 548
548int prof_sysexit_enable(struct ftrace_event_call *call) 549int perf_sysexit_enable(struct ftrace_event_call *call)
549{ 550{
550 int ret = 0; 551 int ret = 0;
551 int num; 552 int num;
@@ -553,30 +554,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
553 num = ((struct syscall_metadata *)call->data)->syscall_nr; 554 num = ((struct syscall_metadata *)call->data)->syscall_nr;
554 555
555 mutex_lock(&syscall_trace_lock); 556 mutex_lock(&syscall_trace_lock);
556 if (!sys_prof_refcount_exit) 557 if (!sys_perf_refcount_exit)
557 ret = register_trace_sys_exit(prof_syscall_exit); 558 ret = register_trace_sys_exit(perf_syscall_exit);
558 if (ret) { 559 if (ret) {
559 pr_info("event trace: Could not activate" 560 pr_info("event trace: Could not activate"
560 "syscall exit trace point"); 561 "syscall exit trace point");
561 } else { 562 } else {
562 set_bit(num, enabled_prof_exit_syscalls); 563 set_bit(num, enabled_perf_exit_syscalls);
563 sys_prof_refcount_exit++; 564 sys_perf_refcount_exit++;
564 } 565 }
565 mutex_unlock(&syscall_trace_lock); 566 mutex_unlock(&syscall_trace_lock);
566 return ret; 567 return ret;
567} 568}
568 569
569void prof_sysexit_disable(struct ftrace_event_call *call) 570void perf_sysexit_disable(struct ftrace_event_call *call)
570{ 571{
571 int num; 572 int num;
572 573
573 num = ((struct syscall_metadata *)call->data)->syscall_nr; 574 num = ((struct syscall_metadata *)call->data)->syscall_nr;
574 575
575 mutex_lock(&syscall_trace_lock); 576 mutex_lock(&syscall_trace_lock);
576 sys_prof_refcount_exit--; 577 sys_perf_refcount_exit--;
577 clear_bit(num, enabled_prof_exit_syscalls); 578 clear_bit(num, enabled_perf_exit_syscalls);
578 if (!sys_prof_refcount_exit) 579 if (!sys_perf_refcount_exit)
579 unregister_trace_sys_exit(prof_syscall_exit); 580 unregister_trace_sys_exit(perf_syscall_exit);
580 mutex_unlock(&syscall_trace_lock); 581 mutex_unlock(&syscall_trace_lock);
581} 582}
582 583
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..cc2d2faa7d9e 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/slab.h>
12#include <linux/kref.h> 13#include <linux/kref.h>
13#include "trace_stat.h" 14#include "trace_stat.h"
14#include "trace.h" 15#include "trace.h"
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
20 19
21struct user_namespace init_user_ns = { 20struct user_namespace init_user_ns = {
22 .kref = { 21 .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
137 struct hlist_head *hashent = uidhashentry(ns, uid); 136 struct hlist_head *hashent = uidhashentry(ns, uid);
138 struct user_struct *up, *new; 137 struct user_struct *up, *new;
139 138
140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
141 * atomic.
142 */
143 spin_lock_irq(&uidhash_lock); 139 spin_lock_irq(&uidhash_lock);
144 up = uid_hash_find(uid, hashent); 140 up = uid_hash_find(uid, hashent);
145 spin_unlock_irq(&uidhash_lock); 141 spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
161 spin_lock_irq(&uidhash_lock); 157 spin_lock_irq(&uidhash_lock);
162 up = uid_hash_find(uid, hashent); 158 up = uid_hash_find(uid, hashent);
163 if (up) { 159 if (up) {
164 /* This case is not possible when CONFIG_USER_SCHED
165 * is defined, since we serialize alloc_uid() using
166 * uids_mutex. Hence no need to call
167 * sched_destroy_user() or remove_user_sysfs_dir().
168 */
169 key_put(new->uid_keyring); 160 key_put(new->uid_keyring);
170 key_put(new->session_keyring); 161 key_put(new->session_keyring);
171 kmem_cache_free(uid_cachep, new); 162 kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
178 169
179 return up; 170 return up;
180 171
181 put_user_ns(new->user_ns);
182 kmem_cache_free(uid_cachep, new);
183out_unlock: 172out_unlock:
184 return NULL; 173 return NULL;
185} 174}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..5bfb213984b2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork)
774{ 774{
775 if (del_timer_sync(&dwork->timer)) { 775 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq; 776 struct cpu_workqueue_struct *cwq;
777 cwq = wq_per_cpu(keventd_wq, get_cpu()); 777 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
778 __queue_work(cwq, &dwork->work); 778 __queue_work(cwq, &dwork->work);
779 put_cpu(); 779 put_cpu();
780 } 780 }