aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit_tree.c100
-rw-r--r--kernel/auditsc.c7
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c15
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/early_res.c578
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/irq/chip.c52
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c681
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/module.c29
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c46
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/perf_event.c629
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/posix-cpu-timers.c36
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/hibernate.c9
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/printk.c55
-rw-r--r--kernel/ptrace.c88
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c29
-rw-r--r--kernel/rcutorture.c102
-rw-r--r--kernel/rcutree.c268
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h229
-rw-r--r--kernel/rcutree_trace.c14
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c22
-rw-r--r--kernel/sched.c2214
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1699
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c59
-rw-r--r--kernel/signal.c45
-rw-r--r--kernel/smp.c8
-rw-r--r--kernel/srcu.c52
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/sysctl_binary.c7
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time/clocksource.c14
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/timekeeping.c1
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c5
-rw-r--r--kernel/trace/ftrace.c105
-rw-r--r--kernel/trace/ring_buffer.c1
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c150
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_branch.c19
-rw-r--r--kernel/trace/trace_event_profile.c52
-rw-r--r--kernel/trace/trace_events.c81
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_export.c87
-rw-r--r--kernel/trace/trace_functions_graph.c82
-rw-r--r--kernel/trace/trace_kprobe.c304
-rw-r--r--kernel/trace/trace_syscalls.c189
-rw-r--r--kernel/tsacct.c1
-rw-r--r--kernel/user.c305
76 files changed, 5140 insertions, 3858 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aebdeb2aa34..a987aa1676b5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
90obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 93obj-$(CONFIG_LATENCYTOP) += latencytop.o
94obj-$(CONFIG_BINFMT_ELF) += elfcore.o
95obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..028e85663f27 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
548 return 0; 548 return 0;
549} 549}
550 550
551static int compare_root(struct vfsmount *mnt, void *arg)
552{
553 return mnt->mnt_root->d_inode == arg;
554}
555
551void audit_trim_trees(void) 556void audit_trim_trees(void)
552{ 557{
553 struct list_head cursor; 558 struct list_head cursor;
@@ -559,7 +564,6 @@ void audit_trim_trees(void)
559 struct path path; 564 struct path path;
560 struct vfsmount *root_mnt; 565 struct vfsmount *root_mnt;
561 struct node *node; 566 struct node *node;
562 struct list_head list;
563 int err; 567 int err;
564 568
565 tree = container_of(cursor.next, struct audit_tree, list); 569 tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +581,16 @@ void audit_trim_trees(void)
577 if (!root_mnt) 581 if (!root_mnt)
578 goto skip_it; 582 goto skip_it;
579 583
580 list_add_tail(&list, &root_mnt->mnt_list);
581 spin_lock(&hash_lock); 584 spin_lock(&hash_lock);
582 list_for_each_entry(node, &tree->chunks, list) { 585 list_for_each_entry(node, &tree->chunks, list) {
583 struct audit_chunk *chunk = find_chunk(node); 586 struct inode *inode = find_chunk(node)->watch.inode;
584 struct inode *inode = chunk->watch.inode;
585 struct vfsmount *mnt;
586 node->index |= 1U<<31; 587 node->index |= 1U<<31;
587 list_for_each_entry(mnt, &list, mnt_list) { 588 if (iterate_mounts(compare_root, inode, root_mnt))
588 if (mnt->mnt_root->d_inode == inode) { 589 node->index &= ~(1U<<31);
589 node->index &= ~(1U<<31);
590 break;
591 }
592 }
593 } 590 }
594 spin_unlock(&hash_lock); 591 spin_unlock(&hash_lock);
595 trim_marked(tree); 592 trim_marked(tree);
596 put_tree(tree); 593 put_tree(tree);
597 list_del_init(&list);
598 drop_collected_mounts(root_mnt); 594 drop_collected_mounts(root_mnt);
599skip_it: 595skip_it:
600 mutex_lock(&audit_filter_mutex); 596 mutex_lock(&audit_filter_mutex);
@@ -603,22 +599,6 @@ skip_it:
603 mutex_unlock(&audit_filter_mutex); 599 mutex_unlock(&audit_filter_mutex);
604} 600}
605 601
606static int is_under(struct vfsmount *mnt, struct dentry *dentry,
607 struct path *path)
608{
609 if (mnt != path->mnt) {
610 for (;;) {
611 if (mnt->mnt_parent == mnt)
612 return 0;
613 if (mnt->mnt_parent == path->mnt)
614 break;
615 mnt = mnt->mnt_parent;
616 }
617 dentry = mnt->mnt_mountpoint;
618 }
619 return is_subdir(dentry, path->dentry);
620}
621
622int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 602int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
623{ 603{
624 604
@@ -638,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree)
638 put_tree(tree); 618 put_tree(tree);
639} 619}
640 620
621static int tag_mount(struct vfsmount *mnt, void *arg)
622{
623 return tag_chunk(mnt->mnt_root->d_inode, arg);
624}
625
641/* called with audit_filter_mutex */ 626/* called with audit_filter_mutex */
642int audit_add_tree_rule(struct audit_krule *rule) 627int audit_add_tree_rule(struct audit_krule *rule)
643{ 628{
644 struct audit_tree *seed = rule->tree, *tree; 629 struct audit_tree *seed = rule->tree, *tree;
645 struct path path; 630 struct path path;
646 struct vfsmount *mnt, *p; 631 struct vfsmount *mnt;
647 struct list_head list;
648 int err; 632 int err;
649 633
650 list_for_each_entry(tree, &tree_list, list) { 634 list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
670 err = -ENOMEM; 654 err = -ENOMEM;
671 goto Err; 655 goto Err;
672 } 656 }
673 list_add_tail(&list, &mnt->mnt_list);
674 657
675 get_tree(tree); 658 get_tree(tree);
676 list_for_each_entry(p, &list, mnt_list) { 659 err = iterate_mounts(tag_mount, tree, mnt);
677 err = tag_chunk(p->mnt_root->d_inode, tree);
678 if (err)
679 break;
680 }
681
682 list_del(&list);
683 drop_collected_mounts(mnt); 660 drop_collected_mounts(mnt);
684 661
685 if (!err) { 662 if (!err) {
@@ -714,31 +691,23 @@ int audit_tag_tree(char *old, char *new)
714{ 691{
715 struct list_head cursor, barrier; 692 struct list_head cursor, barrier;
716 int failed = 0; 693 int failed = 0;
717 struct path path; 694 struct path path1, path2;
718 struct vfsmount *tagged; 695 struct vfsmount *tagged;
719 struct list_head list;
720 struct vfsmount *mnt;
721 struct dentry *dentry;
722 int err; 696 int err;
723 697
724 err = kern_path(new, 0, &path); 698 err = kern_path(new, 0, &path2);
725 if (err) 699 if (err)
726 return err; 700 return err;
727 tagged = collect_mounts(&path); 701 tagged = collect_mounts(&path2);
728 path_put(&path); 702 path_put(&path2);
729 if (!tagged) 703 if (!tagged)
730 return -ENOMEM; 704 return -ENOMEM;
731 705
732 err = kern_path(old, 0, &path); 706 err = kern_path(old, 0, &path1);
733 if (err) { 707 if (err) {
734 drop_collected_mounts(tagged); 708 drop_collected_mounts(tagged);
735 return err; 709 return err;
736 } 710 }
737 mnt = mntget(path.mnt);
738 dentry = dget(path.dentry);
739 path_put(&path);
740
741 list_add_tail(&list, &tagged->mnt_list);
742 711
743 mutex_lock(&audit_filter_mutex); 712 mutex_lock(&audit_filter_mutex);
744 list_add(&barrier, &tree_list); 713 list_add(&barrier, &tree_list);
@@ -746,7 +715,7 @@ int audit_tag_tree(char *old, char *new)
746 715
747 while (cursor.next != &tree_list) { 716 while (cursor.next != &tree_list) {
748 struct audit_tree *tree; 717 struct audit_tree *tree;
749 struct vfsmount *p; 718 int good_one = 0;
750 719
751 tree = container_of(cursor.next, struct audit_tree, list); 720 tree = container_of(cursor.next, struct audit_tree, list);
752 get_tree(tree); 721 get_tree(tree);
@@ -754,30 +723,19 @@ int audit_tag_tree(char *old, char *new)
754 list_add(&cursor, &tree->list); 723 list_add(&cursor, &tree->list);
755 mutex_unlock(&audit_filter_mutex); 724 mutex_unlock(&audit_filter_mutex);
756 725
757 err = kern_path(tree->pathname, 0, &path); 726 err = kern_path(tree->pathname, 0, &path2);
758 if (err) { 727 if (!err) {
759 put_tree(tree); 728 good_one = path_is_under(&path1, &path2);
760 mutex_lock(&audit_filter_mutex); 729 path_put(&path2);
761 continue;
762 } 730 }
763 731
764 spin_lock(&vfsmount_lock); 732 if (!good_one) {
765 if (!is_under(mnt, dentry, &path)) {
766 spin_unlock(&vfsmount_lock);
767 path_put(&path);
768 put_tree(tree); 733 put_tree(tree);
769 mutex_lock(&audit_filter_mutex); 734 mutex_lock(&audit_filter_mutex);
770 continue; 735 continue;
771 } 736 }
772 spin_unlock(&vfsmount_lock);
773 path_put(&path);
774
775 list_for_each_entry(p, &list, mnt_list) {
776 failed = tag_chunk(p->mnt_root->d_inode, tree);
777 if (failed)
778 break;
779 }
780 737
738 failed = iterate_mounts(tag_mount, tree, tagged);
781 if (failed) { 739 if (failed) {
782 put_tree(tree); 740 put_tree(tree);
783 mutex_lock(&audit_filter_mutex); 741 mutex_lock(&audit_filter_mutex);
@@ -818,10 +776,8 @@ int audit_tag_tree(char *old, char *new)
818 } 776 }
819 list_del(&barrier); 777 list_del(&barrier);
820 list_del(&cursor); 778 list_del(&cursor);
821 list_del(&list);
822 mutex_unlock(&audit_filter_mutex); 779 mutex_unlock(&audit_filter_mutex);
823 dput(dentry); 780 path_put(&path1);
824 mntput(mnt);
825 drop_collected_mounts(tagged); 781 drop_collected_mounts(tagged);
826 return failed; 782 return failed;
827} 783}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..f3a461c0970a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1988 1988
1989/** 1989/**
1990 * audit_inode_child - collect inode info for created/removed objects 1990 * audit_inode_child - collect inode info for created/removed objects
1991 * @dname: inode's dentry name
1992 * @dentry: dentry being audited 1991 * @dentry: dentry being audited
1993 * @parent: inode of dentry parent 1992 * @parent: inode of dentry parent
1994 * 1993 *
@@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2000 * must be hooked prior, in order to capture the target inode during 1999 * must be hooked prior, in order to capture the target inode during
2001 * unsuccessful attempts. 2000 * unsuccessful attempts.
2002 */ 2001 */
2003void __audit_inode_child(const char *dname, const struct dentry *dentry, 2002void __audit_inode_child(const struct dentry *dentry,
2004 const struct inode *parent) 2003 const struct inode *parent)
2005{ 2004{
2006 int idx; 2005 int idx;
2007 struct audit_context *context = current->audit_context; 2006 struct audit_context *context = current->audit_context;
2008 const char *found_parent = NULL, *found_child = NULL; 2007 const char *found_parent = NULL, *found_child = NULL;
2009 const struct inode *inode = dentry->d_inode; 2008 const struct inode *inode = dentry->d_inode;
2009 const char *dname = dentry->d_name.name;
2010 int dirlen = 0; 2010 int dirlen = 0;
2011 2011
2012 if (!context->in_syscall) 2012 if (!context->in_syscall)
@@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2014 2014
2015 if (inode) 2015 if (inode)
2016 handle_one(inode); 2016 handle_one(inode);
2017 /* determine matching parent */
2018 if (!dname)
2019 goto add_names;
2020 2017
2021 /* parent is more likely, look for it first */ 2018 /* parent is more likely, look for it first */
2022 for (idx = 0; idx < context->name_count; idx++) { 2019 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
135 if (pid && (pid != task_pid_vnr(current))) { 135 if (pid && (pid != task_pid_vnr(current))) {
136 struct task_struct *target; 136 struct task_struct *target;
137 137
138 read_lock(&tasklist_lock); 138 rcu_read_lock();
139 139
140 target = find_task_by_vpid(pid); 140 target = find_task_by_vpid(pid);
141 if (!target) 141 if (!target)
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
143 else 143 else
144 ret = security_capget(target, pEp, pIp, pPp); 144 ret = security_capget(target, pEp, pIp, pPp);
145 145
146 read_unlock(&tasklist_lock); 146 rcu_read_unlock();
147 } else 147 } else
148 ret = security_capget(current, pEp, pIp, pPp); 148 ret = security_capget(current, pEp, pIp, pPp);
149 149
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index aa3bee566446..4fd90e129772 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/module.h>
26#include <linux/ctype.h> 27#include <linux/ctype.h>
27#include <linux/errno.h> 28#include <linux/errno.h>
28#include <linux/fs.h> 29#include <linux/fs.h>
@@ -166,6 +167,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 167 */
167static int need_forkexit_callback __read_mostly; 168static int need_forkexit_callback __read_mostly;
168 169
170#ifdef CONFIG_PROVE_LOCKING
171int cgroup_lock_is_held(void)
172{
173 return lockdep_is_held(&cgroup_mutex);
174}
175#else /* #ifdef CONFIG_PROVE_LOCKING */
176int cgroup_lock_is_held(void)
177{
178 return mutex_is_locked(&cgroup_mutex);
179}
180#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
181
182EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
183
169/* convenient tests for these bits */ 184/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 185inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 186{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 677f25376a38..f8cced2692b3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -338,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu)
338 if (!cpu_possible(cpu)) { 338 if (!cpu_possible(cpu)) {
339 printk(KERN_ERR "can't online cpu %d because it is not " 339 printk(KERN_ERR "can't online cpu %d because it is not "
340 "configured as may-hotadd at boot time\n", cpu); 340 "configured as may-hotadd at boot time\n", cpu);
341#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 341#if defined(CONFIG_IA64)
342 printk(KERN_ERR "please check additional_cpus= boot " 342 printk(KERN_ERR "please check additional_cpus= boot "
343 "parameter\n"); 343 "parameter\n");
344#endif 344#endif
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..3cb2c661bb78
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,578 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10
11/*
12 * Early reserved memory areas.
13 */
14/*
15 * need to make sure this one is bigger enough before
16 * find_fw_memmap_area could be used
17 */
18#define MAX_EARLY_RES_X 32
19
20struct early_res {
21 u64 start, end;
22 char name[15];
23 char overlap_ok;
24};
25static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
26
27static int max_early_res __initdata = MAX_EARLY_RES_X;
28static struct early_res *early_res __initdata = &early_res_x[0];
29static int early_res_count __initdata;
30
31static int __init find_overlapped_early(u64 start, u64 end)
32{
33 int i;
34 struct early_res *r;
35
36 for (i = 0; i < max_early_res && early_res[i].end; i++) {
37 r = &early_res[i];
38 if (end > r->start && start < r->end)
39 break;
40 }
41
42 return i;
43}
44
45/*
46 * Drop the i-th range from the early reservation map,
47 * by copying any higher ranges down one over it, and
48 * clearing what had been the last slot.
49 */
50static void __init drop_range(int i)
51{
52 int j;
53
54 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
55 ;
56
57 memmove(&early_res[i], &early_res[i + 1],
58 (j - 1 - i) * sizeof(struct early_res));
59
60 early_res[j - 1].end = 0;
61 early_res_count--;
62}
63
64static void __init drop_range_partial(int i, u64 start, u64 end)
65{
66 u64 common_start, common_end;
67 u64 old_start, old_end;
68
69 old_start = early_res[i].start;
70 old_end = early_res[i].end;
71 common_start = max(old_start, start);
72 common_end = min(old_end, end);
73
74 /* no overlap ? */
75 if (common_start >= common_end)
76 return;
77
78 if (old_start < common_start) {
79 /* make head segment */
80 early_res[i].end = common_start;
81 if (old_end > common_end) {
82 char name[15];
83
84 /*
85 * Save a local copy of the name, since the
86 * early_res array could get resized inside
87 * reserve_early_without_check() ->
88 * __check_and_double_early_res(), which would
89 * make the current name pointer invalid.
90 */
91 strncpy(name, early_res[i].name,
92 sizeof(early_res[i].name) - 1);
93 /* add another for left over on tail */
94 reserve_early_without_check(common_end, old_end, name);
95 }
96 return;
97 } else {
98 if (old_end > common_end) {
99 /* reuse the entry for tail left */
100 early_res[i].start = common_end;
101 return;
102 }
103 /* all covered */
104 drop_range(i);
105 }
106}
107
108/*
109 * Split any existing ranges that:
110 * 1) are marked 'overlap_ok', and
111 * 2) overlap with the stated range [start, end)
112 * into whatever portion (if any) of the existing range is entirely
113 * below or entirely above the stated range. Drop the portion
114 * of the existing range that overlaps with the stated range,
115 * which will allow the caller of this routine to then add that
116 * stated range without conflicting with any existing range.
117 */
118static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
119{
120 int i;
121 struct early_res *r;
122 u64 lower_start, lower_end;
123 u64 upper_start, upper_end;
124 char name[15];
125
126 for (i = 0; i < max_early_res && early_res[i].end; i++) {
127 r = &early_res[i];
128
129 /* Continue past non-overlapping ranges */
130 if (end <= r->start || start >= r->end)
131 continue;
132
133 /*
134 * Leave non-ok overlaps as is; let caller
135 * panic "Overlapping early reservations"
136 * when it hits this overlap.
137 */
138 if (!r->overlap_ok)
139 return;
140
141 /*
142 * We have an ok overlap. We will drop it from the early
143 * reservation map, and add back in any non-overlapping
144 * portions (lower or upper) as separate, overlap_ok,
145 * non-overlapping ranges.
146 */
147
148 /* 1. Note any non-overlapping (lower or upper) ranges. */
149 strncpy(name, r->name, sizeof(name) - 1);
150
151 lower_start = lower_end = 0;
152 upper_start = upper_end = 0;
153 if (r->start < start) {
154 lower_start = r->start;
155 lower_end = start;
156 }
157 if (r->end > end) {
158 upper_start = end;
159 upper_end = r->end;
160 }
161
162 /* 2. Drop the original ok overlapping range */
163 drop_range(i);
164
165 i--; /* resume for-loop on copied down entry */
166
167 /* 3. Add back in any non-overlapping ranges. */
168 if (lower_end)
169 reserve_early_overlap_ok(lower_start, lower_end, name);
170 if (upper_end)
171 reserve_early_overlap_ok(upper_start, upper_end, name);
172 }
173}
174
175static void __init __reserve_early(u64 start, u64 end, char *name,
176 int overlap_ok)
177{
178 int i;
179 struct early_res *r;
180
181 i = find_overlapped_early(start, end);
182 if (i >= max_early_res)
183 panic("Too many early reservations");
184 r = &early_res[i];
185 if (r->end)
186 panic("Overlapping early reservations "
187 "%llx-%llx %s to %llx-%llx %s\n",
188 start, end - 1, name ? name : "", r->start,
189 r->end - 1, r->name);
190 r->start = start;
191 r->end = end;
192 r->overlap_ok = overlap_ok;
193 if (name)
194 strncpy(r->name, name, sizeof(r->name) - 1);
195 early_res_count++;
196}
197
198/*
199 * A few early reservtations come here.
200 *
201 * The 'overlap_ok' in the name of this routine does -not- mean it
202 * is ok for these reservations to overlap an earlier reservation.
203 * Rather it means that it is ok for subsequent reservations to
204 * overlap this one.
205 *
206 * Use this entry point to reserve early ranges when you are doing
207 * so out of "Paranoia", reserving perhaps more memory than you need,
208 * just in case, and don't mind a subsequent overlapping reservation
209 * that is known to be needed.
210 *
211 * The drop_overlaps_that_are_ok() call here isn't really needed.
212 * It would be needed if we had two colliding 'overlap_ok'
213 * reservations, so that the second such would not panic on the
214 * overlap with the first. We don't have any such as of this
215 * writing, but might as well tolerate such if it happens in
216 * the future.
217 */
218void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
219{
220 drop_overlaps_that_are_ok(start, end);
221 __reserve_early(start, end, name, 1);
222}
223
224static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
225{
226 u64 start, end, size, mem;
227 struct early_res *new;
228
229 /* do we have enough slots left ? */
230 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
231 return;
232
233 /* double it */
234 mem = -1ULL;
235 size = sizeof(struct early_res) * max_early_res * 2;
236 if (early_res == early_res_x)
237 start = 0;
238 else
239 start = early_res[0].end;
240 end = ex_start;
241 if (start + size < end)
242 mem = find_fw_memmap_area(start, end, size,
243 sizeof(struct early_res));
244 if (mem == -1ULL) {
245 start = ex_end;
246 end = get_max_mapped();
247 if (start + size < end)
248 mem = find_fw_memmap_area(start, end, size,
249 sizeof(struct early_res));
250 }
251 if (mem == -1ULL)
252 panic("can not find more space for early_res array");
253
254 new = __va(mem);
255 /* save the first one for own */
256 new[0].start = mem;
257 new[0].end = mem + size;
258 new[0].overlap_ok = 0;
259 /* copy old to new */
260 if (early_res == early_res_x) {
261 memcpy(&new[1], &early_res[0],
262 sizeof(struct early_res) * max_early_res);
263 memset(&new[max_early_res+1], 0,
264 sizeof(struct early_res) * (max_early_res - 1));
265 early_res_count++;
266 } else {
267 memcpy(&new[1], &early_res[1],
268 sizeof(struct early_res) * (max_early_res - 1));
269 memset(&new[max_early_res], 0,
270 sizeof(struct early_res) * max_early_res);
271 }
272 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
273 early_res = new;
274 max_early_res *= 2;
275 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
276 max_early_res, mem, mem + size - 1);
277}
278
279/*
280 * Most early reservations come here.
281 *
282 * We first have drop_overlaps_that_are_ok() drop any pre-existing
283 * 'overlap_ok' ranges, so that we can then reserve this memory
284 * range without risk of panic'ing on an overlapping overlap_ok
285 * early reservation.
286 */
287void __init reserve_early(u64 start, u64 end, char *name)
288{
289 if (start >= end)
290 return;
291
292 __check_and_double_early_res(start, end);
293
294 drop_overlaps_that_are_ok(start, end);
295 __reserve_early(start, end, name, 0);
296}
297
298void __init reserve_early_without_check(u64 start, u64 end, char *name)
299{
300 struct early_res *r;
301
302 if (start >= end)
303 return;
304
305 __check_and_double_early_res(start, end);
306
307 r = &early_res[early_res_count];
308
309 r->start = start;
310 r->end = end;
311 r->overlap_ok = 0;
312 if (name)
313 strncpy(r->name, name, sizeof(r->name) - 1);
314 early_res_count++;
315}
316
317void __init free_early(u64 start, u64 end)
318{
319 struct early_res *r;
320 int i;
321
322 i = find_overlapped_early(start, end);
323 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start)
325 panic("free_early on not reserved area: %llx-%llx!",
326 start, end - 1);
327
328 drop_range(i);
329}
330
331void __init free_early_partial(u64 start, u64 end)
332{
333 struct early_res *r;
334 int i;
335
336try_next:
337 i = find_overlapped_early(start, end);
338 if (i >= max_early_res)
339 return;
340
341 r = &early_res[i];
342 /* hole ? */
343 if (r->end >= end && r->start <= start) {
344 drop_range_partial(i, start, end);
345 return;
346 }
347
348 drop_range_partial(i, start, end);
349 goto try_next;
350}
351
352#ifdef CONFIG_NO_BOOTMEM
353static void __init subtract_early_res(struct range *range, int az)
354{
355 int i, count;
356 u64 final_start, final_end;
357 int idx = 0;
358
359 count = 0;
360 for (i = 0; i < max_early_res && early_res[i].end; i++)
361 count++;
362
363 /* need to skip first one ?*/
364 if (early_res != early_res_x)
365 idx = 1;
366
367#define DEBUG_PRINT_EARLY_RES 1
368
369#if DEBUG_PRINT_EARLY_RES
370 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
371#endif
372 for (i = idx; i < count; i++) {
373 struct early_res *r = &early_res[i];
374#if DEBUG_PRINT_EARLY_RES
375 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
376 r->start, r->end, r->name);
377#endif
378 final_start = PFN_DOWN(r->start);
379 final_end = PFN_UP(r->end);
380 if (final_start >= final_end)
381 continue;
382 subtract_range(range, az, final_start, final_end);
383 }
384
385}
386
387int __init get_free_all_memory_range(struct range **rangep, int nodeid)
388{
389 int i, count;
390 u64 start = 0, end;
391 u64 size;
392 u64 mem;
393 struct range *range;
394 int nr_range;
395
396 count = 0;
397 for (i = 0; i < max_early_res && early_res[i].end; i++)
398 count++;
399
400 count *= 2;
401
402 size = sizeof(struct range) * count;
403 end = get_max_mapped();
404#ifdef MAX_DMA32_PFN
405 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
406 start = MAX_DMA32_PFN << PAGE_SHIFT;
407#endif
408 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
409 if (mem == -1ULL)
410 panic("can not find more space for range free");
411
412 range = __va(mem);
413 /* use early_node_map[] and early_res to get range array at first */
414 memset(range, 0, size);
415 nr_range = 0;
416
417 /* need to go over early_node_map to find out good range for node */
418 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
419#ifdef CONFIG_X86_32
420 subtract_range(range, count, max_low_pfn, -1ULL);
421#endif
422 subtract_early_res(range, count);
423 nr_range = clean_sort_range(range, count);
424
425 /* need to clear it ? */
426 if (nodeid == MAX_NUMNODES) {
427 memset(&early_res[0], 0,
428 sizeof(struct early_res) * max_early_res);
429 early_res = NULL;
430 max_early_res = 0;
431 }
432
433 *rangep = range;
434 return nr_range;
435}
436#else
437void __init early_res_to_bootmem(u64 start, u64 end)
438{
439 int i, count;
440 u64 final_start, final_end;
441 int idx = 0;
442
443 count = 0;
444 for (i = 0; i < max_early_res && early_res[i].end; i++)
445 count++;
446
447 /* need to skip first one ?*/
448 if (early_res != early_res_x)
449 idx = 1;
450
451 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
452 count - idx, max_early_res, start, end);
453 for (i = idx; i < count; i++) {
454 struct early_res *r = &early_res[i];
455 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
456 r->start, r->end, r->name);
457 final_start = max(start, r->start);
458 final_end = min(end, r->end);
459 if (final_start >= final_end) {
460 printk(KERN_CONT "\n");
461 continue;
462 }
463 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
464 final_start, final_end);
465 reserve_bootmem_generic(final_start, final_end - final_start,
466 BOOTMEM_DEFAULT);
467 }
468 /* clear them */
469 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
470 early_res = NULL;
471 max_early_res = 0;
472 early_res_count = 0;
473}
474#endif
475
476/* Check for already reserved areas */
477static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
478{
479 int i;
480 u64 addr = *addrp;
481 int changed = 0;
482 struct early_res *r;
483again:
484 i = find_overlapped_early(addr, addr + size);
485 r = &early_res[i];
486 if (i < max_early_res && r->end) {
487 *addrp = addr = round_up(r->end, align);
488 changed = 1;
489 goto again;
490 }
491 return changed;
492}
493
494/* Check for already reserved areas */
495static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
496{
497 int i;
498 u64 addr = *addrp, last;
499 u64 size = *sizep;
500 int changed = 0;
501again:
502 last = addr + size;
503 for (i = 0; i < max_early_res && early_res[i].end; i++) {
504 struct early_res *r = &early_res[i];
505 if (last > r->start && addr < r->start) {
506 size = r->start - addr;
507 changed = 1;
508 goto again;
509 }
510 if (last > r->end && addr < r->end) {
511 addr = round_up(r->end, align);
512 size = last - addr;
513 changed = 1;
514 goto again;
515 }
516 if (last <= r->end && addr >= r->start) {
517 (*sizep)++;
518 return 0;
519 }
520 }
521 if (changed) {
522 *addrp = addr;
523 *sizep = size;
524 }
525 return changed;
526}
527
528/*
529 * Find a free area with specified alignment in a specific range.
530 * only with the area.between start to end is active range from early_node_map
531 * so they are good as RAM
532 */
533u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
534 u64 size, u64 align)
535{
536 u64 addr, last;
537
538 addr = round_up(ei_start, align);
539 if (addr < start)
540 addr = round_up(start, align);
541 if (addr >= ei_last)
542 goto out;
543 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
544 ;
545 last = addr + size;
546 if (last > ei_last)
547 goto out;
548 if (last > end)
549 goto out;
550
551 return addr;
552
553out:
554 return -1ULL;
555}
556
557u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
558 u64 *sizep, u64 align)
559{
560 u64 addr, last;
561
562 addr = round_up(ei_start, align);
563 if (addr < start)
564 addr = round_up(start, align);
565 if (addr >= ei_last)
566 goto out;
567 *sizep = ei_last - addr;
568 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
569 ;
570 last = addr + *sizep;
571 if (last > ei_last)
572 goto out;
573
574 return addr;
575
576out:
577 return -1ULL;
578}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a66..ce1e48c2d93d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk)
85 BUG_ON(!sig); 85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count)); 86 BUG_ON(!atomic_read(&sig->count));
87 87
88 sighand = rcu_dereference(tsk->sighand); 88 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() ||
90 lockdep_is_held(&tasklist_lock));
89 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
90 92
91 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p)
170repeat: 172repeat:
171 tracehook_prepare_release_task(p); 173 tracehook_prepare_release_task(p);
172 /* don't need to get the RCU readlock here - the process is dead and 174 /* don't need to get the RCU readlock here - the process is dead and
173 * can't be modifying its own credentials */ 175 * can't be modifying its own credentials. But shut RCU-lockdep up */
176 rcu_read_lock();
174 atomic_dec(&__task_cred(p)->user->processes); 177 atomic_dec(&__task_cred(p)->user->processes);
178 rcu_read_unlock();
175 179
176 proc_flush_task(p); 180 proc_flush_task(p);
177 181
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files)
473 /* 477 /*
474 * It is safe to dereference the fd table without RCU or 478 * It is safe to dereference the fd table without RCU or
475 * ->file_lock because this is the last reference to the 479 * ->file_lock because this is the last reference to the
476 * files structure. 480 * files structure. But use RCU to shut RCU-lockdep up.
477 */ 481 */
482 rcu_read_lock();
478 fdt = files_fdtable(files); 483 fdt = files_fdtable(files);
484 rcu_read_unlock();
479 for (;;) { 485 for (;;) {
480 unsigned long set; 486 unsigned long set;
481 i = j * __NFDBITS; 487 i = j * __NFDBITS;
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files)
521 * at the end of the RCU grace period. Otherwise, 527 * at the end of the RCU grace period. Otherwise,
522 * you can free files immediately. 528 * you can free files immediately.
523 */ 529 */
530 rcu_read_lock();
524 fdt = files_fdtable(files); 531 fdt = files_fdtable(files);
525 if (fdt != &files->fdtab) 532 if (fdt != &files->fdtab)
526 kmem_cache_free(files_cachep, files); 533 kmem_cache_free(files_cachep, files);
527 free_fdtable(fdt); 534 free_fdtable(fdt);
535 rcu_read_unlock();
528 } 536 }
529} 537}
530 538
@@ -944,7 +952,8 @@ NORET_TYPE void do_exit(long code)
944 preempt_count()); 952 preempt_count());
945 953
946 acct_update_integrals(tsk); 954 acct_update_integrals(tsk);
947 955 /* sync mm's RSS info before statistics gathering */
956 sync_mm_rss(tsk, tsk->mm);
948 group_dead = atomic_dec_and_test(&tsk->signal->live); 957 group_dead = atomic_dec_and_test(&tsk->signal->live);
949 if (group_dead) { 958 if (group_dead) {
950 hrtimer_cancel(&tsk->signal->real_timer); 959 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1180,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1180 1189
1181 if (unlikely(wo->wo_flags & WNOWAIT)) { 1190 if (unlikely(wo->wo_flags & WNOWAIT)) {
1182 int exit_code = p->exit_code; 1191 int exit_code = p->exit_code;
1183 int why, status; 1192 int why;
1184 1193
1185 get_task_struct(p); 1194 get_task_struct(p);
1186 read_unlock(&tasklist_lock); 1195 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index f88bd984df35..b0ec34abc0bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,6 +86,7 @@ int max_threads; /* tunable limit on nr_threads */
86DEFINE_PER_CPU(unsigned long, process_counts) = 0; 86DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89EXPORT_SYMBOL_GPL(tasklist_lock);
89 90
90int nr_processes(void) 91int nr_processes(void)
91{ 92{
@@ -328,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (!tmp) 329 if (!tmp)
329 goto fail_nomem; 330 goto fail_nomem;
330 *tmp = *mpnt; 331 *tmp = *mpnt;
332 INIT_LIST_HEAD(&tmp->anon_vma_chain);
331 pol = mpol_dup(vma_policy(mpnt)); 333 pol = mpol_dup(vma_policy(mpnt));
332 retval = PTR_ERR(pol); 334 retval = PTR_ERR(pol);
333 if (IS_ERR(pol)) 335 if (IS_ERR(pol))
334 goto fail_nomem_policy; 336 goto fail_nomem_policy;
335 vma_set_policy(tmp, pol); 337 vma_set_policy(tmp, pol);
338 if (anon_vma_fork(tmp, mpnt))
339 goto fail_nomem_anon_vma_fork;
336 tmp->vm_flags &= ~VM_LOCKED; 340 tmp->vm_flags &= ~VM_LOCKED;
337 tmp->vm_mm = mm; 341 tmp->vm_mm = mm;
338 tmp->vm_next = NULL; 342 tmp->vm_next = NULL;
339 anon_vma_link(tmp);
340 file = tmp->vm_file; 343 file = tmp->vm_file;
341 if (file) { 344 if (file) {
342 struct inode *inode = file->f_path.dentry->d_inode; 345 struct inode *inode = file->f_path.dentry->d_inode;
@@ -391,6 +394,8 @@ out:
391 flush_tlb_mm(oldmm); 394 flush_tlb_mm(oldmm);
392 up_write(&oldmm->mmap_sem); 395 up_write(&oldmm->mmap_sem);
393 return retval; 396 return retval;
397fail_nomem_anon_vma_fork:
398 mpol_put(pol);
394fail_nomem_policy: 399fail_nomem_policy:
395 kmem_cache_free(vm_area_cachep, tmp); 400 kmem_cache_free(vm_area_cachep, tmp);
396fail_nomem: 401fail_nomem:
@@ -454,8 +459,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
454 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 459 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
455 mm->core_state = NULL; 460 mm->core_state = NULL;
456 mm->nr_ptes = 0; 461 mm->nr_ptes = 0;
457 set_mm_counter(mm, file_rss, 0); 462 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
458 set_mm_counter(mm, anon_rss, 0);
459 spin_lock_init(&mm->page_table_lock); 463 spin_lock_init(&mm->page_table_lock);
460 mm->free_area_cache = TASK_UNMAPPED_BASE; 464 mm->free_area_cache = TASK_UNMAPPED_BASE;
461 mm->cached_hole_size = ~0UL; 465 mm->cached_hole_size = ~0UL;
@@ -824,6 +828,8 @@ void __cleanup_sighand(struct sighand_struct *sighand)
824 */ 828 */
825static void posix_cpu_timers_init_group(struct signal_struct *sig) 829static void posix_cpu_timers_init_group(struct signal_struct *sig)
826{ 830{
831 unsigned long cpu_limit;
832
827 /* Thread group counters. */ 833 /* Thread group counters. */
828 thread_group_cputime_init(sig); 834 thread_group_cputime_init(sig);
829 835
@@ -838,9 +844,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
838 sig->cputime_expires.virt_exp = cputime_zero; 844 sig->cputime_expires.virt_exp = cputime_zero;
839 sig->cputime_expires.sched_exp = 0; 845 sig->cputime_expires.sched_exp = 0;
840 846
841 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 847 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
842 sig->cputime_expires.prof_exp = 848 if (cpu_limit != RLIM_INFINITY) {
843 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 849 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
844 sig->cputimer.running = 1; 850 sig->cputimer.running = 1;
845 } 851 }
846 852
@@ -1033,7 +1039,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1033#endif 1039#endif
1034 retval = -EAGAIN; 1040 retval = -EAGAIN;
1035 if (atomic_read(&p->real_cred->user->processes) >= 1041 if (atomic_read(&p->real_cred->user->processes) >=
1036 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1042 task_rlimit(p, RLIMIT_NPROC)) {
1037 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1043 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1038 p->real_cred->user != INIT_USER) 1044 p->real_cred->user != INIT_USER)
1039 goto bad_fork_free; 1045 goto bad_fork_free;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
146 struct task_struct *p; 146 struct task_struct *p;
147 147
148 ret = -ESRCH; 148 ret = -ESRCH;
149 read_lock(&tasklist_lock); 149 rcu_read_lock();
150 p = find_task_by_vpid(pid); 150 p = find_task_by_vpid(pid);
151 if (!p) 151 if (!p)
152 goto err_unlock; 152 goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
157 !capable(CAP_SYS_PTRACE)) 157 !capable(CAP_SYS_PTRACE))
158 goto err_unlock; 158 goto err_unlock;
159 head = p->compat_robust_list; 159 head = p->compat_robust_list;
160 read_unlock(&tasklist_lock); 160 rcu_read_unlock();
161 } 161 }
162 162
163 if (put_user(sizeof(*head), len_ptr)) 163 if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
165 return put_user(ptr_to_compat(head), head_ptr); 165 return put_user(ptr_to_compat(head), head_ptr);
166 166
167err_unlock: 167err_unlock:
168 read_unlock(&tasklist_lock); 168 rcu_read_unlock();
169 169
170 return ret; 170 return ret;
171} 171}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..d70394f12ee9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
85 raw_spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 100}
87 101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
88 122
89/** 123/**
90 * set_irq_chip - set the irq chip for an irq 124 * set_irq_chip - set the irq chip for an irq
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
132 */ 128 */
133DEFINE_RAW_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..963559dbd858 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_to_desc(irq);
74 74
75 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
76 goto out_unlock; 76 goto out_unlock;
@@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 90 goto out_unlock;
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 replace_irq_desc(irq, desc);
94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..87ebe8adc474 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
41#include <asm/sections.h> 41#include <asm/sections.h>
42 42
43/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
44note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
45 45
46/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a0204..fa034d29cf73 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,8 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h>
49#include <linux/cpu.h>
47 50
48#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -93,6 +96,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
93 {"native_get_debugreg",}, 96 {"native_get_debugreg",},
94 {"irq_entries_start",}, 97 {"irq_entries_start",},
95 {"common_interrupt",}, 98 {"common_interrupt",},
99 {"mcount",}, /* mcount can be called from everywhere */
96 {NULL} /* Terminator */ 100 {NULL} /* Terminator */
97}; 101};
98 102
@@ -103,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
104 * is a recipe for disaster 108 * is a recipe for disaster
105 */ 109 */
106#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
107
108struct kprobe_insn_page { 110struct kprobe_insn_page {
109 struct list_head list; 111 struct list_head list;
110 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
111 char slot_used[INSNS_PER_PAGE];
112 int nused; 113 int nused;
113 int ngarbage; 114 int ngarbage;
115 char slot_used[];
114}; 116};
115 117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
126};
127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
116enum kprobe_slot_state { 133enum kprobe_slot_state {
117 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
118 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
119 SLOT_USED = 2, 136 SLOT_USED = 2,
120}; 137};
121 138
122static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
123static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
124static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
125static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
126 143 .nr_garbage = 0,
127static int __kprobes check_safety(void) 144};
128{ 145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
129 int ret = 0;
130#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
131 ret = freeze_processes();
132 if (ret == 0) {
133 struct task_struct *p, *q;
134 do_each_thread(p, q) {
135 if (p != current && p->state == TASK_RUNNING &&
136 p->pid != 0) {
137 printk("Check failed: %s is running\n",p->comm);
138 ret = -1;
139 goto loop_end;
140 }
141 } while_each_thread(p, q);
142 }
143loop_end:
144 thaw_processes();
145#else
146 synchronize_sched();
147#endif
148 return ret;
149}
150 146
151/** 147/**
152 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
153 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
154 */ 150 */
155static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
156{ 152{
157 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
158 154
159 retry: 155 retry:
160 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
161 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
162 int i; 158 int i;
163 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
164 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
165 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
166 kip->nused++; 162 kip->nused++;
167 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
168 } 164 }
169 } 165 }
170 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
171 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
172 } 169 }
173 } 170 }
174 171
175 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
176 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
177 goto retry; 174 goto retry;
178 } 175
179 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
180 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
181 if (!kip) 178 if (!kip)
182 return NULL; 179 return NULL;
183 180
@@ -192,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
192 return NULL; 189 return NULL;
193 } 190 }
194 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
195 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
196 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
197 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
198 kip->nused = 1; 194 kip->nused = 1;
199 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
200 return kip->insns; 197 return kip->insns;
201} 198}
202 199
200
203kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
204{ 202{
205 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
206 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
207 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
208 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
209 return ret; 209 return ret;
210} 210}
211 211
@@ -221,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
221 * so as not to have to set it up again the 221 * so as not to have to set it up again the
222 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
223 */ 223 */
224 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
225 list_del(&kip->list); 225 list_del(&kip->list);
226 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
227 kfree(kip); 227 kfree(kip);
@@ -231,52 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
231 return 0; 231 return 0;
232} 232}
233 233
234static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
235{ 235{
236 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
237 237
238 /* Ensure no-one is preepmted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
239 if (check_safety()) 239 synchronize_sched();
240 return -EAGAIN;
241 240
242 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
243 int i; 242 int i;
244 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
245 continue; 244 continue;
246 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
247 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
248 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
249 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
250 break; 249 break;
251 } 250 }
252 } 251 }
253 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
254 return 0; 253 return 0;
255} 254}
256 255
257void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
258{ 258{
259 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
260 260
261 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
262 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) / c->insn_size;
263 if (kip->insns <= slot && 263 if (idx >= 0 && idx < slots_per_page(c)) {
264 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 WARN_ON(kip->slot_used[idx] != SLOT_USED);
265 int i = (slot - kip->insns) / MAX_INSN_SIZE;
266 if (dirty) { 265 if (dirty) {
267 kip->slot_used[i] = SLOT_DIRTY; 266 kip->slot_used[idx] = SLOT_DIRTY;
268 kip->ngarbage++; 267 kip->ngarbage++;
268 if (++c->nr_garbage > slots_per_page(c))
269 collect_garbage_slots(c);
269 } else 270 } else
270 collect_one_slot(kip, i); 271 collect_one_slot(kip, idx);
271 break; 272 return;
272 } 273 }
273 } 274 }
275 /* Could not free this slot. */
276 WARN_ON(1);
277}
274 278
275 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 279void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
276 collect_garbage_slots(); 280{
277 281 mutex_lock(&kprobe_insn_mutex);
282 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
278 mutex_unlock(&kprobe_insn_mutex); 283 mutex_unlock(&kprobe_insn_mutex);
279} 284}
285#ifdef CONFIG_OPTPROBES
286/* For optimized_kprobe buffer */
287static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
288static struct kprobe_insn_cache kprobe_optinsn_slots = {
289 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
290 /* .insn_size is initialized later */
291 .nr_garbage = 0,
292};
293/* Get a slot for optimized_kprobe buffer */
294kprobe_opcode_t __kprobes *get_optinsn_slot(void)
295{
296 kprobe_opcode_t *ret = NULL;
297
298 mutex_lock(&kprobe_optinsn_mutex);
299 ret = __get_insn_slot(&kprobe_optinsn_slots);
300 mutex_unlock(&kprobe_optinsn_mutex);
301
302 return ret;
303}
304
305void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
306{
307 mutex_lock(&kprobe_optinsn_mutex);
308 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
309 mutex_unlock(&kprobe_optinsn_mutex);
310}
311#endif
280#endif 312#endif
281 313
282/* We have preemption disabled.. so it is safe to use __ versions */ 314/* We have preemption disabled.. so it is safe to use __ versions */
@@ -307,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
307 if (p->addr == addr) 339 if (p->addr == addr)
308 return p; 340 return p;
309 } 341 }
342
310 return NULL; 343 return NULL;
311} 344}
312 345
346static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
347
348/* Return true if the kprobe is an aggregator */
349static inline int kprobe_aggrprobe(struct kprobe *p)
350{
351 return p->pre_handler == aggr_pre_handler;
352}
353
354/*
355 * Keep all fields in the kprobe consistent
356 */
357static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
358{
359 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
360 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
361}
362
363#ifdef CONFIG_OPTPROBES
364/* NOTE: change this value only with kprobe_mutex held */
365static bool kprobes_allow_optimization;
366
367/*
368 * Call all pre_handler on the list, but ignores its return value.
369 * This must be called from arch-dep optimized caller.
370 */
371void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
372{
373 struct kprobe *kp;
374
375 list_for_each_entry_rcu(kp, &p->list, list) {
376 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
377 set_kprobe_instance(kp);
378 kp->pre_handler(kp, regs);
379 }
380 reset_kprobe_instance();
381 }
382}
383
384/* Return true(!0) if the kprobe is ready for optimization. */
385static inline int kprobe_optready(struct kprobe *p)
386{
387 struct optimized_kprobe *op;
388
389 if (kprobe_aggrprobe(p)) {
390 op = container_of(p, struct optimized_kprobe, kp);
391 return arch_prepared_optinsn(&op->optinsn);
392 }
393
394 return 0;
395}
396
397/*
398 * Return an optimized kprobe whose optimizing code replaces
399 * instructions including addr (exclude breakpoint).
400 */
401struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
402{
403 int i;
404 struct kprobe *p = NULL;
405 struct optimized_kprobe *op;
406
407 /* Don't check i == 0, since that is a breakpoint case. */
408 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
409 p = get_kprobe((void *)(addr - i));
410
411 if (p && kprobe_optready(p)) {
412 op = container_of(p, struct optimized_kprobe, kp);
413 if (arch_within_optimized_kprobe(op, addr))
414 return p;
415 }
416
417 return NULL;
418}
419
420/* Optimization staging list, protected by kprobe_mutex */
421static LIST_HEAD(optimizing_list);
422
423static void kprobe_optimizer(struct work_struct *work);
424static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
425#define OPTIMIZE_DELAY 5
426
427/* Kprobe jump optimizer */
428static __kprobes void kprobe_optimizer(struct work_struct *work)
429{
430 struct optimized_kprobe *op, *tmp;
431
432 /* Lock modules while optimizing kprobes */
433 mutex_lock(&module_mutex);
434 mutex_lock(&kprobe_mutex);
435 if (kprobes_all_disarmed || !kprobes_allow_optimization)
436 goto end;
437
438 /*
439 * Wait for quiesence period to ensure all running interrupts
440 * are done. Because optprobe may modify multiple instructions
441 * there is a chance that Nth instruction is interrupted. In that
442 * case, running interrupt can return to 2nd-Nth byte of jump
443 * instruction. This wait is for avoiding it.
444 */
445 synchronize_sched();
446
447 /*
448 * The optimization/unoptimization refers online_cpus via
449 * stop_machine() and cpu-hotplug modifies online_cpus.
450 * And same time, text_mutex will be held in cpu-hotplug and here.
451 * This combination can cause a deadlock (cpu-hotplug try to lock
452 * text_mutex but stop_machine can not be done because online_cpus
453 * has been changed)
454 * To avoid this deadlock, we need to call get_online_cpus()
455 * for preventing cpu-hotplug outside of text_mutex locking.
456 */
457 get_online_cpus();
458 mutex_lock(&text_mutex);
459 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
460 WARN_ON(kprobe_disabled(&op->kp));
461 if (arch_optimize_kprobe(op) < 0)
462 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
463 list_del_init(&op->list);
464 }
465 mutex_unlock(&text_mutex);
466 put_online_cpus();
467end:
468 mutex_unlock(&kprobe_mutex);
469 mutex_unlock(&module_mutex);
470}
471
472/* Optimize kprobe if p is ready to be optimized */
473static __kprobes void optimize_kprobe(struct kprobe *p)
474{
475 struct optimized_kprobe *op;
476
477 /* Check if the kprobe is disabled or not ready for optimization. */
478 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
479 (kprobe_disabled(p) || kprobes_all_disarmed))
480 return;
481
482 /* Both of break_handler and post_handler are not supported. */
483 if (p->break_handler || p->post_handler)
484 return;
485
486 op = container_of(p, struct optimized_kprobe, kp);
487
488 /* Check there is no other kprobes at the optimized instructions */
489 if (arch_check_optimized_kprobe(op) < 0)
490 return;
491
492 /* Check if it is already optimized. */
493 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
494 return;
495
496 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
497 list_add(&op->list, &optimizing_list);
498 if (!delayed_work_pending(&optimizing_work))
499 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
500}
501
502/* Unoptimize a kprobe if p is optimized */
503static __kprobes void unoptimize_kprobe(struct kprobe *p)
504{
505 struct optimized_kprobe *op;
506
507 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
508 op = container_of(p, struct optimized_kprobe, kp);
509 if (!list_empty(&op->list))
510 /* Dequeue from the optimization queue */
511 list_del_init(&op->list);
512 else
513 /* Replace jump with break */
514 arch_unoptimize_kprobe(op);
515 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
516 }
517}
518
519/* Remove optimized instructions */
520static void __kprobes kill_optimized_kprobe(struct kprobe *p)
521{
522 struct optimized_kprobe *op;
523
524 op = container_of(p, struct optimized_kprobe, kp);
525 if (!list_empty(&op->list)) {
526 /* Dequeue from the optimization queue */
527 list_del_init(&op->list);
528 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
529 }
530 /* Don't unoptimize, because the target code will be freed. */
531 arch_remove_optimized_kprobe(op);
532}
533
534/* Try to prepare optimized instructions */
535static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
536{
537 struct optimized_kprobe *op;
538
539 op = container_of(p, struct optimized_kprobe, kp);
540 arch_prepare_optimized_kprobe(op);
541}
542
543/* Free optimized instructions and optimized_kprobe */
544static __kprobes void free_aggr_kprobe(struct kprobe *p)
545{
546 struct optimized_kprobe *op;
547
548 op = container_of(p, struct optimized_kprobe, kp);
549 arch_remove_optimized_kprobe(op);
550 kfree(op);
551}
552
553/* Allocate new optimized_kprobe and try to prepare optimized instructions */
554static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
555{
556 struct optimized_kprobe *op;
557
558 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
559 if (!op)
560 return NULL;
561
562 INIT_LIST_HEAD(&op->list);
563 op->kp.addr = p->addr;
564 arch_prepare_optimized_kprobe(op);
565
566 return &op->kp;
567}
568
569static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
570
571/*
572 * Prepare an optimized_kprobe and optimize it
573 * NOTE: p must be a normal registered kprobe
574 */
575static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
576{
577 struct kprobe *ap;
578 struct optimized_kprobe *op;
579
580 ap = alloc_aggr_kprobe(p);
581 if (!ap)
582 return;
583
584 op = container_of(ap, struct optimized_kprobe, kp);
585 if (!arch_prepared_optinsn(&op->optinsn)) {
586 /* If failed to setup optimizing, fallback to kprobe */
587 free_aggr_kprobe(ap);
588 return;
589 }
590
591 init_aggr_kprobe(ap, p);
592 optimize_kprobe(ap);
593}
594
595#ifdef CONFIG_SYSCTL
596static void __kprobes optimize_all_kprobes(void)
597{
598 struct hlist_head *head;
599 struct hlist_node *node;
600 struct kprobe *p;
601 unsigned int i;
602
603 /* If optimization is already allowed, just return */
604 if (kprobes_allow_optimization)
605 return;
606
607 kprobes_allow_optimization = true;
608 mutex_lock(&text_mutex);
609 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
610 head = &kprobe_table[i];
611 hlist_for_each_entry_rcu(p, node, head, hlist)
612 if (!kprobe_disabled(p))
613 optimize_kprobe(p);
614 }
615 mutex_unlock(&text_mutex);
616 printk(KERN_INFO "Kprobes globally optimized\n");
617}
618
619static void __kprobes unoptimize_all_kprobes(void)
620{
621 struct hlist_head *head;
622 struct hlist_node *node;
623 struct kprobe *p;
624 unsigned int i;
625
626 /* If optimization is already prohibited, just return */
627 if (!kprobes_allow_optimization)
628 return;
629
630 kprobes_allow_optimization = false;
631 printk(KERN_INFO "Kprobes globally unoptimized\n");
632 get_online_cpus(); /* For avoiding text_mutex deadlock */
633 mutex_lock(&text_mutex);
634 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
635 head = &kprobe_table[i];
636 hlist_for_each_entry_rcu(p, node, head, hlist) {
637 if (!kprobe_disabled(p))
638 unoptimize_kprobe(p);
639 }
640 }
641
642 mutex_unlock(&text_mutex);
643 put_online_cpus();
644 /* Allow all currently running kprobes to complete */
645 synchronize_sched();
646}
647
648int sysctl_kprobes_optimization;
649int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
650 void __user *buffer, size_t *length,
651 loff_t *ppos)
652{
653 int ret;
654
655 mutex_lock(&kprobe_mutex);
656 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
657 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
658
659 if (sysctl_kprobes_optimization)
660 optimize_all_kprobes();
661 else
662 unoptimize_all_kprobes();
663 mutex_unlock(&kprobe_mutex);
664
665 return ret;
666}
667#endif /* CONFIG_SYSCTL */
668
669static void __kprobes __arm_kprobe(struct kprobe *p)
670{
671 struct kprobe *old_p;
672
673 /* Check collision with other optimized kprobes */
674 old_p = get_optimized_kprobe((unsigned long)p->addr);
675 if (unlikely(old_p))
676 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
677
678 arch_arm_kprobe(p);
679 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
680}
681
682static void __kprobes __disarm_kprobe(struct kprobe *p)
683{
684 struct kprobe *old_p;
685
686 unoptimize_kprobe(p); /* Try to unoptimize */
687 arch_disarm_kprobe(p);
688
689 /* If another kprobe was blocked, optimize it. */
690 old_p = get_optimized_kprobe((unsigned long)p->addr);
691 if (unlikely(old_p))
692 optimize_kprobe(old_p);
693}
694
695#else /* !CONFIG_OPTPROBES */
696
697#define optimize_kprobe(p) do {} while (0)
698#define unoptimize_kprobe(p) do {} while (0)
699#define kill_optimized_kprobe(p) do {} while (0)
700#define prepare_optimized_kprobe(p) do {} while (0)
701#define try_to_optimize_kprobe(p) do {} while (0)
702#define __arm_kprobe(p) arch_arm_kprobe(p)
703#define __disarm_kprobe(p) arch_disarm_kprobe(p)
704
705static __kprobes void free_aggr_kprobe(struct kprobe *p)
706{
707 kfree(p);
708}
709
710static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
711{
712 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
713}
714#endif /* CONFIG_OPTPROBES */
715
313/* Arm a kprobe with text_mutex */ 716/* Arm a kprobe with text_mutex */
314static void __kprobes arm_kprobe(struct kprobe *kp) 717static void __kprobes arm_kprobe(struct kprobe *kp)
315{ 718{
719 /*
720 * Here, since __arm_kprobe() doesn't use stop_machine(),
721 * this doesn't cause deadlock on text_mutex. So, we don't
722 * need get_online_cpus().
723 */
316 mutex_lock(&text_mutex); 724 mutex_lock(&text_mutex);
317 arch_arm_kprobe(kp); 725 __arm_kprobe(kp);
318 mutex_unlock(&text_mutex); 726 mutex_unlock(&text_mutex);
319} 727}
320 728
321/* Disarm a kprobe with text_mutex */ 729/* Disarm a kprobe with text_mutex */
322static void __kprobes disarm_kprobe(struct kprobe *kp) 730static void __kprobes disarm_kprobe(struct kprobe *kp)
323{ 731{
732 get_online_cpus(); /* For avoiding text_mutex deadlock */
324 mutex_lock(&text_mutex); 733 mutex_lock(&text_mutex);
325 arch_disarm_kprobe(kp); 734 __disarm_kprobe(kp);
326 mutex_unlock(&text_mutex); 735 mutex_unlock(&text_mutex);
736 put_online_cpus();
327} 737}
328 738
329/* 739/*
@@ -392,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
392void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 802void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
393{ 803{
394 struct kprobe *kp; 804 struct kprobe *kp;
395 if (p->pre_handler != aggr_pre_handler) { 805 if (!kprobe_aggrprobe(p)) {
396 p->nmissed++; 806 p->nmissed++;
397 } else { 807 } else {
398 list_for_each_entry_rcu(kp, &p->list, list) 808 list_for_each_entry_rcu(kp, &p->list, list)
@@ -516,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
516} 926}
517 927
518/* 928/*
519 * Keep all fields in the kprobe consistent
520 */
521static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
522{
523 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
524 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
525}
526
527/*
528* Add the new probe to ap->list. Fail if this is the 929* Add the new probe to ap->list. Fail if this is the
529* second jprobe at the address - two jprobes can't coexist 930* second jprobe at the address - two jprobes can't coexist
530*/ 931*/
531static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 932static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
532{ 933{
533 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 934 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
935
936 if (p->break_handler || p->post_handler)
937 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
938
534 if (p->break_handler) { 939 if (p->break_handler) {
535 if (ap->break_handler) 940 if (ap->break_handler)
536 return -EEXIST; 941 return -EEXIST;
@@ -545,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
545 ap->flags &= ~KPROBE_FLAG_DISABLED; 950 ap->flags &= ~KPROBE_FLAG_DISABLED;
546 if (!kprobes_all_disarmed) 951 if (!kprobes_all_disarmed)
547 /* Arm the breakpoint again. */ 952 /* Arm the breakpoint again. */
548 arm_kprobe(ap); 953 __arm_kprobe(ap);
549 } 954 }
550 return 0; 955 return 0;
551} 956}
@@ -554,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
554 * Fill in the required fields of the "manager kprobe". Replace the 959 * Fill in the required fields of the "manager kprobe". Replace the
555 * earlier kprobe in the hlist with the manager kprobe 960 * earlier kprobe in the hlist with the manager kprobe
556 */ 961 */
557static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 962static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
558{ 963{
964 /* Copy p's insn slot to ap */
559 copy_kprobe(p, ap); 965 copy_kprobe(p, ap);
560 flush_insn_slot(ap); 966 flush_insn_slot(ap);
561 ap->addr = p->addr; 967 ap->addr = p->addr;
562 ap->flags = p->flags; 968 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
563 ap->pre_handler = aggr_pre_handler; 969 ap->pre_handler = aggr_pre_handler;
564 ap->fault_handler = aggr_fault_handler; 970 ap->fault_handler = aggr_fault_handler;
565 /* We don't care the kprobe which has gone. */ 971 /* We don't care the kprobe which has gone. */
@@ -569,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
569 ap->break_handler = aggr_break_handler; 975 ap->break_handler = aggr_break_handler;
570 976
571 INIT_LIST_HEAD(&ap->list); 977 INIT_LIST_HEAD(&ap->list);
572 list_add_rcu(&p->list, &ap->list); 978 INIT_HLIST_NODE(&ap->hlist);
573 979
980 list_add_rcu(&p->list, &ap->list);
574 hlist_replace_rcu(&p->hlist, &ap->hlist); 981 hlist_replace_rcu(&p->hlist, &ap->hlist);
575} 982}
576 983
@@ -584,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
584 int ret = 0; 991 int ret = 0;
585 struct kprobe *ap = old_p; 992 struct kprobe *ap = old_p;
586 993
587 if (old_p->pre_handler != aggr_pre_handler) { 994 if (!kprobe_aggrprobe(old_p)) {
588 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 995 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
589 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 996 ap = alloc_aggr_kprobe(old_p);
590 if (!ap) 997 if (!ap)
591 return -ENOMEM; 998 return -ENOMEM;
592 add_aggr_kprobe(ap, old_p); 999 init_aggr_kprobe(ap, old_p);
593 } 1000 }
594 1001
595 if (kprobe_gone(ap)) { 1002 if (kprobe_gone(ap)) {
@@ -608,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
608 */ 1015 */
609 return ret; 1016 return ret;
610 1017
1018 /* Prepare optimized instructions if possible. */
1019 prepare_optimized_kprobe(ap);
1020
611 /* 1021 /*
612 * Clear gone flag to prevent allocating new slot again, and 1022 * Clear gone flag to prevent allocating new slot again, and
613 * set disabled flag because it is not armed yet. 1023 * set disabled flag because it is not armed yet.
@@ -616,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
616 | KPROBE_FLAG_DISABLED; 1026 | KPROBE_FLAG_DISABLED;
617 } 1027 }
618 1028
1029 /* Copy ap's insn slot to p */
619 copy_kprobe(ap, p); 1030 copy_kprobe(ap, p);
620 return add_new_kprobe(ap, p); 1031 return add_new_kprobe(ap, p);
621} 1032}
@@ -728,7 +1139,8 @@ int __kprobes register_kprobe(struct kprobe *p)
728 1139
729 preempt_disable(); 1140 preempt_disable();
730 if (!kernel_text_address((unsigned long) p->addr) || 1141 if (!kernel_text_address((unsigned long) p->addr) ||
731 in_kprobes_functions((unsigned long) p->addr)) { 1142 in_kprobes_functions((unsigned long) p->addr) ||
1143 ftrace_text_reserved(p->addr, p->addr)) {
732 preempt_enable(); 1144 preempt_enable();
733 return -EINVAL; 1145 return -EINVAL;
734 } 1146 }
@@ -765,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p)
765 p->nmissed = 0; 1177 p->nmissed = 0;
766 INIT_LIST_HEAD(&p->list); 1178 INIT_LIST_HEAD(&p->list);
767 mutex_lock(&kprobe_mutex); 1179 mutex_lock(&kprobe_mutex);
1180
1181 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1182 mutex_lock(&text_mutex);
1183
768 old_p = get_kprobe(p->addr); 1184 old_p = get_kprobe(p->addr);
769 if (old_p) { 1185 if (old_p) {
1186 /* Since this may unoptimize old_p, locking text_mutex. */
770 ret = register_aggr_kprobe(old_p, p); 1187 ret = register_aggr_kprobe(old_p, p);
771 goto out; 1188 goto out;
772 } 1189 }
773 1190
774 mutex_lock(&text_mutex);
775 ret = arch_prepare_kprobe(p); 1191 ret = arch_prepare_kprobe(p);
776 if (ret) 1192 if (ret)
777 goto out_unlock_text; 1193 goto out;
778 1194
779 INIT_HLIST_NODE(&p->hlist); 1195 INIT_HLIST_NODE(&p->hlist);
780 hlist_add_head_rcu(&p->hlist, 1196 hlist_add_head_rcu(&p->hlist,
781 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1197 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
782 1198
783 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1199 if (!kprobes_all_disarmed && !kprobe_disabled(p))
784 arch_arm_kprobe(p); 1200 __arm_kprobe(p);
1201
1202 /* Try to optimize kprobe */
1203 try_to_optimize_kprobe(p);
785 1204
786out_unlock_text:
787 mutex_unlock(&text_mutex);
788out: 1205out:
1206 mutex_unlock(&text_mutex);
1207 put_online_cpus();
789 mutex_unlock(&kprobe_mutex); 1208 mutex_unlock(&kprobe_mutex);
790 1209
791 if (probed_mod) 1210 if (probed_mod)
@@ -807,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
807 return -EINVAL; 1226 return -EINVAL;
808 1227
809 if (old_p == p || 1228 if (old_p == p ||
810 (old_p->pre_handler == aggr_pre_handler && 1229 (kprobe_aggrprobe(old_p) &&
811 list_is_singular(&old_p->list))) { 1230 list_is_singular(&old_p->list))) {
812 /* 1231 /*
813 * Only probe on the hash list. Disarm only if kprobes are 1232 * Only probe on the hash list. Disarm only if kprobes are
@@ -815,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
815 * already have been removed. We save on flushing icache. 1234 * already have been removed. We save on flushing icache.
816 */ 1235 */
817 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1236 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
818 disarm_kprobe(p); 1237 disarm_kprobe(old_p);
819 hlist_del_rcu(&old_p->hlist); 1238 hlist_del_rcu(&old_p->hlist);
820 } else { 1239 } else {
821 if (p->break_handler && !kprobe_gone(p)) 1240 if (p->break_handler && !kprobe_gone(p))
@@ -831,8 +1250,13 @@ noclean:
831 list_del_rcu(&p->list); 1250 list_del_rcu(&p->list);
832 if (!kprobe_disabled(old_p)) { 1251 if (!kprobe_disabled(old_p)) {
833 try_to_disable_aggr_kprobe(old_p); 1252 try_to_disable_aggr_kprobe(old_p);
834 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1253 if (!kprobes_all_disarmed) {
835 disarm_kprobe(old_p); 1254 if (kprobe_disabled(old_p))
1255 disarm_kprobe(old_p);
1256 else
1257 /* Try to optimize this probe again */
1258 optimize_kprobe(old_p);
1259 }
836 } 1260 }
837 } 1261 }
838 return 0; 1262 return 0;
@@ -849,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
849 old_p = list_entry(p->list.next, struct kprobe, list); 1273 old_p = list_entry(p->list.next, struct kprobe, list);
850 list_del(&p->list); 1274 list_del(&p->list);
851 arch_remove_kprobe(old_p); 1275 arch_remove_kprobe(old_p);
852 kfree(old_p); 1276 free_aggr_kprobe(old_p);
853 } 1277 }
854} 1278}
855 1279
@@ -1145,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1145 struct kprobe *kp; 1569 struct kprobe *kp;
1146 1570
1147 p->flags |= KPROBE_FLAG_GONE; 1571 p->flags |= KPROBE_FLAG_GONE;
1148 if (p->pre_handler == aggr_pre_handler) { 1572 if (kprobe_aggrprobe(p)) {
1149 /* 1573 /*
1150 * If this is an aggr_kprobe, we have to list all the 1574 * If this is an aggr_kprobe, we have to list all the
1151 * chained probes and mark them GONE. 1575 * chained probes and mark them GONE.
@@ -1154,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1154 kp->flags |= KPROBE_FLAG_GONE; 1578 kp->flags |= KPROBE_FLAG_GONE;
1155 p->post_handler = NULL; 1579 p->post_handler = NULL;
1156 p->break_handler = NULL; 1580 p->break_handler = NULL;
1581 kill_optimized_kprobe(p);
1157 } 1582 }
1158 /* 1583 /*
1159 * Here, we can remove insn_slot safely, because no thread calls 1584 * Here, we can remove insn_slot safely, because no thread calls
@@ -1263,6 +1688,15 @@ static int __init init_kprobes(void)
1263 } 1688 }
1264 } 1689 }
1265 1690
1691#if defined(CONFIG_OPTPROBES)
1692#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1693 /* Init kprobe_optinsn_slots */
1694 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1695#endif
1696 /* By default, kprobes can be optimized */
1697 kprobes_allow_optimization = true;
1698#endif
1699
1266 /* By default, kprobes are armed */ 1700 /* By default, kprobes are armed */
1267 kprobes_all_disarmed = false; 1701 kprobes_all_disarmed = false;
1268 1702
@@ -1281,7 +1715,7 @@ static int __init init_kprobes(void)
1281 1715
1282#ifdef CONFIG_DEBUG_FS 1716#ifdef CONFIG_DEBUG_FS
1283static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1717static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1284 const char *sym, int offset,char *modname) 1718 const char *sym, int offset, char *modname, struct kprobe *pp)
1285{ 1719{
1286 char *kprobe_type; 1720 char *kprobe_type;
1287 1721
@@ -1291,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1291 kprobe_type = "j"; 1725 kprobe_type = "j";
1292 else 1726 else
1293 kprobe_type = "k"; 1727 kprobe_type = "k";
1728
1294 if (sym) 1729 if (sym)
1295 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1730 seq_printf(pi, "%p %s %s+0x%x %s ",
1296 p->addr, kprobe_type, sym, offset, 1731 p->addr, kprobe_type, sym, offset,
1297 (modname ? modname : " "), 1732 (modname ? modname : " "));
1298 (kprobe_gone(p) ? "[GONE]" : ""),
1299 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1300 "[DISABLED]" : ""));
1301 else 1733 else
1302 seq_printf(pi, "%p %s %p %s%s\n", 1734 seq_printf(pi, "%p %s %p ",
1303 p->addr, kprobe_type, p->addr, 1735 p->addr, kprobe_type, p->addr);
1304 (kprobe_gone(p) ? "[GONE]" : ""), 1736
1305 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1737 if (!pp)
1306 "[DISABLED]" : "")); 1738 pp = p;
1739 seq_printf(pi, "%s%s%s\n",
1740 (kprobe_gone(p) ? "[GONE]" : ""),
1741 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1742 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1307} 1743}
1308 1744
1309static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1745static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1339,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1339 hlist_for_each_entry_rcu(p, node, head, hlist) { 1775 hlist_for_each_entry_rcu(p, node, head, hlist) {
1340 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1776 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1341 &offset, &modname, namebuf); 1777 &offset, &modname, namebuf);
1342 if (p->pre_handler == aggr_pre_handler) { 1778 if (kprobe_aggrprobe(p)) {
1343 list_for_each_entry_rcu(kp, &p->list, list) 1779 list_for_each_entry_rcu(kp, &p->list, list)
1344 report_probe(pi, kp, sym, offset, modname); 1780 report_probe(pi, kp, sym, offset, modname, p);
1345 } else 1781 } else
1346 report_probe(pi, p, sym, offset, modname); 1782 report_probe(pi, p, sym, offset, modname, NULL);
1347 } 1783 }
1348 preempt_enable(); 1784 preempt_enable();
1349 return 0; 1785 return 0;
@@ -1421,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
1421 goto out; 1857 goto out;
1422 } 1858 }
1423 1859
1424 if (!kprobes_all_disarmed && kprobe_disabled(p))
1425 arm_kprobe(p);
1426
1427 p->flags &= ~KPROBE_FLAG_DISABLED;
1428 if (p != kp) 1860 if (p != kp)
1429 kp->flags &= ~KPROBE_FLAG_DISABLED; 1861 kp->flags &= ~KPROBE_FLAG_DISABLED;
1862
1863 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1864 p->flags &= ~KPROBE_FLAG_DISABLED;
1865 arm_kprobe(p);
1866 }
1430out: 1867out:
1431 mutex_unlock(&kprobe_mutex); 1868 mutex_unlock(&kprobe_mutex);
1432 return ret; 1869 return ret;
@@ -1446,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void)
1446 if (!kprobes_all_disarmed) 1883 if (!kprobes_all_disarmed)
1447 goto already_enabled; 1884 goto already_enabled;
1448 1885
1886 /* Arming kprobes doesn't optimize kprobe itself */
1449 mutex_lock(&text_mutex); 1887 mutex_lock(&text_mutex);
1450 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1888 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1451 head = &kprobe_table[i]; 1889 head = &kprobe_table[i];
1452 hlist_for_each_entry_rcu(p, node, head, hlist) 1890 hlist_for_each_entry_rcu(p, node, head, hlist)
1453 if (!kprobe_disabled(p)) 1891 if (!kprobe_disabled(p))
1454 arch_arm_kprobe(p); 1892 __arm_kprobe(p);
1455 } 1893 }
1456 mutex_unlock(&text_mutex); 1894 mutex_unlock(&text_mutex);
1457 1895
@@ -1478,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void)
1478 1916
1479 kprobes_all_disarmed = true; 1917 kprobes_all_disarmed = true;
1480 printk(KERN_INFO "Kprobes globally disabled\n"); 1918 printk(KERN_INFO "Kprobes globally disabled\n");
1919
1920 /*
1921 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1922 * because disarming may also unoptimize kprobes.
1923 */
1924 get_online_cpus();
1481 mutex_lock(&text_mutex); 1925 mutex_lock(&text_mutex);
1482 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1926 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1483 head = &kprobe_table[i]; 1927 head = &kprobe_table[i];
1484 hlist_for_each_entry_rcu(p, node, head, hlist) { 1928 hlist_for_each_entry_rcu(p, node, head, hlist) {
1485 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1929 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1486 arch_disarm_kprobe(p); 1930 __disarm_kprobe(p);
1487 } 1931 }
1488 } 1932 }
1489 1933
1490 mutex_unlock(&text_mutex); 1934 mutex_unlock(&text_mutex);
1935 put_online_cpus();
1491 mutex_unlock(&kprobe_mutex); 1936 mutex_unlock(&kprobe_mutex);
1492 /* Allow all currently running kprobes to complete */ 1937 /* Allow all currently running kprobes to complete */
1493 synchronize_sched(); 1938 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c62ec14609b9..0c30d0455de1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3809,3 +3809,21 @@ void lockdep_sys_exit(void)
3809 lockdep_print_held_locks(curr); 3809 lockdep_print_held_locks(curr);
3810 } 3810 }
3811} 3811}
3812
3813void lockdep_rcu_dereference(const char *file, const int line)
3814{
3815 struct task_struct *curr = current;
3816
3817 if (!debug_locks_off())
3818 return;
3819 printk("\n===================================================\n");
3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3821 printk( "---------------------------------------------------\n");
3822 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3823 file, line);
3824 printk("\nother info that might help us debug this:\n\n");
3825 lockdep_print_held_locks(curr);
3826 printk("\nstack backtrace:\n");
3827 dump_stack();
3828}
3829EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index f82386bd9ee9..e5538d5f00ad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod)
474 474
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 475 INIT_LIST_HEAD(&mod->modules_which_use_me);
476 for_each_possible_cpu(cpu) 476 for_each_possible_cpu(cpu)
477 local_set(__module_ref_addr(mod, cpu), 0); 477 per_cpu_ptr(mod->refptr, cpu)->count = 0;
478
478 /* Hold reference count during initialization. */ 479 /* Hold reference count during initialization. */
479 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 480 __this_cpu_write(mod->refptr->count, 1);
480 /* Backwards compatibility macros put refcount during init. */ 481 /* Backwards compatibility macros put refcount during init. */
481 mod->waiter = current; 482 mod->waiter = current;
482} 483}
@@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod)
619 int cpu; 620 int cpu;
620 621
621 for_each_possible_cpu(cpu) 622 for_each_possible_cpu(cpu)
622 total += local_read(__module_ref_addr(mod, cpu)); 623 total += per_cpu_ptr(mod->refptr, cpu)->count;
623 return total; 624 return total;
624} 625}
625EXPORT_SYMBOL(module_refcount); 626EXPORT_SYMBOL(module_refcount);
@@ -796,14 +797,15 @@ static struct module_attribute refcnt = {
796void module_put(struct module *module) 797void module_put(struct module *module)
797{ 798{
798 if (module) { 799 if (module) {
799 unsigned int cpu = get_cpu(); 800 preempt_disable();
800 local_dec(__module_ref_addr(module, cpu)); 801 __this_cpu_dec(module->refptr->count);
802
801 trace_module_put(module, _RET_IP_, 803 trace_module_put(module, _RET_IP_,
802 local_read(__module_ref_addr(module, cpu))); 804 __this_cpu_read(module->refptr->count));
803 /* Maybe they're waiting for us to drop reference? */ 805 /* Maybe they're waiting for us to drop reference? */
804 if (unlikely(!module_is_live(module))) 806 if (unlikely(!module_is_live(module)))
805 wake_up_process(module->waiter); 807 wake_up_process(module->waiter);
806 put_cpu(); 808 preempt_enable();
807 } 809 }
808} 810}
809EXPORT_SYMBOL(module_put); 811EXPORT_SYMBOL(module_put);
@@ -1397,9 +1399,9 @@ static void free_module(struct module *mod)
1397 kfree(mod->args); 1399 kfree(mod->args);
1398 if (mod->percpu) 1400 if (mod->percpu)
1399 percpu_modfree(mod->percpu); 1401 percpu_modfree(mod->percpu);
1400#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 1402#if defined(CONFIG_MODULE_UNLOAD)
1401 if (mod->refptr) 1403 if (mod->refptr)
1402 percpu_modfree(mod->refptr); 1404 free_percpu(mod->refptr);
1403#endif 1405#endif
1404 /* Free lock-classes: */ 1406 /* Free lock-classes: */
1405 lockdep_free_key_range(mod->module_core, mod->core_size); 1407 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2162,9 +2164,8 @@ static noinline struct module *load_module(void __user *umod,
2162 mod = (void *)sechdrs[modindex].sh_addr; 2164 mod = (void *)sechdrs[modindex].sh_addr;
2163 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2165 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2164 2166
2165#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2167#if defined(CONFIG_MODULE_UNLOAD)
2166 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2168 mod->refptr = alloc_percpu(struct module_ref);
2167 mod->name);
2168 if (!mod->refptr) { 2169 if (!mod->refptr) {
2169 err = -ENOMEM; 2170 err = -ENOMEM;
2170 goto free_init; 2171 goto free_init;
@@ -2396,8 +2397,8 @@ static noinline struct module *load_module(void __user *umod,
2396 kobject_put(&mod->mkobj.kobj); 2397 kobject_put(&mod->mkobj.kobj);
2397 free_unload: 2398 free_unload:
2398 module_unload_free(mod); 2399 module_unload_free(mod);
2399#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2400#if defined(CONFIG_MODULE_UNLOAD)
2400 percpu_modfree(mod->refptr); 2401 free_percpu(mod->refptr);
2401 free_init: 2402 free_init:
2402#endif 2403#endif
2403 module_free(mod, mod->module_init); 2404 module_free(mod, mod->module_init);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
80 80
81 nb = rcu_dereference(*nl); 81 nb = rcu_dereference_raw(*nl);
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference_raw(nb->next);
85 85
86#ifdef CONFIG_DEBUG_NOTIFIERS 86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { 87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference(nh->head)) { 312 if (rcu_dereference_raw(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f9bcb8313d6..93caf65ff57c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -642,6 +642,9 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
642 if (!pd) 642 if (!pd)
643 goto err_free_inst; 643 goto err_free_inst;
644 644
645 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
646 goto err_free_pd;
647
645 rcu_assign_pointer(pinst->pd, pd); 648 rcu_assign_pointer(pinst->pd, pd);
646 649
647 pinst->wq = wq; 650 pinst->wq = wq;
@@ -654,12 +657,14 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
654 pinst->cpu_notifier.priority = 0; 657 pinst->cpu_notifier.priority = 0;
655 err = register_hotcpu_notifier(&pinst->cpu_notifier); 658 err = register_hotcpu_notifier(&pinst->cpu_notifier);
656 if (err) 659 if (err)
657 goto err_free_pd; 660 goto err_free_cpumask;
658 661
659 mutex_init(&pinst->lock); 662 mutex_init(&pinst->lock);
660 663
661 return pinst; 664 return pinst;
662 665
666err_free_cpumask:
667 free_cpumask_var(pinst->cpumask);
663err_free_pd: 668err_free_pd:
664 padata_free_pd(pd); 669 padata_free_pd(pd);
665err_free_inst: 670err_free_inst:
@@ -685,6 +690,7 @@ void padata_free(struct padata_instance *pinst)
685 690
686 unregister_hotcpu_notifier(&pinst->cpu_notifier); 691 unregister_hotcpu_notifier(&pinst->cpu_notifier);
687 padata_free_pd(pinst->pd); 692 padata_free_pd(pinst->pd);
693 free_cpumask_var(pinst->cpumask);
688 kfree(pinst); 694 kfree(pinst);
689} 695}
690EXPORT_SYMBOL(padata_free); 696EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 36
37EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
38 38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
45long (*panic_blink)(long time); 40long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
47 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
48/** 69/**
49 * panic - halt the system 70 * panic - halt the system
50 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
95 116
96 bust_spinlocks(0); 117 bust_spinlocks(0);
97 118
98 if (!panic_blink)
99 panic_blink = no_blink;
100
101 if (panic_timeout > 0) { 119 if (panic_timeout > 0) {
102 /* 120 /*
103 * Delay timeout seconds before rebooting the machine. 121 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
105 */ 123 */
106 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
107 125
108 for (i = 0; i < panic_timeout*1000; ) { 126 for (i = 0; i < panic_timeout; i++) {
109 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
110 i += panic_blink(i); 128 panic_blink_one_second();
111 mdelay(1);
112 i++;
113 } 129 }
114 /* 130 /*
115 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
135 } 151 }
136#endif 152#endif
137 local_irq_enable(); 153 local_irq_enable();
138 for (i = 0; ; ) { 154 while (1) {
139 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
140 i += panic_blink(i); 156 panic_blink_one_second();
141 mdelay(1);
142 i++;
143 } 157 }
144} 158}
145 159
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..8d95f5451b22 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
28 27
29#if 0 28#if 0
30#define DEBUGP printk 29#define DEBUGP printk
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2ae7409bf38f..8e352c756ba7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -98,11 +98,12 @@ void __weak hw_perf_enable(void) { barrier(); }
98 98
99void __weak hw_perf_event_setup(int cpu) { barrier(); } 99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); } 100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
101 102
102int __weak 103int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 104hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 105 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 106 struct perf_event_context *ctx)
106{ 107{
107 return 0; 108 return 0;
108} 109}
@@ -248,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 249
249static inline u64 perf_clock(void) 250static inline u64 perf_clock(void)
250{ 251{
251 return cpu_clock(smp_processor_id()); 252 return cpu_clock(raw_smp_processor_id());
252} 253}
253 254
254/* 255/*
@@ -289,6 +290,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 290 event->total_time_running = run_end - event->tstamp_running;
290} 291}
291 292
293static struct list_head *
294ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
295{
296 if (event->attr.pinned)
297 return &ctx->pinned_groups;
298 else
299 return &ctx->flexible_groups;
300}
301
292/* 302/*
293 * Add a event from the lists for its context. 303 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 304 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +313,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 313 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 314 * leader's sibling list:
305 */ 315 */
306 if (group_leader == event) 316 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 317 struct list_head *list;
308 else { 318
319 if (is_software_event(event))
320 event->group_flags |= PERF_GROUP_SOFTWARE;
321
322 list = ctx_group_list(event, ctx);
323 list_add_tail(&event->group_entry, list);
324 } else {
325 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
326 !is_software_event(event))
327 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
328
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 329 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 330 group_leader->nr_siblings++;
311 } 331 }
@@ -355,9 +375,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 375 * to the context list directly:
356 */ 376 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 377 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
378 struct list_head *list;
358 379
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 380 list = ctx_group_list(event, ctx);
381 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 382 sibling->group_leader = sibling;
383
384 /* Inherit group flags from the previous leader */
385 sibling->group_flags = event->group_flags;
361 } 386 }
362} 387}
363 388
@@ -608,14 +633,13 @@ void perf_event_disable(struct perf_event *event)
608static int 633static int
609event_sched_in(struct perf_event *event, 634event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 635 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 636 struct perf_event_context *ctx)
612 int cpu)
613{ 637{
614 if (event->state <= PERF_EVENT_STATE_OFF) 638 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 639 return 0;
616 640
617 event->state = PERF_EVENT_STATE_ACTIVE; 641 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 642 event->oncpu = smp_processor_id();
619 /* 643 /*
620 * The new state must be visible before we turn it on in the hardware: 644 * The new state must be visible before we turn it on in the hardware:
621 */ 645 */
@@ -642,8 +666,7 @@ event_sched_in(struct perf_event *event,
642static int 666static int
643group_sched_in(struct perf_event *group_event, 667group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 669 struct perf_event_context *ctx)
646 int cpu)
647{ 670{
648 struct perf_event *event, *partial_group; 671 struct perf_event *event, *partial_group;
649 int ret; 672 int ret;
@@ -651,18 +674,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 674 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 675 return 0;
653 676
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 677 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 678 if (ret)
656 return ret < 0 ? ret : 0; 679 return ret < 0 ? ret : 0;
657 680
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 681 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 682 return -EAGAIN;
660 683
661 /* 684 /*
662 * Schedule in siblings as one group (if any): 685 * Schedule in siblings as one group (if any):
663 */ 686 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 687 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 688 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 689 partial_group = event;
667 goto group_error; 690 goto group_error;
668 } 691 }
@@ -686,24 +709,6 @@ group_error:
686} 709}
687 710
688/* 711/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 712 * Work out whether we can put this event group on the CPU now.
708 */ 713 */
709static int group_can_go_on(struct perf_event *event, 714static int group_can_go_on(struct perf_event *event,
@@ -713,7 +718,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 718 /*
714 * Groups consisting entirely of software events can always go on. 719 * Groups consisting entirely of software events can always go on.
715 */ 720 */
716 if (is_software_only_group(event)) 721 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 722 return 1;
718 /* 723 /*
719 * If an exclusive group is already on, no other hardware 724 * If an exclusive group is already on, no other hardware
@@ -754,7 +759,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 759 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 760 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 761 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 762 int err;
759 763
760 /* 764 /*
@@ -801,7 +805,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 805 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 806 err = -EEXIST;
803 else 807 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 808 err = event_sched_in(event, cpuctx, ctx);
805 809
806 if (err) { 810 if (err) {
807 /* 811 /*
@@ -943,11 +947,9 @@ static void __perf_event_enable(void *info)
943 } else { 947 } else {
944 perf_disable(); 948 perf_disable();
945 if (event == leader) 949 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 950 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 951 else
949 err = event_sched_in(event, cpuctx, ctx, 952 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 953 perf_enable();
952 } 954 }
953 955
@@ -1043,8 +1045,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1045 return 0;
1044} 1046}
1045 1047
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1048enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1049 EVENT_FLEXIBLE = 0x1,
1050 EVENT_PINNED = 0x2,
1051 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1052};
1053
1054static void ctx_sched_out(struct perf_event_context *ctx,
1055 struct perf_cpu_context *cpuctx,
1056 enum event_type_t event_type)
1048{ 1057{
1049 struct perf_event *event; 1058 struct perf_event *event;
1050 1059
@@ -1055,10 +1064,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1064 update_context_time(ctx);
1056 1065
1057 perf_disable(); 1066 perf_disable();
1058 if (ctx->nr_active) { 1067 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1068 goto out_enable;
1069
1070 if (event_type & EVENT_PINNED)
1071 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1072 group_sched_out(event, cpuctx, ctx);
1061 } 1073
1074 if (event_type & EVENT_FLEXIBLE)
1075 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1076 group_sched_out(event, cpuctx, ctx);
1077
1078 out_enable:
1062 perf_enable(); 1079 perf_enable();
1063 out: 1080 out:
1064 raw_spin_unlock(&ctx->lock); 1081 raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1187,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1187 * not restart the event.
1171 */ 1188 */
1172void perf_event_task_sched_out(struct task_struct *task, 1189void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1190 struct task_struct *next)
1174{ 1191{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1192 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1193 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1194 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1195 struct perf_event_context *parent;
@@ -1220,15 +1237,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1237 rcu_read_unlock();
1221 1238
1222 if (do_switch) { 1239 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1240 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1241 cpuctx->task_ctx = NULL;
1225 } 1242 }
1226} 1243}
1227 1244
1228/* 1245static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1246 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1247{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1249
@@ -1238,47 +1253,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1253 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1254 return;
1240 1255
1241 __perf_event_sched_out(ctx, cpuctx); 1256 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1257 cpuctx->task_ctx = NULL;
1243} 1258}
1244 1259
1245/* 1260/*
1246 * Called with IRQs disabled 1261 * Called with IRQs disabled
1247 */ 1262 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1263static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1264{
1265 task_ctx_sched_out(ctx, EVENT_ALL);
1266}
1267
1268/*
1269 * Called with IRQs disabled
1270 */
1271static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1272 enum event_type_t event_type)
1249{ 1273{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1274 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1275}
1252 1276
1253static void 1277static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1278ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1279 struct perf_cpu_context *cpuctx)
1256{ 1280{
1257 struct perf_event *event; 1281 struct perf_event *event;
1258 int can_add_hw = 1;
1259
1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264 1282
1265 ctx->timestamp = perf_clock(); 1283 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1266 1284 if (event->state <= PERF_EVENT_STATE_OFF)
1267 perf_disable();
1268
1269 /*
1270 * First go through the list and put on any pinned groups
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1285 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1286 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1287 continue;
1279 1288
1280 if (group_can_go_on(event, cpuctx, 1)) 1289 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1290 group_sched_in(event, cpuctx, ctx);
1282 1291
1283 /* 1292 /*
1284 * If this pinned group hasn't been scheduled, 1293 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1298,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1298 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1299 }
1291 } 1300 }
1301}
1292 1302
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1303static void
1294 /* 1304ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1305 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1306{
1297 */ 1307 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1308 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1309
1310 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1311 /* Ignore events in OFF or ERROR state */
1312 if (event->state <= PERF_EVENT_STATE_OFF)
1313 continue;
1302 /* 1314 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1315 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1316 * of events:
1305 */ 1317 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1318 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1319 continue;
1308 1320
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1321 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1322 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1323 can_add_hw = 0;
1312 } 1324 }
1325}
1326
1327static void
1328ctx_sched_in(struct perf_event_context *ctx,
1329 struct perf_cpu_context *cpuctx,
1330 enum event_type_t event_type)
1331{
1332 raw_spin_lock(&ctx->lock);
1333 ctx->is_active = 1;
1334 if (likely(!ctx->nr_events))
1335 goto out;
1336
1337 ctx->timestamp = perf_clock();
1338
1339 perf_disable();
1340
1341 /*
1342 * First go through the list and put on any pinned groups
1343 * in order to give them the best chance of going on.
1344 */
1345 if (event_type & EVENT_PINNED)
1346 ctx_pinned_sched_in(ctx, cpuctx);
1347
1348 /* Then walk through the lower prio flexible groups */
1349 if (event_type & EVENT_FLEXIBLE)
1350 ctx_flexible_sched_in(ctx, cpuctx);
1351
1313 perf_enable(); 1352 perf_enable();
1314 out: 1353 out:
1315 raw_spin_unlock(&ctx->lock); 1354 raw_spin_unlock(&ctx->lock);
1316} 1355}
1317 1356
1357static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1358 enum event_type_t event_type)
1359{
1360 struct perf_event_context *ctx = &cpuctx->ctx;
1361
1362 ctx_sched_in(ctx, cpuctx, event_type);
1363}
1364
1365static void task_ctx_sched_in(struct task_struct *task,
1366 enum event_type_t event_type)
1367{
1368 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1369 struct perf_event_context *ctx = task->perf_event_ctxp;
1370
1371 if (likely(!ctx))
1372 return;
1373 if (cpuctx->task_ctx == ctx)
1374 return;
1375 ctx_sched_in(ctx, cpuctx, event_type);
1376 cpuctx->task_ctx = ctx;
1377}
1318/* 1378/*
1319 * Called from scheduler to add the events of the current task 1379 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1380 * with interrupts disabled.
@@ -1326,38 +1386,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1386 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1387 * keep the event running.
1328 */ 1388 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1389void perf_event_task_sched_in(struct task_struct *task)
1330{ 1390{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1391 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1392 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1393
1334 if (likely(!ctx)) 1394 if (likely(!ctx))
1335 return; 1395 return;
1396
1336 if (cpuctx->task_ctx == ctx) 1397 if (cpuctx->task_ctx == ctx)
1337 return; 1398 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1399
1400 /*
1401 * We want to keep the following priority order:
1402 * cpu pinned (that don't need to move), task pinned,
1403 * cpu flexible, task flexible.
1404 */
1405 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1406
1407 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1408 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1409 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1410
1339 cpuctx->task_ctx = ctx; 1411 cpuctx->task_ctx = ctx;
1340} 1412}
1341 1413
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1414#define MAX_INTERRUPTS (~0ULL)
1415
1416static void perf_log_throttle(struct perf_event *event, int enable);
1417
1418static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1343{ 1419{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1420 u64 frequency = event->attr.sample_freq;
1421 u64 sec = NSEC_PER_SEC;
1422 u64 divisor, dividend;
1423
1424 int count_fls, nsec_fls, frequency_fls, sec_fls;
1425
1426 count_fls = fls64(count);
1427 nsec_fls = fls64(nsec);
1428 frequency_fls = fls64(frequency);
1429 sec_fls = 30;
1345 1430
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1431 /*
1432 * We got @count in @nsec, with a target of sample_freq HZ
1433 * the target period becomes:
1434 *
1435 * @count * 10^9
1436 * period = -------------------
1437 * @nsec * sample_freq
1438 *
1439 */
1440
1441 /*
1442 * Reduce accuracy by one bit such that @a and @b converge
1443 * to a similar magnitude.
1444 */
1445#define REDUCE_FLS(a, b) \
1446do { \
1447 if (a##_fls > b##_fls) { \
1448 a >>= 1; \
1449 a##_fls--; \
1450 } else { \
1451 b >>= 1; \
1452 b##_fls--; \
1453 } \
1454} while (0)
1455
1456 /*
1457 * Reduce accuracy until either term fits in a u64, then proceed with
1458 * the other, so that finally we can do a u64/u64 division.
1459 */
1460 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1461 REDUCE_FLS(nsec, frequency);
1462 REDUCE_FLS(sec, count);
1463 }
1464
1465 if (count_fls + sec_fls > 64) {
1466 divisor = nsec * frequency;
1467
1468 while (count_fls + sec_fls > 64) {
1469 REDUCE_FLS(count, sec);
1470 divisor >>= 1;
1471 }
1472
1473 dividend = count * sec;
1474 } else {
1475 dividend = count * sec;
1476
1477 while (nsec_fls + frequency_fls > 64) {
1478 REDUCE_FLS(nsec, frequency);
1479 dividend >>= 1;
1480 }
1481
1482 divisor = nsec * frequency;
1483 }
1484
1485 return div64_u64(dividend, divisor);
1347} 1486}
1348 1487
1349#define MAX_INTERRUPTS (~0ULL) 1488static void perf_event_stop(struct perf_event *event)
1489{
1490 if (!event->pmu->stop)
1491 return event->pmu->disable(event);
1350 1492
1351static void perf_log_throttle(struct perf_event *event, int enable); 1493 return event->pmu->stop(event);
1494}
1495
1496static int perf_event_start(struct perf_event *event)
1497{
1498 if (!event->pmu->start)
1499 return event->pmu->enable(event);
1352 1500
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1501 return event->pmu->start(event);
1502}
1503
1504static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1505{
1355 struct hw_perf_event *hwc = &event->hw; 1506 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1507 u64 period, sample_period;
1357 s64 delta; 1508 s64 delta;
1358 1509
1359 events *= hwc->sample_period; 1510 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1511
1362 delta = (s64)(period - hwc->sample_period); 1512 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1513 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1518,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1518 sample_period = 1;
1369 1519
1370 hwc->sample_period = sample_period; 1520 hwc->sample_period = sample_period;
1521
1522 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1523 perf_disable();
1524 perf_event_stop(event);
1525 atomic64_set(&hwc->period_left, 0);
1526 perf_event_start(event);
1527 perf_enable();
1528 }
1371} 1529}
1372 1530
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1531static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1532{
1375 struct perf_event *event; 1533 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1534 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1535 u64 interrupts, now;
1536 s64 delta;
1378 1537
1379 raw_spin_lock(&ctx->lock); 1538 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1539 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1554,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1395 if (interrupts == MAX_INTERRUPTS) { 1554 if (interrupts == MAX_INTERRUPTS) {
1396 perf_log_throttle(event, 1); 1555 perf_log_throttle(event, 1);
1397 event->pmu->unthrottle(event); 1556 event->pmu->unthrottle(event);
1398 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1399 } 1557 }
1400 1558
1401 if (!event->attr.freq || !event->attr.sample_freq) 1559 if (!event->attr.freq || !event->attr.sample_freq)
1402 continue; 1560 continue;
1403 1561
1404 /* 1562 event->pmu->read(event);
1405 * if the specified freq < HZ then we need to skip ticks 1563 now = atomic64_read(&event->count);
1406 */ 1564 delta = now - hwc->freq_count_stamp;
1407 if (event->attr.sample_freq < HZ) { 1565 hwc->freq_count_stamp = now;
1408 freq = event->attr.sample_freq;
1409
1410 hwc->freq_count += freq;
1411 hwc->freq_interrupts += interrupts;
1412
1413 if (hwc->freq_count < HZ)
1414 continue;
1415
1416 interrupts = hwc->freq_interrupts;
1417 hwc->freq_interrupts = 0;
1418 hwc->freq_count -= HZ;
1419 } else
1420 freq = HZ;
1421
1422 perf_adjust_period(event, freq * interrupts);
1423 1566
1424 /* 1567 if (delta > 0)
1425 * In order to avoid being stalled by an (accidental) huge 1568 perf_adjust_period(event, TICK_NSEC, delta);
1426 * sample period, force reset the sample period if we didn't
1427 * get any events in this freq period.
1428 */
1429 if (!interrupts) {
1430 perf_disable();
1431 event->pmu->disable(event);
1432 atomic64_set(&hwc->period_left, 0);
1433 event->pmu->enable(event);
1434 perf_enable();
1435 }
1436 } 1569 }
1437 raw_spin_unlock(&ctx->lock); 1570 raw_spin_unlock(&ctx->lock);
1438} 1571}
@@ -1442,26 +1575,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1442 */ 1575 */
1443static void rotate_ctx(struct perf_event_context *ctx) 1576static void rotate_ctx(struct perf_event_context *ctx)
1444{ 1577{
1445 struct perf_event *event;
1446
1447 if (!ctx->nr_events) 1578 if (!ctx->nr_events)
1448 return; 1579 return;
1449 1580
1450 raw_spin_lock(&ctx->lock); 1581 raw_spin_lock(&ctx->lock);
1451 /* 1582
1452 * Rotate the first entry last (works just fine for group events too): 1583 /* Rotate the first entry last of non-pinned groups */
1453 */ 1584 list_rotate_left(&ctx->flexible_groups);
1454 perf_disable();
1455 list_for_each_entry(event, &ctx->group_list, group_entry) {
1456 list_move_tail(&event->group_entry, &ctx->group_list);
1457 break;
1458 }
1459 perf_enable();
1460 1585
1461 raw_spin_unlock(&ctx->lock); 1586 raw_spin_unlock(&ctx->lock);
1462} 1587}
1463 1588
1464void perf_event_task_tick(struct task_struct *curr, int cpu) 1589void perf_event_task_tick(struct task_struct *curr)
1465{ 1590{
1466 struct perf_cpu_context *cpuctx; 1591 struct perf_cpu_context *cpuctx;
1467 struct perf_event_context *ctx; 1592 struct perf_event_context *ctx;
@@ -1469,24 +1594,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
1469 if (!atomic_read(&nr_events)) 1594 if (!atomic_read(&nr_events))
1470 return; 1595 return;
1471 1596
1472 cpuctx = &per_cpu(perf_cpu_context, cpu); 1597 cpuctx = &__get_cpu_var(perf_cpu_context);
1473 ctx = curr->perf_event_ctxp; 1598 ctx = curr->perf_event_ctxp;
1474 1599
1600 perf_disable();
1601
1475 perf_ctx_adjust_freq(&cpuctx->ctx); 1602 perf_ctx_adjust_freq(&cpuctx->ctx);
1476 if (ctx) 1603 if (ctx)
1477 perf_ctx_adjust_freq(ctx); 1604 perf_ctx_adjust_freq(ctx);
1478 1605
1479 perf_event_cpu_sched_out(cpuctx); 1606 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1480 if (ctx) 1607 if (ctx)
1481 __perf_event_task_sched_out(ctx); 1608 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1482 1609
1483 rotate_ctx(&cpuctx->ctx); 1610 rotate_ctx(&cpuctx->ctx);
1484 if (ctx) 1611 if (ctx)
1485 rotate_ctx(ctx); 1612 rotate_ctx(ctx);
1486 1613
1487 perf_event_cpu_sched_in(cpuctx, cpu); 1614 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1488 if (ctx) 1615 if (ctx)
1489 perf_event_task_sched_in(curr, cpu); 1616 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618 perf_enable();
1619}
1620
1621static int event_enable_on_exec(struct perf_event *event,
1622 struct perf_event_context *ctx)
1623{
1624 if (!event->attr.enable_on_exec)
1625 return 0;
1626
1627 event->attr.enable_on_exec = 0;
1628 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1629 return 0;
1630
1631 __perf_event_mark_enabled(event, ctx);
1632
1633 return 1;
1490} 1634}
1491 1635
1492/* 1636/*
@@ -1499,6 +1643,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1499 struct perf_event *event; 1643 struct perf_event *event;
1500 unsigned long flags; 1644 unsigned long flags;
1501 int enabled = 0; 1645 int enabled = 0;
1646 int ret;
1502 1647
1503 local_irq_save(flags); 1648 local_irq_save(flags);
1504 ctx = task->perf_event_ctxp; 1649 ctx = task->perf_event_ctxp;
@@ -1509,14 +1654,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1509 1654
1510 raw_spin_lock(&ctx->lock); 1655 raw_spin_lock(&ctx->lock);
1511 1656
1512 list_for_each_entry(event, &ctx->group_list, group_entry) { 1657 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1513 if (!event->attr.enable_on_exec) 1658 ret = event_enable_on_exec(event, ctx);
1514 continue; 1659 if (ret)
1515 event->attr.enable_on_exec = 0; 1660 enabled = 1;
1516 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1661 }
1517 continue; 1662
1518 __perf_event_mark_enabled(event, ctx); 1663 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1519 enabled = 1; 1664 ret = event_enable_on_exec(event, ctx);
1665 if (ret)
1666 enabled = 1;
1520 } 1667 }
1521 1668
1522 /* 1669 /*
@@ -1527,7 +1674,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1527 1674
1528 raw_spin_unlock(&ctx->lock); 1675 raw_spin_unlock(&ctx->lock);
1529 1676
1530 perf_event_task_sched_in(task, smp_processor_id()); 1677 perf_event_task_sched_in(task);
1531 out: 1678 out:
1532 local_irq_restore(flags); 1679 local_irq_restore(flags);
1533} 1680}
@@ -1590,7 +1737,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1590{ 1737{
1591 raw_spin_lock_init(&ctx->lock); 1738 raw_spin_lock_init(&ctx->lock);
1592 mutex_init(&ctx->mutex); 1739 mutex_init(&ctx->mutex);
1593 INIT_LIST_HEAD(&ctx->group_list); 1740 INIT_LIST_HEAD(&ctx->pinned_groups);
1741 INIT_LIST_HEAD(&ctx->flexible_groups);
1594 INIT_LIST_HEAD(&ctx->event_list); 1742 INIT_LIST_HEAD(&ctx->event_list);
1595 atomic_set(&ctx->refcount, 1); 1743 atomic_set(&ctx->refcount, 1);
1596 ctx->task = task; 1744 ctx->task = task;
@@ -2462,7 +2610,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2462 if (user_locked > user_lock_limit) 2610 if (user_locked > user_lock_limit)
2463 extra = user_locked - user_lock_limit; 2611 extra = user_locked - user_lock_limit;
2464 2612
2465 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2613 lock_limit = rlimit(RLIMIT_MEMLOCK);
2466 lock_limit >>= PAGE_SHIFT; 2614 lock_limit >>= PAGE_SHIFT;
2467 locked = vma->vm_mm->locked_vm + extra; 2615 locked = vma->vm_mm->locked_vm + extra;
2468 2616
@@ -3608,7 +3756,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3608 /* .tid */ 3756 /* .tid */
3609 .start = vma->vm_start, 3757 .start = vma->vm_start,
3610 .len = vma->vm_end - vma->vm_start, 3758 .len = vma->vm_end - vma->vm_start,
3611 .pgoff = vma->vm_pgoff, 3759 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3612 }, 3760 },
3613 }; 3761 };
3614 3762
@@ -3688,12 +3836,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3688 3836
3689 if (event->attr.freq) { 3837 if (event->attr.freq) {
3690 u64 now = perf_clock(); 3838 u64 now = perf_clock();
3691 s64 delta = now - hwc->freq_stamp; 3839 s64 delta = now - hwc->freq_time_stamp;
3692 3840
3693 hwc->freq_stamp = now; 3841 hwc->freq_time_stamp = now;
3694 3842
3695 if (delta > 0 && delta < TICK_NSEC) 3843 if (delta > 0 && delta < 2*TICK_NSEC)
3696 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3844 perf_adjust_period(event, delta, hwc->last_period);
3697 } 3845 }
3698 3846
3699 /* 3847 /*
@@ -4184,7 +4332,7 @@ static const struct pmu perf_ops_task_clock = {
4184 .read = task_clock_perf_event_read, 4332 .read = task_clock_perf_event_read,
4185}; 4333};
4186 4334
4187#ifdef CONFIG_EVENT_PROFILE 4335#ifdef CONFIG_EVENT_TRACING
4188 4336
4189void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4337void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4190 int entry_size) 4338 int entry_size)
@@ -4289,7 +4437,7 @@ static void perf_event_free_filter(struct perf_event *event)
4289{ 4437{
4290} 4438}
4291 4439
4292#endif /* CONFIG_EVENT_PROFILE */ 4440#endif /* CONFIG_EVENT_TRACING */
4293 4441
4294#ifdef CONFIG_HAVE_HW_BREAKPOINT 4442#ifdef CONFIG_HAVE_HW_BREAKPOINT
4295static void bp_perf_event_destroy(struct perf_event *event) 4443static void bp_perf_event_destroy(struct perf_event *event)
@@ -4870,8 +5018,15 @@ inherit_event(struct perf_event *parent_event,
4870 else 5018 else
4871 child_event->state = PERF_EVENT_STATE_OFF; 5019 child_event->state = PERF_EVENT_STATE_OFF;
4872 5020
4873 if (parent_event->attr.freq) 5021 if (parent_event->attr.freq) {
4874 child_event->hw.sample_period = parent_event->hw.sample_period; 5022 u64 sample_period = parent_event->hw.sample_period;
5023 struct hw_perf_event *hwc = &child_event->hw;
5024
5025 hwc->sample_period = sample_period;
5026 hwc->last_period = sample_period;
5027
5028 atomic64_set(&hwc->period_left, sample_period);
5029 }
4875 5030
4876 child_event->overflow_handler = parent_event->overflow_handler; 5031 child_event->overflow_handler = parent_event->overflow_handler;
4877 5032
@@ -5039,7 +5194,11 @@ void perf_event_exit_task(struct task_struct *child)
5039 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5194 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5040 5195
5041again: 5196again:
5042 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5197 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5198 group_entry)
5199 __perf_event_exit_task(child_event, child_ctx, child);
5200
5201 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5043 group_entry) 5202 group_entry)
5044 __perf_event_exit_task(child_event, child_ctx, child); 5203 __perf_event_exit_task(child_event, child_ctx, child);
5045 5204
@@ -5048,7 +5207,8 @@ again:
5048 * its siblings to the list, but we obtained 'tmp' before that which 5207 * its siblings to the list, but we obtained 'tmp' before that which
5049 * will still point to the list head terminating the iteration. 5208 * will still point to the list head terminating the iteration.
5050 */ 5209 */
5051 if (!list_empty(&child_ctx->group_list)) 5210 if (!list_empty(&child_ctx->pinned_groups) ||
5211 !list_empty(&child_ctx->flexible_groups))
5052 goto again; 5212 goto again;
5053 5213
5054 mutex_unlock(&child_ctx->mutex); 5214 mutex_unlock(&child_ctx->mutex);
@@ -5056,6 +5216,24 @@ again:
5056 put_ctx(child_ctx); 5216 put_ctx(child_ctx);
5057} 5217}
5058 5218
5219static void perf_free_event(struct perf_event *event,
5220 struct perf_event_context *ctx)
5221{
5222 struct perf_event *parent = event->parent;
5223
5224 if (WARN_ON_ONCE(!parent))
5225 return;
5226
5227 mutex_lock(&parent->child_mutex);
5228 list_del_init(&event->child_list);
5229 mutex_unlock(&parent->child_mutex);
5230
5231 fput(parent->filp);
5232
5233 list_del_event(event, ctx);
5234 free_event(event);
5235}
5236
5059/* 5237/*
5060 * free an unexposed, unused context as created by inheritance by 5238 * free an unexposed, unused context as created by inheritance by
5061 * init_task below, used by fork() in case of fail. 5239 * init_task below, used by fork() in case of fail.
@@ -5070,36 +5248,70 @@ void perf_event_free_task(struct task_struct *task)
5070 5248
5071 mutex_lock(&ctx->mutex); 5249 mutex_lock(&ctx->mutex);
5072again: 5250again:
5073 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5251 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5074 struct perf_event *parent = event->parent; 5252 perf_free_event(event, ctx);
5075 5253
5076 if (WARN_ON_ONCE(!parent)) 5254 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5077 continue; 5255 group_entry)
5256 perf_free_event(event, ctx);
5078 5257
5079 mutex_lock(&parent->child_mutex); 5258 if (!list_empty(&ctx->pinned_groups) ||
5080 list_del_init(&event->child_list); 5259 !list_empty(&ctx->flexible_groups))
5081 mutex_unlock(&parent->child_mutex); 5260 goto again;
5082 5261
5083 fput(parent->filp); 5262 mutex_unlock(&ctx->mutex);
5084 5263
5085 list_del_event(event, ctx); 5264 put_ctx(ctx);
5086 free_event(event); 5265}
5266
5267static int
5268inherit_task_group(struct perf_event *event, struct task_struct *parent,
5269 struct perf_event_context *parent_ctx,
5270 struct task_struct *child,
5271 int *inherited_all)
5272{
5273 int ret;
5274 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5275
5276 if (!event->attr.inherit) {
5277 *inherited_all = 0;
5278 return 0;
5087 } 5279 }
5088 5280
5089 if (!list_empty(&ctx->group_list)) 5281 if (!child_ctx) {
5090 goto again; 5282 /*
5283 * This is executed from the parent task context, so
5284 * inherit events that have been marked for cloning.
5285 * First allocate and initialize a context for the
5286 * child.
5287 */
5091 5288
5092 mutex_unlock(&ctx->mutex); 5289 child_ctx = kzalloc(sizeof(struct perf_event_context),
5290 GFP_KERNEL);
5291 if (!child_ctx)
5292 return -ENOMEM;
5093 5293
5094 put_ctx(ctx); 5294 __perf_event_init_context(child_ctx, child);
5295 child->perf_event_ctxp = child_ctx;
5296 get_task_struct(child);
5297 }
5298
5299 ret = inherit_group(event, parent, parent_ctx,
5300 child, child_ctx);
5301
5302 if (ret)
5303 *inherited_all = 0;
5304
5305 return ret;
5095} 5306}
5096 5307
5308
5097/* 5309/*
5098 * Initialize the perf_event context in task_struct 5310 * Initialize the perf_event context in task_struct
5099 */ 5311 */
5100int perf_event_init_task(struct task_struct *child) 5312int perf_event_init_task(struct task_struct *child)
5101{ 5313{
5102 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5314 struct perf_event_context *child_ctx, *parent_ctx;
5103 struct perf_event_context *cloned_ctx; 5315 struct perf_event_context *cloned_ctx;
5104 struct perf_event *event; 5316 struct perf_event *event;
5105 struct task_struct *parent = current; 5317 struct task_struct *parent = current;
@@ -5137,41 +5349,22 @@ int perf_event_init_task(struct task_struct *child)
5137 * We dont have to disable NMIs - we are only looking at 5349 * We dont have to disable NMIs - we are only looking at
5138 * the list, not manipulating it: 5350 * the list, not manipulating it:
5139 */ 5351 */
5140 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5352 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5141 5353 ret = inherit_task_group(event, parent, parent_ctx, child,
5142 if (!event->attr.inherit) { 5354 &inherited_all);
5143 inherited_all = 0; 5355 if (ret)
5144 continue; 5356 break;
5145 } 5357 }
5146
5147 if (!child->perf_event_ctxp) {
5148 /*
5149 * This is executed from the parent task context, so
5150 * inherit events that have been marked for cloning.
5151 * First allocate and initialize a context for the
5152 * child.
5153 */
5154
5155 child_ctx = kzalloc(sizeof(struct perf_event_context),
5156 GFP_KERNEL);
5157 if (!child_ctx) {
5158 ret = -ENOMEM;
5159 break;
5160 }
5161
5162 __perf_event_init_context(child_ctx, child);
5163 child->perf_event_ctxp = child_ctx;
5164 get_task_struct(child);
5165 }
5166 5358
5167 ret = inherit_group(event, parent, parent_ctx, 5359 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5168 child, child_ctx); 5360 ret = inherit_task_group(event, parent, parent_ctx, child,
5169 if (ret) { 5361 &inherited_all);
5170 inherited_all = 0; 5362 if (ret)
5171 break; 5363 break;
5172 }
5173 } 5364 }
5174 5365
5366 child_ctx = child->perf_event_ctxp;
5367
5175 if (child_ctx && inherited_all) { 5368 if (child_ctx && inherited_all) {
5176 /* 5369 /*
5177 * Mark the child context as a clone of the parent 5370 * Mark the child context as a clone of the parent
@@ -5220,7 +5413,9 @@ static void __perf_event_exit_cpu(void *info)
5220 struct perf_event_context *ctx = &cpuctx->ctx; 5413 struct perf_event_context *ctx = &cpuctx->ctx;
5221 struct perf_event *event, *tmp; 5414 struct perf_event *event, *tmp;
5222 5415
5223 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5416 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5417 __perf_event_remove_from_context(event);
5418 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5224 __perf_event_remove_from_context(event); 5419 __perf_event_remove_from_context(event);
5225} 5420}
5226static void perf_event_exit_cpu(int cpu) 5421static void perf_event_exit_cpu(int cpu)
@@ -5258,6 +5453,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5258 perf_event_exit_cpu(cpu); 5453 perf_event_exit_cpu(cpu);
5259 break; 5454 break;
5260 5455
5456 case CPU_DEAD:
5457 hw_perf_event_setup_offline(cpu);
5458 break;
5459
5261 default: 5460 default:
5262 break; 5461 break;
5263 } 5462 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cbe..86b296943e5f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference(pid->tasks[type].first); 370 first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock));
371 if (first) 371 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 372 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 373 }
@@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 376EXPORT_SYMBOL(pid_task);
377 377
378/* 378/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 379 * Must be called under rcu_read_lock().
380 */ 380 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 382{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..1a22dfd42df9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
982 int maxfire; 982 int maxfire;
983 struct list_head *timers = tsk->cpu_timers; 983 struct list_head *timers = tsk->cpu_timers;
984 struct signal_struct *const sig = tsk->signal; 984 struct signal_struct *const sig = tsk->signal;
985 unsigned long soft;
985 986
986 maxfire = 20; 987 maxfire = 20;
987 tsk->cputime_expires.prof_exp = cputime_zero; 988 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
1030 /* 1031 /*
1031 * Check for the special case thread timers. 1032 * Check for the special case thread timers.
1032 */ 1033 */
1033 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 1034 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1034 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 1035 if (soft != RLIM_INFINITY) {
1035 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 1036 unsigned long hard =
1037 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1036 1038
1037 if (hard != RLIM_INFINITY && 1039 if (hard != RLIM_INFINITY &&
1038 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 1040 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
1043 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1045 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1044 return; 1046 return;
1045 } 1047 }
1046 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 1048 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1047 /* 1049 /*
1048 * At the soft limit, send a SIGXCPU every second. 1050 * At the soft limit, send a SIGXCPU every second.
1049 */ 1051 */
1050 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 1052 if (soft < hard) {
1051 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 1053 soft += USEC_PER_SEC;
1052 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1054 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1053 USEC_PER_SEC;
1054 } 1055 }
1055 printk(KERN_INFO 1056 printk(KERN_INFO
1056 "RT Watchdog Timeout: %s[%d]\n", 1057 "RT Watchdog Timeout: %s[%d]\n",
@@ -1121,6 +1122,7 @@ static void check_process_timers(struct task_struct *tsk,
1121 unsigned long long sum_sched_runtime, sched_expires; 1122 unsigned long long sum_sched_runtime, sched_expires;
1122 struct list_head *timers = sig->cpu_timers; 1123 struct list_head *timers = sig->cpu_timers;
1123 struct task_cputime cputime; 1124 struct task_cputime cputime;
1125 unsigned long soft;
1124 1126
1125 /* 1127 /*
1126 * Don't sample the current process CPU clocks if there are no timers. 1128 * Don't sample the current process CPU clocks if there are no timers.
@@ -1193,11 +1195,13 @@ static void check_process_timers(struct task_struct *tsk,
1193 SIGPROF); 1195 SIGPROF);
1194 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1196 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1195 SIGVTALRM); 1197 SIGVTALRM);
1196 1198 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1197 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1199 if (soft != RLIM_INFINITY) {
1198 unsigned long psecs = cputime_to_secs(ptime); 1200 unsigned long psecs = cputime_to_secs(ptime);
1201 unsigned long hard =
1202 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1199 cputime_t x; 1203 cputime_t x;
1200 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1204 if (psecs >= hard) {
1201 /* 1205 /*
1202 * At the hard limit, we just die. 1206 * At the hard limit, we just die.
1203 * No need to calculate anything else now. 1207 * No need to calculate anything else now.
@@ -1205,17 +1209,17 @@ static void check_process_timers(struct task_struct *tsk,
1205 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1209 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1206 return; 1210 return;
1207 } 1211 }
1208 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1212 if (psecs >= soft) {
1209 /* 1213 /*
1210 * At the soft limit, send a SIGXCPU every second. 1214 * At the soft limit, send a SIGXCPU every second.
1211 */ 1215 */
1212 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1216 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1213 if (sig->rlim[RLIMIT_CPU].rlim_cur 1217 if (soft < hard) {
1214 < sig->rlim[RLIMIT_CPU].rlim_max) { 1218 soft++;
1215 sig->rlim[RLIMIT_CPU].rlim_cur++; 1219 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1216 } 1220 }
1217 } 1221 }
1218 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1222 x = secs_to_cputime(soft);
1219 if (cputime_eq(prof_expires, cputime_zero) || 1223 if (cputime_eq(prof_expires, cputime_zero) ||
1220 cputime_lt(x, prof_expires)) { 1224 cputime_lt(x, prof_expires)) {
1221 prof_expires = x; 1225 prof_expires = x;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
256 return 0; 256 return 0;
257} 257}
258 258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 259static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{ 260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 262 return 0;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..da5288ec2392 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mode)
323int hibernation_snapshot(int platform_mode) 323int hibernation_snapshot(int platform_mode)
324{ 324{
325 int error; 325 int error;
326 gfp_t saved_mask;
326 327
327 error = platform_begin(platform_mode); 328 error = platform_begin(platform_mode);
328 if (error) 329 if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode)
334 goto Close; 335 goto Close;
335 336
336 suspend_console(); 337 suspend_console();
338 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
337 error = dpm_suspend_start(PMSG_FREEZE); 339 error = dpm_suspend_start(PMSG_FREEZE);
338 if (error) 340 if (error)
339 goto Recover_platform; 341 goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode)
351 353
352 dpm_resume_end(in_suspend ? 354 dpm_resume_end(in_suspend ?
353 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 355 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
356 set_gfp_allowed_mask(saved_mask);
354 resume_console(); 357 resume_console();
355 Close: 358 Close:
356 platform_end(platform_mode); 359 platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode)
445int hibernation_restore(int platform_mode) 448int hibernation_restore(int platform_mode)
446{ 449{
447 int error; 450 int error;
451 gfp_t saved_mask;
448 452
449 pm_prepare_console(); 453 pm_prepare_console();
450 suspend_console(); 454 suspend_console();
455 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
451 error = dpm_suspend_start(PMSG_QUIESCE); 456 error = dpm_suspend_start(PMSG_QUIESCE);
452 if (!error) { 457 if (!error) {
453 error = resume_target_kernel(platform_mode); 458 error = resume_target_kernel(platform_mode);
454 dpm_resume_end(PMSG_RECOVER); 459 dpm_resume_end(PMSG_RECOVER);
455 } 460 }
461 set_gfp_allowed_mask(saved_mask);
456 resume_console(); 462 resume_console();
457 pm_restore_console(); 463 pm_restore_console();
458 return error; 464 return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode)
466int hibernation_platform_enter(void) 472int hibernation_platform_enter(void)
467{ 473{
468 int error; 474 int error;
475 gfp_t saved_mask;
469 476
470 if (!hibernation_ops) 477 if (!hibernation_ops)
471 return -ENOSYS; 478 return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
481 488
482 entering_platform_hibernation = true; 489 entering_platform_hibernation = true;
483 suspend_console(); 490 suspend_console();
491 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
484 error = dpm_suspend_start(PMSG_HIBERNATE); 492 error = dpm_suspend_start(PMSG_HIBERNATE);
485 if (error) { 493 if (error) {
486 if (hibernation_ops->recover) 494 if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
518 Resume_devices: 526 Resume_devices:
519 entering_platform_hibernation = false; 527 entering_platform_hibernation = false;
520 dpm_resume_end(PMSG_RESTORE); 528 dpm_resume_end(PMSG_RESTORE);
529 set_gfp_allowed_mask(saved_mask);
521 resume_console(); 530 resume_console();
522 531
523 Close: 532 Close:
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..44cce10b582d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 189int suspend_devices_and_enter(suspend_state_t state)
190{ 190{
191 int error; 191 int error;
192 gfp_t saved_mask;
192 193
193 if (!suspend_ops) 194 if (!suspend_ops)
194 return -ENOSYS; 195 return -ENOSYS;
@@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 200 goto Close;
200 } 201 }
201 suspend_console(); 202 suspend_console();
203 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 204 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 205 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 206 if (error) {
@@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 217 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 218 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 219 suspend_test_finish("resume devices");
220 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 221 resume_console();
219 Close: 222 Close:
220 if (suspend_ops->end) 223 if (suspend_ops->end)
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 37#include <linux/kmsg_dump.h>
38#include <linux/syslog.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
@@ -69,8 +70,6 @@ int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 70 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
70}; 71};
71 72
72static int saved_console_loglevel = -1;
73
74/* 73/*
75 * Low level drivers may need that to know if they can schedule in 74 * Low level drivers may need that to know if they can schedule in
76 * their unblank() callback or not. So let's export it. 75 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
145static char *log_buf = __log_buf; 144static char *log_buf = __log_buf;
146static int log_buf_len = __LOG_BUF_LEN; 145static int log_buf_len = __LOG_BUF_LEN;
147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 146static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
147static int saved_console_loglevel = -1;
148 148
149#ifdef CONFIG_KEXEC 149#ifdef CONFIG_KEXEC
150/* 150/*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
258} 258}
259#endif 259#endif
260 260
261/* 261int do_syslog(int type, char __user *buf, int len, bool from_file)
262 * Commands to do_syslog:
263 *
264 * 0 -- Close the log. Currently a NOP.
265 * 1 -- Open the log. Currently a NOP.
266 * 2 -- Read from the log.
267 * 3 -- Read all messages remaining in the ring buffer.
268 * 4 -- Read and clear all messages remaining in the ring buffer
269 * 5 -- Clear ring buffer.
270 * 6 -- Disable printk's to console
271 * 7 -- Enable printk's to console
272 * 8 -- Set level of messages printed to console
273 * 9 -- Return number of unread characters in the log buffer
274 * 10 -- Return size of the log buffer
275 */
276int do_syslog(int type, char __user *buf, int len)
277{ 262{
278 unsigned i, j, limit, count; 263 unsigned i, j, limit, count;
279 int do_clear = 0; 264 int do_clear = 0;
280 char c; 265 char c;
281 int error = 0; 266 int error = 0;
282 267
283 error = security_syslog(type); 268 error = security_syslog(type, from_file);
284 if (error) 269 if (error)
285 return error; 270 return error;
286 271
287 switch (type) { 272 switch (type) {
288 case 0: /* Close log */ 273 case SYSLOG_ACTION_CLOSE: /* Close log */
289 break; 274 break;
290 case 1: /* Open log */ 275 case SYSLOG_ACTION_OPEN: /* Open log */
291 break; 276 break;
292 case 2: /* Read from log */ 277 case SYSLOG_ACTION_READ: /* Read from log */
293 error = -EINVAL; 278 error = -EINVAL;
294 if (!buf || len < 0) 279 if (!buf || len < 0)
295 goto out; 280 goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
320 if (!error) 305 if (!error)
321 error = i; 306 error = i;
322 break; 307 break;
323 case 4: /* Read/clear last kernel messages */ 308 /* Read/clear last kernel messages */
309 case SYSLOG_ACTION_READ_CLEAR:
324 do_clear = 1; 310 do_clear = 1;
325 /* FALL THRU */ 311 /* FALL THRU */
326 case 3: /* Read last kernel messages */ 312 /* Read last kernel messages */
313 case SYSLOG_ACTION_READ_ALL:
327 error = -EINVAL; 314 error = -EINVAL;
328 if (!buf || len < 0) 315 if (!buf || len < 0)
329 goto out; 316 goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
376 } 363 }
377 } 364 }
378 break; 365 break;
379 case 5: /* Clear ring buffer */ 366 /* Clear ring buffer */
367 case SYSLOG_ACTION_CLEAR:
380 logged_chars = 0; 368 logged_chars = 0;
381 break; 369 break;
382 case 6: /* Disable logging to console */ 370 /* Disable logging to console */
371 case SYSLOG_ACTION_CONSOLE_OFF:
383 if (saved_console_loglevel == -1) 372 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel; 373 saved_console_loglevel = console_loglevel;
385 console_loglevel = minimum_console_loglevel; 374 console_loglevel = minimum_console_loglevel;
386 break; 375 break;
387 case 7: /* Enable logging to console */ 376 /* Enable logging to console */
377 case SYSLOG_ACTION_CONSOLE_ON:
388 if (saved_console_loglevel != -1) { 378 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel; 379 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1; 380 saved_console_loglevel = -1;
391 } 381 }
392 break; 382 break;
393 case 8: /* Set level of messages printed to console */ 383 /* Set level of messages printed to console */
384 case SYSLOG_ACTION_CONSOLE_LEVEL:
394 error = -EINVAL; 385 error = -EINVAL;
395 if (len < 1 || len > 8) 386 if (len < 1 || len > 8)
396 goto out; 387 goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
401 saved_console_loglevel = -1; 392 saved_console_loglevel = -1;
402 error = 0; 393 error = 0;
403 break; 394 break;
404 case 9: /* Number of chars in the log buffer */ 395 /* Number of chars in the log buffer */
396 case SYSLOG_ACTION_SIZE_UNREAD:
405 error = log_end - log_start; 397 error = log_end - log_start;
406 break; 398 break;
407 case 10: /* Size of the log buffer */ 399 /* Size of the log buffer */
400 case SYSLOG_ACTION_SIZE_BUFFER:
408 error = log_buf_len; 401 error = log_buf_len;
409 break; 402 break;
410 default: 403 default:
@@ -417,7 +410,7 @@ out:
417 410
418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 411SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
419{ 412{
420 return do_syslog(type, buf, len); 413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 414}
422 415
423/* 416/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/regset.h>
25 26
26 27
27/* 28/*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 512 return 0;
512} 513}
513 514
515#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
516
517static const struct user_regset *
518find_regset(const struct user_regset_view *view, unsigned int type)
519{
520 const struct user_regset *regset;
521 int n;
522
523 for (n = 0; n < view->n; ++n) {
524 regset = view->regsets + n;
525 if (regset->core_note_type == type)
526 return regset;
527 }
528
529 return NULL;
530}
531
532static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
533 struct iovec *kiov)
534{
535 const struct user_regset_view *view = task_user_regset_view(task);
536 const struct user_regset *regset = find_regset(view, type);
537 int regset_no;
538
539 if (!regset || (kiov->iov_len % regset->size) != 0)
540 return -EINVAL;
541
542 regset_no = regset - view->regsets;
543 kiov->iov_len = min(kiov->iov_len,
544 (__kernel_size_t) (regset->n * regset->size));
545
546 if (req == PTRACE_GETREGSET)
547 return copy_regset_to_user(task, view, regset_no, 0,
548 kiov->iov_len, kiov->iov_base);
549 else
550 return copy_regset_from_user(task, view, regset_no, 0,
551 kiov->iov_len, kiov->iov_base);
552}
553
554#endif
555
514int ptrace_request(struct task_struct *child, long request, 556int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 557 long addr, long data)
516{ 558{
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 615 return 0;
574 return ptrace_resume(child, request, SIGKILL); 616 return ptrace_resume(child, request, SIGKILL);
575 617
618#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
619 case PTRACE_GETREGSET:
620 case PTRACE_SETREGSET:
621 {
622 struct iovec kiov;
623 struct iovec __user *uiov = (struct iovec __user *) data;
624
625 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
626 return -EFAULT;
627
628 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
629 __get_user(kiov.iov_len, &uiov->iov_len))
630 return -EFAULT;
631
632 ret = ptrace_regset(child, request, addr, &kiov);
633 if (!ret)
634 ret = __put_user(kiov.iov_len, &uiov->iov_len);
635 break;
636 }
637#endif
576 default: 638 default:
577 break; 639 break;
578 } 640 }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 773 else
712 ret = ptrace_setsiginfo(child, &siginfo); 774 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 775 break;
776#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
777 case PTRACE_GETREGSET:
778 case PTRACE_SETREGSET:
779 {
780 struct iovec kiov;
781 struct compat_iovec __user *uiov =
782 (struct compat_iovec __user *) datap;
783 compat_uptr_t ptr;
784 compat_size_t len;
785
786 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
787 return -EFAULT;
788
789 if (__get_user(ptr, &uiov->iov_base) ||
790 __get_user(len, &uiov->iov_len))
791 return -EFAULT;
792
793 kiov.iov_base = compat_ptr(ptr);
794 kiov.iov_len = len;
795
796 ret = ptrace_regset(child, request, addr, &kiov);
797 if (!ret)
798 ret = __put_user(kiov.iov_len, &uiov->iov_len);
799 break;
800 }
801#endif
714 802
715 default: 803 default:
716 ret = ptrace_request(child, request, addr, data); 804 ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..f1125c1a6321 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,43 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
47 48
48#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
49static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
50struct lockdep_map rcu_lock_map = 51struct lockdep_map rcu_lock_map =
51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
52EXPORT_SYMBOL_GPL(rcu_lock_map); 53EXPORT_SYMBOL_GPL(rcu_lock_map);
54
55static struct lock_class_key rcu_bh_lock_key;
56struct lockdep_map rcu_bh_lock_map =
57 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
58EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
59
60static struct lock_class_key rcu_sched_lock_key;
61struct lockdep_map rcu_sched_lock_map =
62 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
53#endif 64#endif
54 65
66int rcu_scheduler_active __read_mostly;
67EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68
69/*
70 * This function is invoked towards the end of the scheduler's initialization
71 * process. Before this is called, the idle task might contain
72 * RCU read-side critical sections (during which time, this idle
73 * task is booting the system). After this function is called, the
74 * idle tasks are prohibited from containing RCU read-side critical
75 * sections.
76 */
77void rcu_scheduler_starting(void)
78{
79 WARN_ON(num_online_cpus() != 1);
80 WARN_ON(nr_context_switches() > 0);
81 rcu_scheduler_active = 1;
82}
83
55/* 84/*
56 * Awaken the corresponding synchronize_rcu() instance now that a 85 * Awaken the corresponding synchronize_rcu() instance now that a
57 * grace period has elapsed. 86 * grace period has elapsed.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af02..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */
64static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 67static char *torture_type = "rcu"; /* What RCU implementation to torture. */
65 68
66module_param(nreaders, int, 0444); 69module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
79MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 82MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
80module_param(irqreader, int, 0444); 83module_param(irqreader, int, 0444);
81MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 84MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
85module_param(fqs_duration, int, 0444);
86MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
87module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
82module_param(torture_type, charp, 0444); 91module_param(torture_type, charp, 0444);
83MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
84 93
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
99static struct task_struct *stats_task; 108static struct task_struct *stats_task;
100static struct task_struct *shuffler_task; 109static struct task_struct *shuffler_task;
101static struct task_struct *stutter_task; 110static struct task_struct *stutter_task;
111static struct task_struct *fqs_task;
102 112
103#define RCU_TORTURE_PIPE_LEN 10 113#define RCU_TORTURE_PIPE_LEN 10
104 114
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
263 void (*deferred_free)(struct rcu_torture *p); 273 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 274 void (*sync)(void);
265 void (*cb_barrier)(void); 275 void (*cb_barrier)(void);
276 void (*fqs)(void);
266 int (*stats)(char *page); 277 int (*stats)(char *page);
267 int irq_capable; 278 int irq_capable;
268 char *name; 279 char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
347 .deferred_free = rcu_torture_deferred_free, 358 .deferred_free = rcu_torture_deferred_free,
348 .sync = synchronize_rcu, 359 .sync = synchronize_rcu,
349 .cb_barrier = rcu_barrier, 360 .cb_barrier = rcu_barrier,
361 .fqs = rcu_force_quiescent_state,
350 .stats = NULL, 362 .stats = NULL,
351 .irq_capable = 1, 363 .irq_capable = 1,
352 .name = "rcu" 364 .name = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .deferred_free = rcu_sync_torture_deferred_free, 400 .deferred_free = rcu_sync_torture_deferred_free,
389 .sync = synchronize_rcu, 401 .sync = synchronize_rcu,
390 .cb_barrier = NULL, 402 .cb_barrier = NULL,
403 .fqs = rcu_force_quiescent_state,
391 .stats = NULL, 404 .stats = NULL,
392 .irq_capable = 1, 405 .irq_capable = 1,
393 .name = "rcu_sync" 406 .name = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
403 .deferred_free = rcu_sync_torture_deferred_free, 416 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited, 417 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL, 418 .cb_barrier = NULL,
419 .fqs = rcu_force_quiescent_state,
406 .stats = NULL, 420 .stats = NULL,
407 .irq_capable = 1, 421 .irq_capable = 1,
408 .name = "rcu_expedited" 422 .name = "rcu_expedited"
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
465 .deferred_free = rcu_bh_torture_deferred_free, 479 .deferred_free = rcu_bh_torture_deferred_free,
466 .sync = rcu_bh_torture_synchronize, 480 .sync = rcu_bh_torture_synchronize,
467 .cb_barrier = rcu_barrier_bh, 481 .cb_barrier = rcu_barrier_bh,
482 .fqs = rcu_bh_force_quiescent_state,
468 .stats = NULL, 483 .stats = NULL,
469 .irq_capable = 1, 484 .irq_capable = 1,
470 .name = "rcu_bh" 485 .name = "rcu_bh"
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
480 .deferred_free = rcu_sync_torture_deferred_free, 495 .deferred_free = rcu_sync_torture_deferred_free,
481 .sync = rcu_bh_torture_synchronize, 496 .sync = rcu_bh_torture_synchronize,
482 .cb_barrier = NULL, 497 .cb_barrier = NULL,
498 .fqs = rcu_bh_force_quiescent_state,
483 .stats = NULL, 499 .stats = NULL,
484 .irq_capable = 1, 500 .irq_capable = 1,
485 .name = "rcu_bh_sync" 501 .name = "rcu_bh_sync"
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = {
621 .deferred_free = rcu_sched_torture_deferred_free, 637 .deferred_free = rcu_sched_torture_deferred_free,
622 .sync = sched_torture_synchronize, 638 .sync = sched_torture_synchronize,
623 .cb_barrier = rcu_barrier_sched, 639 .cb_barrier = rcu_barrier_sched,
640 .fqs = rcu_sched_force_quiescent_state,
624 .stats = NULL, 641 .stats = NULL,
625 .irq_capable = 1, 642 .irq_capable = 1,
626 .name = "sched" 643 .name = "sched"
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = {
636 .deferred_free = rcu_sync_torture_deferred_free, 653 .deferred_free = rcu_sync_torture_deferred_free,
637 .sync = sched_torture_synchronize, 654 .sync = sched_torture_synchronize,
638 .cb_barrier = NULL, 655 .cb_barrier = NULL,
656 .fqs = rcu_sched_force_quiescent_state,
639 .stats = NULL, 657 .stats = NULL,
640 .name = "sched_sync" 658 .name = "sched_sync"
641}; 659};
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
650 .deferred_free = rcu_sync_torture_deferred_free, 668 .deferred_free = rcu_sync_torture_deferred_free,
651 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
652 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state,
653 .stats = rcu_expedited_torture_stats, 672 .stats = rcu_expedited_torture_stats,
654 .irq_capable = 1, 673 .irq_capable = 1,
655 .name = "sched_expedited" 674 .name = "sched_expedited"
656}; 675};
657 676
658/* 677/*
678 * RCU torture force-quiescent-state kthread. Repeatedly induces
679 * bursts of calls to force_quiescent_state(), increasing the probability
680 * of occurrence of some important types of race conditions.
681 */
682static int
683rcu_torture_fqs(void *arg)
684{
685 unsigned long fqs_resume_time;
686 int fqs_burst_remaining;
687
688 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
689 do {
690 fqs_resume_time = jiffies + fqs_stutter * HZ;
691 while (jiffies - fqs_resume_time > LONG_MAX) {
692 schedule_timeout_interruptible(1);
693 }
694 fqs_burst_remaining = fqs_duration;
695 while (fqs_burst_remaining > 0) {
696 cur_ops->fqs();
697 udelay(fqs_holdoff);
698 fqs_burst_remaining -= fqs_holdoff;
699 }
700 rcu_stutter_wait("rcu_torture_fqs");
701 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
702 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
703 rcutorture_shutdown_absorb("rcu_torture_fqs");
704 while (!kthread_should_stop())
705 schedule_timeout_uninterruptible(1);
706 return 0;
707}
708
709/*
659 * RCU torture writer kthread. Repeatedly substitutes a new structure 710 * RCU torture writer kthread. Repeatedly substitutes a new structure
660 * for that pointed to by rcu_torture_current, freeing the old structure 711 * for that pointed to by rcu_torture_current, freeing the old structure
661 * after a series of grace periods (the "pipeline"). 712 * after a series of grace periods (the "pipeline").
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
745 796
746 idx = cur_ops->readlock(); 797 idx = cur_ops->readlock();
747 completed = cur_ops->completed(); 798 completed = cur_ops->completed();
748 p = rcu_dereference(rcu_torture_current); 799 p = rcu_dereference_check(rcu_torture_current,
800 rcu_read_lock_held() ||
801 rcu_read_lock_bh_held() ||
802 rcu_read_lock_sched_held() ||
803 srcu_read_lock_held(&srcu_ctl));
749 if (p == NULL) { 804 if (p == NULL) {
750 /* Leave because rcu_torture_writer is not yet underway */ 805 /* Leave because rcu_torture_writer is not yet underway */
751 cur_ops->readunlock(idx); 806 cur_ops->readunlock(idx);
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
763 /* Should not happen, but... */ 818 /* Should not happen, but... */
764 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
765 } 820 }
766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 821 __this_cpu_inc(rcu_torture_count[pipe_count]);
767 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
768 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
769 /* Should not happen, but... */ 824 /* Should not happen, but... */
770 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
771 } 826 }
772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 827 __this_cpu_inc(rcu_torture_batch[completed]);
773 preempt_enable(); 828 preempt_enable();
774 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
775} 830}
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg)
798 do { 853 do {
799 if (irqreader && cur_ops->irq_capable) { 854 if (irqreader && cur_ops->irq_capable) {
800 if (!timer_pending(&t)) 855 if (!timer_pending(&t))
801 mod_timer(&t, 1); 856 mod_timer(&t, jiffies + 1);
802 } 857 }
803 idx = cur_ops->readlock(); 858 idx = cur_ops->readlock();
804 completed = cur_ops->completed(); 859 completed = cur_ops->completed();
805 p = rcu_dereference(rcu_torture_current); 860 p = rcu_dereference_check(rcu_torture_current,
861 rcu_read_lock_held() ||
862 rcu_read_lock_bh_held() ||
863 rcu_read_lock_sched_held() ||
864 srcu_read_lock_held(&srcu_ctl));
806 if (p == NULL) { 865 if (p == NULL) {
807 /* Wait for rcu_torture_writer to get underway */ 866 /* Wait for rcu_torture_writer to get underway */
808 cur_ops->readunlock(idx); 867 cur_ops->readunlock(idx);
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg)
818 /* Should not happen, but... */ 877 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 878 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 879 }
821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 880 __this_cpu_inc(rcu_torture_count[pipe_count]);
822 completed = cur_ops->completed() - completed; 881 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 882 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 883 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 884 completed = RCU_TORTURE_PIPE_LEN;
826 } 885 }
827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 886 __this_cpu_inc(rcu_torture_batch[completed]);
828 preempt_enable(); 887 preempt_enable();
829 cur_ops->readunlock(idx); 888 cur_ops->readunlock(idx);
830 schedule(); 889 schedule();
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
1030 printk(KERN_ALERT "%s" TORTURE_FLAG 1089 printk(KERN_ALERT "%s" TORTURE_FLAG
1031 "--- %s: nreaders=%d nfakewriters=%d " 1090 "--- %s: nreaders=%d nfakewriters=%d "
1032 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1091 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1033 "shuffle_interval=%d stutter=%d irqreader=%d\n", 1092 "shuffle_interval=%d stutter=%d irqreader=%d "
1093 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
1034 torture_type, tag, nrealreaders, nfakewriters, 1094 torture_type, tag, nrealreaders, nfakewriters,
1035 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1095 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1036 stutter, irqreader); 1096 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
1037} 1097}
1038 1098
1039static struct notifier_block rcutorture_nb = { 1099static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void)
1109 } 1169 }
1110 stats_task = NULL; 1170 stats_task = NULL;
1111 1171
1172 if (fqs_task) {
1173 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1174 kthread_stop(fqs_task);
1175 }
1176 fqs_task = NULL;
1177
1112 /* Wait for all RCU callbacks to fire. */ 1178 /* Wait for all RCU callbacks to fire. */
1113 1179
1114 if (cur_ops->cb_barrier != NULL) 1180 if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1220,11 @@ rcu_torture_init(void)
1154 mutex_unlock(&fullstop_mutex); 1220 mutex_unlock(&fullstop_mutex);
1155 return -EINVAL; 1221 return -EINVAL;
1156 } 1222 }
1223 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1224 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1225 "fqs_duration, fqs disabled.\n");
1226 fqs_duration = 0;
1227 }
1157 if (cur_ops->init) 1228 if (cur_ops->init)
1158 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1229 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1159 1230
@@ -1282,6 +1353,19 @@ rcu_torture_init(void)
1282 goto unwind; 1353 goto unwind;
1283 } 1354 }
1284 } 1355 }
1356 if (fqs_duration < 0)
1357 fqs_duration = 0;
1358 if (fqs_duration) {
1359 /* Create the stutter thread */
1360 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1361 "rcu_torture_fqs");
1362 if (IS_ERR(fqs_task)) {
1363 firsterr = PTR_ERR(fqs_task);
1364 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1365 fqs_task = NULL;
1366 goto unwind;
1367 }
1368 }
1285 register_reboot_notifier(&rcutorture_nb); 1369 register_reboot_notifier(&rcutorture_nb);
1286 mutex_unlock(&fullstop_mutex); 1370 mutex_unlock(&fullstop_mutex);
1287 return 0; 1371 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,7 +46,6 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
50 49
51#include "rcutree.h" 50#include "rcutree.h"
52 51
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
66 .signaled = RCU_GP_IDLE, \ 65 .signaled = RCU_GP_IDLE, \
67 .gpnum = -300, \ 66 .gpnum = -300, \
68 .completed = -300, \ 67 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \ 69 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \ 70 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \ 71 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 73 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 74 .n_force_qs_ngp = 0, \
76} 75}
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 82
84static int rcu_scheduler_active __read_mostly;
85
86
87/* 83/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node 85 * permit this function to be invoked without holding the root rcu_node
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void)
157EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 153EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
158 154
159/* 155/*
156 * Force a quiescent state for RCU BH.
157 */
158void rcu_bh_force_quiescent_state(void)
159{
160 force_quiescent_state(&rcu_bh_state, 0);
161}
162EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
163
164/*
165 * Force a quiescent state for RCU-sched.
166 */
167void rcu_sched_force_quiescent_state(void)
168{
169 force_quiescent_state(&rcu_sched_state, 0);
170}
171EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
172
173/*
160 * Does the CPU have callbacks ready to be invoked? 174 * Does the CPU have callbacks ready to be invoked?
161 */ 175 */
162static int 176static int
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
439 453
440 /* Only let one CPU complain about others per time interval. */ 454 /* Only let one CPU complain about others per time interval. */
441 455
442 spin_lock_irqsave(&rnp->lock, flags); 456 raw_spin_lock_irqsave(&rnp->lock, flags);
443 delta = jiffies - rsp->jiffies_stall; 457 delta = jiffies - rsp->jiffies_stall;
444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 458 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
445 spin_unlock_irqrestore(&rnp->lock, flags); 459 raw_spin_unlock_irqrestore(&rnp->lock, flags);
446 return; 460 return;
447 } 461 }
448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 462 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
452 * due to CPU offlining. 466 * due to CPU offlining.
453 */ 467 */
454 rcu_print_task_stall(rnp); 468 rcu_print_task_stall(rnp);
455 spin_unlock_irqrestore(&rnp->lock, flags); 469 raw_spin_unlock_irqrestore(&rnp->lock, flags);
456 470
457 /* OK, time to rat on our buddy... */ 471 /* OK, time to rat on our buddy... */
458 472
459 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 473 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
460 rcu_for_each_leaf_node(rsp, rnp) { 474 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags);
461 rcu_print_task_stall(rnp); 476 rcu_print_task_stall(rnp);
477 raw_spin_unlock_irqrestore(&rnp->lock, flags);
462 if (rnp->qsmask == 0) 478 if (rnp->qsmask == 0)
463 continue; 479 continue;
464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 480 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 485 smp_processor_id(), (long)(jiffies - rsp->gp_start));
470 trigger_all_cpu_backtrace(); 486 trigger_all_cpu_backtrace();
471 487
488 /* If so configured, complain about tasks blocking the grace period. */
489
490 rcu_print_detail_task_stall(rsp);
491
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 492 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 493}
474 494
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
481 smp_processor_id(), jiffies - rsp->gp_start); 501 smp_processor_id(), jiffies - rsp->gp_start);
482 trigger_all_cpu_backtrace(); 502 trigger_all_cpu_backtrace();
483 503
484 spin_lock_irqsave(&rnp->lock, flags); 504 raw_spin_lock_irqsave(&rnp->lock, flags);
485 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 505 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
486 rsp->jiffies_stall = 506 rsp->jiffies_stall =
487 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 507 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
488 spin_unlock_irqrestore(&rnp->lock, flags); 508 raw_spin_unlock_irqrestore(&rnp->lock, flags);
489 509
490 set_need_resched(); /* kick ourselves to get things going. */ 510 set_need_resched(); /* kick ourselves to get things going. */
491} 511}
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
545 local_irq_save(flags); 565 local_irq_save(flags);
546 rnp = rdp->mynode; 566 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ 567 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 568 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
549 local_irq_restore(flags); 569 local_irq_restore(flags);
550 return; 570 return;
551 } 571 }
552 __note_new_gpnum(rsp, rnp, rdp); 572 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags); 573 raw_spin_unlock_irqrestore(&rnp->lock, flags);
554} 574}
555 575
556/* 576/*
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
609 local_irq_save(flags); 629 local_irq_save(flags);
610 rnp = rdp->mynode; 630 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 631 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 632 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
613 local_irq_restore(flags); 633 local_irq_restore(flags);
614 return; 634 return;
615 } 635 }
616 __rcu_process_gp_end(rsp, rnp, rdp); 636 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags); 637 raw_spin_unlock_irqrestore(&rnp->lock, flags);
618} 638}
619 639
620/* 640/*
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
659 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 679 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
660 struct rcu_node *rnp = rcu_get_root(rsp); 680 struct rcu_node *rnp = rcu_get_root(rsp);
661 681
662 if (!cpu_needs_another_gp(rsp, rdp)) { 682 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
683 if (cpu_needs_another_gp(rsp, rdp))
684 rsp->fqs_need_gp = 1;
663 if (rnp->completed == rsp->completed) { 685 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags); 686 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 687 return;
666 } 688 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 689 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 690
669 /* 691 /*
670 * Propagate new ->completed value to rcu_node structures 692 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672 * of the next grace period to process their callbacks. 694 * of the next grace period to process their callbacks.
673 */ 695 */
674 rcu_for_each_node_breadth_first(rsp, rnp) { 696 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */ 697 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed; 698 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 699 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 } 700 }
679 local_irq_restore(flags); 701 local_irq_restore(flags);
680 return; 702 return;
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
695 rnp->completed = rsp->completed; 717 rnp->completed = rsp->completed;
696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 718 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp); 719 rcu_start_gp_per_cpu(rsp, rnp, rdp);
698 spin_unlock_irqrestore(&rnp->lock, flags); 720 raw_spin_unlock_irqrestore(&rnp->lock, flags);
699 return; 721 return;
700 } 722 }
701 723
702 spin_unlock(&rnp->lock); /* leave irqs disabled. */ 724 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
703 725
704 726
705 /* Exclude any concurrent CPU-hotplug operations. */ 727 /* Exclude any concurrent CPU-hotplug operations. */
706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 728 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
707 729
708 /* 730 /*
709 * Set the quiescent-state-needed bits in all the rcu_node 731 * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
723 * irqs disabled. 745 * irqs disabled.
724 */ 746 */
725 rcu_for_each_node_breadth_first(rsp, rnp) { 747 rcu_for_each_node_breadth_first(rsp, rnp) {
726 spin_lock(&rnp->lock); /* irqs already disabled. */ 748 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
727 rcu_preempt_check_blocked_tasks(rnp); 749 rcu_preempt_check_blocked_tasks(rnp);
728 rnp->qsmask = rnp->qsmaskinit; 750 rnp->qsmask = rnp->qsmaskinit;
729 rnp->gpnum = rsp->gpnum; 751 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed; 752 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode) 753 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp); 754 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 755 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
734 } 756 }
735 757
736 rnp = rcu_get_root(rsp); 758 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */ 759 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 760 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 761 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
740 spin_unlock_irqrestore(&rsp->onofflock, flags); 762 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
741} 763}
742 764
743/* 765/*
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
776 if (!(rnp->qsmask & mask)) { 798 if (!(rnp->qsmask & mask)) {
777 799
778 /* Our bit has already been cleared, so done. */ 800 /* Our bit has already been cleared, so done. */
779 spin_unlock_irqrestore(&rnp->lock, flags); 801 raw_spin_unlock_irqrestore(&rnp->lock, flags);
780 return; 802 return;
781 } 803 }
782 rnp->qsmask &= ~mask; 804 rnp->qsmask &= ~mask;
783 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 805 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
784 806
785 /* Other bits still set at this level, so done. */ 807 /* Other bits still set at this level, so done. */
786 spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
787 return; 809 return;
788 } 810 }
789 mask = rnp->grpmask; 811 mask = rnp->grpmask;
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
793 815
794 break; 816 break;
795 } 817 }
796 spin_unlock_irqrestore(&rnp->lock, flags); 818 raw_spin_unlock_irqrestore(&rnp->lock, flags);
797 rnp_c = rnp; 819 rnp_c = rnp;
798 rnp = rnp->parent; 820 rnp = rnp->parent;
799 spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
800 WARN_ON_ONCE(rnp_c->qsmask); 822 WARN_ON_ONCE(rnp_c->qsmask);
801 } 823 }
802 824
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
825 struct rcu_node *rnp; 847 struct rcu_node *rnp;
826 848
827 rnp = rdp->mynode; 849 rnp = rdp->mynode;
828 spin_lock_irqsave(&rnp->lock, flags); 850 raw_spin_lock_irqsave(&rnp->lock, flags);
829 if (lastcomp != rnp->completed) { 851 if (lastcomp != rnp->completed) {
830 852
831 /* 853 /*
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
837 * race occurred. 859 * race occurred.
838 */ 860 */
839 rdp->passed_quiesc = 0; /* try again later! */ 861 rdp->passed_quiesc = 0; /* try again later! */
840 spin_unlock_irqrestore(&rnp->lock, flags); 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
841 return; 863 return;
842 } 864 }
843 mask = rdp->grpmask; 865 mask = rdp->grpmask;
844 if ((rnp->qsmask & mask) == 0) { 866 if ((rnp->qsmask & mask) == 0) {
845 spin_unlock_irqrestore(&rnp->lock, flags); 867 raw_spin_unlock_irqrestore(&rnp->lock, flags);
846 } else { 868 } else {
847 rdp->qs_pending = 0; 869 rdp->qs_pending = 0;
848 870
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
906 928
907 if (rdp->nxtlist == NULL) 929 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */ 930 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 931 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist; 932 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 933 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL; 934 rdp->nxtlist = NULL;
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
914 rdp->nxttail[i] = &rdp->nxtlist; 936 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen; 937 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0; 938 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 939 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918} 940}
919 941
920/* 942/*
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
925 unsigned long flags; 947 unsigned long flags;
926 struct rcu_data *rdp; 948 struct rcu_data *rdp;
927 949
928 spin_lock_irqsave(&rsp->onofflock, flags); 950 raw_spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()]; 951 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) { 952 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags); 953 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return; 954 return;
933 } 955 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 956 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
937 rsp->orphan_cbs_list = NULL; 959 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 960 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0; 961 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 962 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
941} 963}
942 964
943/* 965/*
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
953 struct rcu_node *rnp; 975 struct rcu_node *rnp;
954 976
955 /* Exclude any attempts to start a new grace period. */ 977 /* Exclude any attempts to start a new grace period. */
956 spin_lock_irqsave(&rsp->onofflock, flags); 978 raw_spin_lock_irqsave(&rsp->onofflock, flags);
957 979
958 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 980 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
959 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ 981 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
960 mask = rdp->grpmask; /* rnp->grplo is constant. */ 982 mask = rdp->grpmask; /* rnp->grplo is constant. */
961 do { 983 do {
962 spin_lock(&rnp->lock); /* irqs already disabled. */ 984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
963 rnp->qsmaskinit &= ~mask; 985 rnp->qsmaskinit &= ~mask;
964 if (rnp->qsmaskinit != 0) { 986 if (rnp->qsmaskinit != 0) {
965 if (rnp != rdp->mynode) 987 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 988 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
967 break; 989 break;
968 } 990 }
969 if (rnp == rdp->mynode) 991 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 992 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else 993 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 994 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
973 mask = rnp->grpmask; 995 mask = rnp->grpmask;
974 rnp = rnp->parent; 996 rnp = rnp->parent;
975 } while (rnp != NULL); 997 } while (rnp != NULL);
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1002 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock. 1003 * held leads to deadlock.
982 */ 1004 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1005 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode; 1006 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP) 1007 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags); 1008 rcu_report_unblock_qs_rnp(rnp, flags);
987 else 1009 else
988 spin_unlock_irqrestore(&rnp->lock, flags); 1010 raw_spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP) 1011 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp); 1012 rcu_report_exp_rnp(rsp, rnp);
991 1013
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
1144/* 1166/*
1145 * Scan the leaf rcu_node structures, processing dyntick state for any that 1167 * Scan the leaf rcu_node structures, processing dyntick state for any that
1146 * have not yet encountered a quiescent state, using the function specified. 1168 * have not yet encountered a quiescent state, using the function specified.
1147 * Returns 1 if the current grace period ends while scanning (possibly 1169 * The caller must have suppressed start of new grace periods.
1148 * because we made it end).
1149 */ 1170 */
1150static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, 1171static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1151 int (*f)(struct rcu_data *))
1152{ 1172{
1153 unsigned long bit; 1173 unsigned long bit;
1154 int cpu; 1174 int cpu;
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1158 1178
1159 rcu_for_each_leaf_node(rsp, rnp) { 1179 rcu_for_each_leaf_node(rsp, rnp) {
1160 mask = 0; 1180 mask = 0;
1161 spin_lock_irqsave(&rnp->lock, flags); 1181 raw_spin_lock_irqsave(&rnp->lock, flags);
1162 if (rnp->completed != lastcomp) { 1182 if (!rcu_gp_in_progress(rsp)) {
1163 spin_unlock_irqrestore(&rnp->lock, flags); 1183 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1164 return 1; 1184 return;
1165 } 1185 }
1166 if (rnp->qsmask == 0) { 1186 if (rnp->qsmask == 0) {
1167 spin_unlock_irqrestore(&rnp->lock, flags); 1187 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168 continue; 1188 continue;
1169 } 1189 }
1170 cpu = rnp->grplo; 1190 cpu = rnp->grplo;
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1193 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1174 mask |= bit; 1194 mask |= bit;
1175 } 1195 }
1176 if (mask != 0 && rnp->completed == lastcomp) { 1196 if (mask != 0) {
1177 1197
1178 /* rcu_report_qs_rnp() releases rnp->lock. */ 1198 /* rcu_report_qs_rnp() releases rnp->lock. */
1179 rcu_report_qs_rnp(mask, rsp, rnp, flags); 1199 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1180 continue; 1200 continue;
1181 } 1201 }
1182 spin_unlock_irqrestore(&rnp->lock, flags); 1202 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1183 } 1203 }
1184 return 0;
1185} 1204}
1186 1205
1187/* 1206/*
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1191static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1210static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1192{ 1211{
1193 unsigned long flags; 1212 unsigned long flags;
1194 long lastcomp;
1195 struct rcu_node *rnp = rcu_get_root(rsp); 1213 struct rcu_node *rnp = rcu_get_root(rsp);
1196 u8 signaled;
1197 u8 forcenow;
1198 1214
1199 if (!rcu_gp_in_progress(rsp)) 1215 if (!rcu_gp_in_progress(rsp))
1200 return; /* No grace period in progress, nothing to force. */ 1216 return; /* No grace period in progress, nothing to force. */
1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1217 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1218 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1203 return; /* Someone else is already on the job. */ 1219 return; /* Someone else is already on the job. */
1204 } 1220 }
1205 if (relaxed && 1221 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1206 (long)(rsp->jiffies_force_qs - jiffies) >= 0) 1222 goto unlock_fqs_ret; /* no emergency and done recently. */
1207 goto unlock_ret; /* no emergency and done recently. */
1208 rsp->n_force_qs++; 1223 rsp->n_force_qs++;
1209 spin_lock(&rnp->lock); 1224 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1210 lastcomp = rsp->gpnum - 1;
1211 signaled = rsp->signaled;
1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1225 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1213 if(!rcu_gp_in_progress(rsp)) { 1226 if(!rcu_gp_in_progress(rsp)) {
1214 rsp->n_force_qs_ngp++; 1227 rsp->n_force_qs_ngp++;
1215 spin_unlock(&rnp->lock); 1228 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1216 goto unlock_ret; /* no GP in progress, time updated. */ 1229 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1217 } 1230 }
1218 spin_unlock(&rnp->lock); 1231 rsp->fqs_active = 1;
1219 switch (signaled) { 1232 switch (rsp->signaled) {
1220 case RCU_GP_IDLE: 1233 case RCU_GP_IDLE:
1221 case RCU_GP_INIT: 1234 case RCU_GP_INIT:
1222 1235
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1224 1237
1225 case RCU_SAVE_DYNTICK: 1238 case RCU_SAVE_DYNTICK:
1226 1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1227 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1228 break; /* So gcc recognizes the dead code. */ 1242 break; /* So gcc recognizes the dead code. */
1229 1243
1230 /* Record dyntick-idle state. */ 1244 /* Record dyntick-idle state. */
1231 if (rcu_process_dyntick(rsp, lastcomp, 1245 force_qs_rnp(rsp, dyntick_save_progress_counter);
1232 dyntick_save_progress_counter)) 1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1233 goto unlock_ret; 1247 if (rcu_gp_in_progress(rsp))
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1237
1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1240 spin_lock(&rnp->lock);
1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1244 rsp->signaled = RCU_FORCE_QS; 1248 rsp->signaled = RCU_FORCE_QS;
1245 rsp->completed_fqs = lastcomp; 1249 break;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1247 }
1248 spin_unlock(&rnp->lock);
1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1252 1250
1253 case RCU_FORCE_QS: 1251 case RCU_FORCE_QS:
1254 1252
1255 /* Check dyntick-idle state, send IPI to laggarts. */ 1253 /* Check dyntick-idle state, send IPI to laggarts. */
1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs, 1254 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1257 rcu_implicit_dynticks_qs)) 1255 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1258 goto unlock_ret;
1259 1256
1260 /* Leave state in case more forcing is required. */ 1257 /* Leave state in case more forcing is required. */
1261 1258
1259 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1262 break; 1260 break;
1263 } 1261 }
1264unlock_ret: 1262 rsp->fqs_active = 0;
1265 spin_unlock_irqrestore(&rsp->fqslock, flags); 1263 if (rsp->fqs_need_gp) {
1264 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1265 rsp->fqs_need_gp = 0;
1266 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1267 return;
1268 }
1269 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1270unlock_fqs_ret:
1271 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1266} 1272}
1267 1273
1268#else /* #ifdef CONFIG_SMP */ 1274#else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1290 * If an RCU GP has gone long enough, go check for dyntick 1296 * If an RCU GP has gone long enough, go check for dyntick
1291 * idle CPUs and, if needed, send resched IPIs. 1297 * idle CPUs and, if needed, send resched IPIs.
1292 */ 1298 */
1293 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1299 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1294 force_quiescent_state(rsp, 1); 1300 force_quiescent_state(rsp, 1);
1295 1301
1296 /* 1302 /*
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1304 1310
1305 /* Does this CPU require a not-yet-started grace period? */ 1311 /* Does this CPU require a not-yet-started grace period? */
1306 if (cpu_needs_another_gp(rsp, rdp)) { 1312 if (cpu_needs_another_gp(rsp, rdp)) {
1307 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 1313 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1308 rcu_start_gp(rsp, flags); /* releases above lock */ 1314 rcu_start_gp(rsp, flags); /* releases above lock */
1309 } 1315 }
1310 1316
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1335 * grace-period manipulations above. 1341 * grace-period manipulations above.
1336 */ 1342 */
1337 smp_mb(); /* See above block comment. */ 1343 smp_mb(); /* See above block comment. */
1344
1345 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1346 rcu_needs_cpu_flush();
1338} 1347}
1339 1348
1340static void 1349static void
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1369 unsigned long nestflag; 1378 unsigned long nestflag;
1370 struct rcu_node *rnp_root = rcu_get_root(rsp); 1379 struct rcu_node *rnp_root = rcu_get_root(rsp);
1371 1380
1372 spin_lock_irqsave(&rnp_root->lock, nestflag); 1381 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1382 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1374 } 1383 }
1375 1384
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1387 force_quiescent_state(rsp, 0); 1396 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs; 1397 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen; 1398 rdp->qlen_last_fqs_check = rdp->qlen;
1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1399 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1391 force_quiescent_state(rsp, 1); 1400 force_quiescent_state(rsp, 1);
1392 local_irq_restore(flags); 1401 local_irq_restore(flags);
1393} 1402}
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1520 1529
1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1530 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1522 if (rcu_gp_in_progress(rsp) && 1531 if (rcu_gp_in_progress(rsp) &&
1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1532 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1524 rdp->n_rp_need_fqs++; 1533 rdp->n_rp_need_fqs++;
1525 return 1; 1534 return 1;
1526 } 1535 }
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu)
1545/* 1554/*
1546 * Check to see if any future RCU-related work will need to be done 1555 * Check to see if any future RCU-related work will need to be done
1547 * by the current CPU, even if none need be done immediately, returning 1556 * by the current CPU, even if none need be done immediately, returning
1548 * 1 if so. This function is part of the RCU implementation; it is -not- 1557 * 1 if so.
1549 * an exported member of the RCU API.
1550 */ 1558 */
1551int rcu_needs_cpu(int cpu) 1559static int rcu_needs_cpu_quick_check(int cpu)
1552{ 1560{
1553 /* RCU callbacks either ready or pending? */ 1561 /* RCU callbacks either ready or pending? */
1554 return per_cpu(rcu_sched_data, cpu).nxtlist || 1562 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu)
1556 rcu_preempt_needs_cpu(cpu); 1564 rcu_preempt_needs_cpu(cpu);
1557} 1565}
1558 1566
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1567static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count; 1568static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex); 1569static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1659 struct rcu_node *rnp = rcu_get_root(rsp); 1652 struct rcu_node *rnp = rcu_get_root(rsp);
1660 1653
1661 /* Set up local state, ensuring consistent view of global state. */ 1654 /* Set up local state, ensuring consistent view of global state. */
1662 spin_lock_irqsave(&rnp->lock, flags); 1655 raw_spin_lock_irqsave(&rnp->lock, flags);
1663 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1656 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1664 rdp->nxtlist = NULL; 1657 rdp->nxtlist = NULL;
1665 for (i = 0; i < RCU_NEXT_SIZE; i++) 1658 for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1669 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1662 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1670#endif /* #ifdef CONFIG_NO_HZ */ 1663#endif /* #ifdef CONFIG_NO_HZ */
1671 rdp->cpu = cpu; 1664 rdp->cpu = cpu;
1672 spin_unlock_irqrestore(&rnp->lock, flags); 1665 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1673} 1666}
1674 1667
1675/* 1668/*
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1687 struct rcu_node *rnp = rcu_get_root(rsp); 1680 struct rcu_node *rnp = rcu_get_root(rsp);
1688 1681
1689 /* Set up local state, ensuring consistent view of global state. */ 1682 /* Set up local state, ensuring consistent view of global state. */
1690 spin_lock_irqsave(&rnp->lock, flags); 1683 raw_spin_lock_irqsave(&rnp->lock, flags);
1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1684 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1685 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1693 rdp->beenonline = 1; /* We have now been online. */ 1686 rdp->beenonline = 1; /* We have now been online. */
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1695 rdp->qlen_last_fqs_check = 0; 1688 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs; 1689 rdp->n_force_qs_snap = rsp->n_force_qs;
1697 rdp->blimit = blimit; 1690 rdp->blimit = blimit;
1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1691 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1699 1692
1700 /* 1693 /*
1701 * A new grace period might start here. If so, we won't be part 1694 * A new grace period might start here. If so, we won't be part
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1703 */ 1696 */
1704 1697
1705 /* Exclude any attempts to start a new GP on large systems. */ 1698 /* Exclude any attempts to start a new GP on large systems. */
1706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1699 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1707 1700
1708 /* Add CPU to rcu_node bitmasks. */ 1701 /* Add CPU to rcu_node bitmasks. */
1709 rnp = rdp->mynode; 1702 rnp = rdp->mynode;
1710 mask = rdp->grpmask; 1703 mask = rdp->grpmask;
1711 do { 1704 do {
1712 /* Exclude any attempts to start a new GP on small systems. */ 1705 /* Exclude any attempts to start a new GP on small systems. */
1713 spin_lock(&rnp->lock); /* irqs already disabled. */ 1706 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1714 rnp->qsmaskinit |= mask; 1707 rnp->qsmaskinit |= mask;
1715 mask = rnp->grpmask; 1708 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) { 1709 if (rnp == rdp->mynode) {
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1718 rdp->completed = rnp->completed; 1711 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1; 1712 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 } 1713 }
1721 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1714 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1722 rnp = rnp->parent; 1715 rnp = rnp->parent;
1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1716 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1724 1717
1725 spin_unlock_irqrestore(&rsp->onofflock, flags); 1718 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1726} 1719}
1727 1720
1728static void __cpuinit rcu_online_cpu(int cpu) 1721static void __cpuinit rcu_online_cpu(int cpu)
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1806 */ 1799 */
1807static void __init rcu_init_one(struct rcu_state *rsp) 1800static void __init rcu_init_one(struct rcu_state *rsp)
1808{ 1801{
1802 static char *buf[] = { "rcu_node_level_0",
1803 "rcu_node_level_1",
1804 "rcu_node_level_2",
1805 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
1809 int cpustride = 1; 1806 int cpustride = 1;
1810 int i; 1807 int i;
1811 int j; 1808 int j;
1812 struct rcu_node *rnp; 1809 struct rcu_node *rnp;
1813 1810
1811 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
1812
1814 /* Initialize the level-tracking arrays. */ 1813 /* Initialize the level-tracking arrays. */
1815 1814
1816 for (i = 1; i < NUM_RCU_LVLS; i++) 1815 for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1823 cpustride *= rsp->levelspread[i]; 1822 cpustride *= rsp->levelspread[i];
1824 rnp = rsp->level[i]; 1823 rnp = rsp->level[i];
1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1824 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1826 spin_lock_init(&rnp->lock); 1825 raw_spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]); 1826 lockdep_set_class_and_name(&rnp->lock,
1827 &rcu_node_class[i], buf[i]);
1828 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1829 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1830 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1876,7 +1876,7 @@ do { \
1876 1876
1877void __init rcu_init(void) 1877void __init rcu_init(void)
1878{ 1878{
1879 int i; 1879 int cpu;
1880 1880
1881 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -1896,8 +1896,8 @@ void __init rcu_init(void)
1896 * or the scheduler are operational. 1896 * or the scheduler are operational.
1897 */ 1897 */
1898 cpu_notifier(rcu_cpu_notify, 0); 1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i) 1899 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); 1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1901} 1901}
1902 1902
1903#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..1439eb504c22 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
90 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
91 */ 91 */
92struct rcu_node { 92struct rcu_node {
93 spinlock_t lock; /* Root rcu_node's lock protects some */ 93 raw_spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */ 94 /* rcu_state fields as well as following. */
95 long gpnum; /* Current grace period for this node. */ 95 unsigned long gpnum; /* Current grace period for this node. */
96 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
97 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */ 98 unsigned long completed; /* Last GP completed for this node. */
99 /* This will either be equal to or one */ 99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */ 100 /* behind the root rcu_node's gpnum. */
101 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
161/* Per-CPU data for read-copy update. */ 161/* Per-CPU data for read-copy update. */
162struct rcu_data { 162struct rcu_data {
163 /* 1) quiescent-state and grace-period handling : */ 163 /* 1) quiescent-state and grace-period handling : */
164 long completed; /* Track rsp->completed gp number */ 164 unsigned long completed; /* Track rsp->completed gp number */
165 /* in order to detect GP end. */ 165 /* in order to detect GP end. */
166 long gpnum; /* Highest gp number that this CPU */ 166 unsigned long gpnum; /* Highest gp number that this CPU */
167 /* is aware of having started. */ 167 /* is aware of having started. */
168 long passed_quiesc_completed; 168 unsigned long passed_quiesc_completed;
169 /* Value of completed at time of qs. */ 169 /* Value of completed at time of qs. */
170 bool passed_quiesc; /* User-mode/idle loop etc. */ 170 bool passed_quiesc; /* User-mode/idle loop etc. */
171 bool qs_pending; /* Core waits for quiesc state. */ 171 bool qs_pending; /* Core waits for quiesc state. */
@@ -221,14 +221,14 @@ struct rcu_data {
221 unsigned long resched_ipi; /* Sent a resched IPI. */ 221 unsigned long resched_ipi; /* Sent a resched IPI. */
222 222
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 long n_rp_cb_ready; 226 unsigned long n_rp_cb_ready;
227 long n_rp_cpu_needs_gp; 227 unsigned long n_rp_cpu_needs_gp;
228 long n_rp_gp_completed; 228 unsigned long n_rp_gp_completed;
229 long n_rp_gp_started; 229 unsigned long n_rp_gp_started;
230 long n_rp_need_fqs; 230 unsigned long n_rp_need_fqs;
231 long n_rp_need_nothing; 231 unsigned long n_rp_need_nothing;
232 232
233 int cpu; 233 int cpu;
234}; 234};
@@ -237,12 +237,11 @@ struct rcu_data {
237#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
238#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ 240#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
242#ifdef CONFIG_NO_HZ 241#ifdef CONFIG_NO_HZ
243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 242#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
244#else /* #ifdef CONFIG_NO_HZ */ 243#else /* #ifdef CONFIG_NO_HZ */
245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED 244#define RCU_SIGNAL_INIT RCU_FORCE_QS
246#endif /* #else #ifdef CONFIG_NO_HZ */ 245#endif /* #else #ifdef CONFIG_NO_HZ */
247 246
248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -256,6 +255,9 @@ struct rcu_data {
256 255
257#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 256#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
258 257
258#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
259#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
260
259/* 261/*
260 * RCU global state, including node hierarchy. This hierarchy is 262 * RCU global state, including node hierarchy. This hierarchy is
261 * represented in "heap" form in a dense array. The root (first level) 263 * represented in "heap" form in a dense array. The root (first level)
@@ -277,12 +279,19 @@ struct rcu_state {
277 279
278 u8 signaled ____cacheline_internodealigned_in_smp; 280 u8 signaled ____cacheline_internodealigned_in_smp;
279 /* Force QS state. */ 281 /* Force QS state. */
280 long gpnum; /* Current gp number. */ 282 u8 fqs_active; /* force_quiescent_state() */
281 long completed; /* # of last completed gp. */ 283 /* is running. */
284 u8 fqs_need_gp; /* A CPU was prevented from */
285 /* starting a new grace */
286 /* period because */
287 /* force_quiescent_state() */
288 /* was running. */
289 unsigned long gpnum; /* Current gp number. */
290 unsigned long completed; /* # of last completed gp. */
282 291
283 /* End of fields guarded by root rcu_node's lock. */ 292 /* End of fields guarded by root rcu_node's lock. */
284 293
285 spinlock_t onofflock; /* exclude on/offline and */ 294 raw_spinlock_t onofflock; /* exclude on/offline and */
286 /* starting new GP. Also */ 295 /* starting new GP. Also */
287 /* protects the following */ 296 /* protects the following */
288 /* orphan_cbs fields. */ 297 /* orphan_cbs fields. */
@@ -292,10 +301,8 @@ struct rcu_state {
292 /* going offline. */ 301 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ 302 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */ 303 long orphan_qlen; /* Number of orphaned cbs. */
295 spinlock_t fqslock; /* Only one task forcing */ 304 raw_spinlock_t fqslock; /* Only one task forcing */
296 /* quiescent states. */ 305 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
299 unsigned long jiffies_force_qs; /* Time at which to invoke */ 306 unsigned long jiffies_force_qs; /* Time at which to invoke */
300 /* force_quiescent_state(). */ 307 /* force_quiescent_state(). */
301 unsigned long n_force_qs; /* Number of calls to */ 308 unsigned long n_force_qs; /* Number of calls to */
@@ -319,8 +326,6 @@ struct rcu_state {
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ 326#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */ 327 /* GP were moved to root. */
321 328
322#ifdef RCU_TREE_NONCORE
323
324/* 329/*
325 * RCU implementation internal declarations: 330 * RCU implementation internal declarations:
326 */ 331 */
@@ -335,7 +340,7 @@ extern struct rcu_state rcu_preempt_state;
335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 340DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 341#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
337 342
338#else /* #ifdef RCU_TREE_NONCORE */ 343#ifndef RCU_TREE_NONCORE
339 344
340/* Forward declarations for rcutree_plugin.h */ 345/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void); 346static void rcu_bootup_announce(void);
@@ -347,6 +352,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags); 352 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 353#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 354#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
355static void rcu_print_detail_task_stall(struct rcu_state *rsp);
350static void rcu_print_task_stall(struct rcu_node *rnp); 356static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 357#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 358static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +373,6 @@ static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 373static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void); 374static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void); 375static void __init __rcu_init_preempt(void);
376static void rcu_needs_cpu_flush(void);
370 377
371#endif /* #else #ifdef RCU_TREE_NONCORE */ 378#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..464ad2cdee00 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -62,6 +62,15 @@ long rcu_batches_completed(void)
62EXPORT_SYMBOL_GPL(rcu_batches_completed); 62EXPORT_SYMBOL_GPL(rcu_batches_completed);
63 63
64/* 64/*
65 * Force a quiescent state for preemptible RCU.
66 */
67void rcu_force_quiescent_state(void)
68{
69 force_quiescent_state(&rcu_preempt_state, 0);
70}
71EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
72
73/*
65 * Record a preemptable-RCU quiescent state for the specified CPU. Note 74 * Record a preemptable-RCU quiescent state for the specified CPU. Note
66 * that this just means that the task currently running on the CPU is 75 * that this just means that the task currently running on the CPU is
67 * not in a quiescent state. There might be any number of tasks blocked 76 * not in a quiescent state. There might be any number of tasks blocked
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
102 /* Possibly blocking in an RCU read-side critical section. */ 111 /* Possibly blocking in an RCU read-side critical section. */
103 rdp = rcu_preempt_state.rda[cpu]; 112 rdp = rcu_preempt_state.rda[cpu];
104 rnp = rdp->mynode; 113 rnp = rdp->mynode;
105 spin_lock_irqsave(&rnp->lock, flags); 114 raw_spin_lock_irqsave(&rnp->lock, flags);
106 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 115 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
107 t->rcu_blocked_node = rnp; 116 t->rcu_blocked_node = rnp;
108 117
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
123 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 132 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
124 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 133 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
125 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 134 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
126 spin_unlock_irqrestore(&rnp->lock, flags); 135 raw_spin_unlock_irqrestore(&rnp->lock, flags);
127 } 136 }
128 137
129 /* 138 /*
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
180 struct rcu_node *rnp_p; 189 struct rcu_node *rnp_p;
181 190
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 191 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */ 193 return; /* Still need more quiescent states! */
185 } 194 }
186 195
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
197 206
198 /* Report up the rest of the hierarchy. */ 207 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask; 208 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 209 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */ 210 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 211 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203} 212}
204 213
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
248 */ 257 */
249 for (;;) { 258 for (;;) {
250 rnp = t->rcu_blocked_node; 259 rnp = t->rcu_blocked_node;
251 spin_lock(&rnp->lock); /* irqs already disabled. */ 260 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
252 if (rnp == t->rcu_blocked_node) 261 if (rnp == t->rcu_blocked_node)
253 break; 262 break;
254 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
255 } 264 }
256 empty = !rcu_preempted_readers(rnp); 265 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp); 266 empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 274 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
266 */ 275 */
267 if (empty) 276 if (empty)
268 spin_unlock_irqrestore(&rnp->lock, flags); 277 raw_spin_unlock_irqrestore(&rnp->lock, flags);
269 else 278 else
270 rcu_report_unblock_qs_rnp(rnp, flags); 279 rcu_report_unblock_qs_rnp(rnp, flags);
271 280
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void)
295 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 304 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
296 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 305 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
297 rcu_read_unlock_special(t); 306 rcu_read_unlock_special(t);
307#ifdef CONFIG_PROVE_LOCKING
308 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
309#endif /* #ifdef CONFIG_PROVE_LOCKING */
298} 310}
299EXPORT_SYMBOL_GPL(__rcu_read_unlock); 311EXPORT_SYMBOL_GPL(__rcu_read_unlock);
300 312
301#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 313#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
302 314
315#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
316
317/*
318 * Dump detailed information for all tasks blocking the current RCU
319 * grace period on the specified rcu_node structure.
320 */
321static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
322{
323 unsigned long flags;
324 struct list_head *lp;
325 int phase;
326 struct task_struct *t;
327
328 if (rcu_preempted_readers(rnp)) {
329 raw_spin_lock_irqsave(&rnp->lock, flags);
330 phase = rnp->gpnum & 0x1;
331 lp = &rnp->blocked_tasks[phase];
332 list_for_each_entry(t, lp, rcu_node_entry)
333 sched_show_task(t);
334 raw_spin_unlock_irqrestore(&rnp->lock, flags);
335 }
336}
337
338/*
339 * Dump detailed information for all tasks blocking the current RCU
340 * grace period.
341 */
342static void rcu_print_detail_task_stall(struct rcu_state *rsp)
343{
344 struct rcu_node *rnp = rcu_get_root(rsp);
345
346 rcu_print_detail_task_stall_rnp(rnp);
347 rcu_for_each_leaf_node(rsp, rnp)
348 rcu_print_detail_task_stall_rnp(rnp);
349}
350
351#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
352
353static void rcu_print_detail_task_stall(struct rcu_state *rsp)
354{
355}
356
357#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
358
303/* 359/*
304 * Scan the current list of tasks blocked within RCU read-side critical 360 * Scan the current list of tasks blocked within RCU read-side critical
305 * sections, printing out the tid of each. 361 * sections, printing out the tid of each.
306 */ 362 */
307static void rcu_print_task_stall(struct rcu_node *rnp) 363static void rcu_print_task_stall(struct rcu_node *rnp)
308{ 364{
309 unsigned long flags;
310 struct list_head *lp; 365 struct list_head *lp;
311 int phase; 366 int phase;
312 struct task_struct *t; 367 struct task_struct *t;
313 368
314 if (rcu_preempted_readers(rnp)) { 369 if (rcu_preempted_readers(rnp)) {
315 spin_lock_irqsave(&rnp->lock, flags);
316 phase = rnp->gpnum & 0x1; 370 phase = rnp->gpnum & 0x1;
317 lp = &rnp->blocked_tasks[phase]; 371 lp = &rnp->blocked_tasks[phase];
318 list_for_each_entry(t, lp, rcu_node_entry) 372 list_for_each_entry(t, lp, rcu_node_entry)
319 printk(" P%d", t->pid); 373 printk(" P%d", t->pid);
320 spin_unlock_irqrestore(&rnp->lock, flags);
321 } 374 }
322} 375}
323 376
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
388 lp_root = &rnp_root->blocked_tasks[i]; 441 lp_root = &rnp_root->blocked_tasks[i];
389 while (!list_empty(lp)) { 442 while (!list_empty(lp)) {
390 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 443 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
391 spin_lock(&rnp_root->lock); /* irqs already disabled */ 444 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
392 list_del(&tp->rcu_node_entry); 445 list_del(&tp->rcu_node_entry);
393 tp->rcu_blocked_node = rnp_root; 446 tp->rcu_blocked_node = rnp_root;
394 list_add(&tp->rcu_node_entry, lp_root); 447 list_add(&tp->rcu_node_entry, lp_root);
395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 448 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
396 } 449 }
397 } 450 }
398 return retval; 451 return retval;
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
516 unsigned long flags; 569 unsigned long flags;
517 unsigned long mask; 570 unsigned long mask;
518 571
519 spin_lock_irqsave(&rnp->lock, flags); 572 raw_spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) { 573 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp)) 574 if (!sync_rcu_preempt_exp_done(rnp))
522 break; 575 break;
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
525 break; 578 break;
526 } 579 }
527 mask = rnp->grpmask; 580 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */ 581 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent; 582 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */ 583 raw_spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask; 584 rnp->expmask &= ~mask;
532 } 585 }
533 spin_unlock_irqrestore(&rnp->lock, flags); 586 raw_spin_unlock_irqrestore(&rnp->lock, flags);
534} 587}
535 588
536/* 589/*
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{ 598{
546 int must_wait; 599 int must_wait;
547 600
548 spin_lock(&rnp->lock); /* irqs already disabled */ 601 raw_spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 602 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 603 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp); 604 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */ 605 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait) 606 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp); 607 rcu_report_exp_rnp(rsp, rnp);
555} 608}
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void)
594 /* force all RCU readers onto blocked_tasks[]. */ 647 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited(); 648 synchronize_sched_expedited();
596 649
597 spin_lock_irqsave(&rsp->onofflock, flags); 650 raw_spin_lock_irqsave(&rsp->onofflock, flags);
598 651
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 652 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 653 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */ 654 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit; 655 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 656 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 } 657 }
605 658
606 /* Snapshot current state of ->blocked_tasks[] lists. */ 659 /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void)
609 if (NUM_RCU_NODES > 1) 662 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 663 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611 664
612 spin_unlock_irqrestore(&rsp->onofflock, flags); 665 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
613 666
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 667 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp); 668 rnp = rcu_get_root(rsp);
@@ -713,6 +766,16 @@ long rcu_batches_completed(void)
713EXPORT_SYMBOL_GPL(rcu_batches_completed); 766EXPORT_SYMBOL_GPL(rcu_batches_completed);
714 767
715/* 768/*
769 * Force a quiescent state for RCU, which, because there is no preemptible
770 * RCU, becomes the same as rcu-sched.
771 */
772void rcu_force_quiescent_state(void)
773{
774 rcu_sched_force_quiescent_state();
775}
776EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
777
778/*
716 * Because preemptable RCU does not exist, we never have to check for 779 * Because preemptable RCU does not exist, we never have to check for
717 * CPUs being in quiescent states. 780 * CPUs being in quiescent states.
718 */ 781 */
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
734/* Because preemptible RCU does not exist, no quieting of tasks. */ 797/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 798static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{ 799{
737 spin_unlock_irqrestore(&rnp->lock, flags); 800 raw_spin_unlock_irqrestore(&rnp->lock, flags);
738} 801}
739 802
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 803#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
745 * Because preemptable RCU does not exist, we never have to check for 808 * Because preemptable RCU does not exist, we never have to check for
746 * tasks blocked within RCU read-side critical sections. 809 * tasks blocked within RCU read-side critical sections.
747 */ 810 */
811static void rcu_print_detail_task_stall(struct rcu_state *rsp)
812{
813}
814
815/*
816 * Because preemptable RCU does not exist, we never have to check for
817 * tasks blocked within RCU read-side critical sections.
818 */
748static void rcu_print_task_stall(struct rcu_node *rnp) 819static void rcu_print_task_stall(struct rcu_node *rnp)
749{ 820{
750} 821}
@@ -884,3 +955,113 @@ static void __init __rcu_init_preempt(void)
884} 955}
885 956
886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 957#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
958
959#if !defined(CONFIG_RCU_FAST_NO_HZ)
960
961/*
962 * Check to see if any future RCU-related work will need to be done
963 * by the current CPU, even if none need be done immediately, returning
964 * 1 if so. This function is part of the RCU implementation; it is -not-
965 * an exported member of the RCU API.
966 *
967 * Because we have preemptible RCU, just check whether this CPU needs
968 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
969 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
970 */
971int rcu_needs_cpu(int cpu)
972{
973 return rcu_needs_cpu_quick_check(cpu);
974}
975
976/*
977 * Check to see if we need to continue a callback-flush operations to
978 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
979 * entry is not configured, so we never do need to.
980 */
981static void rcu_needs_cpu_flush(void)
982{
983}
984
985#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
986
987#define RCU_NEEDS_CPU_FLUSHES 5
988static DEFINE_PER_CPU(int, rcu_dyntick_drain);
989static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
990
991/*
992 * Check to see if any future RCU-related work will need to be done
993 * by the current CPU, even if none need be done immediately, returning
994 * 1 if so. This function is part of the RCU implementation; it is -not-
995 * an exported member of the RCU API.
996 *
997 * Because we are not supporting preemptible RCU, attempt to accelerate
998 * any current grace periods so that RCU no longer needs this CPU, but
999 * only if all other CPUs are already in dynticks-idle mode. This will
1000 * allow the CPU cores to be powered down immediately, as opposed to after
1001 * waiting many milliseconds for grace periods to elapse.
1002 *
1003 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1004 * disabled, we do one pass of force_quiescent_state(), then do a
1005 * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
1006 * The per-cpu rcu_dyntick_drain variable controls the sequencing.
1007 */
1008int rcu_needs_cpu(int cpu)
1009{
1010 int c = 0;
1011 int thatcpu;
1012
1013 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1014 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1015 if (thatcpu != cpu) {
1016 per_cpu(rcu_dyntick_drain, cpu) = 0;
1017 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1018 return rcu_needs_cpu_quick_check(cpu);
1019 }
1020
1021 /* Check and update the rcu_dyntick_drain sequencing. */
1022 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1023 /* First time through, initialize the counter. */
1024 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1025 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1026 /* We have hit the limit, so time to give up. */
1027 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1028 return rcu_needs_cpu_quick_check(cpu);
1029 }
1030
1031 /* Do one step pushing remaining RCU callbacks through. */
1032 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1033 rcu_sched_qs(cpu);
1034 force_quiescent_state(&rcu_sched_state, 0);
1035 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1036 }
1037 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1038 rcu_bh_qs(cpu);
1039 force_quiescent_state(&rcu_bh_state, 0);
1040 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1041 }
1042
1043 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1044 if (c) {
1045 raise_softirq(RCU_SOFTIRQ);
1046 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1047 }
1048 return c;
1049}
1050
1051/*
1052 * Check to see if we need to continue a callback-flush operations to
1053 * allow the last CPU to enter dyntick-idle mode.
1054 */
1055static void rcu_needs_cpu_flush(void)
1056{
1057 int cpu = smp_processor_id();
1058 unsigned long flags;
1059
1060 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
1061 return;
1062 local_irq_save(flags);
1063 (void)rcu_needs_cpu(cpu);
1064 local_irq_restore(flags);
1065}
1066
1067#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 50{
51 if (!rdp->beenonline) 51 if (!rdp->beenonline)
52 return; 52 return;
53 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", 53 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
54 rdp->cpu, 54 rdp->cpu,
55 cpu_is_offline(rdp->cpu) ? '!' : ' ', 55 cpu_is_offline(rdp->cpu) ? '!' : ' ',
56 rdp->completed, rdp->gpnum, 56 rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
105{ 105{
106 if (!rdp->beenonline) 106 if (!rdp->beenonline)
107 return; 107 return;
108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
109 rdp->cpu, 109 rdp->cpu,
110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
111 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum; 158 unsigned long gpnum;
159 int level = 0; 159 int level = 0;
160 int phase; 160 int phase;
161 struct rcu_node *rnp; 161 struct rcu_node *rnp;
162 162
163 gpnum = rsp->gpnum; 163 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
166 rsp->completed, gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
167 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
215static int show_rcugp(struct seq_file *m, void *unused) 215static int show_rcugp(struct seq_file *m, void *unused)
216{ 216{
217#ifdef CONFIG_TREE_PREEMPT_RCU 217#ifdef CONFIG_TREE_PREEMPT_RCU
218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", 218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
219 rcu_preempt_state.completed, rcu_preempt_state.gpnum); 219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", 221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
222 rcu_sched_state.completed, rcu_sched_state.gpnum); 222 rcu_sched_state.completed, rcu_sched_state.gpnum);
223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
224 rcu_bh_state.completed, rcu_bh_state.gpnum); 224 rcu_bh_state.completed, rcu_bh_state.gpnum);
225 return 0; 225 return 0;
226} 226}
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index 24e9e60c1459..2d5be5d9bf5f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -304,7 +304,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
304 void *arg, int (*func)(unsigned long, unsigned long, void *)) 304 void *arg, int (*func)(unsigned long, unsigned long, void *))
305{ 305{
306 struct resource res; 306 struct resource res;
307 unsigned long pfn, len; 307 unsigned long pfn, end_pfn;
308 u64 orig_end; 308 u64 orig_end;
309 int ret = -1; 309 int ret = -1;
310 310
@@ -314,9 +314,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
314 orig_end = res.end; 314 orig_end = res.end;
315 while ((res.start < res.end) && 315 while ((res.start < res.end) &&
316 (find_next_system_ram(&res, "System RAM") >= 0)) { 316 (find_next_system_ram(&res, "System RAM") >= 0)) {
317 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 317 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
318 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 318 end_pfn = (res.end + 1) >> PAGE_SHIFT;
319 ret = (*func)(pfn, len, arg); 319 if (end_pfn > pfn)
320 ret = (*func)(pfn, end_pfn - pfn, arg);
320 if (ret) 321 if (ret)
321 break; 322 break;
322 res.start = res.end + 1; 323 res.start = res.end + 1;
@@ -327,6 +328,19 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
327 328
328#endif 329#endif
329 330
331static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
332{
333 return 1;
334}
335/*
336 * This generic page_is_ram() returns true if specified address is
337 * registered as "System RAM" in iomem_resource list.
338 */
339int __weak page_is_ram(unsigned long pfn)
340{
341 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
342}
343
330/* 344/*
331 * Find empty slot in the resource tree given range and alignment. 345 * Find empty slot in the resource tree given range and alignment.
332 */ 346 */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3a8fb30a91b1..b47ceeec1a91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 602#endif
646} 603}
647 604
605#define rcu_dereference_check_sched_domain(p) \
606 rcu_dereference_check((p), \
607 rcu_read_lock_sched_held() || \
608 lockdep_is_held(&sched_domains_mutex))
609
648/* 610/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 611 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 612 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 615 * preempt-disabled sections.
654 */ 616 */
655#define for_each_domain(cpu, __sd) \ 617#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 618 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 619
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 620#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 621#define this_rq() (&__get_cpu_var(runqueues))
@@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 904
943/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
946 */ 921 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 923 __acquires(rq->lock)
949{ 924{
925 struct rq *rq;
926
950 for (;;) { 927 for (;;) {
951 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 931 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 933 return rq;
955 raw_spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
956 } 935 }
@@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 946 struct rq *rq;
968 947
969 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
970 local_irq_save(*flags); 951 local_irq_save(*flags);
971 rq = task_rq(p); 952 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 955 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 957 }
@@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1372};
1392 1373
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
1531 1486
1532static struct sched_group *group_of(int cpu) 1487static struct sched_group *group_of(int cpu)
1533{ 1488{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1535 1490
1536 if (!sd) 1491 if (!sd)
1537 return NULL; 1492 return NULL;
@@ -1566,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1566 1521
1567#ifdef CONFIG_FAIR_GROUP_SCHED 1522#ifdef CONFIG_FAIR_GROUP_SCHED
1568 1523
1569static __read_mostly unsigned long *update_shares_data; 1524static __read_mostly unsigned long __percpu *update_shares_data;
1570 1525
1571static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1526static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1572 1527
@@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1656 }
1702} 1657}
1703 1658
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1715{ 1660{
1716 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1670{
1726} 1671}
1727 1672
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1673#endif
1733 1674
1734#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1808#endif 1794#endif
1809 1795
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1820#endif
1835} 1821}
1836 1822
1837#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1824
1845#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1826#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1848 1828
1829#include "sched_stats.h"
1830
1849static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1850{ 1832{
1851 rq->nr_running++; 1833 rq->nr_running++;
@@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1865 *avg += diff >> 3;
1884} 1866}
1885 1867
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1870{
1888 if (wakeup) 1871 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1873
1891 sched_info_queued(p); 1874 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1894} 1877}
1895 1878
@@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1895}
1913 1896
1914/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1915 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1916 */ 1930 */
1917static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1971 return p->prio;
1958} 1972}
1959 1973
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1974/**
1985 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1976 * @p: the task in question.
@@ -2408,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2408 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2409 2399
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2411 if (cpu != orig_cpu) 2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2412 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2408 }
2413 2409
2414 rq = __task_rq_lock(p); 2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2415 update_rq_clock(rq); 2412 update_rq_clock(rq);
2416 2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2417 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2418 cpu = task_cpu(p);
2419 2422
2420#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2421 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2663,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2663 set_task_cpu(p, cpu); 2666 set_task_cpu(p, cpu);
2664#endif 2667#endif
2665 2668
2666 rq = task_rq_lock(p, &flags); 2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675
2667 BUG_ON(p->state != TASK_WAKING); 2676 BUG_ON(p->state != TASK_WAKING);
2668 p->state = TASK_RUNNING; 2677 p->state = TASK_RUNNING;
2669 update_rq_clock(rq); 2678 update_rq_clock(rq);
@@ -2794,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2794 */ 2803 */
2795 prev_state = prev->state; 2804 prev_state = prev->state;
2796 finish_arch_switch(prev); 2805 finish_arch_switch(prev);
2797 perf_event_task_sched_in(current, cpu_of(rq)); 2806#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2807 local_irq_disable();
2808#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2809 perf_event_task_sched_in(current);
2810#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2811 local_irq_enable();
2812#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2798 finish_lock_switch(rq, prev); 2813 finish_lock_switch(rq, prev);
2799 2814
2800 fire_sched_in_preempt_notifiers(current); 2815 fire_sched_in_preempt_notifiers(current);
@@ -3099,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3099#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3100 3115
3101/* 3116/*
3102 * double_rq_lock - safely lock two runqueues
3103 *
3104 * Note this does not disable interrupts like task_rq_lock,
3105 * you need to do so manually before calling.
3106 */
3107static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3108 __acquires(rq1->lock)
3109 __acquires(rq2->lock)
3110{
3111 BUG_ON(!irqs_disabled());
3112 if (rq1 == rq2) {
3113 raw_spin_lock(&rq1->lock);
3114 __acquire(rq2->lock); /* Fake it out ;) */
3115 } else {
3116 if (rq1 < rq2) {
3117 raw_spin_lock(&rq1->lock);
3118 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3119 } else {
3120 raw_spin_lock(&rq2->lock);
3121 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3122 }
3123 }
3124 update_rq_clock(rq1);
3125 update_rq_clock(rq2);
3126}
3127
3128/*
3129 * double_rq_unlock - safely unlock two runqueues
3130 *
3131 * Note this does not restore interrupts like task_rq_unlock,
3132 * you need to do so manually after calling.
3133 */
3134static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3135 __releases(rq1->lock)
3136 __releases(rq2->lock)
3137{
3138 raw_spin_unlock(&rq1->lock);
3139 if (rq1 != rq2)
3140 raw_spin_unlock(&rq2->lock);
3141 else
3142 __release(rq2->lock);
3143}
3144
3145/*
3146 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3147 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3148 */ 3119 */
@@ -3190,1771 +3161,6 @@ again:
3190 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3191} 3162}
3192 3163
3193/*
3194 * pull_task - move a task from a remote runqueue to the local runqueue.
3195 * Both runqueues must be locked.
3196 */
3197static void pull_task(struct rq *src_rq, struct task_struct *p,
3198 struct rq *this_rq, int this_cpu)
3199{
3200 deactivate_task(src_rq, p, 0);
3201 set_task_cpu(p, this_cpu);
3202 activate_task(this_rq, p, 0);
3203 check_preempt_curr(this_rq, p, 0);
3204}
3205
3206/*
3207 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3208 */
3209static
3210int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3211 struct sched_domain *sd, enum cpu_idle_type idle,
3212 int *all_pinned)
3213{
3214 int tsk_cache_hot = 0;
3215 /*
3216 * We do not migrate tasks that are:
3217 * 1) running (obviously), or
3218 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3219 * 3) are cache-hot on their current CPU.
3220 */
3221 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3222 schedstat_inc(p, se.nr_failed_migrations_affine);
3223 return 0;
3224 }
3225 *all_pinned = 0;
3226
3227 if (task_running(rq, p)) {
3228 schedstat_inc(p, se.nr_failed_migrations_running);
3229 return 0;
3230 }
3231
3232 /*
3233 * Aggressive migration if:
3234 * 1) task is cache cold, or
3235 * 2) too many balance attempts have failed.
3236 */
3237
3238 tsk_cache_hot = task_hot(p, rq->clock, sd);
3239 if (!tsk_cache_hot ||
3240 sd->nr_balance_failed > sd->cache_nice_tries) {
3241#ifdef CONFIG_SCHEDSTATS
3242 if (tsk_cache_hot) {
3243 schedstat_inc(sd, lb_hot_gained[idle]);
3244 schedstat_inc(p, se.nr_forced_migrations);
3245 }
3246#endif
3247 return 1;
3248 }
3249
3250 if (tsk_cache_hot) {
3251 schedstat_inc(p, se.nr_failed_migrations_hot);
3252 return 0;
3253 }
3254 return 1;
3255}
3256
3257static unsigned long
3258balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3259 unsigned long max_load_move, struct sched_domain *sd,
3260 enum cpu_idle_type idle, int *all_pinned,
3261 int *this_best_prio, struct rq_iterator *iterator)
3262{
3263 int loops = 0, pulled = 0, pinned = 0;
3264 struct task_struct *p;
3265 long rem_load_move = max_load_move;
3266
3267 if (max_load_move == 0)
3268 goto out;
3269
3270 pinned = 1;
3271
3272 /*
3273 * Start the load-balancing iterator:
3274 */
3275 p = iterator->start(iterator->arg);
3276next:
3277 if (!p || loops++ > sysctl_sched_nr_migrate)
3278 goto out;
3279
3280 if ((p->se.load.weight >> 1) > rem_load_move ||
3281 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3282 p = iterator->next(iterator->arg);
3283 goto next;
3284 }
3285
3286 pull_task(busiest, p, this_rq, this_cpu);
3287 pulled++;
3288 rem_load_move -= p->se.load.weight;
3289
3290#ifdef CONFIG_PREEMPT
3291 /*
3292 * NEWIDLE balancing is a source of latency, so preemptible kernels
3293 * will stop after the first task is pulled to minimize the critical
3294 * section.
3295 */
3296 if (idle == CPU_NEWLY_IDLE)
3297 goto out;
3298#endif
3299
3300 /*
3301 * We only want to steal up to the prescribed amount of weighted load.
3302 */
3303 if (rem_load_move > 0) {
3304 if (p->prio < *this_best_prio)
3305 *this_best_prio = p->prio;
3306 p = iterator->next(iterator->arg);
3307 goto next;
3308 }
3309out:
3310 /*
3311 * Right now, this is one of only two places pull_task() is called,
3312 * so we can safely collect pull_task() stats here rather than
3313 * inside pull_task().
3314 */
3315 schedstat_add(sd, lb_gained[idle], pulled);
3316
3317 if (all_pinned)
3318 *all_pinned = pinned;
3319
3320 return max_load_move - rem_load_move;
3321}
3322
3323/*
3324 * move_tasks tries to move up to max_load_move weighted load from busiest to
3325 * this_rq, as part of a balancing operation within domain "sd".
3326 * Returns 1 if successful and 0 otherwise.
3327 *
3328 * Called with both runqueues locked.
3329 */
3330static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3331 unsigned long max_load_move,
3332 struct sched_domain *sd, enum cpu_idle_type idle,
3333 int *all_pinned)
3334{
3335 const struct sched_class *class = sched_class_highest;
3336 unsigned long total_load_moved = 0;
3337 int this_best_prio = this_rq->curr->prio;
3338
3339 do {
3340 total_load_moved +=
3341 class->load_balance(this_rq, this_cpu, busiest,
3342 max_load_move - total_load_moved,
3343 sd, idle, all_pinned, &this_best_prio);
3344 class = class->next;
3345
3346#ifdef CONFIG_PREEMPT
3347 /*
3348 * NEWIDLE balancing is a source of latency, so preemptible
3349 * kernels will stop after the first task is pulled to minimize
3350 * the critical section.
3351 */
3352 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3353 break;
3354#endif
3355 } while (class && max_load_move > total_load_moved);
3356
3357 return total_load_moved > 0;
3358}
3359
3360static int
3361iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3362 struct sched_domain *sd, enum cpu_idle_type idle,
3363 struct rq_iterator *iterator)
3364{
3365 struct task_struct *p = iterator->start(iterator->arg);
3366 int pinned = 0;
3367
3368 while (p) {
3369 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3370 pull_task(busiest, p, this_rq, this_cpu);
3371 /*
3372 * Right now, this is only the second place pull_task()
3373 * is called, so we can safely collect pull_task()
3374 * stats here rather than inside pull_task().
3375 */
3376 schedstat_inc(sd, lb_gained[idle]);
3377
3378 return 1;
3379 }
3380 p = iterator->next(iterator->arg);
3381 }
3382
3383 return 0;
3384}
3385
3386/*
3387 * move_one_task tries to move exactly one task from busiest to this_rq, as
3388 * part of active balancing operations within "domain".
3389 * Returns 1 if successful and 0 otherwise.
3390 *
3391 * Called with both runqueues locked.
3392 */
3393static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3394 struct sched_domain *sd, enum cpu_idle_type idle)
3395{
3396 const struct sched_class *class;
3397
3398 for_each_class(class) {
3399 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3400 return 1;
3401 }
3402
3403 return 0;
3404}
3405/********** Helpers for find_busiest_group ************************/
3406/*
3407 * sd_lb_stats - Structure to store the statistics of a sched_domain
3408 * during load balancing.
3409 */
3410struct sd_lb_stats {
3411 struct sched_group *busiest; /* Busiest group in this sd */
3412 struct sched_group *this; /* Local group in this sd */
3413 unsigned long total_load; /* Total load of all groups in sd */
3414 unsigned long total_pwr; /* Total power of all groups in sd */
3415 unsigned long avg_load; /* Average load across all groups in sd */
3416
3417 /** Statistics of this group */
3418 unsigned long this_load;
3419 unsigned long this_load_per_task;
3420 unsigned long this_nr_running;
3421
3422 /* Statistics of the busiest group */
3423 unsigned long max_load;
3424 unsigned long busiest_load_per_task;
3425 unsigned long busiest_nr_running;
3426
3427 int group_imb; /* Is there imbalance in this sd */
3428#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3429 int power_savings_balance; /* Is powersave balance needed for this sd */
3430 struct sched_group *group_min; /* Least loaded group in sd */
3431 struct sched_group *group_leader; /* Group which relieves group_min */
3432 unsigned long min_load_per_task; /* load_per_task in group_min */
3433 unsigned long leader_nr_running; /* Nr running of group_leader */
3434 unsigned long min_nr_running; /* Nr running of group_min */
3435#endif
3436};
3437
3438/*
3439 * sg_lb_stats - stats of a sched_group required for load_balancing
3440 */
3441struct sg_lb_stats {
3442 unsigned long avg_load; /*Avg load across the CPUs of the group */
3443 unsigned long group_load; /* Total load over the CPUs of the group */
3444 unsigned long sum_nr_running; /* Nr tasks running in the group */
3445 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3446 unsigned long group_capacity;
3447 int group_imb; /* Is there an imbalance in the group ? */
3448};
3449
3450/**
3451 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3452 * @group: The group whose first cpu is to be returned.
3453 */
3454static inline unsigned int group_first_cpu(struct sched_group *group)
3455{
3456 return cpumask_first(sched_group_cpus(group));
3457}
3458
3459/**
3460 * get_sd_load_idx - Obtain the load index for a given sched domain.
3461 * @sd: The sched_domain whose load_idx is to be obtained.
3462 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3463 */
3464static inline int get_sd_load_idx(struct sched_domain *sd,
3465 enum cpu_idle_type idle)
3466{
3467 int load_idx;
3468
3469 switch (idle) {
3470 case CPU_NOT_IDLE:
3471 load_idx = sd->busy_idx;
3472 break;
3473
3474 case CPU_NEWLY_IDLE:
3475 load_idx = sd->newidle_idx;
3476 break;
3477 default:
3478 load_idx = sd->idle_idx;
3479 break;
3480 }
3481
3482 return load_idx;
3483}
3484
3485
3486#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3487/**
3488 * init_sd_power_savings_stats - Initialize power savings statistics for
3489 * the given sched_domain, during load balancing.
3490 *
3491 * @sd: Sched domain whose power-savings statistics are to be initialized.
3492 * @sds: Variable containing the statistics for sd.
3493 * @idle: Idle status of the CPU at which we're performing load-balancing.
3494 */
3495static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3496 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3497{
3498 /*
3499 * Busy processors will not participate in power savings
3500 * balance.
3501 */
3502 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3503 sds->power_savings_balance = 0;
3504 else {
3505 sds->power_savings_balance = 1;
3506 sds->min_nr_running = ULONG_MAX;
3507 sds->leader_nr_running = 0;
3508 }
3509}
3510
3511/**
3512 * update_sd_power_savings_stats - Update the power saving stats for a
3513 * sched_domain while performing load balancing.
3514 *
3515 * @group: sched_group belonging to the sched_domain under consideration.
3516 * @sds: Variable containing the statistics of the sched_domain
3517 * @local_group: Does group contain the CPU for which we're performing
3518 * load balancing ?
3519 * @sgs: Variable containing the statistics of the group.
3520 */
3521static inline void update_sd_power_savings_stats(struct sched_group *group,
3522 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3523{
3524
3525 if (!sds->power_savings_balance)
3526 return;
3527
3528 /*
3529 * If the local group is idle or completely loaded
3530 * no need to do power savings balance at this domain
3531 */
3532 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3533 !sds->this_nr_running))
3534 sds->power_savings_balance = 0;
3535
3536 /*
3537 * If a group is already running at full capacity or idle,
3538 * don't include that group in power savings calculations
3539 */
3540 if (!sds->power_savings_balance ||
3541 sgs->sum_nr_running >= sgs->group_capacity ||
3542 !sgs->sum_nr_running)
3543 return;
3544
3545 /*
3546 * Calculate the group which has the least non-idle load.
3547 * This is the group from where we need to pick up the load
3548 * for saving power
3549 */
3550 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3551 (sgs->sum_nr_running == sds->min_nr_running &&
3552 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3553 sds->group_min = group;
3554 sds->min_nr_running = sgs->sum_nr_running;
3555 sds->min_load_per_task = sgs->sum_weighted_load /
3556 sgs->sum_nr_running;
3557 }
3558
3559 /*
3560 * Calculate the group which is almost near its
3561 * capacity but still has some space to pick up some load
3562 * from other group and save more power
3563 */
3564 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3565 return;
3566
3567 if (sgs->sum_nr_running > sds->leader_nr_running ||
3568 (sgs->sum_nr_running == sds->leader_nr_running &&
3569 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3570 sds->group_leader = group;
3571 sds->leader_nr_running = sgs->sum_nr_running;
3572 }
3573}
3574
3575/**
3576 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3577 * @sds: Variable containing the statistics of the sched_domain
3578 * under consideration.
3579 * @this_cpu: Cpu at which we're currently performing load-balancing.
3580 * @imbalance: Variable to store the imbalance.
3581 *
3582 * Description:
3583 * Check if we have potential to perform some power-savings balance.
3584 * If yes, set the busiest group to be the least loaded group in the
3585 * sched_domain, so that it's CPUs can be put to idle.
3586 *
3587 * Returns 1 if there is potential to perform power-savings balance.
3588 * Else returns 0.
3589 */
3590static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3591 int this_cpu, unsigned long *imbalance)
3592{
3593 if (!sds->power_savings_balance)
3594 return 0;
3595
3596 if (sds->this != sds->group_leader ||
3597 sds->group_leader == sds->group_min)
3598 return 0;
3599
3600 *imbalance = sds->min_load_per_task;
3601 sds->busiest = sds->group_min;
3602
3603 return 1;
3604
3605}
3606#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3607static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3608 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3609{
3610 return;
3611}
3612
3613static inline void update_sd_power_savings_stats(struct sched_group *group,
3614 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3615{
3616 return;
3617}
3618
3619static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3620 int this_cpu, unsigned long *imbalance)
3621{
3622 return 0;
3623}
3624#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3625
3626
3627unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3628{
3629 return SCHED_LOAD_SCALE;
3630}
3631
3632unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3633{
3634 return default_scale_freq_power(sd, cpu);
3635}
3636
3637unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3638{
3639 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3640 unsigned long smt_gain = sd->smt_gain;
3641
3642 smt_gain /= weight;
3643
3644 return smt_gain;
3645}
3646
3647unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3648{
3649 return default_scale_smt_power(sd, cpu);
3650}
3651
3652unsigned long scale_rt_power(int cpu)
3653{
3654 struct rq *rq = cpu_rq(cpu);
3655 u64 total, available;
3656
3657 sched_avg_update(rq);
3658
3659 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3660 available = total - rq->rt_avg;
3661
3662 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3663 total = SCHED_LOAD_SCALE;
3664
3665 total >>= SCHED_LOAD_SHIFT;
3666
3667 return div_u64(available, total);
3668}
3669
3670static void update_cpu_power(struct sched_domain *sd, int cpu)
3671{
3672 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3673 unsigned long power = SCHED_LOAD_SCALE;
3674 struct sched_group *sdg = sd->groups;
3675
3676 if (sched_feat(ARCH_POWER))
3677 power *= arch_scale_freq_power(sd, cpu);
3678 else
3679 power *= default_scale_freq_power(sd, cpu);
3680
3681 power >>= SCHED_LOAD_SHIFT;
3682
3683 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3684 if (sched_feat(ARCH_POWER))
3685 power *= arch_scale_smt_power(sd, cpu);
3686 else
3687 power *= default_scale_smt_power(sd, cpu);
3688
3689 power >>= SCHED_LOAD_SHIFT;
3690 }
3691
3692 power *= scale_rt_power(cpu);
3693 power >>= SCHED_LOAD_SHIFT;
3694
3695 if (!power)
3696 power = 1;
3697
3698 sdg->cpu_power = power;
3699}
3700
3701static void update_group_power(struct sched_domain *sd, int cpu)
3702{
3703 struct sched_domain *child = sd->child;
3704 struct sched_group *group, *sdg = sd->groups;
3705 unsigned long power;
3706
3707 if (!child) {
3708 update_cpu_power(sd, cpu);
3709 return;
3710 }
3711
3712 power = 0;
3713
3714 group = child->groups;
3715 do {
3716 power += group->cpu_power;
3717 group = group->next;
3718 } while (group != child->groups);
3719
3720 sdg->cpu_power = power;
3721}
3722
3723/**
3724 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3725 * @sd: The sched_domain whose statistics are to be updated.
3726 * @group: sched_group whose statistics are to be updated.
3727 * @this_cpu: Cpu for which load balance is currently performed.
3728 * @idle: Idle status of this_cpu
3729 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3730 * @sd_idle: Idle status of the sched_domain containing group.
3731 * @local_group: Does group contain this_cpu.
3732 * @cpus: Set of cpus considered for load balancing.
3733 * @balance: Should we balance.
3734 * @sgs: variable to hold the statistics for this group.
3735 */
3736static inline void update_sg_lb_stats(struct sched_domain *sd,
3737 struct sched_group *group, int this_cpu,
3738 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3739 int local_group, const struct cpumask *cpus,
3740 int *balance, struct sg_lb_stats *sgs)
3741{
3742 unsigned long load, max_cpu_load, min_cpu_load;
3743 int i;
3744 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3745 unsigned long sum_avg_load_per_task;
3746 unsigned long avg_load_per_task;
3747
3748 if (local_group) {
3749 balance_cpu = group_first_cpu(group);
3750 if (balance_cpu == this_cpu)
3751 update_group_power(sd, this_cpu);
3752 }
3753
3754 /* Tally up the load of all CPUs in the group */
3755 sum_avg_load_per_task = avg_load_per_task = 0;
3756 max_cpu_load = 0;
3757 min_cpu_load = ~0UL;
3758
3759 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3760 struct rq *rq = cpu_rq(i);
3761
3762 if (*sd_idle && rq->nr_running)
3763 *sd_idle = 0;
3764
3765 /* Bias balancing toward cpus of our domain */
3766 if (local_group) {
3767 if (idle_cpu(i) && !first_idle_cpu) {
3768 first_idle_cpu = 1;
3769 balance_cpu = i;
3770 }
3771
3772 load = target_load(i, load_idx);
3773 } else {
3774 load = source_load(i, load_idx);
3775 if (load > max_cpu_load)
3776 max_cpu_load = load;
3777 if (min_cpu_load > load)
3778 min_cpu_load = load;
3779 }
3780
3781 sgs->group_load += load;
3782 sgs->sum_nr_running += rq->nr_running;
3783 sgs->sum_weighted_load += weighted_cpuload(i);
3784
3785 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3786 }
3787
3788 /*
3789 * First idle cpu or the first cpu(busiest) in this sched group
3790 * is eligible for doing load balancing at this and above
3791 * domains. In the newly idle case, we will allow all the cpu's
3792 * to do the newly idle load balance.
3793 */
3794 if (idle != CPU_NEWLY_IDLE && local_group &&
3795 balance_cpu != this_cpu && balance) {
3796 *balance = 0;
3797 return;
3798 }
3799
3800 /* Adjust by relative CPU power of the group */
3801 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3802
3803
3804 /*
3805 * Consider the group unbalanced when the imbalance is larger
3806 * than the average weight of two tasks.
3807 *
3808 * APZ: with cgroup the avg task weight can vary wildly and
3809 * might not be a suitable number - should we keep a
3810 * normalized nr_running number somewhere that negates
3811 * the hierarchy?
3812 */
3813 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3814 group->cpu_power;
3815
3816 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3817 sgs->group_imb = 1;
3818
3819 sgs->group_capacity =
3820 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3821}
3822
3823/**
3824 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3825 * @sd: sched_domain whose statistics are to be updated.
3826 * @this_cpu: Cpu for which load balance is currently performed.
3827 * @idle: Idle status of this_cpu
3828 * @sd_idle: Idle status of the sched_domain containing group.
3829 * @cpus: Set of cpus considered for load balancing.
3830 * @balance: Should we balance.
3831 * @sds: variable to hold the statistics for this sched_domain.
3832 */
3833static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3834 enum cpu_idle_type idle, int *sd_idle,
3835 const struct cpumask *cpus, int *balance,
3836 struct sd_lb_stats *sds)
3837{
3838 struct sched_domain *child = sd->child;
3839 struct sched_group *group = sd->groups;
3840 struct sg_lb_stats sgs;
3841 int load_idx, prefer_sibling = 0;
3842
3843 if (child && child->flags & SD_PREFER_SIBLING)
3844 prefer_sibling = 1;
3845
3846 init_sd_power_savings_stats(sd, sds, idle);
3847 load_idx = get_sd_load_idx(sd, idle);
3848
3849 do {
3850 int local_group;
3851
3852 local_group = cpumask_test_cpu(this_cpu,
3853 sched_group_cpus(group));
3854 memset(&sgs, 0, sizeof(sgs));
3855 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3856 local_group, cpus, balance, &sgs);
3857
3858 if (local_group && balance && !(*balance))
3859 return;
3860
3861 sds->total_load += sgs.group_load;
3862 sds->total_pwr += group->cpu_power;
3863
3864 /*
3865 * In case the child domain prefers tasks go to siblings
3866 * first, lower the group capacity to one so that we'll try
3867 * and move all the excess tasks away.
3868 */
3869 if (prefer_sibling)
3870 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3871
3872 if (local_group) {
3873 sds->this_load = sgs.avg_load;
3874 sds->this = group;
3875 sds->this_nr_running = sgs.sum_nr_running;
3876 sds->this_load_per_task = sgs.sum_weighted_load;
3877 } else if (sgs.avg_load > sds->max_load &&
3878 (sgs.sum_nr_running > sgs.group_capacity ||
3879 sgs.group_imb)) {
3880 sds->max_load = sgs.avg_load;
3881 sds->busiest = group;
3882 sds->busiest_nr_running = sgs.sum_nr_running;
3883 sds->busiest_load_per_task = sgs.sum_weighted_load;
3884 sds->group_imb = sgs.group_imb;
3885 }
3886
3887 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3888 group = group->next;
3889 } while (group != sd->groups);
3890}
3891
3892/**
3893 * fix_small_imbalance - Calculate the minor imbalance that exists
3894 * amongst the groups of a sched_domain, during
3895 * load balancing.
3896 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3897 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3898 * @imbalance: Variable to store the imbalance.
3899 */
3900static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3901 int this_cpu, unsigned long *imbalance)
3902{
3903 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3904 unsigned int imbn = 2;
3905
3906 if (sds->this_nr_running) {
3907 sds->this_load_per_task /= sds->this_nr_running;
3908 if (sds->busiest_load_per_task >
3909 sds->this_load_per_task)
3910 imbn = 1;
3911 } else
3912 sds->this_load_per_task =
3913 cpu_avg_load_per_task(this_cpu);
3914
3915 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3916 sds->busiest_load_per_task * imbn) {
3917 *imbalance = sds->busiest_load_per_task;
3918 return;
3919 }
3920
3921 /*
3922 * OK, we don't have enough imbalance to justify moving tasks,
3923 * however we may be able to increase total CPU power used by
3924 * moving them.
3925 */
3926
3927 pwr_now += sds->busiest->cpu_power *
3928 min(sds->busiest_load_per_task, sds->max_load);
3929 pwr_now += sds->this->cpu_power *
3930 min(sds->this_load_per_task, sds->this_load);
3931 pwr_now /= SCHED_LOAD_SCALE;
3932
3933 /* Amount of load we'd subtract */
3934 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3935 sds->busiest->cpu_power;
3936 if (sds->max_load > tmp)
3937 pwr_move += sds->busiest->cpu_power *
3938 min(sds->busiest_load_per_task, sds->max_load - tmp);
3939
3940 /* Amount of load we'd add */
3941 if (sds->max_load * sds->busiest->cpu_power <
3942 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3943 tmp = (sds->max_load * sds->busiest->cpu_power) /
3944 sds->this->cpu_power;
3945 else
3946 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3947 sds->this->cpu_power;
3948 pwr_move += sds->this->cpu_power *
3949 min(sds->this_load_per_task, sds->this_load + tmp);
3950 pwr_move /= SCHED_LOAD_SCALE;
3951
3952 /* Move if we gain throughput */
3953 if (pwr_move > pwr_now)
3954 *imbalance = sds->busiest_load_per_task;
3955}
3956
3957/**
3958 * calculate_imbalance - Calculate the amount of imbalance present within the
3959 * groups of a given sched_domain during load balance.
3960 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3961 * @this_cpu: Cpu for which currently load balance is being performed.
3962 * @imbalance: The variable to store the imbalance.
3963 */
3964static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3965 unsigned long *imbalance)
3966{
3967 unsigned long max_pull;
3968 /*
3969 * In the presence of smp nice balancing, certain scenarios can have
3970 * max load less than avg load(as we skip the groups at or below
3971 * its cpu_power, while calculating max_load..)
3972 */
3973 if (sds->max_load < sds->avg_load) {
3974 *imbalance = 0;
3975 return fix_small_imbalance(sds, this_cpu, imbalance);
3976 }
3977
3978 /* Don't want to pull so many tasks that a group would go idle */
3979 max_pull = min(sds->max_load - sds->avg_load,
3980 sds->max_load - sds->busiest_load_per_task);
3981
3982 /* How much load to actually move to equalise the imbalance */
3983 *imbalance = min(max_pull * sds->busiest->cpu_power,
3984 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3985 / SCHED_LOAD_SCALE;
3986
3987 /*
3988 * if *imbalance is less than the average load per runnable task
3989 * there is no gaurantee that any tasks will be moved so we'll have
3990 * a think about bumping its value to force at least one task to be
3991 * moved
3992 */
3993 if (*imbalance < sds->busiest_load_per_task)
3994 return fix_small_imbalance(sds, this_cpu, imbalance);
3995
3996}
3997/******* find_busiest_group() helpers end here *********************/
3998
3999/**
4000 * find_busiest_group - Returns the busiest group within the sched_domain
4001 * if there is an imbalance. If there isn't an imbalance, and
4002 * the user has opted for power-savings, it returns a group whose
4003 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4004 * such a group exists.
4005 *
4006 * Also calculates the amount of weighted load which should be moved
4007 * to restore balance.
4008 *
4009 * @sd: The sched_domain whose busiest group is to be returned.
4010 * @this_cpu: The cpu for which load balancing is currently being performed.
4011 * @imbalance: Variable which stores amount of weighted load which should
4012 * be moved to restore balance/put a group to idle.
4013 * @idle: The idle status of this_cpu.
4014 * @sd_idle: The idleness of sd
4015 * @cpus: The set of CPUs under consideration for load-balancing.
4016 * @balance: Pointer to a variable indicating if this_cpu
4017 * is the appropriate cpu to perform load balancing at this_level.
4018 *
4019 * Returns: - the busiest group if imbalance exists.
4020 * - If no imbalance and user has opted for power-savings balance,
4021 * return the least loaded group whose CPUs can be
4022 * put to idle by rebalancing its tasks onto our group.
4023 */
4024static struct sched_group *
4025find_busiest_group(struct sched_domain *sd, int this_cpu,
4026 unsigned long *imbalance, enum cpu_idle_type idle,
4027 int *sd_idle, const struct cpumask *cpus, int *balance)
4028{
4029 struct sd_lb_stats sds;
4030
4031 memset(&sds, 0, sizeof(sds));
4032
4033 /*
4034 * Compute the various statistics relavent for load balancing at
4035 * this level.
4036 */
4037 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4038 balance, &sds);
4039
4040 /* Cases where imbalance does not exist from POV of this_cpu */
4041 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4042 * at this level.
4043 * 2) There is no busy sibling group to pull from.
4044 * 3) This group is the busiest group.
4045 * 4) This group is more busy than the avg busieness at this
4046 * sched_domain.
4047 * 5) The imbalance is within the specified limit.
4048 * 6) Any rebalance would lead to ping-pong
4049 */
4050 if (balance && !(*balance))
4051 goto ret;
4052
4053 if (!sds.busiest || sds.busiest_nr_running == 0)
4054 goto out_balanced;
4055
4056 if (sds.this_load >= sds.max_load)
4057 goto out_balanced;
4058
4059 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4060
4061 if (sds.this_load >= sds.avg_load)
4062 goto out_balanced;
4063
4064 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4065 goto out_balanced;
4066
4067 sds.busiest_load_per_task /= sds.busiest_nr_running;
4068 if (sds.group_imb)
4069 sds.busiest_load_per_task =
4070 min(sds.busiest_load_per_task, sds.avg_load);
4071
4072 /*
4073 * We're trying to get all the cpus to the average_load, so we don't
4074 * want to push ourselves above the average load, nor do we wish to
4075 * reduce the max loaded cpu below the average load, as either of these
4076 * actions would just result in more rebalancing later, and ping-pong
4077 * tasks around. Thus we look for the minimum possible imbalance.
4078 * Negative imbalances (*we* are more loaded than anyone else) will
4079 * be counted as no imbalance for these purposes -- we can't fix that
4080 * by pulling tasks to us. Be careful of negative numbers as they'll
4081 * appear as very large values with unsigned longs.
4082 */
4083 if (sds.max_load <= sds.busiest_load_per_task)
4084 goto out_balanced;
4085
4086 /* Looks like there is an imbalance. Compute it */
4087 calculate_imbalance(&sds, this_cpu, imbalance);
4088 return sds.busiest;
4089
4090out_balanced:
4091 /*
4092 * There is no obvious imbalance. But check if we can do some balancing
4093 * to save power.
4094 */
4095 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4096 return sds.busiest;
4097ret:
4098 *imbalance = 0;
4099 return NULL;
4100}
4101
4102/*
4103 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4104 */
4105static struct rq *
4106find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4107 unsigned long imbalance, const struct cpumask *cpus)
4108{
4109 struct rq *busiest = NULL, *rq;
4110 unsigned long max_load = 0;
4111 int i;
4112
4113 for_each_cpu(i, sched_group_cpus(group)) {
4114 unsigned long power = power_of(i);
4115 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4116 unsigned long wl;
4117
4118 if (!cpumask_test_cpu(i, cpus))
4119 continue;
4120
4121 rq = cpu_rq(i);
4122 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4123 wl /= power;
4124
4125 if (capacity && rq->nr_running == 1 && wl > imbalance)
4126 continue;
4127
4128 if (wl > max_load) {
4129 max_load = wl;
4130 busiest = rq;
4131 }
4132 }
4133
4134 return busiest;
4135}
4136
4137/*
4138 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4139 * so long as it is large enough.
4140 */
4141#define MAX_PINNED_INTERVAL 512
4142
4143/* Working cpumask for load_balance and load_balance_newidle. */
4144static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4145
4146/*
4147 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4148 * tasks if there is an imbalance.
4149 */
4150static int load_balance(int this_cpu, struct rq *this_rq,
4151 struct sched_domain *sd, enum cpu_idle_type idle,
4152 int *balance)
4153{
4154 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4155 struct sched_group *group;
4156 unsigned long imbalance;
4157 struct rq *busiest;
4158 unsigned long flags;
4159 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4160
4161 cpumask_copy(cpus, cpu_active_mask);
4162
4163 /*
4164 * When power savings policy is enabled for the parent domain, idle
4165 * sibling can pick up load irrespective of busy siblings. In this case,
4166 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4167 * portraying it as CPU_NOT_IDLE.
4168 */
4169 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4170 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4171 sd_idle = 1;
4172
4173 schedstat_inc(sd, lb_count[idle]);
4174
4175redo:
4176 update_shares(sd);
4177 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4178 cpus, balance);
4179
4180 if (*balance == 0)
4181 goto out_balanced;
4182
4183 if (!group) {
4184 schedstat_inc(sd, lb_nobusyg[idle]);
4185 goto out_balanced;
4186 }
4187
4188 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4189 if (!busiest) {
4190 schedstat_inc(sd, lb_nobusyq[idle]);
4191 goto out_balanced;
4192 }
4193
4194 BUG_ON(busiest == this_rq);
4195
4196 schedstat_add(sd, lb_imbalance[idle], imbalance);
4197
4198 ld_moved = 0;
4199 if (busiest->nr_running > 1) {
4200 /*
4201 * Attempt to move tasks. If find_busiest_group has found
4202 * an imbalance but busiest->nr_running <= 1, the group is
4203 * still unbalanced. ld_moved simply stays zero, so it is
4204 * correctly treated as an imbalance.
4205 */
4206 local_irq_save(flags);
4207 double_rq_lock(this_rq, busiest);
4208 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4209 imbalance, sd, idle, &all_pinned);
4210 double_rq_unlock(this_rq, busiest);
4211 local_irq_restore(flags);
4212
4213 /*
4214 * some other cpu did the load balance for us.
4215 */
4216 if (ld_moved && this_cpu != smp_processor_id())
4217 resched_cpu(this_cpu);
4218
4219 /* All tasks on this runqueue were pinned by CPU affinity */
4220 if (unlikely(all_pinned)) {
4221 cpumask_clear_cpu(cpu_of(busiest), cpus);
4222 if (!cpumask_empty(cpus))
4223 goto redo;
4224 goto out_balanced;
4225 }
4226 }
4227
4228 if (!ld_moved) {
4229 schedstat_inc(sd, lb_failed[idle]);
4230 sd->nr_balance_failed++;
4231
4232 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4233
4234 raw_spin_lock_irqsave(&busiest->lock, flags);
4235
4236 /* don't kick the migration_thread, if the curr
4237 * task on busiest cpu can't be moved to this_cpu
4238 */
4239 if (!cpumask_test_cpu(this_cpu,
4240 &busiest->curr->cpus_allowed)) {
4241 raw_spin_unlock_irqrestore(&busiest->lock,
4242 flags);
4243 all_pinned = 1;
4244 goto out_one_pinned;
4245 }
4246
4247 if (!busiest->active_balance) {
4248 busiest->active_balance = 1;
4249 busiest->push_cpu = this_cpu;
4250 active_balance = 1;
4251 }
4252 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4253 if (active_balance)
4254 wake_up_process(busiest->migration_thread);
4255
4256 /*
4257 * We've kicked active balancing, reset the failure
4258 * counter.
4259 */
4260 sd->nr_balance_failed = sd->cache_nice_tries+1;
4261 }
4262 } else
4263 sd->nr_balance_failed = 0;
4264
4265 if (likely(!active_balance)) {
4266 /* We were unbalanced, so reset the balancing interval */
4267 sd->balance_interval = sd->min_interval;
4268 } else {
4269 /*
4270 * If we've begun active balancing, start to back off. This
4271 * case may not be covered by the all_pinned logic if there
4272 * is only 1 task on the busy runqueue (because we don't call
4273 * move_tasks).
4274 */
4275 if (sd->balance_interval < sd->max_interval)
4276 sd->balance_interval *= 2;
4277 }
4278
4279 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4280 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4281 ld_moved = -1;
4282
4283 goto out;
4284
4285out_balanced:
4286 schedstat_inc(sd, lb_balanced[idle]);
4287
4288 sd->nr_balance_failed = 0;
4289
4290out_one_pinned:
4291 /* tune up the balancing interval */
4292 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4293 (sd->balance_interval < sd->max_interval))
4294 sd->balance_interval *= 2;
4295
4296 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4297 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4298 ld_moved = -1;
4299 else
4300 ld_moved = 0;
4301out:
4302 if (ld_moved)
4303 update_shares(sd);
4304 return ld_moved;
4305}
4306
4307/*
4308 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4309 * tasks if there is an imbalance.
4310 *
4311 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4312 * this_rq is locked.
4313 */
4314static int
4315load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4316{
4317 struct sched_group *group;
4318 struct rq *busiest = NULL;
4319 unsigned long imbalance;
4320 int ld_moved = 0;
4321 int sd_idle = 0;
4322 int all_pinned = 0;
4323 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4324
4325 cpumask_copy(cpus, cpu_active_mask);
4326
4327 /*
4328 * When power savings policy is enabled for the parent domain, idle
4329 * sibling can pick up load irrespective of busy siblings. In this case,
4330 * let the state of idle sibling percolate up as IDLE, instead of
4331 * portraying it as CPU_NOT_IDLE.
4332 */
4333 if (sd->flags & SD_SHARE_CPUPOWER &&
4334 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4335 sd_idle = 1;
4336
4337 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4338redo:
4339 update_shares_locked(this_rq, sd);
4340 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4341 &sd_idle, cpus, NULL);
4342 if (!group) {
4343 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4344 goto out_balanced;
4345 }
4346
4347 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4348 if (!busiest) {
4349 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4350 goto out_balanced;
4351 }
4352
4353 BUG_ON(busiest == this_rq);
4354
4355 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4356
4357 ld_moved = 0;
4358 if (busiest->nr_running > 1) {
4359 /* Attempt to move tasks */
4360 double_lock_balance(this_rq, busiest);
4361 /* this_rq->clock is already updated */
4362 update_rq_clock(busiest);
4363 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4364 imbalance, sd, CPU_NEWLY_IDLE,
4365 &all_pinned);
4366 double_unlock_balance(this_rq, busiest);
4367
4368 if (unlikely(all_pinned)) {
4369 cpumask_clear_cpu(cpu_of(busiest), cpus);
4370 if (!cpumask_empty(cpus))
4371 goto redo;
4372 }
4373 }
4374
4375 if (!ld_moved) {
4376 int active_balance = 0;
4377
4378 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4379 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4380 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4381 return -1;
4382
4383 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4384 return -1;
4385
4386 if (sd->nr_balance_failed++ < 2)
4387 return -1;
4388
4389 /*
4390 * The only task running in a non-idle cpu can be moved to this
4391 * cpu in an attempt to completely freeup the other CPU
4392 * package. The same method used to move task in load_balance()
4393 * have been extended for load_balance_newidle() to speedup
4394 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4395 *
4396 * The package power saving logic comes from
4397 * find_busiest_group(). If there are no imbalance, then
4398 * f_b_g() will return NULL. However when sched_mc={1,2} then
4399 * f_b_g() will select a group from which a running task may be
4400 * pulled to this cpu in order to make the other package idle.
4401 * If there is no opportunity to make a package idle and if
4402 * there are no imbalance, then f_b_g() will return NULL and no
4403 * action will be taken in load_balance_newidle().
4404 *
4405 * Under normal task pull operation due to imbalance, there
4406 * will be more than one task in the source run queue and
4407 * move_tasks() will succeed. ld_moved will be true and this
4408 * active balance code will not be triggered.
4409 */
4410
4411 /* Lock busiest in correct order while this_rq is held */
4412 double_lock_balance(this_rq, busiest);
4413
4414 /*
4415 * don't kick the migration_thread, if the curr
4416 * task on busiest cpu can't be moved to this_cpu
4417 */
4418 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4419 double_unlock_balance(this_rq, busiest);
4420 all_pinned = 1;
4421 return ld_moved;
4422 }
4423
4424 if (!busiest->active_balance) {
4425 busiest->active_balance = 1;
4426 busiest->push_cpu = this_cpu;
4427 active_balance = 1;
4428 }
4429
4430 double_unlock_balance(this_rq, busiest);
4431 /*
4432 * Should not call ttwu while holding a rq->lock
4433 */
4434 raw_spin_unlock(&this_rq->lock);
4435 if (active_balance)
4436 wake_up_process(busiest->migration_thread);
4437 raw_spin_lock(&this_rq->lock);
4438
4439 } else
4440 sd->nr_balance_failed = 0;
4441
4442 update_shares_locked(this_rq, sd);
4443 return ld_moved;
4444
4445out_balanced:
4446 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4447 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4448 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4449 return -1;
4450 sd->nr_balance_failed = 0;
4451
4452 return 0;
4453}
4454
4455/*
4456 * idle_balance is called by schedule() if this_cpu is about to become
4457 * idle. Attempts to pull tasks from other CPUs.
4458 */
4459static void idle_balance(int this_cpu, struct rq *this_rq)
4460{
4461 struct sched_domain *sd;
4462 int pulled_task = 0;
4463 unsigned long next_balance = jiffies + HZ;
4464
4465 this_rq->idle_stamp = this_rq->clock;
4466
4467 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4468 return;
4469
4470 for_each_domain(this_cpu, sd) {
4471 unsigned long interval;
4472
4473 if (!(sd->flags & SD_LOAD_BALANCE))
4474 continue;
4475
4476 if (sd->flags & SD_BALANCE_NEWIDLE)
4477 /* If we've pulled tasks over stop searching: */
4478 pulled_task = load_balance_newidle(this_cpu, this_rq,
4479 sd);
4480
4481 interval = msecs_to_jiffies(sd->balance_interval);
4482 if (time_after(next_balance, sd->last_balance + interval))
4483 next_balance = sd->last_balance + interval;
4484 if (pulled_task) {
4485 this_rq->idle_stamp = 0;
4486 break;
4487 }
4488 }
4489 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4490 /*
4491 * We are going idle. next_balance may be set based on
4492 * a busy processor. So reset next_balance.
4493 */
4494 this_rq->next_balance = next_balance;
4495 }
4496}
4497
4498/*
4499 * active_load_balance is run by migration threads. It pushes running tasks
4500 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4501 * running on each physical CPU where possible, and avoids physical /
4502 * logical imbalances.
4503 *
4504 * Called with busiest_rq locked.
4505 */
4506static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4507{
4508 int target_cpu = busiest_rq->push_cpu;
4509 struct sched_domain *sd;
4510 struct rq *target_rq;
4511
4512 /* Is there any task to move? */
4513 if (busiest_rq->nr_running <= 1)
4514 return;
4515
4516 target_rq = cpu_rq(target_cpu);
4517
4518 /*
4519 * This condition is "impossible", if it occurs
4520 * we need to fix it. Originally reported by
4521 * Bjorn Helgaas on a 128-cpu setup.
4522 */
4523 BUG_ON(busiest_rq == target_rq);
4524
4525 /* move a task from busiest_rq to target_rq */
4526 double_lock_balance(busiest_rq, target_rq);
4527 update_rq_clock(busiest_rq);
4528 update_rq_clock(target_rq);
4529
4530 /* Search for an sd spanning us and the target CPU. */
4531 for_each_domain(target_cpu, sd) {
4532 if ((sd->flags & SD_LOAD_BALANCE) &&
4533 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4534 break;
4535 }
4536
4537 if (likely(sd)) {
4538 schedstat_inc(sd, alb_count);
4539
4540 if (move_one_task(target_rq, target_cpu, busiest_rq,
4541 sd, CPU_IDLE))
4542 schedstat_inc(sd, alb_pushed);
4543 else
4544 schedstat_inc(sd, alb_failed);
4545 }
4546 double_unlock_balance(busiest_rq, target_rq);
4547}
4548
4549#ifdef CONFIG_NO_HZ
4550static struct {
4551 atomic_t load_balancer;
4552 cpumask_var_t cpu_mask;
4553 cpumask_var_t ilb_grp_nohz_mask;
4554} nohz ____cacheline_aligned = {
4555 .load_balancer = ATOMIC_INIT(-1),
4556};
4557
4558int get_nohz_load_balancer(void)
4559{
4560 return atomic_read(&nohz.load_balancer);
4561}
4562
4563#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4564/**
4565 * lowest_flag_domain - Return lowest sched_domain containing flag.
4566 * @cpu: The cpu whose lowest level of sched domain is to
4567 * be returned.
4568 * @flag: The flag to check for the lowest sched_domain
4569 * for the given cpu.
4570 *
4571 * Returns the lowest sched_domain of a cpu which contains the given flag.
4572 */
4573static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4574{
4575 struct sched_domain *sd;
4576
4577 for_each_domain(cpu, sd)
4578 if (sd && (sd->flags & flag))
4579 break;
4580
4581 return sd;
4582}
4583
4584/**
4585 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4586 * @cpu: The cpu whose domains we're iterating over.
4587 * @sd: variable holding the value of the power_savings_sd
4588 * for cpu.
4589 * @flag: The flag to filter the sched_domains to be iterated.
4590 *
4591 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4592 * set, starting from the lowest sched_domain to the highest.
4593 */
4594#define for_each_flag_domain(cpu, sd, flag) \
4595 for (sd = lowest_flag_domain(cpu, flag); \
4596 (sd && (sd->flags & flag)); sd = sd->parent)
4597
4598/**
4599 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4600 * @ilb_group: group to be checked for semi-idleness
4601 *
4602 * Returns: 1 if the group is semi-idle. 0 otherwise.
4603 *
4604 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4605 * and atleast one non-idle CPU. This helper function checks if the given
4606 * sched_group is semi-idle or not.
4607 */
4608static inline int is_semi_idle_group(struct sched_group *ilb_group)
4609{
4610 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4611 sched_group_cpus(ilb_group));
4612
4613 /*
4614 * A sched_group is semi-idle when it has atleast one busy cpu
4615 * and atleast one idle cpu.
4616 */
4617 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4618 return 0;
4619
4620 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4621 return 0;
4622
4623 return 1;
4624}
4625/**
4626 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4627 * @cpu: The cpu which is nominating a new idle_load_balancer.
4628 *
4629 * Returns: Returns the id of the idle load balancer if it exists,
4630 * Else, returns >= nr_cpu_ids.
4631 *
4632 * This algorithm picks the idle load balancer such that it belongs to a
4633 * semi-idle powersavings sched_domain. The idea is to try and avoid
4634 * completely idle packages/cores just for the purpose of idle load balancing
4635 * when there are other idle cpu's which are better suited for that job.
4636 */
4637static int find_new_ilb(int cpu)
4638{
4639 struct sched_domain *sd;
4640 struct sched_group *ilb_group;
4641
4642 /*
4643 * Have idle load balancer selection from semi-idle packages only
4644 * when power-aware load balancing is enabled
4645 */
4646 if (!(sched_smt_power_savings || sched_mc_power_savings))
4647 goto out_done;
4648
4649 /*
4650 * Optimize for the case when we have no idle CPUs or only one
4651 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4652 */
4653 if (cpumask_weight(nohz.cpu_mask) < 2)
4654 goto out_done;
4655
4656 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4657 ilb_group = sd->groups;
4658
4659 do {
4660 if (is_semi_idle_group(ilb_group))
4661 return cpumask_first(nohz.ilb_grp_nohz_mask);
4662
4663 ilb_group = ilb_group->next;
4664
4665 } while (ilb_group != sd->groups);
4666 }
4667
4668out_done:
4669 return cpumask_first(nohz.cpu_mask);
4670}
4671#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4672static inline int find_new_ilb(int call_cpu)
4673{
4674 return cpumask_first(nohz.cpu_mask);
4675}
4676#endif
4677
4678/*
4679 * This routine will try to nominate the ilb (idle load balancing)
4680 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4681 * load balancing on behalf of all those cpus. If all the cpus in the system
4682 * go into this tickless mode, then there will be no ilb owner (as there is
4683 * no need for one) and all the cpus will sleep till the next wakeup event
4684 * arrives...
4685 *
4686 * For the ilb owner, tick is not stopped. And this tick will be used
4687 * for idle load balancing. ilb owner will still be part of
4688 * nohz.cpu_mask..
4689 *
4690 * While stopping the tick, this cpu will become the ilb owner if there
4691 * is no other owner. And will be the owner till that cpu becomes busy
4692 * or if all cpus in the system stop their ticks at which point
4693 * there is no need for ilb owner.
4694 *
4695 * When the ilb owner becomes busy, it nominates another owner, during the
4696 * next busy scheduler_tick()
4697 */
4698int select_nohz_load_balancer(int stop_tick)
4699{
4700 int cpu = smp_processor_id();
4701
4702 if (stop_tick) {
4703 cpu_rq(cpu)->in_nohz_recently = 1;
4704
4705 if (!cpu_active(cpu)) {
4706 if (atomic_read(&nohz.load_balancer) != cpu)
4707 return 0;
4708
4709 /*
4710 * If we are going offline and still the leader,
4711 * give up!
4712 */
4713 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4714 BUG();
4715
4716 return 0;
4717 }
4718
4719 cpumask_set_cpu(cpu, nohz.cpu_mask);
4720
4721 /* time for ilb owner also to sleep */
4722 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4723 if (atomic_read(&nohz.load_balancer) == cpu)
4724 atomic_set(&nohz.load_balancer, -1);
4725 return 0;
4726 }
4727
4728 if (atomic_read(&nohz.load_balancer) == -1) {
4729 /* make me the ilb owner */
4730 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4731 return 1;
4732 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4733 int new_ilb;
4734
4735 if (!(sched_smt_power_savings ||
4736 sched_mc_power_savings))
4737 return 1;
4738 /*
4739 * Check to see if there is a more power-efficient
4740 * ilb.
4741 */
4742 new_ilb = find_new_ilb(cpu);
4743 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4744 atomic_set(&nohz.load_balancer, -1);
4745 resched_cpu(new_ilb);
4746 return 0;
4747 }
4748 return 1;
4749 }
4750 } else {
4751 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4752 return 0;
4753
4754 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4755
4756 if (atomic_read(&nohz.load_balancer) == cpu)
4757 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4758 BUG();
4759 }
4760 return 0;
4761}
4762#endif
4763
4764static DEFINE_SPINLOCK(balancing);
4765
4766/*
4767 * It checks each scheduling domain to see if it is due to be balanced,
4768 * and initiates a balancing operation if so.
4769 *
4770 * Balancing parameters are set up in arch_init_sched_domains.
4771 */
4772static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4773{
4774 int balance = 1;
4775 struct rq *rq = cpu_rq(cpu);
4776 unsigned long interval;
4777 struct sched_domain *sd;
4778 /* Earliest time when we have to do rebalance again */
4779 unsigned long next_balance = jiffies + 60*HZ;
4780 int update_next_balance = 0;
4781 int need_serialize;
4782
4783 for_each_domain(cpu, sd) {
4784 if (!(sd->flags & SD_LOAD_BALANCE))
4785 continue;
4786
4787 interval = sd->balance_interval;
4788 if (idle != CPU_IDLE)
4789 interval *= sd->busy_factor;
4790
4791 /* scale ms to jiffies */
4792 interval = msecs_to_jiffies(interval);
4793 if (unlikely(!interval))
4794 interval = 1;
4795 if (interval > HZ*NR_CPUS/10)
4796 interval = HZ*NR_CPUS/10;
4797
4798 need_serialize = sd->flags & SD_SERIALIZE;
4799
4800 if (need_serialize) {
4801 if (!spin_trylock(&balancing))
4802 goto out;
4803 }
4804
4805 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4806 if (load_balance(cpu, rq, sd, idle, &balance)) {
4807 /*
4808 * We've pulled tasks over so either we're no
4809 * longer idle, or one of our SMT siblings is
4810 * not idle.
4811 */
4812 idle = CPU_NOT_IDLE;
4813 }
4814 sd->last_balance = jiffies;
4815 }
4816 if (need_serialize)
4817 spin_unlock(&balancing);
4818out:
4819 if (time_after(next_balance, sd->last_balance + interval)) {
4820 next_balance = sd->last_balance + interval;
4821 update_next_balance = 1;
4822 }
4823
4824 /*
4825 * Stop the load balance at this level. There is another
4826 * CPU in our sched group which is doing load balancing more
4827 * actively.
4828 */
4829 if (!balance)
4830 break;
4831 }
4832
4833 /*
4834 * next_balance will be updated only when there is a need.
4835 * When the cpu is attached to null domain for ex, it will not be
4836 * updated.
4837 */
4838 if (likely(update_next_balance))
4839 rq->next_balance = next_balance;
4840}
4841
4842/*
4843 * run_rebalance_domains is triggered when needed from the scheduler tick.
4844 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4845 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4846 */
4847static void run_rebalance_domains(struct softirq_action *h)
4848{
4849 int this_cpu = smp_processor_id();
4850 struct rq *this_rq = cpu_rq(this_cpu);
4851 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4852 CPU_IDLE : CPU_NOT_IDLE;
4853
4854 rebalance_domains(this_cpu, idle);
4855
4856#ifdef CONFIG_NO_HZ
4857 /*
4858 * If this cpu is the owner for idle load balancing, then do the
4859 * balancing on behalf of the other idle cpus whose ticks are
4860 * stopped.
4861 */
4862 if (this_rq->idle_at_tick &&
4863 atomic_read(&nohz.load_balancer) == this_cpu) {
4864 struct rq *rq;
4865 int balance_cpu;
4866
4867 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4868 if (balance_cpu == this_cpu)
4869 continue;
4870
4871 /*
4872 * If this cpu gets work to do, stop the load balancing
4873 * work being done for other cpus. Next load
4874 * balancing owner will pick it up.
4875 */
4876 if (need_resched())
4877 break;
4878
4879 rebalance_domains(balance_cpu, CPU_IDLE);
4880
4881 rq = cpu_rq(balance_cpu);
4882 if (time_after(this_rq->next_balance, rq->next_balance))
4883 this_rq->next_balance = rq->next_balance;
4884 }
4885 }
4886#endif
4887}
4888
4889static inline int on_null_domain(int cpu)
4890{
4891 return !rcu_dereference(cpu_rq(cpu)->sd);
4892}
4893
4894/*
4895 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4896 *
4897 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4898 * idle load balancing owner or decide to stop the periodic load balancing,
4899 * if the whole system is idle.
4900 */
4901static inline void trigger_load_balance(struct rq *rq, int cpu)
4902{
4903#ifdef CONFIG_NO_HZ
4904 /*
4905 * If we were in the nohz mode recently and busy at the current
4906 * scheduler tick, then check if we need to nominate new idle
4907 * load balancer.
4908 */
4909 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4910 rq->in_nohz_recently = 0;
4911
4912 if (atomic_read(&nohz.load_balancer) == cpu) {
4913 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4914 atomic_set(&nohz.load_balancer, -1);
4915 }
4916
4917 if (atomic_read(&nohz.load_balancer) == -1) {
4918 int ilb = find_new_ilb(cpu);
4919
4920 if (ilb < nr_cpu_ids)
4921 resched_cpu(ilb);
4922 }
4923 }
4924
4925 /*
4926 * If this cpu is idle and doing idle load balancing for all the
4927 * cpus with ticks stopped, is it time for that to stop?
4928 */
4929 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4930 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4931 resched_cpu(cpu);
4932 return;
4933 }
4934
4935 /*
4936 * If this cpu is idle and the idle load balancing is done by
4937 * someone else, then no need raise the SCHED_SOFTIRQ
4938 */
4939 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4940 cpumask_test_cpu(cpu, nohz.cpu_mask))
4941 return;
4942#endif
4943 /* Don't need to rebalance while attached to NULL domain */
4944 if (time_after_eq(jiffies, rq->next_balance) &&
4945 likely(!on_null_domain(cpu)))
4946 raise_softirq(SCHED_SOFTIRQ);
4947}
4948
4949#else /* CONFIG_SMP */
4950
4951/*
4952 * on UP we do not need to balance between CPUs:
4953 */
4954static inline void idle_balance(int cpu, struct rq *rq)
4955{
4956}
4957
4958#endif 3164#endif
4959 3165
4960DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5309,7 +3515,7 @@ void scheduler_tick(void)
5309 curr->sched_class->task_tick(rq, curr, 0); 3515 curr->sched_class->task_tick(rq, curr, 0);
5310 raw_spin_unlock(&rq->lock); 3516 raw_spin_unlock(&rq->lock);
5311 3517
5312 perf_event_task_tick(curr, cpu); 3518 perf_event_task_tick(curr);
5313 3519
5314#ifdef CONFIG_SMP 3520#ifdef CONFIG_SMP
5315 rq->idle_at_tick = idle_cpu(cpu); 3521 rq->idle_at_tick = idle_cpu(cpu);
@@ -5523,7 +3729,7 @@ need_resched_nonpreemptible:
5523 3729
5524 if (likely(prev != next)) { 3730 if (likely(prev != next)) {
5525 sched_info_switch(prev, next); 3731 sched_info_switch(prev, next);
5526 perf_event_task_sched_out(prev, next, cpu); 3732 perf_event_task_sched_out(prev, next);
5527 3733
5528 rq->nr_switches++; 3734 rq->nr_switches++;
5529 rq->curr = next; 3735 rq->curr = next;
@@ -6054,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6054 unsigned long flags; 4260 unsigned long flags;
6055 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6056 struct rq *rq; 4262 struct rq *rq;
6057 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6058 4264
6059 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6060 4266
@@ -6062,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6062 update_rq_clock(rq); 4268 update_rq_clock(rq);
6063 4269
6064 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6065 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6066 running = task_current(rq, p); 4273 running = task_current(rq, p);
6067 if (on_rq) 4274 if (on_rq)
@@ -6079,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6079 if (running) 4286 if (running)
6080 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6081 if (on_rq) { 4288 if (on_rq) {
6082 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6083 4290
6084 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6085 } 4292 }
@@ -6123,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6123 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6124 4331
6125 if (on_rq) { 4332 if (on_rq) {
6126 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6127 /* 4334 /*
6128 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6129 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6146,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice)
6146 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4353 /* convert nice value [19,-20] to rlimit style value [1,40] */
6147 int nice_rlim = 20 - nice; 4354 int nice_rlim = 20 - nice;
6148 4355
6149 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4356 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6150 capable(CAP_SYS_NICE)); 4357 capable(CAP_SYS_NICE));
6151} 4358}
6152 4359
@@ -6281,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6281{ 4488{
6282 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6283 unsigned long flags; 4490 unsigned long flags;
6284 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6285 struct rq *rq; 4492 struct rq *rq;
6286 int reset_on_fork; 4493 int reset_on_fork;
6287 4494
@@ -6323,7 +4530,7 @@ recheck:
6323 4530
6324 if (!lock_task_sighand(p, &flags)) 4531 if (!lock_task_sighand(p, &flags))
6325 return -ESRCH; 4532 return -ESRCH;
6326 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4533 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6327 unlock_task_sighand(p, &flags); 4534 unlock_task_sighand(p, &flags);
6328 4535
6329 /* can't set/change the rt policy */ 4536 /* can't set/change the rt policy */
@@ -6395,6 +4602,7 @@ recheck:
6395 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6396 4603
6397 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6398 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6399 4607
6400 if (running) 4608 if (running)
@@ -7145,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7145 struct rq *rq; 5353 struct rq *rq;
7146 int ret = 0; 5354 int ret = 0;
7147 5355
7148 /*
7149 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7150 * the ->cpus_allowed mask from under waking tasks, which would be
7151 * possible when we change rq->lock in ttwu(), so synchronize against
7152 * TASK_WAKING to avoid that.
7153 *
7154 * Make an exception for freshly cloned tasks, since cpuset namespaces
7155 * might move the task about, we have to validate the target in
7156 * wake_up_new_task() anyway since the cpu might have gone away.
7157 */
7158again:
7159 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7160 cpu_relax();
7161
7162 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
7163 5357
7164 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7165 task_rq_unlock(rq, &flags);
7166 goto again;
7167 }
7168
7169 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7170 ret = -EINVAL; 5359 ret = -EINVAL;
7171 goto out; 5360 goto out;
@@ -9452,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9452 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9453 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9454 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9455 rt_rq->rt_se = rt_se;
9456 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9457 if (add) 7645 if (add)
9458 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9483,9 +7671,6 @@ void __init sched_init(void)
9483#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9484 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9485#endif 7673#endif
9486#ifdef CONFIG_USER_SCHED
9487 alloc_size *= 2;
9488#endif
9489#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9490 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9491#endif 7676#endif
@@ -9499,13 +7684,6 @@ void __init sched_init(void)
9499 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9500 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9501 7686
9502#ifdef CONFIG_USER_SCHED
9503 root_task_group.se = (struct sched_entity **)ptr;
9504 ptr += nr_cpu_ids * sizeof(void **);
9505
9506 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9507 ptr += nr_cpu_ids * sizeof(void **);
9508#endif /* CONFIG_USER_SCHED */
9509#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9510#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9511 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9514,13 +7692,6 @@ void __init sched_init(void)
9514 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9515 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9516 7694
9517#ifdef CONFIG_USER_SCHED
9518 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9519 ptr += nr_cpu_ids * sizeof(void **);
9520
9521 root_task_group.rt_rq = (struct rt_rq **)ptr;
9522 ptr += nr_cpu_ids * sizeof(void **);
9523#endif /* CONFIG_USER_SCHED */
9524#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9525#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9526 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9540,22 +7711,13 @@ void __init sched_init(void)
9540#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9541 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9542 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9543#ifdef CONFIG_USER_SCHED
9544 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9545 global_rt_period(), RUNTIME_INF);
9546#endif /* CONFIG_USER_SCHED */
9547#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9548 7715
9549#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9550 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9551 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9552 7719
9553#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9554 INIT_LIST_HEAD(&root_task_group.children);
9555 init_task_group.parent = &root_task_group;
9556 list_add(&init_task_group.siblings, &root_task_group.children);
9557#endif /* CONFIG_USER_SCHED */
9558#endif /* CONFIG_GROUP_SCHED */
9559 7721
9560#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9561 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9595,25 +7757,6 @@ void __init sched_init(void)
9595 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9596 */ 7758 */
9597 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9598#elif defined CONFIG_USER_SCHED
9599 root_task_group.shares = NICE_0_LOAD;
9600 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9601 /*
9602 * In case of task-groups formed thr' the user id of tasks,
9603 * init_task_group represents tasks belonging to root user.
9604 * Hence it forms a sibling of all subsequent groups formed.
9605 * In this case, init_task_group gets only a fraction of overall
9606 * system cpu resource, based on the weight assigned to root
9607 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9608 * by letting tasks of init_task_group sit in a separate cfs_rq
9609 * (init_tg_cfs_rq) and having one entity represent this group of
9610 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9611 */
9612 init_tg_cfs_entry(&init_task_group,
9613 &per_cpu(init_tg_cfs_rq, i),
9614 &per_cpu(init_sched_entity, i), i, 1,
9615 root_task_group.se[i]);
9616
9617#endif 7760#endif
9618#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9619 7762
@@ -9622,12 +7765,6 @@ void __init sched_init(void)
9622 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9623#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9624 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9625#elif defined CONFIG_USER_SCHED
9626 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9627 init_tg_rt_entry(&init_task_group,
9628 &per_cpu(init_rt_rq_var, i),
9629 &per_cpu(init_sched_rt_entity, i), i, 1,
9630 root_task_group.rt_se[i]);
9631#endif 7768#endif
9632#endif 7769#endif
9633 7770
@@ -9712,7 +7849,7 @@ static inline int preempt_count_equals(int preempt_offset)
9712 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9713} 7850}
9714 7851
9715void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9716{ 7853{
9717#ifdef in_atomic 7854#ifdef in_atomic
9718 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10023,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10023} 8160}
10024#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
10025 8162
10026#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
10027static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
10028{ 8165{
10029 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10128,11 +8265,11 @@ void sched_move_task(struct task_struct *tsk)
10128 if (unlikely(running)) 8265 if (unlikely(running))
10129 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10130 if (on_rq) 8267 if (on_rq)
10131 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10132 8269
10133 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10134} 8271}
10135#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10136 8273
10137#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10138static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10274,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10274 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10275 } 8412 }
10276 8413
10277#ifdef CONFIG_USER_SCHED
10278 if (tg == &root_task_group) {
10279 period = global_rt_period();
10280 runtime = global_rt_runtime();
10281 }
10282#endif
10283
10284 /* 8414 /*
10285 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10286 */ 8416 */
@@ -10683,7 +8813,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10683struct cpuacct { 8813struct cpuacct {
10684 struct cgroup_subsys_state css; 8814 struct cgroup_subsys_state css;
10685 /* cpuusage holds pointer to a u64-type object on every cpu */ 8815 /* cpuusage holds pointer to a u64-type object on every cpu */
10686 u64 *cpuusage; 8816 u64 __percpu *cpuusage;
10687 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8817 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10688 struct cpuacct *parent; 8818 struct cpuacct *parent;
10689}; 8819};
@@ -10900,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10900} 9030}
10901 9031
10902/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10903 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10904 */ 9051 */
10905static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10906 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10907{ 9054{
10908 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10909 9057
10910 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10911 return; 9059 return;
@@ -10914,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10914 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10915 9063
10916 do { 9064 do {
10917 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10918 ca = ca->parent; 9066 ca = ca->parent;
10919 } while (ca); 9067 } while (ca);
10920 rcu_read_unlock(); 9068 rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..82095bf2099f 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..3e1fd96c6cf9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1850 1903
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1934{ 2050{
1935 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1936 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
1937 2053
1938 cfs_rq_iterator.start = load_balance_start_fair; 2054 do {
1939 cfs_rq_iterator.next = load_balance_next_fair; 2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1940 2058
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
1942 /* 2062 /*
1943 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1944 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1945 */ 2066 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1948 &cfs_rq_iterator)) 2069
1949 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
1950 } 2155 }
1951 2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
1952 return 0; 2297 return 0;
1953} 2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
3348 return 0;
3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
1954 3536
1955static void rq_online_fair(struct rq *rq) 3537static void rq_online_fair(struct rq *rq)
1956{ 3538{
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3544 update_sysctl();
1963} 3545}
1964 3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1965#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1966 3557
1967/* 3558/*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3667}
2077#endif 3668#endif
2078 3669
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3671{
2081 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2110 3701
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3702 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3703 .rq_offline = rq_offline_fair,
2115 3704
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..5a6ed1f0990a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1670,8 +1662,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1670 if (!p->signal) 1662 if (!p->signal)
1671 return; 1663 return;
1672 1664
1673 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1665 /* max may change after cur was read, this will be fixed next tick */
1674 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1666 soft = task_rlimit(p, RLIMIT_RTTIME);
1667 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1675 1668
1676 if (soft != RLIM_INFINITY) { 1669 if (soft != RLIM_INFINITY) {
1677 unsigned long next; 1670 unsigned long next;
@@ -1721,7 +1714,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1714 dequeue_pushable_task(rq, p);
1722} 1715}
1723 1716
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1717static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1718{
1726 /* 1719 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1720 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1739,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1739#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1740 .select_task_rq = select_task_rq_rt,
1748 1741
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1742 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1743 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1744 .rq_offline = rq_offline_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
159 159
160/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
161 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
162int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
163{ 167{
164 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
166 170
167 s = pending->signal.sig; 171 s = pending->signal.sig;
168 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
169 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
170 default: 187 default:
171 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
172 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
173 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
174 break; 191 continue;
175 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
176 break; 195 break;
177 196
178 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
179 sig = 1; 198 x = s[1] &~ m[1];
180 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
181 sig = _NSIG_BPW + 1;
182 else
183 break; 200 break;
184 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
185 break; 202 break;
186 203
187 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
188 sig = ffz(~x) + 1; 205 /* Nothing to do */
189 break; 206 break;
190 } 207 }
191 208
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
228 245
229 if (override_rlimit || 246 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
232 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else { 250 } else {
234 print_dropped_signal(sig); 251 print_dropped_signal(sig);
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..9867b6bfefce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,8 +12,6 @@
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14 14
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 15static struct {
18 struct list_head queue; 16 struct list_head queue;
19 raw_spinlock_t lock; 17 raw_spinlock_t lock;
@@ -33,12 +31,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 31 cpumask_var_t cpumask;
34}; 32};
35 33
34static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 raw_spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 42
43static int 43static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void)
256 } 256 }
257} 257}
258 258
259static DEFINE_PER_CPU(struct call_single_data, csd_data); 259static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
260 260
261/* 261/*
262 * smp_call_function_single - Run a function on a specific CPU 262 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..bde4295774c8 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,6 +34,30 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37static int init_srcu_struct_fields(struct srcu_struct *sp)
38{
39 sp->completed = 0;
40 mutex_init(&sp->mutex);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM;
43}
44
45#ifdef CONFIG_DEBUG_LOCK_ALLOC
46
47int __init_srcu_struct(struct srcu_struct *sp, const char *name,
48 struct lock_class_key *key)
49{
50#ifdef CONFIG_DEBUG_LOCK_ALLOC
51 /* Don't re-initialize a lock while it is held. */
52 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
53 lockdep_init_map(&sp->dep_map, name, key, 0);
54#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
55 return init_srcu_struct_fields(sp);
56}
57EXPORT_SYMBOL_GPL(__init_srcu_struct);
58
59#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
60
37/** 61/**
38 * init_srcu_struct - initialize a sleep-RCU structure 62 * init_srcu_struct - initialize a sleep-RCU structure
39 * @sp: structure to initialize. 63 * @sp: structure to initialize.
@@ -44,13 +68,12 @@
44 */ 68 */
45int init_srcu_struct(struct srcu_struct *sp) 69int init_srcu_struct(struct srcu_struct *sp)
46{ 70{
47 sp->completed = 0; 71 return init_srcu_struct_fields(sp);
48 mutex_init(&sp->mutex);
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 72}
52EXPORT_SYMBOL_GPL(init_srcu_struct); 73EXPORT_SYMBOL_GPL(init_srcu_struct);
53 74
75#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
76
54/* 77/*
55 * srcu_readers_active_idx -- returns approximate number of readers 78 * srcu_readers_active_idx -- returns approximate number of readers
56 * active on the specified rank of per-CPU counters. 79 * active on the specified rank of per-CPU counters.
@@ -100,15 +123,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
100} 123}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 124EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
102 125
103/** 126/*
104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
105 * @sp: srcu_struct in which to register the new reader.
106 *
107 * Counts the new reader in the appropriate per-CPU element of the 127 * Counts the new reader in the appropriate per-CPU element of the
108 * srcu_struct. Must be called from process context. 128 * srcu_struct. Must be called from process context.
109 * Returns an index that must be passed to the matching srcu_read_unlock(). 129 * Returns an index that must be passed to the matching srcu_read_unlock().
110 */ 130 */
111int srcu_read_lock(struct srcu_struct *sp) 131int __srcu_read_lock(struct srcu_struct *sp)
112{ 132{
113 int idx; 133 int idx;
114 134
@@ -120,31 +140,27 @@ int srcu_read_lock(struct srcu_struct *sp)
120 preempt_enable(); 140 preempt_enable();
121 return idx; 141 return idx;
122} 142}
123EXPORT_SYMBOL_GPL(srcu_read_lock); 143EXPORT_SYMBOL_GPL(__srcu_read_lock);
124 144
125/** 145/*
126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
127 * @sp: srcu_struct in which to unregister the old reader.
128 * @idx: return value from corresponding srcu_read_lock().
129 *
130 * Removes the count for the old reader from the appropriate per-CPU 146 * Removes the count for the old reader from the appropriate per-CPU
131 * element of the srcu_struct. Note that this may well be a different 147 * element of the srcu_struct. Note that this may well be a different
132 * CPU than that which was incremented by the corresponding srcu_read_lock(). 148 * CPU than that which was incremented by the corresponding srcu_read_lock().
133 * Must be called from process context. 149 * Must be called from process context.
134 */ 150 */
135void srcu_read_unlock(struct srcu_struct *sp, int idx) 151void __srcu_read_unlock(struct srcu_struct *sp, int idx)
136{ 152{
137 preempt_disable(); 153 preempt_disable();
138 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 154 srcu_barrier(); /* ensure compiler won't misorder critical section. */
139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 155 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
140 preempt_enable(); 156 preempt_enable();
141} 157}
142EXPORT_SYMBOL_GPL(srcu_read_unlock); 158EXPORT_SYMBOL_GPL(__srcu_read_unlock);
143 159
144/* 160/*
145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 161 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
146 */ 162 */
147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 163static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
148{ 164{
149 int idx; 165 int idx;
150 166
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void __percpu *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
51{ 51{
diff --git a/kernel/sys.c b/kernel/sys.c
index 18bde979f346..9814e43fb23b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -571,13 +571,7 @@ static int set_user(struct cred *new)
571 if (!new_user) 571 if (!new_user)
572 return -EAGAIN; 572 return -EAGAIN;
573 573
574 if (!task_can_switch_user(new_user, current)) { 574 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
575 free_uid(new_user);
576 return -EINVAL;
577 }
578
579 if (atomic_read(&new_user->processes) >=
580 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
581 new_user != INIT_USER) { 575 new_user != INIT_USER) {
582 free_uid(new_user); 576 free_uid(new_user);
583 return -EAGAIN; 577 return -EAGAIN;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..0ef19c614f6d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,6 +50,7 @@
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_event.h> 52#include <linux/perf_event.h>
53#include <linux/kprobes.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/processor.h> 56#include <asm/processor.h>
@@ -1441,7 +1442,7 @@ static struct ctl_table fs_table[] = {
1441}; 1442};
1442 1443
1443static struct ctl_table debug_table[] = { 1444static struct ctl_table debug_table[] = {
1444#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1445#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
1445 { 1446 {
1446 .procname = "exception-trace", 1447 .procname = "exception-trace",
1447 .data = &show_unhandled_signals, 1448 .data = &show_unhandled_signals,
@@ -1450,6 +1451,17 @@ static struct ctl_table debug_table[] = {
1450 .proc_handler = proc_dointvec 1451 .proc_handler = proc_dointvec
1451 }, 1452 },
1452#endif 1453#endif
1454#if defined(CONFIG_OPTPROBES)
1455 {
1456 .procname = "kprobes-optimization",
1457 .data = &sysctl_kprobes_optimization,
1458 .maxlen = sizeof(int),
1459 .mode = 0644,
1460 .proc_handler = proc_kprobes_optimization_handler,
1461 .extra1 = &zero,
1462 .extra2 = &one,
1463 },
1464#endif
1453 { } 1465 { }
1454}; 1466};
1455 1467
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..8cd50d8f9bde 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1331 ssize_t result; 1331 ssize_t result;
1332 char *pathname; 1332 char *pathname;
1333 int flags; 1333 int flags;
1334 int acc_mode, fmode; 1334 int acc_mode;
1335 1335
1336 pathname = sysctl_getname(name, nlen, &table); 1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname); 1337 result = PTR_ERR(pathname);
@@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1342 if (oldval && oldlen && newval && newlen) { 1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR; 1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE; 1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) { 1345 } else if (newval && newlen) {
1347 flags = O_WRONLY; 1346 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE; 1347 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) { 1348 } else if (oldval && oldlen) {
1351 flags = O_RDONLY; 1349 flags = O_RDONLY;
1352 acc_mode = MAY_READ; 1350 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else { 1351 } else {
1355 result = 0; 1352 result = 0;
1356 goto out_putname; 1353 goto out_putname;
@@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1361 if (result) 1358 if (result)
1362 goto out_putname; 1359 goto out_putname;
1363 1360
1364 result = may_open(&nd.path, acc_mode, fmode); 1361 result = may_open(&nd.path, acc_mode, flags);
1365 if (result) 1362 if (result)
1366 goto out_putpath; 1363 goto out_putpath;
1367 1364
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..899ca51be5e8 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -46,15 +46,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 46 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 47};
48 48
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 49static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 50 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 52 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 53 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 54
56static struct nla_policy 55static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 56 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 57};
60 58
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 13700833c181..1f663d23e85e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -453,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
454 454
455/** 455/**
456 * clocksource_suspend - suspend the clocksource(s)
457 */
458void clocksource_suspend(void)
459{
460 struct clocksource *cs;
461
462 list_for_each_entry_reverse(cs, &clocksource_list, list)
463 if (cs->suspend)
464 cs->suspend(cs);
465}
466
467/**
456 * clocksource_resume - resume the clocksource(s) 468 * clocksource_resume - resume the clocksource(s)
457 */ 469 */
458void clocksource_resume(void) 470void clocksource_resume(void)
@@ -461,7 +473,7 @@ void clocksource_resume(void)
461 473
462 list_for_each_entry(cs, &clocksource_list, list) 474 list_for_each_entry(cs, &clocksource_list, list)
463 if (cs->resume) 475 if (cs->resume)
464 cs->resume(); 476 cs->resume(cs);
465 477
466 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
467} 479}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64 time_offset;
58static long time_constant = 2; 58static long time_constant = 2;
59 59
60/* maximum error (usecs): */ 60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT; 61static long time_maxerror = NTP_PHASE_LIMIT;
62 62
63/* estimated error (usecs): */ 63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT; 64static long time_esterror = NTP_PHASE_LIMIT;
65 65
66/* frequency offset (scaled nsecs/secs): */ 66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq; 67static s64 time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
142 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
143 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
144 */ 144 */
145 secs = xtime.tv_sec - time_reftime; 145 secs = get_seconds() - time_reftime;
146 if (unlikely(time_status & STA_FREQHOLD)) 146 if (unlikely(time_status & STA_FREQHOLD))
147 secs = 0; 147 secs = 0;
148 148
149 time_reftime = xtime.tv_sec; 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
368 * reference time to current time. 368 * reference time to current time.
369 */ 369 */
370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) 370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
371 time_reftime = xtime.tv_sec; 371 time_reftime = get_seconds();
372 372
373 /* only set allowed bits */ 373 /* only set allowed bits */
374 time_status &= STA_RONLY; 374 time_status &= STA_RONLY;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e2ab064c6d41..16736379a9ca 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -622,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
622 write_sequnlock_irqrestore(&xtime_lock, flags); 622 write_sequnlock_irqrestore(&xtime_lock, flags);
623 623
624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
625 clocksource_suspend();
625 626
626 return 0; 627 return 0;
627} 628}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 60e2ce0181ee..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -328,15 +328,6 @@ config BRANCH_TRACER
328 328
329 Say N if unsure. 329 Say N if unsure.
330 330
331config POWER_TRACER
332 bool "Trace power consumption behavior"
333 depends on X86
334 select GENERIC_TRACER
335 help
336 This tracer helps developers to analyze and optimize the kernel's
337 power management decisions, specifically the C-state and P-state
338 behavior.
339
340config KSYM_TRACER 331config KSYM_TRACER
341 bool "Trace read and write access on kernel memory locations" 332 bool "Trace read and write access on kernel memory locations"
342 depends on HAVE_HW_BREAKPOINT 333 depends on HAVE_HW_BREAKPOINT
@@ -449,7 +440,7 @@ config BLK_DEV_IO_TRACE
449 440
450config KPROBE_EVENT 441config KPROBE_EVENT
451 depends on KPROBES 442 depends on KPROBES
452 depends on X86 443 depends on HAVE_REGS_AND_STACK_ACCESS_API
453 bool "Enable kprobes-based dynamic events" 444 bool "Enable kprobes-based dynamic events"
454 select TRACING 445 select TRACING
455 default y 446 default y
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..d00c6fe23f54 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 51obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
56endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..07f945a99430 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -540,9 +540,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
540 if (ret) 540 if (ret)
541 return ret; 541 return ret;
542 542
543 if (copy_to_user(arg, &buts, sizeof(buts))) 543 if (copy_to_user(arg, &buts, sizeof(buts))) {
544 blk_trace_remove(q);
544 return -EFAULT; 545 return -EFAULT;
545 546 }
546 return 0; 547 return 0;
547} 548}
548EXPORT_SYMBOL_GPL(blk_trace_setup); 549EXPORT_SYMBOL_GPL(blk_trace_setup);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f80454..83783579378f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,7 +22,6 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
898 } \ 897 } \
899 } 898 }
900 899
901#ifdef CONFIG_KPROBES
902
903static int frozen_record_count;
904
905static inline void freeze_record(struct dyn_ftrace *rec)
906{
907 if (!(rec->flags & FTRACE_FL_FROZEN)) {
908 rec->flags |= FTRACE_FL_FROZEN;
909 frozen_record_count++;
910 }
911}
912
913static inline void unfreeze_record(struct dyn_ftrace *rec)
914{
915 if (rec->flags & FTRACE_FL_FROZEN) {
916 rec->flags &= ~FTRACE_FL_FROZEN;
917 frozen_record_count--;
918 }
919}
920
921static inline int record_frozen(struct dyn_ftrace *rec)
922{
923 return rec->flags & FTRACE_FL_FROZEN;
924}
925#else
926# define freeze_record(rec) ({ 0; })
927# define unfreeze_record(rec) ({ 0; })
928# define record_frozen(rec) ({ 0; })
929#endif /* CONFIG_KPROBES */
930
931static void ftrace_free_rec(struct dyn_ftrace *rec) 900static void ftrace_free_rec(struct dyn_ftrace *rec)
932{ 901{
933 rec->freelist = ftrace_free_records; 902 rec->freelist = ftrace_free_records;
@@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1025} 994}
1026 995
1027 996
997/* Return 1 if the address range is reserved for ftrace */
998int ftrace_text_reserved(void *start, void *end)
999{
1000 struct dyn_ftrace *rec;
1001 struct ftrace_page *pg;
1002
1003 do_for_each_ftrace_rec(pg, rec) {
1004 if (rec->ip <= (unsigned long)end &&
1005 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1006 return 1;
1007 } while_for_each_ftrace_rec();
1008 return 0;
1009}
1010
1011
1028static int 1012static int
1029__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1013__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1030{ 1014{
@@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable)
1076 !(rec->flags & FTRACE_FL_CONVERTED)) 1060 !(rec->flags & FTRACE_FL_CONVERTED))
1077 continue; 1061 continue;
1078 1062
1079 /* ignore updates to this record's mcount site */
1080 if (get_kprobe((void *)rec->ip)) {
1081 freeze_record(rec);
1082 continue;
1083 } else {
1084 unfreeze_record(rec);
1085 }
1086
1087 failed = __ftrace_replace_code(rec, enable); 1063 failed = __ftrace_replace_code(rec, enable);
1088 if (failed) { 1064 if (failed) {
1089 rec->flags |= FTRACE_FL_FAILED; 1065 rec->flags |= FTRACE_FL_FAILED;
@@ -2426,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = {
2426static DEFINE_MUTEX(graph_lock); 2402static DEFINE_MUTEX(graph_lock);
2427 2403
2428int ftrace_graph_count; 2404int ftrace_graph_count;
2405int ftrace_graph_filter_enabled;
2429unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2406unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2430 2407
2431static void * 2408static void *
@@ -2448,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2448 mutex_lock(&graph_lock); 2425 mutex_lock(&graph_lock);
2449 2426
2450 /* Nothing, tell g_show to print all functions are enabled */ 2427 /* Nothing, tell g_show to print all functions are enabled */
2451 if (!ftrace_graph_count && !*pos) 2428 if (!ftrace_graph_filter_enabled && !*pos)
2452 return (void *)1; 2429 return (void *)1;
2453 2430
2454 return __g_next(m, pos); 2431 return __g_next(m, pos);
@@ -2494,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2494 mutex_lock(&graph_lock); 2471 mutex_lock(&graph_lock);
2495 if ((file->f_mode & FMODE_WRITE) && 2472 if ((file->f_mode & FMODE_WRITE) &&
2496 (file->f_flags & O_TRUNC)) { 2473 (file->f_flags & O_TRUNC)) {
2474 ftrace_graph_filter_enabled = 0;
2497 ftrace_graph_count = 0; 2475 ftrace_graph_count = 0;
2498 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2476 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2499 } 2477 }
@@ -2519,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2519 struct dyn_ftrace *rec; 2497 struct dyn_ftrace *rec;
2520 struct ftrace_page *pg; 2498 struct ftrace_page *pg;
2521 int search_len; 2499 int search_len;
2522 int found = 0; 2500 int fail = 1;
2523 int type, not; 2501 int type, not;
2524 char *search; 2502 char *search;
2525 bool exists; 2503 bool exists;
@@ -2530,37 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2530 2508
2531 /* decode regex */ 2509 /* decode regex */
2532 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 2510 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2533 if (not) 2511 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2534 return -EINVAL; 2512 return -EBUSY;
2535 2513
2536 search_len = strlen(search); 2514 search_len = strlen(search);
2537 2515
2538 mutex_lock(&ftrace_lock); 2516 mutex_lock(&ftrace_lock);
2539 do_for_each_ftrace_rec(pg, rec) { 2517 do_for_each_ftrace_rec(pg, rec) {
2540 2518
2541 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2542 break;
2543
2544 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2519 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2545 continue; 2520 continue;
2546 2521
2547 if (ftrace_match_record(rec, search, search_len, type)) { 2522 if (ftrace_match_record(rec, search, search_len, type)) {
2548 /* ensure it is not already in the array */ 2523 /* if it is in the array */
2549 exists = false; 2524 exists = false;
2550 for (i = 0; i < *idx; i++) 2525 for (i = 0; i < *idx; i++) {
2551 if (array[i] == rec->ip) { 2526 if (array[i] == rec->ip) {
2552 exists = true; 2527 exists = true;
2553 break; 2528 break;
2554 } 2529 }
2555 if (!exists) 2530 }
2556 array[(*idx)++] = rec->ip; 2531
2557 found = 1; 2532 if (!not) {
2533 fail = 0;
2534 if (!exists) {
2535 array[(*idx)++] = rec->ip;
2536 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2537 goto out;
2538 }
2539 } else {
2540 if (exists) {
2541 array[i] = array[--(*idx)];
2542 array[*idx] = 0;
2543 fail = 0;
2544 }
2545 }
2558 } 2546 }
2559 } while_for_each_ftrace_rec(); 2547 } while_for_each_ftrace_rec();
2560 2548out:
2561 mutex_unlock(&ftrace_lock); 2549 mutex_unlock(&ftrace_lock);
2562 2550
2563 return found ? 0 : -EINVAL; 2551 if (fail)
2552 return -EINVAL;
2553
2554 ftrace_graph_filter_enabled = 1;
2555 return 0;
2564} 2556}
2565 2557
2566static ssize_t 2558static ssize_t
@@ -2570,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2570 struct trace_parser parser; 2562 struct trace_parser parser;
2571 ssize_t read, ret; 2563 ssize_t read, ret;
2572 2564
2573 if (!cnt || cnt < 0) 2565 if (!cnt)
2574 return 0; 2566 return 0;
2575 2567
2576 mutex_lock(&graph_lock); 2568 mutex_lock(&graph_lock);
2577 2569
2578 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2579 ret = -EBUSY;
2580 goto out_unlock;
2581 }
2582
2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2570 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2584 ret = -ENOMEM; 2571 ret = -ENOMEM;
2585 goto out_unlock; 2572 goto out_unlock;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8c1b2d290718..0287f9f52f5a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22 22
23#include <asm/local.h>
23#include "trace.h" 24#include "trace.h"
24 25
25/* 26/*
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eac6875cb990..ed01fdba4a55 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,7 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
35#include <linux/ctype.h> 36#include <linux/ctype.h>
36#include <linux/init.h> 37#include <linux/init.h>
37#include <linux/poll.h> 38#include <linux/poll.h>
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
91static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
92{ 93{
93 preempt_disable(); 94 preempt_disable();
94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
95} 96}
96 97
97static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
98{ 99{
99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
100 preempt_enable(); 101 preempt_enable();
101} 102}
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -1089,7 +1166,7 @@ trace_function(struct trace_array *tr,
1089 struct ftrace_entry *entry; 1166 struct ftrace_entry *entry;
1090 1167
1091 /* If we are reading the ring buffer, don't trace */ 1168 /* If we are reading the ring buffer, don't trace */
1092 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 1169 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1093 return; 1170 return;
1094 1171
1095 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1172 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1320,8 +1397,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1320 entry->fmt = fmt; 1397 entry->fmt = fmt;
1321 1398
1322 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1399 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1323 if (!filter_check_discard(call, entry, buffer, event)) 1400 if (!filter_check_discard(call, entry, buffer, event)) {
1324 ring_buffer_unlock_commit(buffer, event); 1401 ring_buffer_unlock_commit(buffer, event);
1402 ftrace_trace_stack(buffer, flags, 6, pc);
1403 }
1325 1404
1326out_unlock: 1405out_unlock:
1327 arch_spin_unlock(&trace_buf_lock); 1406 arch_spin_unlock(&trace_buf_lock);
@@ -1394,8 +1473,10 @@ int trace_array_vprintk(struct trace_array *tr,
1394 1473
1395 memcpy(&entry->buf, trace_buf, len); 1474 memcpy(&entry->buf, trace_buf, len);
1396 entry->buf[len] = '\0'; 1475 entry->buf[len] = '\0';
1397 if (!filter_check_discard(call, entry, buffer, event)) 1476 if (!filter_check_discard(call, entry, buffer, event)) {
1398 ring_buffer_unlock_commit(buffer, event); 1477 ring_buffer_unlock_commit(buffer, event);
1478 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1479 }
1399 1480
1400 out_unlock: 1481 out_unlock:
1401 arch_spin_unlock(&trace_buf_lock); 1482 arch_spin_unlock(&trace_buf_lock);
@@ -1585,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1585} 1666}
1586 1667
1587/* 1668/*
1588 * No necessary locking here. The worst thing which can
1589 * happen is loosing events consumed at the same time
1590 * by a trace_pipe reader.
1591 * Other than that, we don't risk to crash the ring buffer
1592 * because it serializes the readers.
1593 *
1594 * The current tracer is copied to avoid a global locking 1669 * The current tracer is copied to avoid a global locking
1595 * all around. 1670 * all around.
1596 */ 1671 */
@@ -1645,12 +1720,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1645 } 1720 }
1646 1721
1647 trace_event_read_lock(); 1722 trace_event_read_lock();
1723 trace_access_lock(cpu_file);
1648 return p; 1724 return p;
1649} 1725}
1650 1726
1651static void s_stop(struct seq_file *m, void *p) 1727static void s_stop(struct seq_file *m, void *p)
1652{ 1728{
1729 struct trace_iterator *iter = m->private;
1730
1653 atomic_dec(&trace_record_cmdline_disabled); 1731 atomic_dec(&trace_record_cmdline_disabled);
1732 trace_access_unlock(iter->cpu_file);
1654 trace_event_read_unlock(); 1733 trace_event_read_unlock();
1655} 1734}
1656 1735
@@ -2841,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2841 2920
2842 mutex_lock(&trace_types_lock); 2921 mutex_lock(&trace_types_lock);
2843 2922
2844 /* We only allow one reader per cpu */
2845 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2846 if (!cpumask_empty(tracing_reader_cpumask)) {
2847 ret = -EBUSY;
2848 goto out;
2849 }
2850 cpumask_setall(tracing_reader_cpumask);
2851 } else {
2852 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2853 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2854 else {
2855 ret = -EBUSY;
2856 goto out;
2857 }
2858 }
2859
2860 /* create a buffer to store the information to pass to userspace */ 2923 /* create a buffer to store the information to pass to userspace */
2861 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2924 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2862 if (!iter) { 2925 if (!iter) {
@@ -2912,12 +2975,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2912 2975
2913 mutex_lock(&trace_types_lock); 2976 mutex_lock(&trace_types_lock);
2914 2977
2915 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2916 cpumask_clear(tracing_reader_cpumask);
2917 else
2918 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2919
2920
2921 if (iter->trace->pipe_close) 2978 if (iter->trace->pipe_close)
2922 iter->trace->pipe_close(iter); 2979 iter->trace->pipe_close(iter);
2923 2980
@@ -3079,6 +3136,7 @@ waitagain:
3079 iter->pos = -1; 3136 iter->pos = -1;
3080 3137
3081 trace_event_read_lock(); 3138 trace_event_read_lock();
3139 trace_access_lock(iter->cpu_file);
3082 while (find_next_entry_inc(iter) != NULL) { 3140 while (find_next_entry_inc(iter) != NULL) {
3083 enum print_line_t ret; 3141 enum print_line_t ret;
3084 int len = iter->seq.len; 3142 int len = iter->seq.len;
@@ -3095,6 +3153,7 @@ waitagain:
3095 if (iter->seq.len >= cnt) 3153 if (iter->seq.len >= cnt)
3096 break; 3154 break;
3097 } 3155 }
3156 trace_access_unlock(iter->cpu_file);
3098 trace_event_read_unlock(); 3157 trace_event_read_unlock();
3099 3158
3100 /* Now copy what we have to the user */ 3159 /* Now copy what we have to the user */
@@ -3220,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3220 } 3279 }
3221 3280
3222 trace_event_read_lock(); 3281 trace_event_read_lock();
3282 trace_access_lock(iter->cpu_file);
3223 3283
3224 /* Fill as many pages as possible. */ 3284 /* Fill as many pages as possible. */
3225 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3285 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3243,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3243 trace_seq_init(&iter->seq); 3303 trace_seq_init(&iter->seq);
3244 } 3304 }
3245 3305
3306 trace_access_unlock(iter->cpu_file);
3246 trace_event_read_unlock(); 3307 trace_event_read_unlock();
3247 mutex_unlock(&iter->mutex); 3308 mutex_unlock(&iter->mutex);
3248 3309
@@ -3544,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3544 3605
3545 info->read = 0; 3606 info->read = 0;
3546 3607
3608 trace_access_lock(info->cpu);
3547 ret = ring_buffer_read_page(info->tr->buffer, 3609 ret = ring_buffer_read_page(info->tr->buffer,
3548 &info->spare, 3610 &info->spare,
3549 count, 3611 count,
3550 info->cpu, 0); 3612 info->cpu, 0);
3613 trace_access_unlock(info->cpu);
3551 if (ret < 0) 3614 if (ret < 0)
3552 return 0; 3615 return 0;
3553 3616
@@ -3675,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3675 len &= PAGE_MASK; 3738 len &= PAGE_MASK;
3676 } 3739 }
3677 3740
3741 trace_access_lock(info->cpu);
3678 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3742 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3679 3743
3680 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3744 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3722,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3722 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3786 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3723 } 3787 }
3724 3788
3789 trace_access_unlock(info->cpu);
3725 spd.nr_pages = i; 3790 spd.nr_pages = i;
3726 3791
3727 /* did we read anything? */ 3792 /* did we read anything? */
@@ -4158,6 +4223,8 @@ static __init int tracer_init_debugfs(void)
4158 struct dentry *d_tracer; 4223 struct dentry *d_tracer;
4159 int cpu; 4224 int cpu;
4160 4225
4226 trace_access_lock_init();
4227
4161 d_tracer = tracing_init_dentry(); 4228 d_tracer = tracing_init_dentry();
4162 4229
4163 trace_create_file("tracing_enabled", 0644, d_tracer, 4230 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4392,9 +4459,6 @@ __init static int tracer_alloc_buffers(void)
4392 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4459 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4393 goto out_free_buffer_mask; 4460 goto out_free_buffer_mask;
4394 4461
4395 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4396 goto out_free_tracing_cpumask;
4397
4398 /* To save memory, keep the ring buffer size to its minimum */ 4462 /* To save memory, keep the ring buffer size to its minimum */
4399 if (ring_buffer_expanded) 4463 if (ring_buffer_expanded)
4400 ring_buf_size = trace_buf_size; 4464 ring_buf_size = trace_buf_size;
@@ -4452,8 +4516,6 @@ __init static int tracer_alloc_buffers(void)
4452 return 0; 4516 return 0;
4453 4517
4454out_free_cpumask: 4518out_free_cpumask:
4455 free_cpumask_var(tracing_reader_cpumask);
4456out_free_tracing_cpumask:
4457 free_cpumask_var(tracing_cpumask); 4519 free_cpumask_var(tracing_cpumask);
4458out_free_buffer_mask: 4520out_free_buffer_mask:
4459 free_cpumask_var(tracing_buffer_mask); 4521 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..fd05bcaf91b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
497#ifdef CONFIG_DYNAMIC_FTRACE 497#ifdef CONFIG_DYNAMIC_FTRACE
498/* TODO: make this variable */ 498/* TODO: make this variable */
499#define FTRACE_GRAPH_MAX_FUNCS 32 499#define FTRACE_GRAPH_MAX_FUNCS 32
500extern int ftrace_graph_filter_enabled;
500extern int ftrace_graph_count; 501extern int ftrace_graph_count;
501extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 502extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
502 503
@@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
504{ 505{
505 int i; 506 int i;
506 507
507 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 508 if (!ftrace_graph_filter_enabled)
508 return 1; 509 return 1;
509 510
510 for (i = 0; i < ftrace_graph_count; i++) { 511 for (i = 0; i < ftrace_graph_count; i++) {
@@ -791,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[];
791 792
792#undef FTRACE_ENTRY 793#undef FTRACE_ENTRY
793#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 794#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
794 extern struct ftrace_event_call event_##call; 795 extern struct ftrace_event_call \
796 __attribute__((__aligned__(4))) event_##call;
795#undef FTRACE_ENTRY_DUP 797#undef FTRACE_ENTRY_DUP
796#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 798#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
797 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 799 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 307 return -1;
308 if (percent_a > percent_b) 308 if (percent_a > percent_b)
309 return 1; 309 return 1;
310 else 310
311 return 0; 311 if (a->incorrect < b->incorrect)
312 return -1;
313 if (a->incorrect > b->incorrect)
314 return 1;
315
316 /*
317 * Since the above shows worse (incorrect) cases
318 * first, we continue that by showing best (correct)
319 * cases last.
320 */
321 if (a->correct > b->correct)
322 return -1;
323 if (a->correct < b->correct)
324 return 1;
325
326 return 0;
312} 327}
313 328
314static struct tracer_stat annotated_branch_stats = { 329static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573242cf..f0d693005075 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h>
9#include "trace.h" 10#include "trace.h"
10 11
11 12
12char *perf_trace_buf; 13static char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 14static char *perf_trace_buf_nmi;
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 15
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19 17
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
120 } 118 }
121 mutex_unlock(&event_mutex); 119 mutex_unlock(&event_mutex);
122} 120}
121
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags)
124{
125 struct trace_entry *entry;
126 char *trace_buf, *raw_data;
127 int pc, cpu;
128
129 pc = preempt_count();
130
131 /* Protect the per cpu buffer, begin the rcu read side */
132 local_irq_save(*irq_flags);
133
134 *rctxp = perf_swevent_get_recursion_context();
135 if (*rctxp < 0)
136 goto err_recursion;
137
138 cpu = smp_processor_id();
139
140 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi);
142 else
143 trace_buf = rcu_dereference(perf_trace_buf);
144
145 if (!trace_buf)
146 goto err;
147
148 raw_data = per_cpu_ptr(trace_buf, cpu);
149
150 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
152
153 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc);
155 entry->type = type;
156
157 return raw_data;
158err:
159 perf_swevent_put_recursion_context(*rctxp);
160err_recursion:
161 local_irq_restore(*irq_flags);
162 return NULL;
163}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..3f972ad98d04 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
60 return 0; 60 return 0;
61 61
62err: 62err:
63 if (field) { 63 if (field)
64 kfree(field->name); 64 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 65 kfree(field);
68 66
69 return -ENOMEM; 67 return -ENOMEM;
@@ -520,41 +518,16 @@ out:
520 return ret; 518 return ret;
521} 519}
522 520
523extern char *__bad_type_size(void);
524
525#undef FIELD
526#define FIELD(type, name) \
527 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
528 #type, "common_" #name, offsetof(typeof(field), name), \
529 sizeof(field.name), is_signed_type(type)
530
531static int trace_write_header(struct trace_seq *s)
532{
533 struct trace_entry field;
534
535 /* struct trace_entry */
536 return trace_seq_printf(s,
537 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
538 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
539 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
540 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
541 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
542 "\n",
543 FIELD(unsigned short, type),
544 FIELD(unsigned char, flags),
545 FIELD(unsigned char, preempt_count),
546 FIELD(int, pid),
547 FIELD(int, lock_depth));
548}
549
550static ssize_t 521static ssize_t
551event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 522event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
552 loff_t *ppos) 523 loff_t *ppos)
553{ 524{
554 struct ftrace_event_call *call = filp->private_data; 525 struct ftrace_event_call *call = filp->private_data;
526 struct ftrace_event_field *field;
555 struct trace_seq *s; 527 struct trace_seq *s;
528 int common_field_count = 5;
556 char *buf; 529 char *buf;
557 int r; 530 int r = 0;
558 531
559 if (*ppos) 532 if (*ppos)
560 return 0; 533 return 0;
@@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
565 538
566 trace_seq_init(s); 539 trace_seq_init(s);
567 540
568 /* If any of the first writes fail, so will the show_format. */
569
570 trace_seq_printf(s, "name: %s\n", call->name); 541 trace_seq_printf(s, "name: %s\n", call->name);
571 trace_seq_printf(s, "ID: %d\n", call->id); 542 trace_seq_printf(s, "ID: %d\n", call->id);
572 trace_seq_printf(s, "format:\n"); 543 trace_seq_printf(s, "format:\n");
573 trace_write_header(s);
574 544
575 r = call->show_format(call, s); 545 list_for_each_entry_reverse(field, &call->fields, link) {
546 /*
547 * Smartly shows the array type(except dynamic array).
548 * Normal:
549 * field:TYPE VAR
550 * If TYPE := TYPE[LEN], it is shown:
551 * field:TYPE VAR[LEN]
552 */
553 const char *array_descriptor = strchr(field->type, '[');
554
555 if (!strncmp(field->type, "__data_loc", 10))
556 array_descriptor = NULL;
557
558 if (!array_descriptor) {
559 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
560 "\tsize:%u;\tsigned:%d;\n",
561 field->type, field->name, field->offset,
562 field->size, !!field->is_signed);
563 } else {
564 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
565 "\tsize:%u;\tsigned:%d;\n",
566 (int)(array_descriptor - field->type),
567 field->type, field->name,
568 array_descriptor, field->offset,
569 field->size, !!field->is_signed);
570 }
571
572 if (--common_field_count == 0)
573 r = trace_seq_printf(s, "\n");
574
575 if (!r)
576 break;
577 }
578
579 if (r)
580 r = trace_seq_printf(s, "\nprint fmt: %s\n",
581 call->print_fmt);
582
576 if (!r) { 583 if (!r) {
577 /* 584 /*
578 * ug! The format output is bigger than a PAGE!! 585 * ug! The format output is bigger than a PAGE!!
@@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
948 filter); 955 filter);
949 } 956 }
950 957
951 /* A trace may not want to export its format */
952 if (!call->show_format)
953 return 0;
954
955 trace_create_file("format", 0444, call->dir, call, 958 trace_create_file("format", 0444, call->dir, call,
956 format); 959 format);
957 960
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e42af9aad69f..4615f62a04f1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1371,7 +1371,7 @@ out_unlock:
1371 return err; 1371 return err;
1372} 1372}
1373 1373
1374#ifdef CONFIG_EVENT_PROFILE 1374#ifdef CONFIG_PERF_EVENTS
1375 1375
1376void ftrace_profile_free_filter(struct perf_event *event) 1376void ftrace_profile_free_filter(struct perf_event *event)
1377{ 1377{
@@ -1439,5 +1439,5 @@ out_unlock:
1439 return err; 1439 return err;
1440} 1440}
1441 1441
1442#endif /* CONFIG_EVENT_PROFILE */ 1442#endif /* CONFIG_PERF_EVENTS */
1443 1443
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4e..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
84
85#undef __array
86#define __array(type, item, len) \
87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
89 offsetof(typeof(field), item), \
90 sizeof(field.item), is_signed_type(type)); \
91 if (!ret) \
92 return 0;
93
94#undef __array_desc
95#define __array_desc(type, container, item, len) \
96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
101 if (!ret) \
102 return 0;
103
104#undef __dynamic_array
105#define __dynamic_array(type, item) \
106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
112
113#undef F_printk
114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
115
116#undef __entry
117#define __entry REC
118
119#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121static int \
122ftrace_format_##name(struct ftrace_event_call *unused, \
123 struct trace_seq *s) \
124{ \
125 struct struct_name field __attribute__((unused)); \
126 int ret = 0; \
127 \
128 tstruct; \
129 \
130 trace_seq_printf(s, "\nprint fmt: " print); \
131 \
132 return ret; \
133}
134
135#include "trace_entries.h"
136
137#undef __field 65#undef __field
138#define __field(type, item) \ 66#define __field(type, item) \
139 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
175 return ret; 103 return ret;
176 104
177#undef __dynamic_array 105#undef __dynamic_array
178#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
179 112
180#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
181#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
198 return 0; 131 return 0;
199} 132}
200 133
134#undef __entry
135#define __entry REC
136
201#undef __field 137#undef __field
202#define __field(type, item) 138#define __field(type, item)
203 139
@@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
213#undef __dynamic_array 149#undef __dynamic_array
214#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
215 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
216#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
217#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
218 \ 157 \
@@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
223 .id = type, \ 162 .id = type, \
224 .system = __stringify(TRACE_SYSTEM), \ 163 .system = __stringify(TRACE_SYSTEM), \
225 .raw_init = ftrace_raw_init_event, \ 164 .raw_init = ftrace_raw_init_event, \
226 .show_format = ftrace_format_##call, \ 165 .print_fmt = print, \
227 .define_fields = ftrace_define_fields_##call, \ 166 .define_fields = ftrace_define_fields_##call, \
228}; \ 167}; \
229 168
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..3fc2a575664f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -18,6 +18,7 @@ struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore; 20 int ignore;
21 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
21}; 22};
22 23
23struct fgraph_data { 24struct fgraph_data {
@@ -187,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr,
187 struct ring_buffer *buffer = tr->buffer; 188 struct ring_buffer *buffer = tr->buffer;
188 struct ftrace_graph_ent_entry *entry; 189 struct ftrace_graph_ent_entry *entry;
189 190
190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 191 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
191 return 0; 192 return 0;
192 193
193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 194 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
212 int cpu; 213 int cpu;
213 int pc; 214 int pc;
214 215
215 if (unlikely(!tr))
216 return 0;
217
218 if (!ftrace_trace_task(current)) 216 if (!ftrace_trace_task(current))
219 return 0; 217 return 0;
220 218
221 if (!ftrace_graph_addr(trace->func)) 219 /* trace it when it is-nested-in or is a function enabled. */
220 if (!(trace->depth || ftrace_graph_addr(trace->func)))
222 return 0; 221 return 0;
223 222
224 local_irq_save(flags); 223 local_irq_save(flags);
@@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
231 } else { 230 } else {
232 ret = 0; 231 ret = 0;
233 } 232 }
234 /* Only do the atomic if it is not already set */
235 if (!test_tsk_trace_graph(current))
236 set_tsk_trace_graph(current);
237 233
238 atomic_dec(&data->disabled); 234 atomic_dec(&data->disabled);
239 local_irq_restore(flags); 235 local_irq_restore(flags);
@@ -251,7 +247,7 @@ static void __trace_graph_return(struct trace_array *tr,
251 struct ring_buffer *buffer = tr->buffer; 247 struct ring_buffer *buffer = tr->buffer;
252 struct ftrace_graph_ret_entry *entry; 248 struct ftrace_graph_ret_entry *entry;
253 249
254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 250 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
255 return; 251 return;
256 252
257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 253 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
281 pc = preempt_count(); 277 pc = preempt_count();
282 __trace_graph_return(tr, trace, flags, pc); 278 __trace_graph_return(tr, trace, flags, pc);
283 } 279 }
284 if (!trace->depth)
285 clear_tsk_trace_graph(current);
286 atomic_dec(&data->disabled); 280 atomic_dec(&data->disabled);
287 local_irq_restore(flags); 281 local_irq_restore(flags);
288} 282}
289 283
284void set_graph_array(struct trace_array *tr)
285{
286 graph_array = tr;
287
288 /* Make graph_array visible before we start tracing */
289
290 smp_mb();
291}
292
290static int graph_trace_init(struct trace_array *tr) 293static int graph_trace_init(struct trace_array *tr)
291{ 294{
292 int ret; 295 int ret;
293 296
294 graph_array = tr; 297 set_graph_array(tr);
295 ret = register_ftrace_graph(&trace_graph_return, 298 ret = register_ftrace_graph(&trace_graph_return,
296 &trace_graph_entry); 299 &trace_graph_entry);
297 if (ret) 300 if (ret)
@@ -301,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr)
301 return 0; 304 return 0;
302} 305}
303 306
304void set_graph_array(struct trace_array *tr)
305{
306 graph_array = tr;
307}
308
309static void graph_trace_reset(struct trace_array *tr) 307static void graph_trace_reset(struct trace_array *tr)
310{ 308{
311 tracing_stop_cmdline_record(); 309 tracing_stop_cmdline_record();
@@ -673,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
673 duration = graph_ret->rettime - graph_ret->calltime; 671 duration = graph_ret->rettime - graph_ret->calltime;
674 672
675 if (data) { 673 if (data) {
674 struct fgraph_cpu_data *cpu_data;
676 int cpu = iter->cpu; 675 int cpu = iter->cpu;
677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 676
677 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
678 678
679 /* 679 /*
680 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
681 * this is a leaf function, keep the comments 681 * this is a leaf function, keep the comments
682 * equal to this depth. 682 * equal to this depth.
683 */ 683 */
684 *depth = call->depth - 1; 684 cpu_data->depth = call->depth - 1;
685
686 /* No need to keep this function around for this depth */
687 if (call->depth < FTRACE_RETFUNC_DEPTH)
688 cpu_data->enter_funcs[call->depth] = 0;
685 } 689 }
686 690
687 /* Overhead */ 691 /* Overhead */
@@ -721,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
721 int i; 725 int i;
722 726
723 if (data) { 727 if (data) {
728 struct fgraph_cpu_data *cpu_data;
724 int cpu = iter->cpu; 729 int cpu = iter->cpu;
725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
726 730
727 *depth = call->depth; 731 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
732 cpu_data->depth = call->depth;
733
734 /* Save this function pointer to see if the exit matches */
735 if (call->depth < FTRACE_RETFUNC_DEPTH)
736 cpu_data->enter_funcs[call->depth] = call->func;
728 } 737 }
729 738
730 /* No overhead */ 739 /* No overhead */
@@ -854,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
854 struct fgraph_data *data = iter->private; 863 struct fgraph_data *data = iter->private;
855 pid_t pid = ent->pid; 864 pid_t pid = ent->pid;
856 int cpu = iter->cpu; 865 int cpu = iter->cpu;
866 int func_match = 1;
857 int ret; 867 int ret;
858 int i; 868 int i;
859 869
860 if (data) { 870 if (data) {
871 struct fgraph_cpu_data *cpu_data;
861 int cpu = iter->cpu; 872 int cpu = iter->cpu;
862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 873
874 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
863 875
864 /* 876 /*
865 * Comments display at + 1 to depth. This is the 877 * Comments display at + 1 to depth. This is the
866 * return from a function, we now want the comments 878 * return from a function, we now want the comments
867 * to display at the same level of the bracket. 879 * to display at the same level of the bracket.
868 */ 880 */
869 *depth = trace->depth - 1; 881 cpu_data->depth = trace->depth - 1;
882
883 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
884 if (cpu_data->enter_funcs[trace->depth] != trace->func)
885 func_match = 0;
886 cpu_data->enter_funcs[trace->depth] = 0;
887 }
870 } 888 }
871 889
872 if (print_graph_prologue(iter, s, 0, 0)) 890 if (print_graph_prologue(iter, s, 0, 0))
@@ -891,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
891 return TRACE_TYPE_PARTIAL_LINE; 909 return TRACE_TYPE_PARTIAL_LINE;
892 } 910 }
893 911
894 ret = trace_seq_printf(s, "}\n"); 912 /*
895 if (!ret) 913 * If the return function does not have a matching entry,
896 return TRACE_TYPE_PARTIAL_LINE; 914 * then the entry was lost. Instead of just printing
915 * the '}' and letting the user guess what function this
916 * belongs to, write out the function name.
917 */
918 if (func_match) {
919 ret = trace_seq_printf(s, "}\n");
920 if (!ret)
921 return TRACE_TYPE_PARTIAL_LINE;
922 } else {
923 ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
924 if (!ret)
925 return TRACE_TYPE_PARTIAL_LINE;
926 }
897 927
898 /* Overrun */ 928 /* Overrun */
899 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 929 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 50b1b8239806..505c92273b1a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
91 return retval; 91 return retval;
92} 92}
93 93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy) 95 void *dummy)
101{ 96{
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{ 226{
232 int ret = -EINVAL; 227 int ret = -EINVAL;
233 228
234 if (ff->func == fetch_argument) 229 if (ff->func == fetch_register) {
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name; 230 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data)); 231 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name); 232 ret = snprintf(buf, n, "%%%s", name);
@@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
489 } 482 }
490 } else 483 } else
491 ret = -EINVAL; 484 ret = -EINVAL;
492 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
493 ret = strict_strtoul(arg + 3, 10, &param);
494 if (ret || param > PARAM_MAX_ARGS)
495 ret = -EINVAL;
496 else {
497 ff->func = fetch_argument;
498 ff->data = (void *)param;
499 }
500 } else 485 } else
501 ret = -EINVAL; 486 ret = -EINVAL;
502 return ret; 487 return ret;
@@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
611 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 596 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
612 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 597 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
613 * Fetch args: 598 * Fetch args:
614 * $argN : fetch Nth of function argument. (N:0-)
615 * $retval : fetch return value 599 * $retval : fetch return value
616 * $stack : fetch stack address 600 * $stack : fetch stack address
617 * $stackN : fetch Nth of stack (N:0-) 601 * $stackN : fetch Nth of stack (N:0-)
@@ -651,12 +635,12 @@ static int create_trace_probe(int argc, char **argv)
651 event = strchr(group, '/') + 1; 635 event = strchr(group, '/') + 1;
652 event[-1] = '\0'; 636 event[-1] = '\0';
653 if (strlen(group) == 0) { 637 if (strlen(group) == 0) {
654 pr_info("Group name is not specifiled\n"); 638 pr_info("Group name is not specified\n");
655 return -EINVAL; 639 return -EINVAL;
656 } 640 }
657 } 641 }
658 if (strlen(event) == 0) { 642 if (strlen(event) == 0) {
659 pr_info("Event name is not specifiled\n"); 643 pr_info("Event name is not specified\n");
660 return -EINVAL; 644 return -EINVAL;
661 } 645 }
662 } 646 }
@@ -958,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
958}; 942};
959 943
960/* Kprobe handler */ 944/* Kprobe handler */
961static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
962{ 946{
963 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
964 struct kprobe_trace_entry *entry; 948 struct kprobe_trace_entry *entry;
@@ -978,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
978 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
979 irq_flags, pc); 963 irq_flags, pc);
980 if (!event) 964 if (!event)
981 return 0; 965 return;
982 966
983 entry = ring_buffer_event_data(event); 967 entry = ring_buffer_event_data(event);
984 entry->nargs = tp->nr_args; 968 entry->nargs = tp->nr_args;
@@ -988,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
988 972
989 if (!filter_current_check_discard(buffer, call, entry, event)) 973 if (!filter_current_check_discard(buffer, call, entry, event))
990 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
991 return 0;
992} 975}
993 976
994/* Kretprobe handler */ 977/* Kretprobe handler */
995static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, 978static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
996 struct pt_regs *regs) 979 struct pt_regs *regs)
997{ 980{
998 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1011,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1011 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
1012 irq_flags, pc); 995 irq_flags, pc);
1013 if (!event) 996 if (!event)
1014 return 0; 997 return;
1015 998
1016 entry = ring_buffer_event_data(event); 999 entry = ring_buffer_event_data(event);
1017 entry->nargs = tp->nr_args; 1000 entry->nargs = tp->nr_args;
@@ -1022,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1022 1005
1023 if (!filter_current_check_discard(buffer, call, entry, event)) 1006 if (!filter_current_check_discard(buffer, call, entry, event))
1024 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1025
1026 return 0;
1027} 1008}
1028 1009
1029/* Event entry printers */ 1010/* Event entry printers */
@@ -1174,213 +1155,123 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1174 return 0; 1155 return 0;
1175} 1156}
1176 1157
1177static int __probe_event_show_format(struct trace_seq *s, 1158static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1178 struct trace_probe *tp, const char *fmt,
1179 const char *arg)
1180{ 1159{
1181 int i; 1160 int i;
1161 int pos = 0;
1182 1162
1183 /* Show format */ 1163 const char *fmt, *arg;
1184 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1185 return 0;
1186 1164
1187 for (i = 0; i < tp->nr_args; i++) 1165 if (!probe_is_return(tp)) {
1188 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) 1166 fmt = "(%lx)";
1189 return 0; 1167 arg = "REC->" FIELD_STRING_IP;
1168 } else {
1169 fmt = "(%lx <- %lx)";
1170 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1171 }
1190 1172
1191 if (!trace_seq_printf(s, "\", %s", arg)) 1173 /* When len=0, we just calculate the needed length */
1192 return 0; 1174#define LEN_OR_ZERO (len ? len - pos : 0)
1193 1175
1194 for (i = 0; i < tp->nr_args; i++) 1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1195 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1196 return 0;
1197
1198 return trace_seq_puts(s, "\n");
1199}
1200 1177
1201#undef SHOW_FIELD 1178 for (i = 0; i < tp->nr_args; i++) {
1202#define SHOW_FIELD(type, item, name) \ 1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
1203 do { \ 1180 tp->args[i].name);
1204 ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ 1181 }
1205 "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
1206 (unsigned int)offsetof(typeof(field), item),\
1207 (unsigned int)sizeof(type), \
1208 is_signed_type(type)); \
1209 if (!ret) \
1210 return 0; \
1211 } while (0)
1212 1182
1213static int kprobe_event_show_format(struct ftrace_event_call *call, 1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1214 struct trace_seq *s)
1215{
1216 struct kprobe_trace_entry field __attribute__((unused));
1217 int ret, i;
1218 struct trace_probe *tp = (struct trace_probe *)call->data;
1219 1184
1220 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); 1185 for (i = 0; i < tp->nr_args; i++) {
1221 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1187 tp->args[i].name);
1188 }
1222 1189
1223 /* Show fields */ 1190#undef LEN_OR_ZERO
1224 for (i = 0; i < tp->nr_args; i++)
1225 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1226 trace_seq_puts(s, "\n");
1227 1191
1228 return __probe_event_show_format(s, tp, "(%lx)", 1192 /* return the length of print_fmt */
1229 "REC->" FIELD_STRING_IP); 1193 return pos;
1230} 1194}
1231 1195
1232static int kretprobe_event_show_format(struct ftrace_event_call *call, 1196static int set_print_fmt(struct trace_probe *tp)
1233 struct trace_seq *s)
1234{ 1197{
1235 struct kretprobe_trace_entry field __attribute__((unused)); 1198 int len;
1236 int ret, i; 1199 char *print_fmt;
1237 struct trace_probe *tp = (struct trace_probe *)call->data;
1238 1200
1239 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); 1201 /* First: called with 0 length to calculate the needed length */
1240 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); 1202 len = __set_print_fmt(tp, NULL, 0);
1241 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1203 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1204 if (!print_fmt)
1205 return -ENOMEM;
1242 1206
1243 /* Show fields */ 1207 /* Second: actually write the @print_fmt */
1244 for (i = 0; i < tp->nr_args; i++) 1208 __set_print_fmt(tp, print_fmt, len + 1);
1245 SHOW_FIELD(unsigned long, args[i], tp->args[i].name); 1209 tp->call.print_fmt = print_fmt;
1246 trace_seq_puts(s, "\n");
1247 1210
1248 return __probe_event_show_format(s, tp, "(%lx <- %lx)", 1211 return 0;
1249 "REC->" FIELD_STRING_FUNC
1250 ", REC->" FIELD_STRING_RETIP);
1251} 1212}
1252 1213
1253#ifdef CONFIG_EVENT_PROFILE 1214#ifdef CONFIG_PERF_EVENTS
1254 1215
1255/* Kprobe profile handler */ 1216/* Kprobe profile handler */
1256static __kprobes int kprobe_profile_func(struct kprobe *kp, 1217static __kprobes void kprobe_profile_func(struct kprobe *kp,
1257 struct pt_regs *regs) 1218 struct pt_regs *regs)
1258{ 1219{
1259 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1260 struct ftrace_event_call *call = &tp->call; 1221 struct ftrace_event_call *call = &tp->call;
1261 struct kprobe_trace_entry *entry; 1222 struct kprobe_trace_entry *entry;
1262 struct trace_entry *ent; 1223 int size, __size, i;
1263 int size, __size, i, pc, __cpu;
1264 unsigned long irq_flags; 1224 unsigned long irq_flags;
1265 char *trace_buf;
1266 char *raw_data;
1267 int rctx; 1225 int rctx;
1268 1226
1269 pc = preempt_count();
1270 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1271 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1228 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1272 size -= sizeof(u32); 1229 size -= sizeof(u32);
1273 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1230 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1274 "profile buffer not large enough")) 1231 "profile buffer not large enough"))
1275 return 0; 1232 return;
1276
1277 /*
1278 * Protect the non nmi buffer
1279 * This also protects the rcu read side
1280 */
1281 local_irq_save(irq_flags);
1282
1283 rctx = perf_swevent_get_recursion_context();
1284 if (rctx < 0)
1285 goto end_recursion;
1286
1287 __cpu = smp_processor_id();
1288
1289 if (in_nmi())
1290 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1291 else
1292 trace_buf = rcu_dereference(perf_trace_buf);
1293 1233
1294 if (!trace_buf) 1234 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1295 goto end; 1235 if (!entry)
1296 1236 return;
1297 raw_data = per_cpu_ptr(trace_buf, __cpu);
1298
1299 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1300 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1301 entry = (struct kprobe_trace_entry *)raw_data;
1302 ent = &entry->ent;
1303 1237
1304 tracing_generic_entry_update(ent, irq_flags, pc);
1305 ent->type = call->id;
1306 entry->nargs = tp->nr_args; 1238 entry->nargs = tp->nr_args;
1307 entry->ip = (unsigned long)kp->addr; 1239 entry->ip = (unsigned long)kp->addr;
1308 for (i = 0; i < tp->nr_args; i++) 1240 for (i = 0; i < tp->nr_args; i++)
1309 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1310 perf_tp_event(call->id, entry->ip, 1, entry, size);
1311
1312end:
1313 perf_swevent_put_recursion_context(rctx);
1314end_recursion:
1315 local_irq_restore(irq_flags);
1316 1242
1317 return 0; 1243 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
1318} 1244}
1319 1245
1320/* Kretprobe profile handler */ 1246/* Kretprobe profile handler */
1321static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, 1247static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1322 struct pt_regs *regs) 1248 struct pt_regs *regs)
1323{ 1249{
1324 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1325 struct ftrace_event_call *call = &tp->call; 1251 struct ftrace_event_call *call = &tp->call;
1326 struct kretprobe_trace_entry *entry; 1252 struct kretprobe_trace_entry *entry;
1327 struct trace_entry *ent; 1253 int size, __size, i;
1328 int size, __size, i, pc, __cpu;
1329 unsigned long irq_flags; 1254 unsigned long irq_flags;
1330 char *trace_buf;
1331 char *raw_data;
1332 int rctx; 1255 int rctx;
1333 1256
1334 pc = preempt_count();
1335 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1336 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1258 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1337 size -= sizeof(u32); 1259 size -= sizeof(u32);
1338 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1260 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1339 "profile buffer not large enough")) 1261 "profile buffer not large enough"))
1340 return 0; 1262 return;
1341
1342 /*
1343 * Protect the non nmi buffer
1344 * This also protects the rcu read side
1345 */
1346 local_irq_save(irq_flags);
1347
1348 rctx = perf_swevent_get_recursion_context();
1349 if (rctx < 0)
1350 goto end_recursion;
1351
1352 __cpu = smp_processor_id();
1353 1263
1354 if (in_nmi()) 1264 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1355 trace_buf = rcu_dereference(perf_trace_buf_nmi); 1265 if (!entry)
1356 else 1266 return;
1357 trace_buf = rcu_dereference(perf_trace_buf);
1358
1359 if (!trace_buf)
1360 goto end;
1361
1362 raw_data = per_cpu_ptr(trace_buf, __cpu);
1363
1364 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1365 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1366 entry = (struct kretprobe_trace_entry *)raw_data;
1367 ent = &entry->ent;
1368 1267
1369 tracing_generic_entry_update(ent, irq_flags, pc);
1370 ent->type = call->id;
1371 entry->nargs = tp->nr_args; 1268 entry->nargs = tp->nr_args;
1372 entry->func = (unsigned long)tp->rp.kp.addr; 1269 entry->func = (unsigned long)tp->rp.kp.addr;
1373 entry->ret_ip = (unsigned long)ri->ret_addr; 1270 entry->ret_ip = (unsigned long)ri->ret_addr;
1374 for (i = 0; i < tp->nr_args; i++) 1271 for (i = 0; i < tp->nr_args; i++)
1375 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1376 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1377
1378end:
1379 perf_swevent_put_recursion_context(rctx);
1380end_recursion:
1381 local_irq_restore(irq_flags);
1382 1273
1383 return 0; 1274 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
1384} 1275}
1385 1276
1386static int probe_profile_enable(struct ftrace_event_call *call) 1277static int probe_profile_enable(struct ftrace_event_call *call)
@@ -1408,7 +1299,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1408 disable_kprobe(&tp->rp.kp); 1299 disable_kprobe(&tp->rp.kp);
1409 } 1300 }
1410} 1301}
1411#endif /* CONFIG_EVENT_PROFILE */ 1302#endif /* CONFIG_PERF_EVENTS */
1412 1303
1413 1304
1414static __kprobes 1305static __kprobes
@@ -1418,10 +1309,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1418 1309
1419 if (tp->flags & TP_FLAG_TRACE) 1310 if (tp->flags & TP_FLAG_TRACE)
1420 kprobe_trace_func(kp, regs); 1311 kprobe_trace_func(kp, regs);
1421#ifdef CONFIG_EVENT_PROFILE 1312#ifdef CONFIG_PERF_EVENTS
1422 if (tp->flags & TP_FLAG_PROFILE) 1313 if (tp->flags & TP_FLAG_PROFILE)
1423 kprobe_profile_func(kp, regs); 1314 kprobe_profile_func(kp, regs);
1424#endif /* CONFIG_EVENT_PROFILE */ 1315#endif
1425 return 0; /* We don't tweek kernel, so just return 0 */ 1316 return 0; /* We don't tweek kernel, so just return 0 */
1426} 1317}
1427 1318
@@ -1432,10 +1323,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1432 1323
1433 if (tp->flags & TP_FLAG_TRACE) 1324 if (tp->flags & TP_FLAG_TRACE)
1434 kretprobe_trace_func(ri, regs); 1325 kretprobe_trace_func(ri, regs);
1435#ifdef CONFIG_EVENT_PROFILE 1326#ifdef CONFIG_PERF_EVENTS
1436 if (tp->flags & TP_FLAG_PROFILE) 1327 if (tp->flags & TP_FLAG_PROFILE)
1437 kretprobe_profile_func(ri, regs); 1328 kretprobe_profile_func(ri, regs);
1438#endif /* CONFIG_EVENT_PROFILE */ 1329#endif
1439 return 0; /* We don't tweek kernel, so just return 0 */ 1330 return 0; /* We don't tweek kernel, so just return 0 */
1440} 1331}
1441 1332
@@ -1448,23 +1339,25 @@ static int register_probe_event(struct trace_probe *tp)
1448 if (probe_is_return(tp)) { 1339 if (probe_is_return(tp)) {
1449 tp->event.trace = print_kretprobe_event; 1340 tp->event.trace = print_kretprobe_event;
1450 call->raw_init = probe_event_raw_init; 1341 call->raw_init = probe_event_raw_init;
1451 call->show_format = kretprobe_event_show_format;
1452 call->define_fields = kretprobe_event_define_fields; 1342 call->define_fields = kretprobe_event_define_fields;
1453 } else { 1343 } else {
1454 tp->event.trace = print_kprobe_event; 1344 tp->event.trace = print_kprobe_event;
1455 call->raw_init = probe_event_raw_init; 1345 call->raw_init = probe_event_raw_init;
1456 call->show_format = kprobe_event_show_format;
1457 call->define_fields = kprobe_event_define_fields; 1346 call->define_fields = kprobe_event_define_fields;
1458 } 1347 }
1348 if (set_print_fmt(tp) < 0)
1349 return -ENOMEM;
1459 call->event = &tp->event; 1350 call->event = &tp->event;
1460 call->id = register_ftrace_event(&tp->event); 1351 call->id = register_ftrace_event(&tp->event);
1461 if (!call->id) 1352 if (!call->id) {
1353 kfree(call->print_fmt);
1462 return -ENODEV; 1354 return -ENODEV;
1355 }
1463 call->enabled = 0; 1356 call->enabled = 0;
1464 call->regfunc = probe_event_enable; 1357 call->regfunc = probe_event_enable;
1465 call->unregfunc = probe_event_disable; 1358 call->unregfunc = probe_event_disable;
1466 1359
1467#ifdef CONFIG_EVENT_PROFILE 1360#ifdef CONFIG_PERF_EVENTS
1468 call->profile_enable = probe_profile_enable; 1361 call->profile_enable = probe_profile_enable;
1469 call->profile_disable = probe_profile_disable; 1362 call->profile_disable = probe_profile_disable;
1470#endif 1363#endif
@@ -1472,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp)
1472 ret = trace_add_event_call(call); 1365 ret = trace_add_event_call(call);
1473 if (ret) { 1366 if (ret) {
1474 pr_info("Failed to register kprobe event: %s\n", call->name); 1367 pr_info("Failed to register kprobe event: %s\n", call->name);
1368 kfree(call->print_fmt);
1475 unregister_ftrace_event(&tp->event); 1369 unregister_ftrace_event(&tp->event);
1476 } 1370 }
1477 return ret; 1371 return ret;
@@ -1481,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1481{ 1375{
1482 /* tp->event is unregistered in trace_remove_event_call() */ 1376 /* tp->event is unregistered in trace_remove_event_call() */
1483 trace_remove_event_call(&tp->call); 1377 trace_remove_event_call(&tp->call);
1378 kfree(tp->call.print_fmt);
1484} 1379}
1485 1380
1486/* Make a debugfs interface for controling probe points */ 1381/* Make a debugfs interface for controling probe points */
@@ -1523,28 +1418,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1523 1418
1524static __init int kprobe_trace_self_tests_init(void) 1419static __init int kprobe_trace_self_tests_init(void)
1525{ 1420{
1526 int ret; 1421 int ret, warn = 0;
1527 int (*target)(int, int, int, int, int, int); 1422 int (*target)(int, int, int, int, int, int);
1423 struct trace_probe *tp;
1528 1424
1529 target = kprobe_trace_selftest_target; 1425 target = kprobe_trace_selftest_target;
1530 1426
1531 pr_info("Testing kprobe tracing: "); 1427 pr_info("Testing kprobe tracing: ");
1532 1428
1533 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1429 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1534 "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); 1430 "$stack $stack0 +0($stack)");
1535 if (WARN_ON_ONCE(ret)) 1431 if (WARN_ON_ONCE(ret)) {
1536 pr_warning("error enabling function entry\n"); 1432 pr_warning("error on probing function entry.\n");
1433 warn++;
1434 } else {
1435 /* Enable trace point */
1436 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1437 if (WARN_ON_ONCE(tp == NULL)) {
1438 pr_warning("error on getting new probe.\n");
1439 warn++;
1440 } else
1441 probe_event_enable(&tp->call);
1442 }
1537 1443
1538 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1444 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1539 "$retval"); 1445 "$retval");
1540 if (WARN_ON_ONCE(ret)) 1446 if (WARN_ON_ONCE(ret)) {
1541 pr_warning("error enabling function return\n"); 1447 pr_warning("error on probing function return.\n");
1448 warn++;
1449 } else {
1450 /* Enable trace point */
1451 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1452 if (WARN_ON_ONCE(tp == NULL)) {
1453 pr_warning("error on getting new probe.\n");
1454 warn++;
1455 } else
1456 probe_event_enable(&tp->call);
1457 }
1458
1459 if (warn)
1460 goto end;
1542 1461
1543 ret = target(1, 2, 3, 4, 5, 6); 1462 ret = target(1, 2, 3, 4, 5, 6);
1544 1463
1545 cleanup_all_probes(); 1464 ret = command_trace_probe("-:testprobe");
1465 if (WARN_ON_ONCE(ret)) {
1466 pr_warning("error on deleting a probe.\n");
1467 warn++;
1468 }
1546 1469
1547 pr_cont("OK\n"); 1470 ret = command_trace_probe("-:testprobe2");
1471 if (WARN_ON_ONCE(ret)) {
1472 pr_warning("error on deleting a probe.\n");
1473 warn++;
1474 }
1475
1476end:
1477 cleanup_all_probes();
1478 if (warn)
1479 pr_cont("NG: Some tests are failed. Please check them.\n");
1480 else
1481 pr_cont("OK\n");
1548 return 0; 1482 return 0;
1549} 1483}
1550 1484
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..cba47d7935cc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -143,70 +143,65 @@ extern char *__bad_type_size(void);
143 #type, #name, offsetof(typeof(trace), name), \ 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type) 144 sizeof(trace.name), is_signed_type(type)
145 145
146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146static
147int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
147{ 148{
148 int i; 149 int i;
149 int ret; 150 int pos = 0;
150 struct syscall_metadata *entry = call->data;
151 struct syscall_trace_enter trace;
152 int offset = offsetof(struct syscall_trace_enter, args);
153 151
154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 152 /* When len=0, we just calculate the needed length */
155 "\tsigned:%u;\n", 153#define LEN_OR_ZERO (len ? len - pos : 0)
156 SYSCALL_FIELD(int, nr));
157 if (!ret)
158 return 0;
159 154
155 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
160 for (i = 0; i < entry->nb_args; i++) { 156 for (i = 0; i < entry->nb_args; i++) {
161 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 157 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
162 entry->args[i]); 158 entry->args[i], sizeof(unsigned long),
163 if (!ret) 159 i == entry->nb_args - 1 ? "" : ", ");
164 return 0;
165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
169 if (!ret)
170 return 0;
171 offset += sizeof(unsigned long);
172 } 160 }
161 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
173 162
174 trace_seq_puts(s, "\nprint fmt: \"");
175 for (i = 0; i < entry->nb_args; i++) { 163 for (i = 0; i < entry->nb_args; i++) {
176 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 164 pos += snprintf(buf + pos, LEN_OR_ZERO,
177 sizeof(unsigned long), 165 ", ((unsigned long)(REC->%s))", entry->args[i]);
178 i == entry->nb_args - 1 ? "" : ", ");
179 if (!ret)
180 return 0;
181 } 166 }
182 trace_seq_putc(s, '"');
183 167
184 for (i = 0; i < entry->nb_args; i++) { 168#undef LEN_OR_ZERO
185 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186 entry->args[i]);
187 if (!ret)
188 return 0;
189 }
190 169
191 return trace_seq_putc(s, '\n'); 170 /* return the length of print_fmt */
171 return pos;
192} 172}
193 173
194int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 174static int set_syscall_print_fmt(struct ftrace_event_call *call)
195{ 175{
196 int ret; 176 char *print_fmt;
197 struct syscall_trace_exit trace; 177 int len;
178 struct syscall_metadata *entry = call->data;
198 179
199 ret = trace_seq_printf(s, 180 if (entry->enter_event != call) {
200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 181 call->print_fmt = "\"0x%lx\", REC->ret";
201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
204 SYSCALL_FIELD(int, nr),
205 SYSCALL_FIELD(long, ret));
206 if (!ret)
207 return 0; 182 return 0;
183 }
184
185 /* First: called with 0 length to calculate the needed length */
186 len = __set_enter_print_fmt(entry, NULL, 0);
187
188 print_fmt = kmalloc(len + 1, GFP_KERNEL);
189 if (!print_fmt)
190 return -ENOMEM;
191
192 /* Second: actually write the @print_fmt */
193 __set_enter_print_fmt(entry, print_fmt, len + 1);
194 call->print_fmt = print_fmt;
208 195
209 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 196 return 0;
197}
198
199static void free_syscall_print_fmt(struct ftrace_event_call *call)
200{
201 struct syscall_metadata *entry = call->data;
202
203 if (entry->enter_event == call)
204 kfree(call->print_fmt);
210} 205}
211 206
212int syscall_enter_define_fields(struct ftrace_event_call *call) 207int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
386{ 381{
387 int id; 382 int id;
388 383
389 id = register_ftrace_event(call->event); 384 if (set_syscall_print_fmt(call) < 0)
390 if (!id) 385 return -ENOMEM;
391 return -ENODEV; 386
392 call->id = id; 387 id = trace_event_raw_init(call);
393 INIT_LIST_HEAD(&call->fields); 388
394 return 0; 389 if (id < 0) {
390 free_syscall_print_fmt(call);
391 return id;
392 }
393
394 return id;
395}
396
397unsigned long __init arch_syscall_addr(int nr)
398{
399 return (unsigned long)sys_call_table[nr];
395} 400}
396 401
397int __init init_ftrace_syscalls(void) 402int __init init_ftrace_syscalls(void)
@@ -421,7 +426,7 @@ int __init init_ftrace_syscalls(void)
421} 426}
422core_initcall(init_ftrace_syscalls); 427core_initcall(init_ftrace_syscalls);
423 428
424#ifdef CONFIG_EVENT_PROFILE 429#ifdef CONFIG_PERF_EVENTS
425 430
426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 431static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 432static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -433,12 +438,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
433 struct syscall_metadata *sys_data; 438 struct syscall_metadata *sys_data;
434 struct syscall_trace_enter *rec; 439 struct syscall_trace_enter *rec;
435 unsigned long flags; 440 unsigned long flags;
436 char *trace_buf;
437 char *raw_data;
438 int syscall_nr; 441 int syscall_nr;
439 int rctx; 442 int rctx;
440 int size; 443 int size;
441 int cpu;
442 444
443 syscall_nr = syscall_get_nr(current, regs); 445 syscall_nr = syscall_get_nr(current, regs);
444 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 446 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +459,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
457 "profile buffer not large enough")) 459 "profile buffer not large enough"))
458 return; 460 return;
459 461
460 /* Protect the per cpu buffer, begin the rcu read side */ 462 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
461 local_irq_save(flags); 463 sys_data->enter_event->id, &rctx, &flags);
462 464 if (!rec)
463 rctx = perf_swevent_get_recursion_context(); 465 return;
464 if (rctx < 0)
465 goto end_recursion;
466
467 cpu = smp_processor_id();
468
469 trace_buf = rcu_dereference(perf_trace_buf);
470
471 if (!trace_buf)
472 goto end;
473
474 raw_data = per_cpu_ptr(trace_buf, cpu);
475
476 /* zero the dead bytes from align to not leak stack to user */
477 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
478 466
479 rec = (struct syscall_trace_enter *) raw_data;
480 tracing_generic_entry_update(&rec->ent, 0, 0);
481 rec->ent.type = sys_data->enter_event->id;
482 rec->nr = syscall_nr; 467 rec->nr = syscall_nr;
483 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 468 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
484 (unsigned long *)&rec->args); 469 (unsigned long *)&rec->args);
485 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); 470 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
486
487end:
488 perf_swevent_put_recursion_context(rctx);
489end_recursion:
490 local_irq_restore(flags);
491} 471}
492 472
493int prof_sysenter_enable(struct ftrace_event_call *call) 473int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +511,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
531 struct syscall_trace_exit *rec; 511 struct syscall_trace_exit *rec;
532 unsigned long flags; 512 unsigned long flags;
533 int syscall_nr; 513 int syscall_nr;
534 char *trace_buf;
535 char *raw_data;
536 int rctx; 514 int rctx;
537 int size; 515 int size;
538 int cpu;
539 516
540 syscall_nr = syscall_get_nr(current, regs); 517 syscall_nr = syscall_get_nr(current, regs);
541 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 518 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +534,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
557 "exit event has grown above profile buffer size")) 534 "exit event has grown above profile buffer size"))
558 return; 535 return;
559 536
560 /* Protect the per cpu buffer, begin the rcu read side */ 537 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
561 local_irq_save(flags); 538 sys_data->exit_event->id, &rctx, &flags);
562 539 if (!rec)
563 rctx = perf_swevent_get_recursion_context(); 540 return;
564 if (rctx < 0)
565 goto end_recursion;
566
567 cpu = smp_processor_id();
568
569 trace_buf = rcu_dereference(perf_trace_buf);
570
571 if (!trace_buf)
572 goto end;
573
574 raw_data = per_cpu_ptr(trace_buf, cpu);
575
576 /* zero the dead bytes from align to not leak stack to user */
577 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
578
579 rec = (struct syscall_trace_exit *)raw_data;
580 541
581 tracing_generic_entry_update(&rec->ent, 0, 0);
582 rec->ent.type = sys_data->exit_event->id;
583 rec->nr = syscall_nr; 542 rec->nr = syscall_nr;
584 rec->ret = syscall_get_return_value(current, regs); 543 rec->ret = syscall_get_return_value(current, regs);
585 544
586 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); 545 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
587
588end:
589 perf_swevent_put_recursion_context(rctx);
590end_recursion:
591 local_irq_restore(flags);
592} 546}
593 547
594int prof_sysexit_enable(struct ftrace_event_call *call) 548int prof_sysexit_enable(struct ftrace_event_call *call)
@@ -603,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
603 ret = register_trace_sys_exit(prof_syscall_exit); 557 ret = register_trace_sys_exit(prof_syscall_exit);
604 if (ret) { 558 if (ret) {
605 pr_info("event trace: Could not activate" 559 pr_info("event trace: Could not activate"
606 "syscall entry trace point"); 560 "syscall exit trace point");
607 } else { 561 } else {
608 set_bit(num, enabled_prof_exit_syscalls); 562 set_bit(num, enabled_prof_exit_syscalls);
609 sys_prof_refcount_exit++; 563 sys_prof_refcount_exit++;
@@ -626,6 +580,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
626 mutex_unlock(&syscall_trace_lock); 580 mutex_unlock(&syscall_trace_lock);
627} 581}
628 582
629#endif 583#endif /* CONFIG_PERF_EVENTS */
630
631 584
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186