aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-03-09 11:11:53 -0500
committerIngo Molnar <mingo@elte.hu>2010-03-09 11:11:53 -0500
commit548b84166917d6f5e2296123b85ad24aecd3801d (patch)
tree0ab0300e23a02df0fe3c0579627e4998bb122c00 /kernel
parentcfb581bcd4f8c158c6f2b48bf5e232bb9e6855c0 (diff)
parent57d54889cd00db2752994b389ba714138652e60c (diff)
Merge commit 'v2.6.34-rc1' into perf/urgent
Conflicts: tools/perf/util/probe-event.c Merge reason: Pick up -rc1 and resolve the conflict as well. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/audit_tree.c100
-rw-r--r--kernel/auditsc.c7
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c15
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/early_res.c578
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/irq/chip.c52
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c647
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/module.c32
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/padata.c696
-rw-r--r--kernel/panic.c46
-rw-r--r--kernel/params.c6
-rw-r--r--kernel/perf_event.c15
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/posix-cpu-timers.c36
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig19
-rw-r--r--kernel/power/hibernate.c9
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/swsusp.c58
-rw-r--r--kernel/power/user.c23
-rw-r--r--kernel/printk.c55
-rw-r--r--kernel/ptrace.c88
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c29
-rw-r--r--kernel/rcutorture.c102
-rw-r--r--kernel/rcutree.c268
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h229
-rw-r--r--kernel/rcutree_trace.c14
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c66
-rw-r--r--kernel/sched.c2206
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1699
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c59
-rw-r--r--kernel/signal.c45
-rw-r--r--kernel/smp.c8
-rw-r--r--kernel/srcu.c52
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/sysctl_binary.c7
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time/clocksource.c14
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/timekeeping.c1
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/blktrace.c5
-rw-r--r--kernel/trace/ftrace.c51
-rw-r--r--kernel/trace/ring_buffer.c1
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c150
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_branch.c19
-rw-r--r--kernel/trace/trace_events.c81
-rw-r--r--kernel/trace/trace_export.c87
-rw-r--r--kernel/trace/trace_functions_graph.c82
-rw-r--r--kernel/trace/trace_kprobe.c108
-rw-r--r--kernel/trace/trace_syscalls.c113
-rw-r--r--kernel/tsacct.c1
-rw-r--r--kernel/user.c305
79 files changed, 5379 insertions, 3457 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..a987aa1676b5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
90obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 93obj-$(CONFIG_LATENCYTOP) += latencytop.o
94obj-$(CONFIG_BINFMT_ELF) += elfcore.o
95obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
@@ -100,6 +104,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
100obj-$(CONFIG_PERF_EVENTS) += perf_event.o 104obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
107obj-$(CONFIG_PADATA) += padata.o
103 108
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 109ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 110# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..028e85663f27 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
548 return 0; 548 return 0;
549} 549}
550 550
551static int compare_root(struct vfsmount *mnt, void *arg)
552{
553 return mnt->mnt_root->d_inode == arg;
554}
555
551void audit_trim_trees(void) 556void audit_trim_trees(void)
552{ 557{
553 struct list_head cursor; 558 struct list_head cursor;
@@ -559,7 +564,6 @@ void audit_trim_trees(void)
559 struct path path; 564 struct path path;
560 struct vfsmount *root_mnt; 565 struct vfsmount *root_mnt;
561 struct node *node; 566 struct node *node;
562 struct list_head list;
563 int err; 567 int err;
564 568
565 tree = container_of(cursor.next, struct audit_tree, list); 569 tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +581,16 @@ void audit_trim_trees(void)
577 if (!root_mnt) 581 if (!root_mnt)
578 goto skip_it; 582 goto skip_it;
579 583
580 list_add_tail(&list, &root_mnt->mnt_list);
581 spin_lock(&hash_lock); 584 spin_lock(&hash_lock);
582 list_for_each_entry(node, &tree->chunks, list) { 585 list_for_each_entry(node, &tree->chunks, list) {
583 struct audit_chunk *chunk = find_chunk(node); 586 struct inode *inode = find_chunk(node)->watch.inode;
584 struct inode *inode = chunk->watch.inode;
585 struct vfsmount *mnt;
586 node->index |= 1U<<31; 587 node->index |= 1U<<31;
587 list_for_each_entry(mnt, &list, mnt_list) { 588 if (iterate_mounts(compare_root, inode, root_mnt))
588 if (mnt->mnt_root->d_inode == inode) { 589 node->index &= ~(1U<<31);
589 node->index &= ~(1U<<31);
590 break;
591 }
592 }
593 } 590 }
594 spin_unlock(&hash_lock); 591 spin_unlock(&hash_lock);
595 trim_marked(tree); 592 trim_marked(tree);
596 put_tree(tree); 593 put_tree(tree);
597 list_del_init(&list);
598 drop_collected_mounts(root_mnt); 594 drop_collected_mounts(root_mnt);
599skip_it: 595skip_it:
600 mutex_lock(&audit_filter_mutex); 596 mutex_lock(&audit_filter_mutex);
@@ -603,22 +599,6 @@ skip_it:
603 mutex_unlock(&audit_filter_mutex); 599 mutex_unlock(&audit_filter_mutex);
604} 600}
605 601
606static int is_under(struct vfsmount *mnt, struct dentry *dentry,
607 struct path *path)
608{
609 if (mnt != path->mnt) {
610 for (;;) {
611 if (mnt->mnt_parent == mnt)
612 return 0;
613 if (mnt->mnt_parent == path->mnt)
614 break;
615 mnt = mnt->mnt_parent;
616 }
617 dentry = mnt->mnt_mountpoint;
618 }
619 return is_subdir(dentry, path->dentry);
620}
621
622int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 602int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
623{ 603{
624 604
@@ -638,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree)
638 put_tree(tree); 618 put_tree(tree);
639} 619}
640 620
621static int tag_mount(struct vfsmount *mnt, void *arg)
622{
623 return tag_chunk(mnt->mnt_root->d_inode, arg);
624}
625
641/* called with audit_filter_mutex */ 626/* called with audit_filter_mutex */
642int audit_add_tree_rule(struct audit_krule *rule) 627int audit_add_tree_rule(struct audit_krule *rule)
643{ 628{
644 struct audit_tree *seed = rule->tree, *tree; 629 struct audit_tree *seed = rule->tree, *tree;
645 struct path path; 630 struct path path;
646 struct vfsmount *mnt, *p; 631 struct vfsmount *mnt;
647 struct list_head list;
648 int err; 632 int err;
649 633
650 list_for_each_entry(tree, &tree_list, list) { 634 list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
670 err = -ENOMEM; 654 err = -ENOMEM;
671 goto Err; 655 goto Err;
672 } 656 }
673 list_add_tail(&list, &mnt->mnt_list);
674 657
675 get_tree(tree); 658 get_tree(tree);
676 list_for_each_entry(p, &list, mnt_list) { 659 err = iterate_mounts(tag_mount, tree, mnt);
677 err = tag_chunk(p->mnt_root->d_inode, tree);
678 if (err)
679 break;
680 }
681
682 list_del(&list);
683 drop_collected_mounts(mnt); 660 drop_collected_mounts(mnt);
684 661
685 if (!err) { 662 if (!err) {
@@ -714,31 +691,23 @@ int audit_tag_tree(char *old, char *new)
714{ 691{
715 struct list_head cursor, barrier; 692 struct list_head cursor, barrier;
716 int failed = 0; 693 int failed = 0;
717 struct path path; 694 struct path path1, path2;
718 struct vfsmount *tagged; 695 struct vfsmount *tagged;
719 struct list_head list;
720 struct vfsmount *mnt;
721 struct dentry *dentry;
722 int err; 696 int err;
723 697
724 err = kern_path(new, 0, &path); 698 err = kern_path(new, 0, &path2);
725 if (err) 699 if (err)
726 return err; 700 return err;
727 tagged = collect_mounts(&path); 701 tagged = collect_mounts(&path2);
728 path_put(&path); 702 path_put(&path2);
729 if (!tagged) 703 if (!tagged)
730 return -ENOMEM; 704 return -ENOMEM;
731 705
732 err = kern_path(old, 0, &path); 706 err = kern_path(old, 0, &path1);
733 if (err) { 707 if (err) {
734 drop_collected_mounts(tagged); 708 drop_collected_mounts(tagged);
735 return err; 709 return err;
736 } 710 }
737 mnt = mntget(path.mnt);
738 dentry = dget(path.dentry);
739 path_put(&path);
740
741 list_add_tail(&list, &tagged->mnt_list);
742 711
743 mutex_lock(&audit_filter_mutex); 712 mutex_lock(&audit_filter_mutex);
744 list_add(&barrier, &tree_list); 713 list_add(&barrier, &tree_list);
@@ -746,7 +715,7 @@ int audit_tag_tree(char *old, char *new)
746 715
747 while (cursor.next != &tree_list) { 716 while (cursor.next != &tree_list) {
748 struct audit_tree *tree; 717 struct audit_tree *tree;
749 struct vfsmount *p; 718 int good_one = 0;
750 719
751 tree = container_of(cursor.next, struct audit_tree, list); 720 tree = container_of(cursor.next, struct audit_tree, list);
752 get_tree(tree); 721 get_tree(tree);
@@ -754,30 +723,19 @@ int audit_tag_tree(char *old, char *new)
754 list_add(&cursor, &tree->list); 723 list_add(&cursor, &tree->list);
755 mutex_unlock(&audit_filter_mutex); 724 mutex_unlock(&audit_filter_mutex);
756 725
757 err = kern_path(tree->pathname, 0, &path); 726 err = kern_path(tree->pathname, 0, &path2);
758 if (err) { 727 if (!err) {
759 put_tree(tree); 728 good_one = path_is_under(&path1, &path2);
760 mutex_lock(&audit_filter_mutex); 729 path_put(&path2);
761 continue;
762 } 730 }
763 731
764 spin_lock(&vfsmount_lock); 732 if (!good_one) {
765 if (!is_under(mnt, dentry, &path)) {
766 spin_unlock(&vfsmount_lock);
767 path_put(&path);
768 put_tree(tree); 733 put_tree(tree);
769 mutex_lock(&audit_filter_mutex); 734 mutex_lock(&audit_filter_mutex);
770 continue; 735 continue;
771 } 736 }
772 spin_unlock(&vfsmount_lock);
773 path_put(&path);
774
775 list_for_each_entry(p, &list, mnt_list) {
776 failed = tag_chunk(p->mnt_root->d_inode, tree);
777 if (failed)
778 break;
779 }
780 737
738 failed = iterate_mounts(tag_mount, tree, tagged);
781 if (failed) { 739 if (failed) {
782 put_tree(tree); 740 put_tree(tree);
783 mutex_lock(&audit_filter_mutex); 741 mutex_lock(&audit_filter_mutex);
@@ -818,10 +776,8 @@ int audit_tag_tree(char *old, char *new)
818 } 776 }
819 list_del(&barrier); 777 list_del(&barrier);
820 list_del(&cursor); 778 list_del(&cursor);
821 list_del(&list);
822 mutex_unlock(&audit_filter_mutex); 779 mutex_unlock(&audit_filter_mutex);
823 dput(dentry); 780 path_put(&path1);
824 mntput(mnt);
825 drop_collected_mounts(tagged); 781 drop_collected_mounts(tagged);
826 return failed; 782 return failed;
827} 783}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..f3a461c0970a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1988 1988
1989/** 1989/**
1990 * audit_inode_child - collect inode info for created/removed objects 1990 * audit_inode_child - collect inode info for created/removed objects
1991 * @dname: inode's dentry name
1992 * @dentry: dentry being audited 1991 * @dentry: dentry being audited
1993 * @parent: inode of dentry parent 1992 * @parent: inode of dentry parent
1994 * 1993 *
@@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2000 * must be hooked prior, in order to capture the target inode during 1999 * must be hooked prior, in order to capture the target inode during
2001 * unsuccessful attempts. 2000 * unsuccessful attempts.
2002 */ 2001 */
2003void __audit_inode_child(const char *dname, const struct dentry *dentry, 2002void __audit_inode_child(const struct dentry *dentry,
2004 const struct inode *parent) 2003 const struct inode *parent)
2005{ 2004{
2006 int idx; 2005 int idx;
2007 struct audit_context *context = current->audit_context; 2006 struct audit_context *context = current->audit_context;
2008 const char *found_parent = NULL, *found_child = NULL; 2007 const char *found_parent = NULL, *found_child = NULL;
2009 const struct inode *inode = dentry->d_inode; 2008 const struct inode *inode = dentry->d_inode;
2009 const char *dname = dentry->d_name.name;
2010 int dirlen = 0; 2010 int dirlen = 0;
2011 2011
2012 if (!context->in_syscall) 2012 if (!context->in_syscall)
@@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2014 2014
2015 if (inode) 2015 if (inode)
2016 handle_one(inode); 2016 handle_one(inode);
2017 /* determine matching parent */
2018 if (!dname)
2019 goto add_names;
2020 2017
2021 /* parent is more likely, look for it first */ 2018 /* parent is more likely, look for it first */
2022 for (idx = 0; idx < context->name_count; idx++) { 2019 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
135 if (pid && (pid != task_pid_vnr(current))) { 135 if (pid && (pid != task_pid_vnr(current))) {
136 struct task_struct *target; 136 struct task_struct *target;
137 137
138 read_lock(&tasklist_lock); 138 rcu_read_lock();
139 139
140 target = find_task_by_vpid(pid); 140 target = find_task_by_vpid(pid);
141 if (!target) 141 if (!target)
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
143 else 143 else
144 ret = security_capget(target, pEp, pIp, pPp); 144 ret = security_capget(target, pEp, pIp, pPp);
145 145
146 read_unlock(&tasklist_lock); 146 rcu_read_unlock();
147 } else 147 } else
148 ret = security_capget(current, pEp, pIp, pPp); 148 ret = security_capget(current, pEp, pIp, pPp);
149 149
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index aa3bee566446..4fd90e129772 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/module.h>
26#include <linux/ctype.h> 27#include <linux/ctype.h>
27#include <linux/errno.h> 28#include <linux/errno.h>
28#include <linux/fs.h> 29#include <linux/fs.h>
@@ -166,6 +167,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 167 */
167static int need_forkexit_callback __read_mostly; 168static int need_forkexit_callback __read_mostly;
168 169
170#ifdef CONFIG_PROVE_LOCKING
171int cgroup_lock_is_held(void)
172{
173 return lockdep_is_held(&cgroup_mutex);
174}
175#else /* #ifdef CONFIG_PROVE_LOCKING */
176int cgroup_lock_is_held(void)
177{
178 return mutex_is_locked(&cgroup_mutex);
179}
180#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
181
182EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
183
169/* convenient tests for these bits */ 184/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 185inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 186{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 677f25376a38..f8cced2692b3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -338,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu)
338 if (!cpu_possible(cpu)) { 338 if (!cpu_possible(cpu)) {
339 printk(KERN_ERR "can't online cpu %d because it is not " 339 printk(KERN_ERR "can't online cpu %d because it is not "
340 "configured as may-hotadd at boot time\n", cpu); 340 "configured as may-hotadd at boot time\n", cpu);
341#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 341#if defined(CONFIG_IA64)
342 printk(KERN_ERR "please check additional_cpus= boot " 342 printk(KERN_ERR "please check additional_cpus= boot "
343 "parameter\n"); 343 "parameter\n");
344#endif 344#endif
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..3cb2c661bb78
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,578 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10
11/*
12 * Early reserved memory areas.
13 */
14/*
15 * need to make sure this one is bigger enough before
16 * find_fw_memmap_area could be used
17 */
18#define MAX_EARLY_RES_X 32
19
20struct early_res {
21 u64 start, end;
22 char name[15];
23 char overlap_ok;
24};
25static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
26
27static int max_early_res __initdata = MAX_EARLY_RES_X;
28static struct early_res *early_res __initdata = &early_res_x[0];
29static int early_res_count __initdata;
30
31static int __init find_overlapped_early(u64 start, u64 end)
32{
33 int i;
34 struct early_res *r;
35
36 for (i = 0; i < max_early_res && early_res[i].end; i++) {
37 r = &early_res[i];
38 if (end > r->start && start < r->end)
39 break;
40 }
41
42 return i;
43}
44
45/*
46 * Drop the i-th range from the early reservation map,
47 * by copying any higher ranges down one over it, and
48 * clearing what had been the last slot.
49 */
50static void __init drop_range(int i)
51{
52 int j;
53
54 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
55 ;
56
57 memmove(&early_res[i], &early_res[i + 1],
58 (j - 1 - i) * sizeof(struct early_res));
59
60 early_res[j - 1].end = 0;
61 early_res_count--;
62}
63
64static void __init drop_range_partial(int i, u64 start, u64 end)
65{
66 u64 common_start, common_end;
67 u64 old_start, old_end;
68
69 old_start = early_res[i].start;
70 old_end = early_res[i].end;
71 common_start = max(old_start, start);
72 common_end = min(old_end, end);
73
74 /* no overlap ? */
75 if (common_start >= common_end)
76 return;
77
78 if (old_start < common_start) {
79 /* make head segment */
80 early_res[i].end = common_start;
81 if (old_end > common_end) {
82 char name[15];
83
84 /*
85 * Save a local copy of the name, since the
86 * early_res array could get resized inside
87 * reserve_early_without_check() ->
88 * __check_and_double_early_res(), which would
89 * make the current name pointer invalid.
90 */
91 strncpy(name, early_res[i].name,
92 sizeof(early_res[i].name) - 1);
93 /* add another for left over on tail */
94 reserve_early_without_check(common_end, old_end, name);
95 }
96 return;
97 } else {
98 if (old_end > common_end) {
99 /* reuse the entry for tail left */
100 early_res[i].start = common_end;
101 return;
102 }
103 /* all covered */
104 drop_range(i);
105 }
106}
107
108/*
109 * Split any existing ranges that:
110 * 1) are marked 'overlap_ok', and
111 * 2) overlap with the stated range [start, end)
112 * into whatever portion (if any) of the existing range is entirely
113 * below or entirely above the stated range. Drop the portion
114 * of the existing range that overlaps with the stated range,
115 * which will allow the caller of this routine to then add that
116 * stated range without conflicting with any existing range.
117 */
118static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
119{
120 int i;
121 struct early_res *r;
122 u64 lower_start, lower_end;
123 u64 upper_start, upper_end;
124 char name[15];
125
126 for (i = 0; i < max_early_res && early_res[i].end; i++) {
127 r = &early_res[i];
128
129 /* Continue past non-overlapping ranges */
130 if (end <= r->start || start >= r->end)
131 continue;
132
133 /*
134 * Leave non-ok overlaps as is; let caller
135 * panic "Overlapping early reservations"
136 * when it hits this overlap.
137 */
138 if (!r->overlap_ok)
139 return;
140
141 /*
142 * We have an ok overlap. We will drop it from the early
143 * reservation map, and add back in any non-overlapping
144 * portions (lower or upper) as separate, overlap_ok,
145 * non-overlapping ranges.
146 */
147
148 /* 1. Note any non-overlapping (lower or upper) ranges. */
149 strncpy(name, r->name, sizeof(name) - 1);
150
151 lower_start = lower_end = 0;
152 upper_start = upper_end = 0;
153 if (r->start < start) {
154 lower_start = r->start;
155 lower_end = start;
156 }
157 if (r->end > end) {
158 upper_start = end;
159 upper_end = r->end;
160 }
161
162 /* 2. Drop the original ok overlapping range */
163 drop_range(i);
164
165 i--; /* resume for-loop on copied down entry */
166
167 /* 3. Add back in any non-overlapping ranges. */
168 if (lower_end)
169 reserve_early_overlap_ok(lower_start, lower_end, name);
170 if (upper_end)
171 reserve_early_overlap_ok(upper_start, upper_end, name);
172 }
173}
174
175static void __init __reserve_early(u64 start, u64 end, char *name,
176 int overlap_ok)
177{
178 int i;
179 struct early_res *r;
180
181 i = find_overlapped_early(start, end);
182 if (i >= max_early_res)
183 panic("Too many early reservations");
184 r = &early_res[i];
185 if (r->end)
186 panic("Overlapping early reservations "
187 "%llx-%llx %s to %llx-%llx %s\n",
188 start, end - 1, name ? name : "", r->start,
189 r->end - 1, r->name);
190 r->start = start;
191 r->end = end;
192 r->overlap_ok = overlap_ok;
193 if (name)
194 strncpy(r->name, name, sizeof(r->name) - 1);
195 early_res_count++;
196}
197
198/*
199 * A few early reservtations come here.
200 *
201 * The 'overlap_ok' in the name of this routine does -not- mean it
202 * is ok for these reservations to overlap an earlier reservation.
203 * Rather it means that it is ok for subsequent reservations to
204 * overlap this one.
205 *
206 * Use this entry point to reserve early ranges when you are doing
207 * so out of "Paranoia", reserving perhaps more memory than you need,
208 * just in case, and don't mind a subsequent overlapping reservation
209 * that is known to be needed.
210 *
211 * The drop_overlaps_that_are_ok() call here isn't really needed.
212 * It would be needed if we had two colliding 'overlap_ok'
213 * reservations, so that the second such would not panic on the
214 * overlap with the first. We don't have any such as of this
215 * writing, but might as well tolerate such if it happens in
216 * the future.
217 */
218void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
219{
220 drop_overlaps_that_are_ok(start, end);
221 __reserve_early(start, end, name, 1);
222}
223
224static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
225{
226 u64 start, end, size, mem;
227 struct early_res *new;
228
229 /* do we have enough slots left ? */
230 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
231 return;
232
233 /* double it */
234 mem = -1ULL;
235 size = sizeof(struct early_res) * max_early_res * 2;
236 if (early_res == early_res_x)
237 start = 0;
238 else
239 start = early_res[0].end;
240 end = ex_start;
241 if (start + size < end)
242 mem = find_fw_memmap_area(start, end, size,
243 sizeof(struct early_res));
244 if (mem == -1ULL) {
245 start = ex_end;
246 end = get_max_mapped();
247 if (start + size < end)
248 mem = find_fw_memmap_area(start, end, size,
249 sizeof(struct early_res));
250 }
251 if (mem == -1ULL)
252 panic("can not find more space for early_res array");
253
254 new = __va(mem);
255 /* save the first one for own */
256 new[0].start = mem;
257 new[0].end = mem + size;
258 new[0].overlap_ok = 0;
259 /* copy old to new */
260 if (early_res == early_res_x) {
261 memcpy(&new[1], &early_res[0],
262 sizeof(struct early_res) * max_early_res);
263 memset(&new[max_early_res+1], 0,
264 sizeof(struct early_res) * (max_early_res - 1));
265 early_res_count++;
266 } else {
267 memcpy(&new[1], &early_res[1],
268 sizeof(struct early_res) * (max_early_res - 1));
269 memset(&new[max_early_res], 0,
270 sizeof(struct early_res) * max_early_res);
271 }
272 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
273 early_res = new;
274 max_early_res *= 2;
275 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
276 max_early_res, mem, mem + size - 1);
277}
278
279/*
280 * Most early reservations come here.
281 *
282 * We first have drop_overlaps_that_are_ok() drop any pre-existing
283 * 'overlap_ok' ranges, so that we can then reserve this memory
284 * range without risk of panic'ing on an overlapping overlap_ok
285 * early reservation.
286 */
287void __init reserve_early(u64 start, u64 end, char *name)
288{
289 if (start >= end)
290 return;
291
292 __check_and_double_early_res(start, end);
293
294 drop_overlaps_that_are_ok(start, end);
295 __reserve_early(start, end, name, 0);
296}
297
298void __init reserve_early_without_check(u64 start, u64 end, char *name)
299{
300 struct early_res *r;
301
302 if (start >= end)
303 return;
304
305 __check_and_double_early_res(start, end);
306
307 r = &early_res[early_res_count];
308
309 r->start = start;
310 r->end = end;
311 r->overlap_ok = 0;
312 if (name)
313 strncpy(r->name, name, sizeof(r->name) - 1);
314 early_res_count++;
315}
316
317void __init free_early(u64 start, u64 end)
318{
319 struct early_res *r;
320 int i;
321
322 i = find_overlapped_early(start, end);
323 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start)
325 panic("free_early on not reserved area: %llx-%llx!",
326 start, end - 1);
327
328 drop_range(i);
329}
330
331void __init free_early_partial(u64 start, u64 end)
332{
333 struct early_res *r;
334 int i;
335
336try_next:
337 i = find_overlapped_early(start, end);
338 if (i >= max_early_res)
339 return;
340
341 r = &early_res[i];
342 /* hole ? */
343 if (r->end >= end && r->start <= start) {
344 drop_range_partial(i, start, end);
345 return;
346 }
347
348 drop_range_partial(i, start, end);
349 goto try_next;
350}
351
352#ifdef CONFIG_NO_BOOTMEM
353static void __init subtract_early_res(struct range *range, int az)
354{
355 int i, count;
356 u64 final_start, final_end;
357 int idx = 0;
358
359 count = 0;
360 for (i = 0; i < max_early_res && early_res[i].end; i++)
361 count++;
362
363 /* need to skip first one ?*/
364 if (early_res != early_res_x)
365 idx = 1;
366
367#define DEBUG_PRINT_EARLY_RES 1
368
369#if DEBUG_PRINT_EARLY_RES
370 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
371#endif
372 for (i = idx; i < count; i++) {
373 struct early_res *r = &early_res[i];
374#if DEBUG_PRINT_EARLY_RES
375 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
376 r->start, r->end, r->name);
377#endif
378 final_start = PFN_DOWN(r->start);
379 final_end = PFN_UP(r->end);
380 if (final_start >= final_end)
381 continue;
382 subtract_range(range, az, final_start, final_end);
383 }
384
385}
386
387int __init get_free_all_memory_range(struct range **rangep, int nodeid)
388{
389 int i, count;
390 u64 start = 0, end;
391 u64 size;
392 u64 mem;
393 struct range *range;
394 int nr_range;
395
396 count = 0;
397 for (i = 0; i < max_early_res && early_res[i].end; i++)
398 count++;
399
400 count *= 2;
401
402 size = sizeof(struct range) * count;
403 end = get_max_mapped();
404#ifdef MAX_DMA32_PFN
405 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
406 start = MAX_DMA32_PFN << PAGE_SHIFT;
407#endif
408 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
409 if (mem == -1ULL)
410 panic("can not find more space for range free");
411
412 range = __va(mem);
413 /* use early_node_map[] and early_res to get range array at first */
414 memset(range, 0, size);
415 nr_range = 0;
416
417 /* need to go over early_node_map to find out good range for node */
418 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
419#ifdef CONFIG_X86_32
420 subtract_range(range, count, max_low_pfn, -1ULL);
421#endif
422 subtract_early_res(range, count);
423 nr_range = clean_sort_range(range, count);
424
425 /* need to clear it ? */
426 if (nodeid == MAX_NUMNODES) {
427 memset(&early_res[0], 0,
428 sizeof(struct early_res) * max_early_res);
429 early_res = NULL;
430 max_early_res = 0;
431 }
432
433 *rangep = range;
434 return nr_range;
435}
436#else
437void __init early_res_to_bootmem(u64 start, u64 end)
438{
439 int i, count;
440 u64 final_start, final_end;
441 int idx = 0;
442
443 count = 0;
444 for (i = 0; i < max_early_res && early_res[i].end; i++)
445 count++;
446
447 /* need to skip first one ?*/
448 if (early_res != early_res_x)
449 idx = 1;
450
451 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
452 count - idx, max_early_res, start, end);
453 for (i = idx; i < count; i++) {
454 struct early_res *r = &early_res[i];
455 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
456 r->start, r->end, r->name);
457 final_start = max(start, r->start);
458 final_end = min(end, r->end);
459 if (final_start >= final_end) {
460 printk(KERN_CONT "\n");
461 continue;
462 }
463 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
464 final_start, final_end);
465 reserve_bootmem_generic(final_start, final_end - final_start,
466 BOOTMEM_DEFAULT);
467 }
468 /* clear them */
469 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
470 early_res = NULL;
471 max_early_res = 0;
472 early_res_count = 0;
473}
474#endif
475
476/* Check for already reserved areas */
477static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
478{
479 int i;
480 u64 addr = *addrp;
481 int changed = 0;
482 struct early_res *r;
483again:
484 i = find_overlapped_early(addr, addr + size);
485 r = &early_res[i];
486 if (i < max_early_res && r->end) {
487 *addrp = addr = round_up(r->end, align);
488 changed = 1;
489 goto again;
490 }
491 return changed;
492}
493
494/* Check for already reserved areas */
495static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
496{
497 int i;
498 u64 addr = *addrp, last;
499 u64 size = *sizep;
500 int changed = 0;
501again:
502 last = addr + size;
503 for (i = 0; i < max_early_res && early_res[i].end; i++) {
504 struct early_res *r = &early_res[i];
505 if (last > r->start && addr < r->start) {
506 size = r->start - addr;
507 changed = 1;
508 goto again;
509 }
510 if (last > r->end && addr < r->end) {
511 addr = round_up(r->end, align);
512 size = last - addr;
513 changed = 1;
514 goto again;
515 }
516 if (last <= r->end && addr >= r->start) {
517 (*sizep)++;
518 return 0;
519 }
520 }
521 if (changed) {
522 *addrp = addr;
523 *sizep = size;
524 }
525 return changed;
526}
527
528/*
529 * Find a free area with specified alignment in a specific range.
530 * only with the area.between start to end is active range from early_node_map
531 * so they are good as RAM
532 */
533u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
534 u64 size, u64 align)
535{
536 u64 addr, last;
537
538 addr = round_up(ei_start, align);
539 if (addr < start)
540 addr = round_up(start, align);
541 if (addr >= ei_last)
542 goto out;
543 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
544 ;
545 last = addr + size;
546 if (last > ei_last)
547 goto out;
548 if (last > end)
549 goto out;
550
551 return addr;
552
553out:
554 return -1ULL;
555}
556
557u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
558 u64 *sizep, u64 align)
559{
560 u64 addr, last;
561
562 addr = round_up(ei_start, align);
563 if (addr < start)
564 addr = round_up(start, align);
565 if (addr >= ei_last)
566 goto out;
567 *sizep = ei_last - addr;
568 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
569 ;
570 last = addr + *sizep;
571 if (last > ei_last)
572 goto out;
573
574 return addr;
575
576out:
577 return -1ULL;
578}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a66..ce1e48c2d93d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk)
85 BUG_ON(!sig); 85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count)); 86 BUG_ON(!atomic_read(&sig->count));
87 87
88 sighand = rcu_dereference(tsk->sighand); 88 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() ||
90 lockdep_is_held(&tasklist_lock));
89 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
90 92
91 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p)
170repeat: 172repeat:
171 tracehook_prepare_release_task(p); 173 tracehook_prepare_release_task(p);
172 /* don't need to get the RCU readlock here - the process is dead and 174 /* don't need to get the RCU readlock here - the process is dead and
173 * can't be modifying its own credentials */ 175 * can't be modifying its own credentials. But shut RCU-lockdep up */
176 rcu_read_lock();
174 atomic_dec(&__task_cred(p)->user->processes); 177 atomic_dec(&__task_cred(p)->user->processes);
178 rcu_read_unlock();
175 179
176 proc_flush_task(p); 180 proc_flush_task(p);
177 181
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files)
473 /* 477 /*
474 * It is safe to dereference the fd table without RCU or 478 * It is safe to dereference the fd table without RCU or
475 * ->file_lock because this is the last reference to the 479 * ->file_lock because this is the last reference to the
476 * files structure. 480 * files structure. But use RCU to shut RCU-lockdep up.
477 */ 481 */
482 rcu_read_lock();
478 fdt = files_fdtable(files); 483 fdt = files_fdtable(files);
484 rcu_read_unlock();
479 for (;;) { 485 for (;;) {
480 unsigned long set; 486 unsigned long set;
481 i = j * __NFDBITS; 487 i = j * __NFDBITS;
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files)
521 * at the end of the RCU grace period. Otherwise, 527 * at the end of the RCU grace period. Otherwise,
522 * you can free files immediately. 528 * you can free files immediately.
523 */ 529 */
530 rcu_read_lock();
524 fdt = files_fdtable(files); 531 fdt = files_fdtable(files);
525 if (fdt != &files->fdtab) 532 if (fdt != &files->fdtab)
526 kmem_cache_free(files_cachep, files); 533 kmem_cache_free(files_cachep, files);
527 free_fdtable(fdt); 534 free_fdtable(fdt);
535 rcu_read_unlock();
528 } 536 }
529} 537}
530 538
@@ -944,7 +952,8 @@ NORET_TYPE void do_exit(long code)
944 preempt_count()); 952 preempt_count());
945 953
946 acct_update_integrals(tsk); 954 acct_update_integrals(tsk);
947 955 /* sync mm's RSS info before statistics gathering */
956 sync_mm_rss(tsk, tsk->mm);
948 group_dead = atomic_dec_and_test(&tsk->signal->live); 957 group_dead = atomic_dec_and_test(&tsk->signal->live);
949 if (group_dead) { 958 if (group_dead) {
950 hrtimer_cancel(&tsk->signal->real_timer); 959 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1180,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1180 1189
1181 if (unlikely(wo->wo_flags & WNOWAIT)) { 1190 if (unlikely(wo->wo_flags & WNOWAIT)) {
1182 int exit_code = p->exit_code; 1191 int exit_code = p->exit_code;
1183 int why, status; 1192 int why;
1184 1193
1185 get_task_struct(p); 1194 get_task_struct(p);
1186 read_unlock(&tasklist_lock); 1195 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index f88bd984df35..b0ec34abc0bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,6 +86,7 @@ int max_threads; /* tunable limit on nr_threads */
86DEFINE_PER_CPU(unsigned long, process_counts) = 0; 86DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89EXPORT_SYMBOL_GPL(tasklist_lock);
89 90
90int nr_processes(void) 91int nr_processes(void)
91{ 92{
@@ -328,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (!tmp) 329 if (!tmp)
329 goto fail_nomem; 330 goto fail_nomem;
330 *tmp = *mpnt; 331 *tmp = *mpnt;
332 INIT_LIST_HEAD(&tmp->anon_vma_chain);
331 pol = mpol_dup(vma_policy(mpnt)); 333 pol = mpol_dup(vma_policy(mpnt));
332 retval = PTR_ERR(pol); 334 retval = PTR_ERR(pol);
333 if (IS_ERR(pol)) 335 if (IS_ERR(pol))
334 goto fail_nomem_policy; 336 goto fail_nomem_policy;
335 vma_set_policy(tmp, pol); 337 vma_set_policy(tmp, pol);
338 if (anon_vma_fork(tmp, mpnt))
339 goto fail_nomem_anon_vma_fork;
336 tmp->vm_flags &= ~VM_LOCKED; 340 tmp->vm_flags &= ~VM_LOCKED;
337 tmp->vm_mm = mm; 341 tmp->vm_mm = mm;
338 tmp->vm_next = NULL; 342 tmp->vm_next = NULL;
339 anon_vma_link(tmp);
340 file = tmp->vm_file; 343 file = tmp->vm_file;
341 if (file) { 344 if (file) {
342 struct inode *inode = file->f_path.dentry->d_inode; 345 struct inode *inode = file->f_path.dentry->d_inode;
@@ -391,6 +394,8 @@ out:
391 flush_tlb_mm(oldmm); 394 flush_tlb_mm(oldmm);
392 up_write(&oldmm->mmap_sem); 395 up_write(&oldmm->mmap_sem);
393 return retval; 396 return retval;
397fail_nomem_anon_vma_fork:
398 mpol_put(pol);
394fail_nomem_policy: 399fail_nomem_policy:
395 kmem_cache_free(vm_area_cachep, tmp); 400 kmem_cache_free(vm_area_cachep, tmp);
396fail_nomem: 401fail_nomem:
@@ -454,8 +459,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
454 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 459 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
455 mm->core_state = NULL; 460 mm->core_state = NULL;
456 mm->nr_ptes = 0; 461 mm->nr_ptes = 0;
457 set_mm_counter(mm, file_rss, 0); 462 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
458 set_mm_counter(mm, anon_rss, 0);
459 spin_lock_init(&mm->page_table_lock); 463 spin_lock_init(&mm->page_table_lock);
460 mm->free_area_cache = TASK_UNMAPPED_BASE; 464 mm->free_area_cache = TASK_UNMAPPED_BASE;
461 mm->cached_hole_size = ~0UL; 465 mm->cached_hole_size = ~0UL;
@@ -824,6 +828,8 @@ void __cleanup_sighand(struct sighand_struct *sighand)
824 */ 828 */
825static void posix_cpu_timers_init_group(struct signal_struct *sig) 829static void posix_cpu_timers_init_group(struct signal_struct *sig)
826{ 830{
831 unsigned long cpu_limit;
832
827 /* Thread group counters. */ 833 /* Thread group counters. */
828 thread_group_cputime_init(sig); 834 thread_group_cputime_init(sig);
829 835
@@ -838,9 +844,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
838 sig->cputime_expires.virt_exp = cputime_zero; 844 sig->cputime_expires.virt_exp = cputime_zero;
839 sig->cputime_expires.sched_exp = 0; 845 sig->cputime_expires.sched_exp = 0;
840 846
841 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 847 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
842 sig->cputime_expires.prof_exp = 848 if (cpu_limit != RLIM_INFINITY) {
843 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 849 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
844 sig->cputimer.running = 1; 850 sig->cputimer.running = 1;
845 } 851 }
846 852
@@ -1033,7 +1039,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1033#endif 1039#endif
1034 retval = -EAGAIN; 1040 retval = -EAGAIN;
1035 if (atomic_read(&p->real_cred->user->processes) >= 1041 if (atomic_read(&p->real_cred->user->processes) >=
1036 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1042 task_rlimit(p, RLIMIT_NPROC)) {
1037 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1043 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1038 p->real_cred->user != INIT_USER) 1044 p->real_cred->user != INIT_USER)
1039 goto bad_fork_free; 1045 goto bad_fork_free;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
146 struct task_struct *p; 146 struct task_struct *p;
147 147
148 ret = -ESRCH; 148 ret = -ESRCH;
149 read_lock(&tasklist_lock); 149 rcu_read_lock();
150 p = find_task_by_vpid(pid); 150 p = find_task_by_vpid(pid);
151 if (!p) 151 if (!p)
152 goto err_unlock; 152 goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
157 !capable(CAP_SYS_PTRACE)) 157 !capable(CAP_SYS_PTRACE))
158 goto err_unlock; 158 goto err_unlock;
159 head = p->compat_robust_list; 159 head = p->compat_robust_list;
160 read_unlock(&tasklist_lock); 160 rcu_read_unlock();
161 } 161 }
162 162
163 if (put_user(sizeof(*head), len_ptr)) 163 if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
165 return put_user(ptr_to_compat(head), head_ptr); 165 return put_user(ptr_to_compat(head), head_ptr);
166 166
167err_unlock: 167err_unlock:
168 read_unlock(&tasklist_lock); 168 rcu_read_unlock();
169 169
170 return ret; 170 return ret;
171} 171}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..d70394f12ee9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
85 raw_spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 100}
87 101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
88 122
89/** 123/**
90 * set_irq_chip - set the irq chip for an irq 124 * set_irq_chip - set the irq chip for an irq
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
132 */ 128 */
133DEFINE_RAW_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..963559dbd858 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_to_desc(irq);
74 74
75 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
76 goto out_unlock; 76 goto out_unlock;
@@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 90 goto out_unlock;
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 replace_irq_desc(irq, desc);
94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..87ebe8adc474 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
41#include <asm/sections.h> 41#include <asm/sections.h>
42 42
43/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
44note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
45 45
46/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ccec774c716d..fa034d29cf73 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,9 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
47#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h>
48 50
49#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
50#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
105 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
106 * is a recipe for disaster 108 * is a recipe for disaster
107 */ 109 */
108#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
109
110struct kprobe_insn_page { 110struct kprobe_insn_page {
111 struct list_head list; 111 struct list_head list;
112 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
113 char slot_used[INSNS_PER_PAGE];
114 int nused; 113 int nused;
115 int ngarbage; 114 int ngarbage;
115 char slot_used[];
116};
117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
116}; 126};
117 127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
118enum kprobe_slot_state { 133enum kprobe_slot_state {
119 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
120 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
121 SLOT_USED = 2, 136 SLOT_USED = 2,
122}; 137};
123 138
124static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
125static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
126static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
127static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
143 .nr_garbage = 0,
144};
145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
128 146
129/** 147/**
130 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
131 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
132 */ 150 */
133static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
134{ 152{
135 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
136 154
137 retry: 155 retry:
138 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
139 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
140 int i; 158 int i;
141 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
142 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
143 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
144 kip->nused++; 162 kip->nused++;
145 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
146 } 164 }
147 } 165 }
148 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
149 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
150 } 169 }
151 } 170 }
152 171
153 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
154 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
155 goto retry; 174 goto retry;
156 } 175
157 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
158 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
159 if (!kip) 178 if (!kip)
160 return NULL; 179 return NULL;
161 180
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
170 return NULL; 189 return NULL;
171 } 190 }
172 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
173 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
174 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
175 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
176 kip->nused = 1; 194 kip->nused = 1;
177 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
178 return kip->insns; 197 return kip->insns;
179} 198}
180 199
200
181kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
182{ 202{
183 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
184 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
185 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
186 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
187 return ret; 209 return ret;
188} 210}
189 211
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
199 * so as not to have to set it up again the 221 * so as not to have to set it up again the
200 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
201 */ 223 */
202 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
203 list_del(&kip->list); 225 list_del(&kip->list);
204 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
205 kfree(kip); 227 kfree(kip);
@@ -209,51 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
209 return 0; 231 return 0;
210} 232}
211 233
212static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
213{ 235{
214 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
215 237
216 /* Ensure no-one is interrupted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
217 synchronize_sched(); 239 synchronize_sched();
218 240
219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
220 int i; 242 int i;
221 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
222 continue; 244 continue;
223 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
224 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
225 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
226 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
227 break; 249 break;
228 } 250 }
229 } 251 }
230 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
231 return 0; 253 return 0;
232} 254}
233 255
234void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
235{ 258{
236 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
237 260
238 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
239 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) / c->insn_size;
240 if (kip->insns <= slot && 263 if (idx >= 0 && idx < slots_per_page(c)) {
241 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 WARN_ON(kip->slot_used[idx] != SLOT_USED);
242 int i = (slot - kip->insns) / MAX_INSN_SIZE;
243 if (dirty) { 265 if (dirty) {
244 kip->slot_used[i] = SLOT_DIRTY; 266 kip->slot_used[idx] = SLOT_DIRTY;
245 kip->ngarbage++; 267 kip->ngarbage++;
268 if (++c->nr_garbage > slots_per_page(c))
269 collect_garbage_slots(c);
246 } else 270 } else
247 collect_one_slot(kip, i); 271 collect_one_slot(kip, idx);
248 break; 272 return;
249 } 273 }
250 } 274 }
275 /* Could not free this slot. */
276 WARN_ON(1);
277}
251 278
252 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 279void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
253 collect_garbage_slots(); 280{
254 281 mutex_lock(&kprobe_insn_mutex);
282 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
255 mutex_unlock(&kprobe_insn_mutex); 283 mutex_unlock(&kprobe_insn_mutex);
256} 284}
285#ifdef CONFIG_OPTPROBES
286/* For optimized_kprobe buffer */
287static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
288static struct kprobe_insn_cache kprobe_optinsn_slots = {
289 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
290 /* .insn_size is initialized later */
291 .nr_garbage = 0,
292};
293/* Get a slot for optimized_kprobe buffer */
294kprobe_opcode_t __kprobes *get_optinsn_slot(void)
295{
296 kprobe_opcode_t *ret = NULL;
297
298 mutex_lock(&kprobe_optinsn_mutex);
299 ret = __get_insn_slot(&kprobe_optinsn_slots);
300 mutex_unlock(&kprobe_optinsn_mutex);
301
302 return ret;
303}
304
305void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
306{
307 mutex_lock(&kprobe_optinsn_mutex);
308 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
309 mutex_unlock(&kprobe_optinsn_mutex);
310}
311#endif
257#endif 312#endif
258 313
259/* We have preemption disabled.. so it is safe to use __ versions */ 314/* We have preemption disabled.. so it is safe to use __ versions */
@@ -284,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
284 if (p->addr == addr) 339 if (p->addr == addr)
285 return p; 340 return p;
286 } 341 }
342
343 return NULL;
344}
345
346static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
347
348/* Return true if the kprobe is an aggregator */
349static inline int kprobe_aggrprobe(struct kprobe *p)
350{
351 return p->pre_handler == aggr_pre_handler;
352}
353
354/*
355 * Keep all fields in the kprobe consistent
356 */
357static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
358{
359 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
360 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
361}
362
363#ifdef CONFIG_OPTPROBES
364/* NOTE: change this value only with kprobe_mutex held */
365static bool kprobes_allow_optimization;
366
367/*
368 * Call all pre_handler on the list, but ignores its return value.
369 * This must be called from arch-dep optimized caller.
370 */
371void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
372{
373 struct kprobe *kp;
374
375 list_for_each_entry_rcu(kp, &p->list, list) {
376 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
377 set_kprobe_instance(kp);
378 kp->pre_handler(kp, regs);
379 }
380 reset_kprobe_instance();
381 }
382}
383
384/* Return true(!0) if the kprobe is ready for optimization. */
385static inline int kprobe_optready(struct kprobe *p)
386{
387 struct optimized_kprobe *op;
388
389 if (kprobe_aggrprobe(p)) {
390 op = container_of(p, struct optimized_kprobe, kp);
391 return arch_prepared_optinsn(&op->optinsn);
392 }
393
394 return 0;
395}
396
397/*
398 * Return an optimized kprobe whose optimizing code replaces
399 * instructions including addr (exclude breakpoint).
400 */
401struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
402{
403 int i;
404 struct kprobe *p = NULL;
405 struct optimized_kprobe *op;
406
407 /* Don't check i == 0, since that is a breakpoint case. */
408 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
409 p = get_kprobe((void *)(addr - i));
410
411 if (p && kprobe_optready(p)) {
412 op = container_of(p, struct optimized_kprobe, kp);
413 if (arch_within_optimized_kprobe(op, addr))
414 return p;
415 }
416
287 return NULL; 417 return NULL;
288} 418}
289 419
420/* Optimization staging list, protected by kprobe_mutex */
421static LIST_HEAD(optimizing_list);
422
423static void kprobe_optimizer(struct work_struct *work);
424static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
425#define OPTIMIZE_DELAY 5
426
427/* Kprobe jump optimizer */
428static __kprobes void kprobe_optimizer(struct work_struct *work)
429{
430 struct optimized_kprobe *op, *tmp;
431
432 /* Lock modules while optimizing kprobes */
433 mutex_lock(&module_mutex);
434 mutex_lock(&kprobe_mutex);
435 if (kprobes_all_disarmed || !kprobes_allow_optimization)
436 goto end;
437
438 /*
439 * Wait for quiesence period to ensure all running interrupts
440 * are done. Because optprobe may modify multiple instructions
441 * there is a chance that Nth instruction is interrupted. In that
442 * case, running interrupt can return to 2nd-Nth byte of jump
443 * instruction. This wait is for avoiding it.
444 */
445 synchronize_sched();
446
447 /*
448 * The optimization/unoptimization refers online_cpus via
449 * stop_machine() and cpu-hotplug modifies online_cpus.
450 * And same time, text_mutex will be held in cpu-hotplug and here.
451 * This combination can cause a deadlock (cpu-hotplug try to lock
452 * text_mutex but stop_machine can not be done because online_cpus
453 * has been changed)
454 * To avoid this deadlock, we need to call get_online_cpus()
455 * for preventing cpu-hotplug outside of text_mutex locking.
456 */
457 get_online_cpus();
458 mutex_lock(&text_mutex);
459 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
460 WARN_ON(kprobe_disabled(&op->kp));
461 if (arch_optimize_kprobe(op) < 0)
462 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
463 list_del_init(&op->list);
464 }
465 mutex_unlock(&text_mutex);
466 put_online_cpus();
467end:
468 mutex_unlock(&kprobe_mutex);
469 mutex_unlock(&module_mutex);
470}
471
472/* Optimize kprobe if p is ready to be optimized */
473static __kprobes void optimize_kprobe(struct kprobe *p)
474{
475 struct optimized_kprobe *op;
476
477 /* Check if the kprobe is disabled or not ready for optimization. */
478 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
479 (kprobe_disabled(p) || kprobes_all_disarmed))
480 return;
481
482 /* Both of break_handler and post_handler are not supported. */
483 if (p->break_handler || p->post_handler)
484 return;
485
486 op = container_of(p, struct optimized_kprobe, kp);
487
488 /* Check there is no other kprobes at the optimized instructions */
489 if (arch_check_optimized_kprobe(op) < 0)
490 return;
491
492 /* Check if it is already optimized. */
493 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
494 return;
495
496 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
497 list_add(&op->list, &optimizing_list);
498 if (!delayed_work_pending(&optimizing_work))
499 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
500}
501
502/* Unoptimize a kprobe if p is optimized */
503static __kprobes void unoptimize_kprobe(struct kprobe *p)
504{
505 struct optimized_kprobe *op;
506
507 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
508 op = container_of(p, struct optimized_kprobe, kp);
509 if (!list_empty(&op->list))
510 /* Dequeue from the optimization queue */
511 list_del_init(&op->list);
512 else
513 /* Replace jump with break */
514 arch_unoptimize_kprobe(op);
515 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
516 }
517}
518
519/* Remove optimized instructions */
520static void __kprobes kill_optimized_kprobe(struct kprobe *p)
521{
522 struct optimized_kprobe *op;
523
524 op = container_of(p, struct optimized_kprobe, kp);
525 if (!list_empty(&op->list)) {
526 /* Dequeue from the optimization queue */
527 list_del_init(&op->list);
528 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
529 }
530 /* Don't unoptimize, because the target code will be freed. */
531 arch_remove_optimized_kprobe(op);
532}
533
534/* Try to prepare optimized instructions */
535static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
536{
537 struct optimized_kprobe *op;
538
539 op = container_of(p, struct optimized_kprobe, kp);
540 arch_prepare_optimized_kprobe(op);
541}
542
543/* Free optimized instructions and optimized_kprobe */
544static __kprobes void free_aggr_kprobe(struct kprobe *p)
545{
546 struct optimized_kprobe *op;
547
548 op = container_of(p, struct optimized_kprobe, kp);
549 arch_remove_optimized_kprobe(op);
550 kfree(op);
551}
552
553/* Allocate new optimized_kprobe and try to prepare optimized instructions */
554static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
555{
556 struct optimized_kprobe *op;
557
558 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
559 if (!op)
560 return NULL;
561
562 INIT_LIST_HEAD(&op->list);
563 op->kp.addr = p->addr;
564 arch_prepare_optimized_kprobe(op);
565
566 return &op->kp;
567}
568
569static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
570
571/*
572 * Prepare an optimized_kprobe and optimize it
573 * NOTE: p must be a normal registered kprobe
574 */
575static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
576{
577 struct kprobe *ap;
578 struct optimized_kprobe *op;
579
580 ap = alloc_aggr_kprobe(p);
581 if (!ap)
582 return;
583
584 op = container_of(ap, struct optimized_kprobe, kp);
585 if (!arch_prepared_optinsn(&op->optinsn)) {
586 /* If failed to setup optimizing, fallback to kprobe */
587 free_aggr_kprobe(ap);
588 return;
589 }
590
591 init_aggr_kprobe(ap, p);
592 optimize_kprobe(ap);
593}
594
595#ifdef CONFIG_SYSCTL
596static void __kprobes optimize_all_kprobes(void)
597{
598 struct hlist_head *head;
599 struct hlist_node *node;
600 struct kprobe *p;
601 unsigned int i;
602
603 /* If optimization is already allowed, just return */
604 if (kprobes_allow_optimization)
605 return;
606
607 kprobes_allow_optimization = true;
608 mutex_lock(&text_mutex);
609 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
610 head = &kprobe_table[i];
611 hlist_for_each_entry_rcu(p, node, head, hlist)
612 if (!kprobe_disabled(p))
613 optimize_kprobe(p);
614 }
615 mutex_unlock(&text_mutex);
616 printk(KERN_INFO "Kprobes globally optimized\n");
617}
618
619static void __kprobes unoptimize_all_kprobes(void)
620{
621 struct hlist_head *head;
622 struct hlist_node *node;
623 struct kprobe *p;
624 unsigned int i;
625
626 /* If optimization is already prohibited, just return */
627 if (!kprobes_allow_optimization)
628 return;
629
630 kprobes_allow_optimization = false;
631 printk(KERN_INFO "Kprobes globally unoptimized\n");
632 get_online_cpus(); /* For avoiding text_mutex deadlock */
633 mutex_lock(&text_mutex);
634 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
635 head = &kprobe_table[i];
636 hlist_for_each_entry_rcu(p, node, head, hlist) {
637 if (!kprobe_disabled(p))
638 unoptimize_kprobe(p);
639 }
640 }
641
642 mutex_unlock(&text_mutex);
643 put_online_cpus();
644 /* Allow all currently running kprobes to complete */
645 synchronize_sched();
646}
647
648int sysctl_kprobes_optimization;
649int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
650 void __user *buffer, size_t *length,
651 loff_t *ppos)
652{
653 int ret;
654
655 mutex_lock(&kprobe_mutex);
656 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
657 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
658
659 if (sysctl_kprobes_optimization)
660 optimize_all_kprobes();
661 else
662 unoptimize_all_kprobes();
663 mutex_unlock(&kprobe_mutex);
664
665 return ret;
666}
667#endif /* CONFIG_SYSCTL */
668
669static void __kprobes __arm_kprobe(struct kprobe *p)
670{
671 struct kprobe *old_p;
672
673 /* Check collision with other optimized kprobes */
674 old_p = get_optimized_kprobe((unsigned long)p->addr);
675 if (unlikely(old_p))
676 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
677
678 arch_arm_kprobe(p);
679 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
680}
681
682static void __kprobes __disarm_kprobe(struct kprobe *p)
683{
684 struct kprobe *old_p;
685
686 unoptimize_kprobe(p); /* Try to unoptimize */
687 arch_disarm_kprobe(p);
688
689 /* If another kprobe was blocked, optimize it. */
690 old_p = get_optimized_kprobe((unsigned long)p->addr);
691 if (unlikely(old_p))
692 optimize_kprobe(old_p);
693}
694
695#else /* !CONFIG_OPTPROBES */
696
697#define optimize_kprobe(p) do {} while (0)
698#define unoptimize_kprobe(p) do {} while (0)
699#define kill_optimized_kprobe(p) do {} while (0)
700#define prepare_optimized_kprobe(p) do {} while (0)
701#define try_to_optimize_kprobe(p) do {} while (0)
702#define __arm_kprobe(p) arch_arm_kprobe(p)
703#define __disarm_kprobe(p) arch_disarm_kprobe(p)
704
705static __kprobes void free_aggr_kprobe(struct kprobe *p)
706{
707 kfree(p);
708}
709
710static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
711{
712 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
713}
714#endif /* CONFIG_OPTPROBES */
715
290/* Arm a kprobe with text_mutex */ 716/* Arm a kprobe with text_mutex */
291static void __kprobes arm_kprobe(struct kprobe *kp) 717static void __kprobes arm_kprobe(struct kprobe *kp)
292{ 718{
719 /*
720 * Here, since __arm_kprobe() doesn't use stop_machine(),
721 * this doesn't cause deadlock on text_mutex. So, we don't
722 * need get_online_cpus().
723 */
293 mutex_lock(&text_mutex); 724 mutex_lock(&text_mutex);
294 arch_arm_kprobe(kp); 725 __arm_kprobe(kp);
295 mutex_unlock(&text_mutex); 726 mutex_unlock(&text_mutex);
296} 727}
297 728
298/* Disarm a kprobe with text_mutex */ 729/* Disarm a kprobe with text_mutex */
299static void __kprobes disarm_kprobe(struct kprobe *kp) 730static void __kprobes disarm_kprobe(struct kprobe *kp)
300{ 731{
732 get_online_cpus(); /* For avoiding text_mutex deadlock */
301 mutex_lock(&text_mutex); 733 mutex_lock(&text_mutex);
302 arch_disarm_kprobe(kp); 734 __disarm_kprobe(kp);
303 mutex_unlock(&text_mutex); 735 mutex_unlock(&text_mutex);
736 put_online_cpus();
304} 737}
305 738
306/* 739/*
@@ -369,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
369void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 802void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
370{ 803{
371 struct kprobe *kp; 804 struct kprobe *kp;
372 if (p->pre_handler != aggr_pre_handler) { 805 if (!kprobe_aggrprobe(p)) {
373 p->nmissed++; 806 p->nmissed++;
374 } else { 807 } else {
375 list_for_each_entry_rcu(kp, &p->list, list) 808 list_for_each_entry_rcu(kp, &p->list, list)
@@ -493,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
493} 926}
494 927
495/* 928/*
496 * Keep all fields in the kprobe consistent
497 */
498static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
499{
500 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
501 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
502}
503
504/*
505* Add the new probe to ap->list. Fail if this is the 929* Add the new probe to ap->list. Fail if this is the
506* second jprobe at the address - two jprobes can't coexist 930* second jprobe at the address - two jprobes can't coexist
507*/ 931*/
508static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 932static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
509{ 933{
510 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 934 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
935
936 if (p->break_handler || p->post_handler)
937 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
938
511 if (p->break_handler) { 939 if (p->break_handler) {
512 if (ap->break_handler) 940 if (ap->break_handler)
513 return -EEXIST; 941 return -EEXIST;
@@ -522,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
522 ap->flags &= ~KPROBE_FLAG_DISABLED; 950 ap->flags &= ~KPROBE_FLAG_DISABLED;
523 if (!kprobes_all_disarmed) 951 if (!kprobes_all_disarmed)
524 /* Arm the breakpoint again. */ 952 /* Arm the breakpoint again. */
525 arm_kprobe(ap); 953 __arm_kprobe(ap);
526 } 954 }
527 return 0; 955 return 0;
528} 956}
@@ -531,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
531 * Fill in the required fields of the "manager kprobe". Replace the 959 * Fill in the required fields of the "manager kprobe". Replace the
532 * earlier kprobe in the hlist with the manager kprobe 960 * earlier kprobe in the hlist with the manager kprobe
533 */ 961 */
534static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 962static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
535{ 963{
964 /* Copy p's insn slot to ap */
536 copy_kprobe(p, ap); 965 copy_kprobe(p, ap);
537 flush_insn_slot(ap); 966 flush_insn_slot(ap);
538 ap->addr = p->addr; 967 ap->addr = p->addr;
539 ap->flags = p->flags; 968 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
540 ap->pre_handler = aggr_pre_handler; 969 ap->pre_handler = aggr_pre_handler;
541 ap->fault_handler = aggr_fault_handler; 970 ap->fault_handler = aggr_fault_handler;
542 /* We don't care the kprobe which has gone. */ 971 /* We don't care the kprobe which has gone. */
@@ -546,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
546 ap->break_handler = aggr_break_handler; 975 ap->break_handler = aggr_break_handler;
547 976
548 INIT_LIST_HEAD(&ap->list); 977 INIT_LIST_HEAD(&ap->list);
549 list_add_rcu(&p->list, &ap->list); 978 INIT_HLIST_NODE(&ap->hlist);
550 979
980 list_add_rcu(&p->list, &ap->list);
551 hlist_replace_rcu(&p->hlist, &ap->hlist); 981 hlist_replace_rcu(&p->hlist, &ap->hlist);
552} 982}
553 983
@@ -561,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
561 int ret = 0; 991 int ret = 0;
562 struct kprobe *ap = old_p; 992 struct kprobe *ap = old_p;
563 993
564 if (old_p->pre_handler != aggr_pre_handler) { 994 if (!kprobe_aggrprobe(old_p)) {
565 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 995 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
566 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 996 ap = alloc_aggr_kprobe(old_p);
567 if (!ap) 997 if (!ap)
568 return -ENOMEM; 998 return -ENOMEM;
569 add_aggr_kprobe(ap, old_p); 999 init_aggr_kprobe(ap, old_p);
570 } 1000 }
571 1001
572 if (kprobe_gone(ap)) { 1002 if (kprobe_gone(ap)) {
@@ -585,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
585 */ 1015 */
586 return ret; 1016 return ret;
587 1017
1018 /* Prepare optimized instructions if possible. */
1019 prepare_optimized_kprobe(ap);
1020
588 /* 1021 /*
589 * Clear gone flag to prevent allocating new slot again, and 1022 * Clear gone flag to prevent allocating new slot again, and
590 * set disabled flag because it is not armed yet. 1023 * set disabled flag because it is not armed yet.
@@ -593,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
593 | KPROBE_FLAG_DISABLED; 1026 | KPROBE_FLAG_DISABLED;
594 } 1027 }
595 1028
1029 /* Copy ap's insn slot to p */
596 copy_kprobe(ap, p); 1030 copy_kprobe(ap, p);
597 return add_new_kprobe(ap, p); 1031 return add_new_kprobe(ap, p);
598} 1032}
@@ -743,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p)
743 p->nmissed = 0; 1177 p->nmissed = 0;
744 INIT_LIST_HEAD(&p->list); 1178 INIT_LIST_HEAD(&p->list);
745 mutex_lock(&kprobe_mutex); 1179 mutex_lock(&kprobe_mutex);
1180
1181 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1182 mutex_lock(&text_mutex);
1183
746 old_p = get_kprobe(p->addr); 1184 old_p = get_kprobe(p->addr);
747 if (old_p) { 1185 if (old_p) {
1186 /* Since this may unoptimize old_p, locking text_mutex. */
748 ret = register_aggr_kprobe(old_p, p); 1187 ret = register_aggr_kprobe(old_p, p);
749 goto out; 1188 goto out;
750 } 1189 }
751 1190
752 mutex_lock(&text_mutex);
753 ret = arch_prepare_kprobe(p); 1191 ret = arch_prepare_kprobe(p);
754 if (ret) 1192 if (ret)
755 goto out_unlock_text; 1193 goto out;
756 1194
757 INIT_HLIST_NODE(&p->hlist); 1195 INIT_HLIST_NODE(&p->hlist);
758 hlist_add_head_rcu(&p->hlist, 1196 hlist_add_head_rcu(&p->hlist,
759 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1197 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
760 1198
761 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1199 if (!kprobes_all_disarmed && !kprobe_disabled(p))
762 arch_arm_kprobe(p); 1200 __arm_kprobe(p);
1201
1202 /* Try to optimize kprobe */
1203 try_to_optimize_kprobe(p);
763 1204
764out_unlock_text:
765 mutex_unlock(&text_mutex);
766out: 1205out:
1206 mutex_unlock(&text_mutex);
1207 put_online_cpus();
767 mutex_unlock(&kprobe_mutex); 1208 mutex_unlock(&kprobe_mutex);
768 1209
769 if (probed_mod) 1210 if (probed_mod)
@@ -785,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
785 return -EINVAL; 1226 return -EINVAL;
786 1227
787 if (old_p == p || 1228 if (old_p == p ||
788 (old_p->pre_handler == aggr_pre_handler && 1229 (kprobe_aggrprobe(old_p) &&
789 list_is_singular(&old_p->list))) { 1230 list_is_singular(&old_p->list))) {
790 /* 1231 /*
791 * Only probe on the hash list. Disarm only if kprobes are 1232 * Only probe on the hash list. Disarm only if kprobes are
@@ -793,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
793 * already have been removed. We save on flushing icache. 1234 * already have been removed. We save on flushing icache.
794 */ 1235 */
795 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1236 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
796 disarm_kprobe(p); 1237 disarm_kprobe(old_p);
797 hlist_del_rcu(&old_p->hlist); 1238 hlist_del_rcu(&old_p->hlist);
798 } else { 1239 } else {
799 if (p->break_handler && !kprobe_gone(p)) 1240 if (p->break_handler && !kprobe_gone(p))
@@ -809,8 +1250,13 @@ noclean:
809 list_del_rcu(&p->list); 1250 list_del_rcu(&p->list);
810 if (!kprobe_disabled(old_p)) { 1251 if (!kprobe_disabled(old_p)) {
811 try_to_disable_aggr_kprobe(old_p); 1252 try_to_disable_aggr_kprobe(old_p);
812 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1253 if (!kprobes_all_disarmed) {
813 disarm_kprobe(old_p); 1254 if (kprobe_disabled(old_p))
1255 disarm_kprobe(old_p);
1256 else
1257 /* Try to optimize this probe again */
1258 optimize_kprobe(old_p);
1259 }
814 } 1260 }
815 } 1261 }
816 return 0; 1262 return 0;
@@ -827,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
827 old_p = list_entry(p->list.next, struct kprobe, list); 1273 old_p = list_entry(p->list.next, struct kprobe, list);
828 list_del(&p->list); 1274 list_del(&p->list);
829 arch_remove_kprobe(old_p); 1275 arch_remove_kprobe(old_p);
830 kfree(old_p); 1276 free_aggr_kprobe(old_p);
831 } 1277 }
832} 1278}
833 1279
@@ -1123,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1123 struct kprobe *kp; 1569 struct kprobe *kp;
1124 1570
1125 p->flags |= KPROBE_FLAG_GONE; 1571 p->flags |= KPROBE_FLAG_GONE;
1126 if (p->pre_handler == aggr_pre_handler) { 1572 if (kprobe_aggrprobe(p)) {
1127 /* 1573 /*
1128 * If this is an aggr_kprobe, we have to list all the 1574 * If this is an aggr_kprobe, we have to list all the
1129 * chained probes and mark them GONE. 1575 * chained probes and mark them GONE.
@@ -1132,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1132 kp->flags |= KPROBE_FLAG_GONE; 1578 kp->flags |= KPROBE_FLAG_GONE;
1133 p->post_handler = NULL; 1579 p->post_handler = NULL;
1134 p->break_handler = NULL; 1580 p->break_handler = NULL;
1581 kill_optimized_kprobe(p);
1135 } 1582 }
1136 /* 1583 /*
1137 * Here, we can remove insn_slot safely, because no thread calls 1584 * Here, we can remove insn_slot safely, because no thread calls
@@ -1241,6 +1688,15 @@ static int __init init_kprobes(void)
1241 } 1688 }
1242 } 1689 }
1243 1690
1691#if defined(CONFIG_OPTPROBES)
1692#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1693 /* Init kprobe_optinsn_slots */
1694 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1695#endif
1696 /* By default, kprobes can be optimized */
1697 kprobes_allow_optimization = true;
1698#endif
1699
1244 /* By default, kprobes are armed */ 1700 /* By default, kprobes are armed */
1245 kprobes_all_disarmed = false; 1701 kprobes_all_disarmed = false;
1246 1702
@@ -1259,7 +1715,7 @@ static int __init init_kprobes(void)
1259 1715
1260#ifdef CONFIG_DEBUG_FS 1716#ifdef CONFIG_DEBUG_FS
1261static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1717static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1262 const char *sym, int offset,char *modname) 1718 const char *sym, int offset, char *modname, struct kprobe *pp)
1263{ 1719{
1264 char *kprobe_type; 1720 char *kprobe_type;
1265 1721
@@ -1269,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1269 kprobe_type = "j"; 1725 kprobe_type = "j";
1270 else 1726 else
1271 kprobe_type = "k"; 1727 kprobe_type = "k";
1728
1272 if (sym) 1729 if (sym)
1273 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1730 seq_printf(pi, "%p %s %s+0x%x %s ",
1274 p->addr, kprobe_type, sym, offset, 1731 p->addr, kprobe_type, sym, offset,
1275 (modname ? modname : " "), 1732 (modname ? modname : " "));
1276 (kprobe_gone(p) ? "[GONE]" : ""),
1277 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1278 "[DISABLED]" : ""));
1279 else 1733 else
1280 seq_printf(pi, "%p %s %p %s%s\n", 1734 seq_printf(pi, "%p %s %p ",
1281 p->addr, kprobe_type, p->addr, 1735 p->addr, kprobe_type, p->addr);
1282 (kprobe_gone(p) ? "[GONE]" : ""), 1736
1283 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1737 if (!pp)
1284 "[DISABLED]" : "")); 1738 pp = p;
1739 seq_printf(pi, "%s%s%s\n",
1740 (kprobe_gone(p) ? "[GONE]" : ""),
1741 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1742 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1285} 1743}
1286 1744
1287static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1745static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1317,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1317 hlist_for_each_entry_rcu(p, node, head, hlist) { 1775 hlist_for_each_entry_rcu(p, node, head, hlist) {
1318 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1776 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1319 &offset, &modname, namebuf); 1777 &offset, &modname, namebuf);
1320 if (p->pre_handler == aggr_pre_handler) { 1778 if (kprobe_aggrprobe(p)) {
1321 list_for_each_entry_rcu(kp, &p->list, list) 1779 list_for_each_entry_rcu(kp, &p->list, list)
1322 report_probe(pi, kp, sym, offset, modname); 1780 report_probe(pi, kp, sym, offset, modname, p);
1323 } else 1781 } else
1324 report_probe(pi, p, sym, offset, modname); 1782 report_probe(pi, p, sym, offset, modname, NULL);
1325 } 1783 }
1326 preempt_enable(); 1784 preempt_enable();
1327 return 0; 1785 return 0;
@@ -1399,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
1399 goto out; 1857 goto out;
1400 } 1858 }
1401 1859
1402 if (!kprobes_all_disarmed && kprobe_disabled(p))
1403 arm_kprobe(p);
1404
1405 p->flags &= ~KPROBE_FLAG_DISABLED;
1406 if (p != kp) 1860 if (p != kp)
1407 kp->flags &= ~KPROBE_FLAG_DISABLED; 1861 kp->flags &= ~KPROBE_FLAG_DISABLED;
1862
1863 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1864 p->flags &= ~KPROBE_FLAG_DISABLED;
1865 arm_kprobe(p);
1866 }
1408out: 1867out:
1409 mutex_unlock(&kprobe_mutex); 1868 mutex_unlock(&kprobe_mutex);
1410 return ret; 1869 return ret;
@@ -1424,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void)
1424 if (!kprobes_all_disarmed) 1883 if (!kprobes_all_disarmed)
1425 goto already_enabled; 1884 goto already_enabled;
1426 1885
1886 /* Arming kprobes doesn't optimize kprobe itself */
1427 mutex_lock(&text_mutex); 1887 mutex_lock(&text_mutex);
1428 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1888 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1429 head = &kprobe_table[i]; 1889 head = &kprobe_table[i];
1430 hlist_for_each_entry_rcu(p, node, head, hlist) 1890 hlist_for_each_entry_rcu(p, node, head, hlist)
1431 if (!kprobe_disabled(p)) 1891 if (!kprobe_disabled(p))
1432 arch_arm_kprobe(p); 1892 __arm_kprobe(p);
1433 } 1893 }
1434 mutex_unlock(&text_mutex); 1894 mutex_unlock(&text_mutex);
1435 1895
@@ -1456,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void)
1456 1916
1457 kprobes_all_disarmed = true; 1917 kprobes_all_disarmed = true;
1458 printk(KERN_INFO "Kprobes globally disabled\n"); 1918 printk(KERN_INFO "Kprobes globally disabled\n");
1919
1920 /*
1921 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1922 * because disarming may also unoptimize kprobes.
1923 */
1924 get_online_cpus();
1459 mutex_lock(&text_mutex); 1925 mutex_lock(&text_mutex);
1460 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1926 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1461 head = &kprobe_table[i]; 1927 head = &kprobe_table[i];
1462 hlist_for_each_entry_rcu(p, node, head, hlist) { 1928 hlist_for_each_entry_rcu(p, node, head, hlist) {
1463 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1929 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1464 arch_disarm_kprobe(p); 1930 __disarm_kprobe(p);
1465 } 1931 }
1466 } 1932 }
1467 1933
1468 mutex_unlock(&text_mutex); 1934 mutex_unlock(&text_mutex);
1935 put_online_cpus();
1469 mutex_unlock(&kprobe_mutex); 1936 mutex_unlock(&kprobe_mutex);
1470 /* Allow all currently running kprobes to complete */ 1937 /* Allow all currently running kprobes to complete */
1471 synchronize_sched(); 1938 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c62ec14609b9..0c30d0455de1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3809,3 +3809,21 @@ void lockdep_sys_exit(void)
3809 lockdep_print_held_locks(curr); 3809 lockdep_print_held_locks(curr);
3810 } 3810 }
3811} 3811}
3812
3813void lockdep_rcu_dereference(const char *file, const int line)
3814{
3815 struct task_struct *curr = current;
3816
3817 if (!debug_locks_off())
3818 return;
3819 printk("\n===================================================\n");
3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3821 printk( "---------------------------------------------------\n");
3822 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3823 file, line);
3824 printk("\nother info that might help us debug this:\n\n");
3825 lockdep_print_held_locks(curr);
3826 printk("\nstack backtrace:\n");
3827 dump_stack();
3828}
3829EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index f82386bd9ee9..c968d3606dca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod)
474 474
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 475 INIT_LIST_HEAD(&mod->modules_which_use_me);
476 for_each_possible_cpu(cpu) 476 for_each_possible_cpu(cpu)
477 local_set(__module_ref_addr(mod, cpu), 0); 477 per_cpu_ptr(mod->refptr, cpu)->count = 0;
478
478 /* Hold reference count during initialization. */ 479 /* Hold reference count during initialization. */
479 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 480 __this_cpu_write(mod->refptr->count, 1);
480 /* Backwards compatibility macros put refcount during init. */ 481 /* Backwards compatibility macros put refcount during init. */
481 mod->waiter = current; 482 mod->waiter = current;
482} 483}
@@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod)
619 int cpu; 620 int cpu;
620 621
621 for_each_possible_cpu(cpu) 622 for_each_possible_cpu(cpu)
622 total += local_read(__module_ref_addr(mod, cpu)); 623 total += per_cpu_ptr(mod->refptr, cpu)->count;
623 return total; 624 return total;
624} 625}
625EXPORT_SYMBOL(module_refcount); 626EXPORT_SYMBOL(module_refcount);
@@ -796,14 +797,15 @@ static struct module_attribute refcnt = {
796void module_put(struct module *module) 797void module_put(struct module *module)
797{ 798{
798 if (module) { 799 if (module) {
799 unsigned int cpu = get_cpu(); 800 preempt_disable();
800 local_dec(__module_ref_addr(module, cpu)); 801 __this_cpu_dec(module->refptr->count);
802
801 trace_module_put(module, _RET_IP_, 803 trace_module_put(module, _RET_IP_,
802 local_read(__module_ref_addr(module, cpu))); 804 __this_cpu_read(module->refptr->count));
803 /* Maybe they're waiting for us to drop reference? */ 805 /* Maybe they're waiting for us to drop reference? */
804 if (unlikely(!module_is_live(module))) 806 if (unlikely(!module_is_live(module)))
805 wake_up_process(module->waiter); 807 wake_up_process(module->waiter);
806 put_cpu(); 808 preempt_enable();
807 } 809 }
808} 810}
809EXPORT_SYMBOL(module_put); 811EXPORT_SYMBOL(module_put);
@@ -1083,6 +1085,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1083 if (sattr->name == NULL) 1085 if (sattr->name == NULL)
1084 goto out; 1086 goto out;
1085 sect_attrs->nsections++; 1087 sect_attrs->nsections++;
1088 sysfs_attr_init(&sattr->mattr.attr);
1086 sattr->mattr.show = module_sect_show; 1089 sattr->mattr.show = module_sect_show;
1087 sattr->mattr.store = NULL; 1090 sattr->mattr.store = NULL;
1088 sattr->mattr.attr.name = sattr->name; 1091 sattr->mattr.attr.name = sattr->name;
@@ -1178,6 +1181,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1178 if (sect_empty(&sechdrs[i])) 1181 if (sect_empty(&sechdrs[i]))
1179 continue; 1182 continue;
1180 if (sechdrs[i].sh_type == SHT_NOTE) { 1183 if (sechdrs[i].sh_type == SHT_NOTE) {
1184 sysfs_bin_attr_init(nattr);
1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1185 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1182 nattr->attr.mode = S_IRUGO; 1186 nattr->attr.mode = S_IRUGO;
1183 nattr->size = sechdrs[i].sh_size; 1187 nattr->size = sechdrs[i].sh_size;
@@ -1250,6 +1254,7 @@ int module_add_modinfo_attrs(struct module *mod)
1250 if (!attr->test || 1254 if (!attr->test ||
1251 (attr->test && attr->test(mod))) { 1255 (attr->test && attr->test(mod))) {
1252 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1256 memcpy(temp_attr, attr, sizeof(*temp_attr));
1257 sysfs_attr_init(&temp_attr->attr);
1253 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1258 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1254 ++temp_attr; 1259 ++temp_attr;
1255 } 1260 }
@@ -1397,9 +1402,9 @@ static void free_module(struct module *mod)
1397 kfree(mod->args); 1402 kfree(mod->args);
1398 if (mod->percpu) 1403 if (mod->percpu)
1399 percpu_modfree(mod->percpu); 1404 percpu_modfree(mod->percpu);
1400#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 1405#if defined(CONFIG_MODULE_UNLOAD)
1401 if (mod->refptr) 1406 if (mod->refptr)
1402 percpu_modfree(mod->refptr); 1407 free_percpu(mod->refptr);
1403#endif 1408#endif
1404 /* Free lock-classes: */ 1409 /* Free lock-classes: */
1405 lockdep_free_key_range(mod->module_core, mod->core_size); 1410 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2162,9 +2167,8 @@ static noinline struct module *load_module(void __user *umod,
2162 mod = (void *)sechdrs[modindex].sh_addr; 2167 mod = (void *)sechdrs[modindex].sh_addr;
2163 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2168 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2164 2169
2165#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2170#if defined(CONFIG_MODULE_UNLOAD)
2166 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2171 mod->refptr = alloc_percpu(struct module_ref);
2167 mod->name);
2168 if (!mod->refptr) { 2172 if (!mod->refptr) {
2169 err = -ENOMEM; 2173 err = -ENOMEM;
2170 goto free_init; 2174 goto free_init;
@@ -2396,8 +2400,8 @@ static noinline struct module *load_module(void __user *umod,
2396 kobject_put(&mod->mkobj.kobj); 2400 kobject_put(&mod->mkobj.kobj);
2397 free_unload: 2401 free_unload:
2398 module_unload_free(mod); 2402 module_unload_free(mod);
2399#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2403#if defined(CONFIG_MODULE_UNLOAD)
2400 percpu_modfree(mod->refptr); 2404 free_percpu(mod->refptr);
2401 free_init: 2405 free_init:
2402#endif 2406#endif
2403 module_free(mod, mod->module_init); 2407 module_free(mod, mod->module_init);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
80 80
81 nb = rcu_dereference(*nl); 81 nb = rcu_dereference_raw(*nl);
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference_raw(nb->next);
85 85
86#ifdef CONFIG_DEBUG_NOTIFIERS 86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { 87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference(nh->head)) { 312 if (rcu_dereference_raw(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..93caf65ff57c
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,696 @@
1/*
2 * padata.c - generic interface to process data streams in parallel
3 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include <linux/module.h>
22#include <linux/cpumask.h>
23#include <linux/err.h>
24#include <linux/cpu.h>
25#include <linux/padata.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/rcupdate.h>
29
30#define MAX_SEQ_NR INT_MAX - NR_CPUS
31#define MAX_OBJ_NUM 10000 * NR_CPUS
32
33static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
34{
35 int cpu, target_cpu;
36
37 target_cpu = cpumask_first(pd->cpumask);
38 for (cpu = 0; cpu < cpu_index; cpu++)
39 target_cpu = cpumask_next(target_cpu, pd->cpumask);
40
41 return target_cpu;
42}
43
44static int padata_cpu_hash(struct padata_priv *padata)
45{
46 int cpu_index;
47 struct parallel_data *pd;
48
49 pd = padata->pd;
50
51 /*
52 * Hash the sequence numbers to the cpus by taking
53 * seq_nr mod. number of cpus in use.
54 */
55 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
56
57 return padata_index_to_cpu(pd, cpu_index);
58}
59
60static void padata_parallel_worker(struct work_struct *work)
61{
62 struct padata_queue *queue;
63 struct parallel_data *pd;
64 struct padata_instance *pinst;
65 LIST_HEAD(local_list);
66
67 local_bh_disable();
68 queue = container_of(work, struct padata_queue, pwork);
69 pd = queue->pd;
70 pinst = pd->pinst;
71
72 spin_lock(&queue->parallel.lock);
73 list_replace_init(&queue->parallel.list, &local_list);
74 spin_unlock(&queue->parallel.lock);
75
76 while (!list_empty(&local_list)) {
77 struct padata_priv *padata;
78
79 padata = list_entry(local_list.next,
80 struct padata_priv, list);
81
82 list_del_init(&padata->list);
83
84 padata->parallel(padata);
85 }
86
87 local_bh_enable();
88}
89
90/*
91 * padata_do_parallel - padata parallelization function
92 *
93 * @pinst: padata instance
94 * @padata: object to be parallelized
95 * @cb_cpu: cpu the serialization callback function will run on,
96 * must be in the cpumask of padata.
97 *
98 * The parallelization callback function will run with BHs off.
99 * Note: Every object which is parallelized by padata_do_parallel
100 * must be seen by padata_do_serial.
101 */
102int padata_do_parallel(struct padata_instance *pinst,
103 struct padata_priv *padata, int cb_cpu)
104{
105 int target_cpu, err;
106 struct padata_queue *queue;
107 struct parallel_data *pd;
108
109 rcu_read_lock_bh();
110
111 pd = rcu_dereference(pinst->pd);
112
113 err = 0;
114 if (!(pinst->flags & PADATA_INIT))
115 goto out;
116
117 err = -EBUSY;
118 if ((pinst->flags & PADATA_RESET))
119 goto out;
120
121 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
122 goto out;
123
124 err = -EINVAL;
125 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
126 goto out;
127
128 err = -EINPROGRESS;
129 atomic_inc(&pd->refcnt);
130 padata->pd = pd;
131 padata->cb_cpu = cb_cpu;
132
133 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
134 atomic_set(&pd->seq_nr, -1);
135
136 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
137
138 target_cpu = padata_cpu_hash(padata);
139 queue = per_cpu_ptr(pd->queue, target_cpu);
140
141 spin_lock(&queue->parallel.lock);
142 list_add_tail(&padata->list, &queue->parallel.list);
143 spin_unlock(&queue->parallel.lock);
144
145 queue_work_on(target_cpu, pinst->wq, &queue->pwork);
146
147out:
148 rcu_read_unlock_bh();
149
150 return err;
151}
152EXPORT_SYMBOL(padata_do_parallel);
153
154static struct padata_priv *padata_get_next(struct parallel_data *pd)
155{
156 int cpu, num_cpus, empty, calc_seq_nr;
157 int seq_nr, next_nr, overrun, next_overrun;
158 struct padata_queue *queue, *next_queue;
159 struct padata_priv *padata;
160 struct padata_list *reorder;
161
162 empty = 0;
163 next_nr = -1;
164 next_overrun = 0;
165 next_queue = NULL;
166
167 num_cpus = cpumask_weight(pd->cpumask);
168
169 for_each_cpu(cpu, pd->cpumask) {
170 queue = per_cpu_ptr(pd->queue, cpu);
171 reorder = &queue->reorder;
172
173 /*
174 * Calculate the seq_nr of the object that should be
175 * next in this queue.
176 */
177 overrun = 0;
178 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
179 + queue->cpu_index;
180
181 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
182 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
183 overrun = 1;
184 }
185
186 if (!list_empty(&reorder->list)) {
187 padata = list_entry(reorder->list.next,
188 struct padata_priv, list);
189
190 seq_nr = padata->seq_nr;
191 BUG_ON(calc_seq_nr != seq_nr);
192 } else {
193 seq_nr = calc_seq_nr;
194 empty++;
195 }
196
197 if (next_nr < 0 || seq_nr < next_nr
198 || (next_overrun && !overrun)) {
199 next_nr = seq_nr;
200 next_overrun = overrun;
201 next_queue = queue;
202 }
203 }
204
205 padata = NULL;
206
207 if (empty == num_cpus)
208 goto out;
209
210 reorder = &next_queue->reorder;
211
212 if (!list_empty(&reorder->list)) {
213 padata = list_entry(reorder->list.next,
214 struct padata_priv, list);
215
216 if (unlikely(next_overrun)) {
217 for_each_cpu(cpu, pd->cpumask) {
218 queue = per_cpu_ptr(pd->queue, cpu);
219 atomic_set(&queue->num_obj, 0);
220 }
221 }
222
223 spin_lock(&reorder->lock);
224 list_del_init(&padata->list);
225 atomic_dec(&pd->reorder_objects);
226 spin_unlock(&reorder->lock);
227
228 atomic_inc(&next_queue->num_obj);
229
230 goto out;
231 }
232
233 if (next_nr % num_cpus == next_queue->cpu_index) {
234 padata = ERR_PTR(-ENODATA);
235 goto out;
236 }
237
238 padata = ERR_PTR(-EINPROGRESS);
239out:
240 return padata;
241}
242
243static void padata_reorder(struct parallel_data *pd)
244{
245 struct padata_priv *padata;
246 struct padata_queue *queue;
247 struct padata_instance *pinst = pd->pinst;
248
249try_again:
250 if (!spin_trylock_bh(&pd->lock))
251 goto out;
252
253 while (1) {
254 padata = padata_get_next(pd);
255
256 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
257 break;
258
259 if (PTR_ERR(padata) == -ENODATA) {
260 spin_unlock_bh(&pd->lock);
261 goto out;
262 }
263
264 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
265
266 spin_lock(&queue->serial.lock);
267 list_add_tail(&padata->list, &queue->serial.list);
268 spin_unlock(&queue->serial.lock);
269
270 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
271 }
272
273 spin_unlock_bh(&pd->lock);
274
275 if (atomic_read(&pd->reorder_objects))
276 goto try_again;
277
278out:
279 return;
280}
281
282static void padata_serial_worker(struct work_struct *work)
283{
284 struct padata_queue *queue;
285 struct parallel_data *pd;
286 LIST_HEAD(local_list);
287
288 local_bh_disable();
289 queue = container_of(work, struct padata_queue, swork);
290 pd = queue->pd;
291
292 spin_lock(&queue->serial.lock);
293 list_replace_init(&queue->serial.list, &local_list);
294 spin_unlock(&queue->serial.lock);
295
296 while (!list_empty(&local_list)) {
297 struct padata_priv *padata;
298
299 padata = list_entry(local_list.next,
300 struct padata_priv, list);
301
302 list_del_init(&padata->list);
303
304 padata->serial(padata);
305 atomic_dec(&pd->refcnt);
306 }
307 local_bh_enable();
308}
309
310/*
311 * padata_do_serial - padata serialization function
312 *
313 * @padata: object to be serialized.
314 *
315 * padata_do_serial must be called for every parallelized object.
316 * The serialization callback function will run with BHs off.
317 */
318void padata_do_serial(struct padata_priv *padata)
319{
320 int cpu;
321 struct padata_queue *queue;
322 struct parallel_data *pd;
323
324 pd = padata->pd;
325
326 cpu = get_cpu();
327 queue = per_cpu_ptr(pd->queue, cpu);
328
329 spin_lock(&queue->reorder.lock);
330 atomic_inc(&pd->reorder_objects);
331 list_add_tail(&padata->list, &queue->reorder.list);
332 spin_unlock(&queue->reorder.lock);
333
334 put_cpu();
335
336 padata_reorder(pd);
337}
338EXPORT_SYMBOL(padata_do_serial);
339
340static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
341 const struct cpumask *cpumask)
342{
343 int cpu, cpu_index, num_cpus;
344 struct padata_queue *queue;
345 struct parallel_data *pd;
346
347 cpu_index = 0;
348
349 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
350 if (!pd)
351 goto err;
352
353 pd->queue = alloc_percpu(struct padata_queue);
354 if (!pd->queue)
355 goto err_free_pd;
356
357 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
358 goto err_free_queue;
359
360 for_each_possible_cpu(cpu) {
361 queue = per_cpu_ptr(pd->queue, cpu);
362
363 queue->pd = pd;
364
365 if (cpumask_test_cpu(cpu, cpumask)
366 && cpumask_test_cpu(cpu, cpu_active_mask)) {
367 queue->cpu_index = cpu_index;
368 cpu_index++;
369 } else
370 queue->cpu_index = -1;
371
372 INIT_LIST_HEAD(&queue->reorder.list);
373 INIT_LIST_HEAD(&queue->parallel.list);
374 INIT_LIST_HEAD(&queue->serial.list);
375 spin_lock_init(&queue->reorder.lock);
376 spin_lock_init(&queue->parallel.lock);
377 spin_lock_init(&queue->serial.lock);
378
379 INIT_WORK(&queue->pwork, padata_parallel_worker);
380 INIT_WORK(&queue->swork, padata_serial_worker);
381 atomic_set(&queue->num_obj, 0);
382 }
383
384 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
385
386 num_cpus = cpumask_weight(pd->cpumask);
387 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
388
389 atomic_set(&pd->seq_nr, -1);
390 atomic_set(&pd->reorder_objects, 0);
391 atomic_set(&pd->refcnt, 0);
392 pd->pinst = pinst;
393 spin_lock_init(&pd->lock);
394
395 return pd;
396
397err_free_queue:
398 free_percpu(pd->queue);
399err_free_pd:
400 kfree(pd);
401err:
402 return NULL;
403}
404
405static void padata_free_pd(struct parallel_data *pd)
406{
407 free_cpumask_var(pd->cpumask);
408 free_percpu(pd->queue);
409 kfree(pd);
410}
411
412static void padata_replace(struct padata_instance *pinst,
413 struct parallel_data *pd_new)
414{
415 struct parallel_data *pd_old = pinst->pd;
416
417 pinst->flags |= PADATA_RESET;
418
419 rcu_assign_pointer(pinst->pd, pd_new);
420
421 synchronize_rcu();
422
423 while (atomic_read(&pd_old->refcnt) != 0)
424 yield();
425
426 flush_workqueue(pinst->wq);
427
428 padata_free_pd(pd_old);
429
430 pinst->flags &= ~PADATA_RESET;
431}
432
433/*
434 * padata_set_cpumask - set the cpumask that padata should use
435 *
436 * @pinst: padata instance
437 * @cpumask: the cpumask to use
438 */
439int padata_set_cpumask(struct padata_instance *pinst,
440 cpumask_var_t cpumask)
441{
442 struct parallel_data *pd;
443 int err = 0;
444
445 might_sleep();
446
447 mutex_lock(&pinst->lock);
448
449 pd = padata_alloc_pd(pinst, cpumask);
450 if (!pd) {
451 err = -ENOMEM;
452 goto out;
453 }
454
455 cpumask_copy(pinst->cpumask, cpumask);
456
457 padata_replace(pinst, pd);
458
459out:
460 mutex_unlock(&pinst->lock);
461
462 return err;
463}
464EXPORT_SYMBOL(padata_set_cpumask);
465
466static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
467{
468 struct parallel_data *pd;
469
470 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
471 pd = padata_alloc_pd(pinst, pinst->cpumask);
472 if (!pd)
473 return -ENOMEM;
474
475 padata_replace(pinst, pd);
476 }
477
478 return 0;
479}
480
481/*
482 * padata_add_cpu - add a cpu to the padata cpumask
483 *
484 * @pinst: padata instance
485 * @cpu: cpu to add
486 */
487int padata_add_cpu(struct padata_instance *pinst, int cpu)
488{
489 int err;
490
491 might_sleep();
492
493 mutex_lock(&pinst->lock);
494
495 cpumask_set_cpu(cpu, pinst->cpumask);
496 err = __padata_add_cpu(pinst, cpu);
497
498 mutex_unlock(&pinst->lock);
499
500 return err;
501}
502EXPORT_SYMBOL(padata_add_cpu);
503
504static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
505{
506 struct parallel_data *pd;
507
508 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
509 pd = padata_alloc_pd(pinst, pinst->cpumask);
510 if (!pd)
511 return -ENOMEM;
512
513 padata_replace(pinst, pd);
514 }
515
516 return 0;
517}
518
519/*
520 * padata_remove_cpu - remove a cpu from the padata cpumask
521 *
522 * @pinst: padata instance
523 * @cpu: cpu to remove
524 */
525int padata_remove_cpu(struct padata_instance *pinst, int cpu)
526{
527 int err;
528
529 might_sleep();
530
531 mutex_lock(&pinst->lock);
532
533 cpumask_clear_cpu(cpu, pinst->cpumask);
534 err = __padata_remove_cpu(pinst, cpu);
535
536 mutex_unlock(&pinst->lock);
537
538 return err;
539}
540EXPORT_SYMBOL(padata_remove_cpu);
541
542/*
543 * padata_start - start the parallel processing
544 *
545 * @pinst: padata instance to start
546 */
547void padata_start(struct padata_instance *pinst)
548{
549 might_sleep();
550
551 mutex_lock(&pinst->lock);
552 pinst->flags |= PADATA_INIT;
553 mutex_unlock(&pinst->lock);
554}
555EXPORT_SYMBOL(padata_start);
556
557/*
558 * padata_stop - stop the parallel processing
559 *
560 * @pinst: padata instance to stop
561 */
562void padata_stop(struct padata_instance *pinst)
563{
564 might_sleep();
565
566 mutex_lock(&pinst->lock);
567 pinst->flags &= ~PADATA_INIT;
568 mutex_unlock(&pinst->lock);
569}
570EXPORT_SYMBOL(padata_stop);
571
572static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
573 unsigned long action, void *hcpu)
574{
575 int err;
576 struct padata_instance *pinst;
577 int cpu = (unsigned long)hcpu;
578
579 pinst = container_of(nfb, struct padata_instance, cpu_notifier);
580
581 switch (action) {
582 case CPU_ONLINE:
583 case CPU_ONLINE_FROZEN:
584 if (!cpumask_test_cpu(cpu, pinst->cpumask))
585 break;
586 mutex_lock(&pinst->lock);
587 err = __padata_add_cpu(pinst, cpu);
588 mutex_unlock(&pinst->lock);
589 if (err)
590 return NOTIFY_BAD;
591 break;
592
593 case CPU_DOWN_PREPARE:
594 case CPU_DOWN_PREPARE_FROZEN:
595 if (!cpumask_test_cpu(cpu, pinst->cpumask))
596 break;
597 mutex_lock(&pinst->lock);
598 err = __padata_remove_cpu(pinst, cpu);
599 mutex_unlock(&pinst->lock);
600 if (err)
601 return NOTIFY_BAD;
602 break;
603
604 case CPU_UP_CANCELED:
605 case CPU_UP_CANCELED_FROZEN:
606 if (!cpumask_test_cpu(cpu, pinst->cpumask))
607 break;
608 mutex_lock(&pinst->lock);
609 __padata_remove_cpu(pinst, cpu);
610 mutex_unlock(&pinst->lock);
611
612 case CPU_DOWN_FAILED:
613 case CPU_DOWN_FAILED_FROZEN:
614 if (!cpumask_test_cpu(cpu, pinst->cpumask))
615 break;
616 mutex_lock(&pinst->lock);
617 __padata_add_cpu(pinst, cpu);
618 mutex_unlock(&pinst->lock);
619 }
620
621 return NOTIFY_OK;
622}
623
624/*
625 * padata_alloc - allocate and initialize a padata instance
626 *
627 * @cpumask: cpumask that padata uses for parallelization
628 * @wq: workqueue to use for the allocated padata instance
629 */
630struct padata_instance *padata_alloc(const struct cpumask *cpumask,
631 struct workqueue_struct *wq)
632{
633 int err;
634 struct padata_instance *pinst;
635 struct parallel_data *pd;
636
637 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
638 if (!pinst)
639 goto err;
640
641 pd = padata_alloc_pd(pinst, cpumask);
642 if (!pd)
643 goto err_free_inst;
644
645 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
646 goto err_free_pd;
647
648 rcu_assign_pointer(pinst->pd, pd);
649
650 pinst->wq = wq;
651
652 cpumask_copy(pinst->cpumask, cpumask);
653
654 pinst->flags = 0;
655
656 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
657 pinst->cpu_notifier.priority = 0;
658 err = register_hotcpu_notifier(&pinst->cpu_notifier);
659 if (err)
660 goto err_free_cpumask;
661
662 mutex_init(&pinst->lock);
663
664 return pinst;
665
666err_free_cpumask:
667 free_cpumask_var(pinst->cpumask);
668err_free_pd:
669 padata_free_pd(pd);
670err_free_inst:
671 kfree(pinst);
672err:
673 return NULL;
674}
675EXPORT_SYMBOL(padata_alloc);
676
677/*
678 * padata_free - free a padata instance
679 *
680 * @ padata_inst: padata instance to free
681 */
682void padata_free(struct padata_instance *pinst)
683{
684 padata_stop(pinst);
685
686 synchronize_rcu();
687
688 while (atomic_read(&pinst->pd->refcnt) != 0)
689 yield();
690
691 unregister_hotcpu_notifier(&pinst->cpu_notifier);
692 padata_free_pd(pinst->pd);
693 free_cpumask_var(pinst->cpumask);
694 kfree(pinst);
695}
696EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 36
37EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
38 38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
45long (*panic_blink)(long time); 40long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
47 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
48/** 69/**
49 * panic - halt the system 70 * panic - halt the system
50 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
95 116
96 bust_spinlocks(0); 117 bust_spinlocks(0);
97 118
98 if (!panic_blink)
99 panic_blink = no_blink;
100
101 if (panic_timeout > 0) { 119 if (panic_timeout > 0) {
102 /* 120 /*
103 * Delay timeout seconds before rebooting the machine. 121 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
105 */ 123 */
106 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
107 125
108 for (i = 0; i < panic_timeout*1000; ) { 126 for (i = 0; i < panic_timeout; i++) {
109 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
110 i += panic_blink(i); 128 panic_blink_one_second();
111 mdelay(1);
112 i++;
113 } 129 }
114 /* 130 /*
115 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
135 } 151 }
136#endif 152#endif
137 local_irq_enable(); 153 local_irq_enable();
138 for (i = 0; ; ) { 154 while (1) {
139 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
140 i += panic_blink(i); 156 panic_blink_one_second();
141 mdelay(1);
142 i++;
143 } 157 }
144} 158}
145 159
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..d55a53ec9234 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
28 27
29#if 0 28#if 0
30#define DEBUGP printk 29#define DEBUGP printk
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
517 new->grp.attrs = attrs; 516 new->grp.attrs = attrs;
518 517
519 /* Tack new one on the end. */ 518 /* Tack new one on the end. */
519 sysfs_attr_init(&new->attrs[num].mattr.attr);
520 new->attrs[num].param = kp; 520 new->attrs[num].param = kp;
521 new->attrs[num].mattr.show = param_attr_show; 521 new->attrs[num].mattr.show = param_attr_show;
522 new->attrs[num].mattr.store = param_attr_store; 522 new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
723 return ret; 723 return ret;
724} 724}
725 725
726static struct sysfs_ops module_sysfs_ops = { 726static const struct sysfs_ops module_sysfs_ops = {
727 .show = module_attr_show, 727 .show = module_attr_show,
728 .store = module_attr_store, 728 .store = module_attr_store,
729}; 729};
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
737 return 0; 737 return 0;
738} 738}
739 739
740static struct kset_uevent_ops module_uevent_ops = { 740static const struct kset_uevent_ops module_uevent_ops = {
741 .filter = uevent_filter, 741 .filter = uevent_filter,
742}; 742};
743 743
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 482d5e1d3764..e68745053013 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2595,7 +2595,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2595 if (user_locked > user_lock_limit) 2595 if (user_locked > user_lock_limit)
2596 extra = user_locked - user_lock_limit; 2596 extra = user_locked - user_lock_limit;
2597 2597
2598 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2598 lock_limit = rlimit(RLIMIT_MEMLOCK);
2599 lock_limit >>= PAGE_SHIFT; 2599 lock_limit >>= PAGE_SHIFT;
2600 locked = vma->vm_mm->locked_vm + extra; 2600 locked = vma->vm_mm->locked_vm + extra;
2601 2601
@@ -5466,13 +5466,16 @@ void __init perf_event_init(void)
5466 register_cpu_notifier(&perf_cpu_nb); 5466 register_cpu_notifier(&perf_cpu_nb);
5467} 5467}
5468 5468
5469static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5469static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5470 struct sysdev_class_attribute *attr,
5471 char *buf)
5470{ 5472{
5471 return sprintf(buf, "%d\n", perf_reserved_percpu); 5473 return sprintf(buf, "%d\n", perf_reserved_percpu);
5472} 5474}
5473 5475
5474static ssize_t 5476static ssize_t
5475perf_set_reserve_percpu(struct sysdev_class *class, 5477perf_set_reserve_percpu(struct sysdev_class *class,
5478 struct sysdev_class_attribute *attr,
5476 const char *buf, 5479 const char *buf,
5477 size_t count) 5480 size_t count)
5478{ 5481{
@@ -5501,13 +5504,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5501 return count; 5504 return count;
5502} 5505}
5503 5506
5504static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5507static ssize_t perf_show_overcommit(struct sysdev_class *class,
5508 struct sysdev_class_attribute *attr,
5509 char *buf)
5505{ 5510{
5506 return sprintf(buf, "%d\n", perf_overcommit); 5511 return sprintf(buf, "%d\n", perf_overcommit);
5507} 5512}
5508 5513
5509static ssize_t 5514static ssize_t
5510perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5515perf_set_overcommit(struct sysdev_class *class,
5516 struct sysdev_class_attribute *attr,
5517 const char *buf, size_t count)
5511{ 5518{
5512 unsigned long val; 5519 unsigned long val;
5513 int err; 5520 int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cbe..86b296943e5f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference(pid->tasks[type].first); 370 first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock));
371 if (first) 371 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 372 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 373 }
@@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 376EXPORT_SYMBOL(pid_task);
377 377
378/* 378/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 379 * Must be called under rcu_read_lock().
380 */ 380 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 382{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..1a22dfd42df9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
982 int maxfire; 982 int maxfire;
983 struct list_head *timers = tsk->cpu_timers; 983 struct list_head *timers = tsk->cpu_timers;
984 struct signal_struct *const sig = tsk->signal; 984 struct signal_struct *const sig = tsk->signal;
985 unsigned long soft;
985 986
986 maxfire = 20; 987 maxfire = 20;
987 tsk->cputime_expires.prof_exp = cputime_zero; 988 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
1030 /* 1031 /*
1031 * Check for the special case thread timers. 1032 * Check for the special case thread timers.
1032 */ 1033 */
1033 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 1034 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1034 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 1035 if (soft != RLIM_INFINITY) {
1035 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 1036 unsigned long hard =
1037 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1036 1038
1037 if (hard != RLIM_INFINITY && 1039 if (hard != RLIM_INFINITY &&
1038 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 1040 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
1043 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1045 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1044 return; 1046 return;
1045 } 1047 }
1046 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 1048 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1047 /* 1049 /*
1048 * At the soft limit, send a SIGXCPU every second. 1050 * At the soft limit, send a SIGXCPU every second.
1049 */ 1051 */
1050 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 1052 if (soft < hard) {
1051 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 1053 soft += USEC_PER_SEC;
1052 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1054 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1053 USEC_PER_SEC;
1054 } 1055 }
1055 printk(KERN_INFO 1056 printk(KERN_INFO
1056 "RT Watchdog Timeout: %s[%d]\n", 1057 "RT Watchdog Timeout: %s[%d]\n",
@@ -1121,6 +1122,7 @@ static void check_process_timers(struct task_struct *tsk,
1121 unsigned long long sum_sched_runtime, sched_expires; 1122 unsigned long long sum_sched_runtime, sched_expires;
1122 struct list_head *timers = sig->cpu_timers; 1123 struct list_head *timers = sig->cpu_timers;
1123 struct task_cputime cputime; 1124 struct task_cputime cputime;
1125 unsigned long soft;
1124 1126
1125 /* 1127 /*
1126 * Don't sample the current process CPU clocks if there are no timers. 1128 * Don't sample the current process CPU clocks if there are no timers.
@@ -1193,11 +1195,13 @@ static void check_process_timers(struct task_struct *tsk,
1193 SIGPROF); 1195 SIGPROF);
1194 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1196 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1195 SIGVTALRM); 1197 SIGVTALRM);
1196 1198 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1197 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1199 if (soft != RLIM_INFINITY) {
1198 unsigned long psecs = cputime_to_secs(ptime); 1200 unsigned long psecs = cputime_to_secs(ptime);
1201 unsigned long hard =
1202 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1199 cputime_t x; 1203 cputime_t x;
1200 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1204 if (psecs >= hard) {
1201 /* 1205 /*
1202 * At the hard limit, we just die. 1206 * At the hard limit, we just die.
1203 * No need to calculate anything else now. 1207 * No need to calculate anything else now.
@@ -1205,17 +1209,17 @@ static void check_process_timers(struct task_struct *tsk,
1205 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1209 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1206 return; 1210 return;
1207 } 1211 }
1208 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1212 if (psecs >= soft) {
1209 /* 1213 /*
1210 * At the soft limit, send a SIGXCPU every second. 1214 * At the soft limit, send a SIGXCPU every second.
1211 */ 1215 */
1212 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1216 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1213 if (sig->rlim[RLIMIT_CPU].rlim_cur 1217 if (soft < hard) {
1214 < sig->rlim[RLIMIT_CPU].rlim_max) { 1218 soft++;
1215 sig->rlim[RLIMIT_CPU].rlim_cur++; 1219 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1216 } 1220 }
1217 } 1221 }
1218 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1222 x = secs_to_cputime(soft);
1219 if (cputime_eq(prof_expires, cputime_zero) || 1223 if (cputime_eq(prof_expires, cputime_zero) ||
1220 cputime_lt(x, prof_expires)) { 1224 cputime_lt(x, prof_expires)) {
1221 prof_expires = x; 1225 prof_expires = x;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
256 return 0; 256 return 0;
257} 257}
258 258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 259static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{ 260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 262 return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
27 code. This is helpful when debugging and reporting PM bugs, like 27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support. 28 suspend support.
29 29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
30config PM_VERBOSE 39config PM_VERBOSE
31 bool "Verbose Power Management debugging" 40 bool "Verbose Power Management debugging"
32 depends on PM_DEBUG 41 depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 95 default y
87 96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
88config SUSPEND 102config SUSPEND
89 bool "Suspend to RAM and standby" 103 bool "Suspend to RAM and standby"
90 depends on PM && ARCH_SUSPEND_POSSIBLE 104 depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
222 and the bus type drivers of the buses the devices are on are 236 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and 237 responsible for the actual handling of the autosuspend requests and
224 wake-up events. 238 wake-up events.
239
240config PM_OPS
241 bool
242 depends on PM_SLEEP || PM_RUNTIME
243 default y
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..da5288ec2392 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mode)
323int hibernation_snapshot(int platform_mode) 323int hibernation_snapshot(int platform_mode)
324{ 324{
325 int error; 325 int error;
326 gfp_t saved_mask;
326 327
327 error = platform_begin(platform_mode); 328 error = platform_begin(platform_mode);
328 if (error) 329 if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode)
334 goto Close; 335 goto Close;
335 336
336 suspend_console(); 337 suspend_console();
338 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
337 error = dpm_suspend_start(PMSG_FREEZE); 339 error = dpm_suspend_start(PMSG_FREEZE);
338 if (error) 340 if (error)
339 goto Recover_platform; 341 goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode)
351 353
352 dpm_resume_end(in_suspend ? 354 dpm_resume_end(in_suspend ?
353 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 355 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
356 set_gfp_allowed_mask(saved_mask);
354 resume_console(); 357 resume_console();
355 Close: 358 Close:
356 platform_end(platform_mode); 359 platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode)
445int hibernation_restore(int platform_mode) 448int hibernation_restore(int platform_mode)
446{ 449{
447 int error; 450 int error;
451 gfp_t saved_mask;
448 452
449 pm_prepare_console(); 453 pm_prepare_console();
450 suspend_console(); 454 suspend_console();
455 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
451 error = dpm_suspend_start(PMSG_QUIESCE); 456 error = dpm_suspend_start(PMSG_QUIESCE);
452 if (!error) { 457 if (!error) {
453 error = resume_target_kernel(platform_mode); 458 error = resume_target_kernel(platform_mode);
454 dpm_resume_end(PMSG_RECOVER); 459 dpm_resume_end(PMSG_RECOVER);
455 } 460 }
461 set_gfp_allowed_mask(saved_mask);
456 resume_console(); 462 resume_console();
457 pm_restore_console(); 463 pm_restore_console();
458 return error; 464 return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode)
466int hibernation_platform_enter(void) 472int hibernation_platform_enter(void)
467{ 473{
468 int error; 474 int error;
475 gfp_t saved_mask;
469 476
470 if (!hibernation_ops) 477 if (!hibernation_ops)
471 return -ENOSYS; 478 return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
481 488
482 entering_platform_hibernation = true; 489 entering_platform_hibernation = true;
483 suspend_console(); 490 suspend_console();
491 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
484 error = dpm_suspend_start(PMSG_HIBERNATE); 492 error = dpm_suspend_start(PMSG_HIBERNATE);
485 if (error) { 493 if (error) {
486 if (hibernation_ops->recover) 494 if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
518 Resume_devices: 526 Resume_devices:
519 entering_platform_hibernation = false; 527 entering_platform_hibernation = false;
520 dpm_resume_end(PMSG_RESTORE); 528 dpm_resume_end(PMSG_RESTORE);
529 set_gfp_allowed_mask(saved_mask);
521 resume_console(); 530 resume_console();
522 531
523 Close: 532 Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
44 == NOTIFY_BAD) ? -EINVAL : 0; 44 == NOTIFY_BAD) ? -EINVAL : 0;
45} 45}
46 46
47/* If set, devices may be suspended and resumed asynchronously. */
48int pm_async_enabled = 1;
49
50static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
51 char *buf)
52{
53 return sprintf(buf, "%d\n", pm_async_enabled);
54}
55
56static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
57 const char *buf, size_t n)
58{
59 unsigned long val;
60
61 if (strict_strtoul(buf, 10, &val))
62 return -EINVAL;
63
64 if (val > 1)
65 return -EINVAL;
66
67 pm_async_enabled = val;
68 return n;
69}
70
71power_attr(pm_async);
72
47#ifdef CONFIG_PM_DEBUG 73#ifdef CONFIG_PM_DEBUG
48int pm_test_level = TEST_NONE; 74int pm_test_level = TEST_NONE;
49 75
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
208#ifdef CONFIG_PM_TRACE 234#ifdef CONFIG_PM_TRACE
209 &pm_trace_attr.attr, 235 &pm_trace_attr.attr,
210#endif 236#endif
211#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) 237#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr,
239#ifdef CONFIG_PM_DEBUG
212 &pm_test_attr.attr, 240 &pm_test_attr.attr,
213#endif 241#endif
242#endif
214 NULL, 243 NULL,
215}; 244};
216 245
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..830cadecbdfc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void)
1181 1181
1182 memory_bm_position_reset(&copy_bm); 1182 memory_bm_position_reset(&copy_bm);
1183 1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) { 1184 while (to_free_normal > 0 || to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn); 1186 struct page *page = pfn_to_page(pfn);
1187 1187
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void)
1500{ 1500{
1501 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
1502 1502
1503 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1503 printk(KERN_INFO "PM: Creating hibernation image:\n");
1504 1504
1505 drain_local_pages(NULL); 1505 drain_local_pages(NULL);
1506 nr_pages = count_data_pages(); 1506 nr_pages = count_data_pages();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..44cce10b582d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 189int suspend_devices_and_enter(suspend_state_t state)
190{ 190{
191 int error; 191 int error;
192 gfp_t saved_mask;
192 193
193 if (!suspend_ops) 194 if (!suspend_ops)
194 return -ENOSYS; 195 return -ENOSYS;
@@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 200 goto Close;
200 } 201 }
201 suspend_console(); 202 suspend_console();
203 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 204 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 205 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 206 if (error) {
@@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 217 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 218 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 219 suspend_test_finish("resume devices");
220 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 221 resume_console();
219 Close: 222 Close:
220 if (suspend_ops->end) 223 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..1d575733d4e1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p)
657 struct swsusp_info *header; 657 struct swsusp_info *header;
658 658
659 *flags_p = swsusp_header->flags; 659 *flags_p = swsusp_header->flags;
660 if (IS_ERR(resume_bdev)) {
661 pr_debug("PM: Image device not initialised\n");
662 return PTR_ERR(resume_bdev);
663 }
664 660
665 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 661 memset(&snapshot, 0, sizeof(struct snapshot_handle));
666 error = snapshot_write_next(&snapshot, PAGE_SIZE); 662 error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file provides code to write suspend image to swap and read it back.
5 *
6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 *
9 * This file is released under the GPLv2.
10 *
11 * I'd like to thank the following people for their work:
12 *
13 * Pavel Machek <pavel@ucw.cz>:
14 * Modifications, defectiveness pointing, being with me at the very beginning,
15 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16 *
17 * Steve Doddi <dirk@loth.demon.co.uk>:
18 * Support the possibility of hardware state restoring.
19 *
20 * Raph <grey.havens@earthling.net>:
21 * Support for preserving states of network devices and virtual console
22 * (including X and svgatextmode)
23 *
24 * Kurt Garloff <garloff@suse.de>:
25 * Straightened the critical function in order to prevent compilers from
26 * playing tricks with local variables.
27 *
28 * Andreas Mohr <a.mohr@mailto.de>
29 *
30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init
32 *
33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Reworked the freeing of memory and the handling of swap
35 *
36 * More state savers are welcome. Especially for the scsi layer...
37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */
40
41#include <linux/mm.h>
42#include <linux/suspend.h>
43#include <linux/spinlock.h>
44#include <linux/kernel.h>
45#include <linux/major.h>
46#include <linux/swap.h>
47#include <linux/pm.h>
48#include <linux/swapops.h>
49#include <linux/bootmem.h>
50#include <linux/syscalls.h>
51#include <linux/highmem.h>
52#include <linux/time.h>
53#include <linux/rbtree.h>
54#include <linux/io.h>
55
56#include "power.h"
57
58int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..4d2289626a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
195 return res; 195 return res;
196} 196}
197 197
198static void snapshot_deprecated_ioctl(unsigned int cmd)
199{
200 if (printk_ratelimit())
201 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
202 "be removed soon, update your suspend-to-disk "
203 "utilities\n",
204 __builtin_return_address(0), cmd);
205}
206
198static long snapshot_ioctl(struct file *filp, unsigned int cmd, 207static long snapshot_ioctl(struct file *filp, unsigned int cmd,
199 unsigned long arg) 208 unsigned long arg)
200{ 209{
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
246 data->frozen = 0; 255 data->frozen = 0;
247 break; 256 break;
248 257
249 case SNAPSHOT_CREATE_IMAGE:
250 case SNAPSHOT_ATOMIC_SNAPSHOT: 258 case SNAPSHOT_ATOMIC_SNAPSHOT:
259 snapshot_deprecated_ioctl(cmd);
260 case SNAPSHOT_CREATE_IMAGE:
251 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 261 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
252 error = -EPERM; 262 error = -EPERM;
253 break; 263 break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 data->ready = 0; 285 data->ready = 0;
276 break; 286 break;
277 287
278 case SNAPSHOT_PREF_IMAGE_SIZE:
279 case SNAPSHOT_SET_IMAGE_SIZE: 288 case SNAPSHOT_SET_IMAGE_SIZE:
289 snapshot_deprecated_ioctl(cmd);
290 case SNAPSHOT_PREF_IMAGE_SIZE:
280 image_size = arg; 291 image_size = arg;
281 break; 292 break;
282 293
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
290 error = put_user(size, (loff_t __user *)arg); 301 error = put_user(size, (loff_t __user *)arg);
291 break; 302 break;
292 303
293 case SNAPSHOT_AVAIL_SWAP_SIZE:
294 case SNAPSHOT_AVAIL_SWAP: 304 case SNAPSHOT_AVAIL_SWAP:
305 snapshot_deprecated_ioctl(cmd);
306 case SNAPSHOT_AVAIL_SWAP_SIZE:
295 size = count_swap_pages(data->swap, 1); 307 size = count_swap_pages(data->swap, 1);
296 size <<= PAGE_SHIFT; 308 size <<= PAGE_SHIFT;
297 error = put_user(size, (loff_t __user *)arg); 309 error = put_user(size, (loff_t __user *)arg);
298 break; 310 break;
299 311
300 case SNAPSHOT_ALLOC_SWAP_PAGE:
301 case SNAPSHOT_GET_SWAP_PAGE: 312 case SNAPSHOT_GET_SWAP_PAGE:
313 snapshot_deprecated_ioctl(cmd);
314 case SNAPSHOT_ALLOC_SWAP_PAGE:
302 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 315 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
303 error = -ENODEV; 316 error = -ENODEV;
304 break; 317 break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 break; 334 break;
322 335
323 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ 336 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
337 snapshot_deprecated_ioctl(cmd);
324 if (!swsusp_swap_in_use()) { 338 if (!swsusp_swap_in_use()) {
325 /* 339 /*
326 * User space encodes device types as two-byte values, 340 * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
362 break; 376 break;
363 377
364 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ 378 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
379 snapshot_deprecated_ioctl(cmd);
365 error = -EINVAL; 380 error = -EINVAL;
366 381
367 switch (arg) { 382 switch (arg) {
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 37#include <linux/kmsg_dump.h>
38#include <linux/syslog.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
@@ -69,8 +70,6 @@ int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 70 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
70}; 71};
71 72
72static int saved_console_loglevel = -1;
73
74/* 73/*
75 * Low level drivers may need that to know if they can schedule in 74 * Low level drivers may need that to know if they can schedule in
76 * their unblank() callback or not. So let's export it. 75 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
145static char *log_buf = __log_buf; 144static char *log_buf = __log_buf;
146static int log_buf_len = __LOG_BUF_LEN; 145static int log_buf_len = __LOG_BUF_LEN;
147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 146static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
147static int saved_console_loglevel = -1;
148 148
149#ifdef CONFIG_KEXEC 149#ifdef CONFIG_KEXEC
150/* 150/*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
258} 258}
259#endif 259#endif
260 260
261/* 261int do_syslog(int type, char __user *buf, int len, bool from_file)
262 * Commands to do_syslog:
263 *
264 * 0 -- Close the log. Currently a NOP.
265 * 1 -- Open the log. Currently a NOP.
266 * 2 -- Read from the log.
267 * 3 -- Read all messages remaining in the ring buffer.
268 * 4 -- Read and clear all messages remaining in the ring buffer
269 * 5 -- Clear ring buffer.
270 * 6 -- Disable printk's to console
271 * 7 -- Enable printk's to console
272 * 8 -- Set level of messages printed to console
273 * 9 -- Return number of unread characters in the log buffer
274 * 10 -- Return size of the log buffer
275 */
276int do_syslog(int type, char __user *buf, int len)
277{ 262{
278 unsigned i, j, limit, count; 263 unsigned i, j, limit, count;
279 int do_clear = 0; 264 int do_clear = 0;
280 char c; 265 char c;
281 int error = 0; 266 int error = 0;
282 267
283 error = security_syslog(type); 268 error = security_syslog(type, from_file);
284 if (error) 269 if (error)
285 return error; 270 return error;
286 271
287 switch (type) { 272 switch (type) {
288 case 0: /* Close log */ 273 case SYSLOG_ACTION_CLOSE: /* Close log */
289 break; 274 break;
290 case 1: /* Open log */ 275 case SYSLOG_ACTION_OPEN: /* Open log */
291 break; 276 break;
292 case 2: /* Read from log */ 277 case SYSLOG_ACTION_READ: /* Read from log */
293 error = -EINVAL; 278 error = -EINVAL;
294 if (!buf || len < 0) 279 if (!buf || len < 0)
295 goto out; 280 goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
320 if (!error) 305 if (!error)
321 error = i; 306 error = i;
322 break; 307 break;
323 case 4: /* Read/clear last kernel messages */ 308 /* Read/clear last kernel messages */
309 case SYSLOG_ACTION_READ_CLEAR:
324 do_clear = 1; 310 do_clear = 1;
325 /* FALL THRU */ 311 /* FALL THRU */
326 case 3: /* Read last kernel messages */ 312 /* Read last kernel messages */
313 case SYSLOG_ACTION_READ_ALL:
327 error = -EINVAL; 314 error = -EINVAL;
328 if (!buf || len < 0) 315 if (!buf || len < 0)
329 goto out; 316 goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
376 } 363 }
377 } 364 }
378 break; 365 break;
379 case 5: /* Clear ring buffer */ 366 /* Clear ring buffer */
367 case SYSLOG_ACTION_CLEAR:
380 logged_chars = 0; 368 logged_chars = 0;
381 break; 369 break;
382 case 6: /* Disable logging to console */ 370 /* Disable logging to console */
371 case SYSLOG_ACTION_CONSOLE_OFF:
383 if (saved_console_loglevel == -1) 372 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel; 373 saved_console_loglevel = console_loglevel;
385 console_loglevel = minimum_console_loglevel; 374 console_loglevel = minimum_console_loglevel;
386 break; 375 break;
387 case 7: /* Enable logging to console */ 376 /* Enable logging to console */
377 case SYSLOG_ACTION_CONSOLE_ON:
388 if (saved_console_loglevel != -1) { 378 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel; 379 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1; 380 saved_console_loglevel = -1;
391 } 381 }
392 break; 382 break;
393 case 8: /* Set level of messages printed to console */ 383 /* Set level of messages printed to console */
384 case SYSLOG_ACTION_CONSOLE_LEVEL:
394 error = -EINVAL; 385 error = -EINVAL;
395 if (len < 1 || len > 8) 386 if (len < 1 || len > 8)
396 goto out; 387 goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
401 saved_console_loglevel = -1; 392 saved_console_loglevel = -1;
402 error = 0; 393 error = 0;
403 break; 394 break;
404 case 9: /* Number of chars in the log buffer */ 395 /* Number of chars in the log buffer */
396 case SYSLOG_ACTION_SIZE_UNREAD:
405 error = log_end - log_start; 397 error = log_end - log_start;
406 break; 398 break;
407 case 10: /* Size of the log buffer */ 399 /* Size of the log buffer */
400 case SYSLOG_ACTION_SIZE_BUFFER:
408 error = log_buf_len; 401 error = log_buf_len;
409 break; 402 break;
410 default: 403 default:
@@ -417,7 +410,7 @@ out:
417 410
418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 411SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
419{ 412{
420 return do_syslog(type, buf, len); 413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 414}
422 415
423/* 416/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/regset.h>
25 26
26 27
27/* 28/*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 512 return 0;
512} 513}
513 514
515#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
516
517static const struct user_regset *
518find_regset(const struct user_regset_view *view, unsigned int type)
519{
520 const struct user_regset *regset;
521 int n;
522
523 for (n = 0; n < view->n; ++n) {
524 regset = view->regsets + n;
525 if (regset->core_note_type == type)
526 return regset;
527 }
528
529 return NULL;
530}
531
532static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
533 struct iovec *kiov)
534{
535 const struct user_regset_view *view = task_user_regset_view(task);
536 const struct user_regset *regset = find_regset(view, type);
537 int regset_no;
538
539 if (!regset || (kiov->iov_len % regset->size) != 0)
540 return -EINVAL;
541
542 regset_no = regset - view->regsets;
543 kiov->iov_len = min(kiov->iov_len,
544 (__kernel_size_t) (regset->n * regset->size));
545
546 if (req == PTRACE_GETREGSET)
547 return copy_regset_to_user(task, view, regset_no, 0,
548 kiov->iov_len, kiov->iov_base);
549 else
550 return copy_regset_from_user(task, view, regset_no, 0,
551 kiov->iov_len, kiov->iov_base);
552}
553
554#endif
555
514int ptrace_request(struct task_struct *child, long request, 556int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 557 long addr, long data)
516{ 558{
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 615 return 0;
574 return ptrace_resume(child, request, SIGKILL); 616 return ptrace_resume(child, request, SIGKILL);
575 617
618#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
619 case PTRACE_GETREGSET:
620 case PTRACE_SETREGSET:
621 {
622 struct iovec kiov;
623 struct iovec __user *uiov = (struct iovec __user *) data;
624
625 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
626 return -EFAULT;
627
628 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
629 __get_user(kiov.iov_len, &uiov->iov_len))
630 return -EFAULT;
631
632 ret = ptrace_regset(child, request, addr, &kiov);
633 if (!ret)
634 ret = __put_user(kiov.iov_len, &uiov->iov_len);
635 break;
636 }
637#endif
576 default: 638 default:
577 break; 639 break;
578 } 640 }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 773 else
712 ret = ptrace_setsiginfo(child, &siginfo); 774 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 775 break;
776#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
777 case PTRACE_GETREGSET:
778 case PTRACE_SETREGSET:
779 {
780 struct iovec kiov;
781 struct compat_iovec __user *uiov =
782 (struct compat_iovec __user *) datap;
783 compat_uptr_t ptr;
784 compat_size_t len;
785
786 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
787 return -EFAULT;
788
789 if (__get_user(ptr, &uiov->iov_base) ||
790 __get_user(len, &uiov->iov_len))
791 return -EFAULT;
792
793 kiov.iov_base = compat_ptr(ptr);
794 kiov.iov_len = len;
795
796 ret = ptrace_regset(child, request, addr, &kiov);
797 if (!ret)
798 ret = __put_user(kiov.iov_len, &uiov->iov_len);
799 break;
800 }
801#endif
714 802
715 default: 803 default:
716 ret = ptrace_request(child, request, addr, data); 804 ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..f1125c1a6321 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,43 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
47 48
48#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
49static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
50struct lockdep_map rcu_lock_map = 51struct lockdep_map rcu_lock_map =
51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
52EXPORT_SYMBOL_GPL(rcu_lock_map); 53EXPORT_SYMBOL_GPL(rcu_lock_map);
54
55static struct lock_class_key rcu_bh_lock_key;
56struct lockdep_map rcu_bh_lock_map =
57 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
58EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
59
60static struct lock_class_key rcu_sched_lock_key;
61struct lockdep_map rcu_sched_lock_map =
62 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
53#endif 64#endif
54 65
66int rcu_scheduler_active __read_mostly;
67EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68
69/*
70 * This function is invoked towards the end of the scheduler's initialization
71 * process. Before this is called, the idle task might contain
72 * RCU read-side critical sections (during which time, this idle
73 * task is booting the system). After this function is called, the
74 * idle tasks are prohibited from containing RCU read-side critical
75 * sections.
76 */
77void rcu_scheduler_starting(void)
78{
79 WARN_ON(num_online_cpus() != 1);
80 WARN_ON(nr_context_switches() > 0);
81 rcu_scheduler_active = 1;
82}
83
55/* 84/*
56 * Awaken the corresponding synchronize_rcu() instance now that a 85 * Awaken the corresponding synchronize_rcu() instance now that a
57 * grace period has elapsed. 86 * grace period has elapsed.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af02..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */
64static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 67static char *torture_type = "rcu"; /* What RCU implementation to torture. */
65 68
66module_param(nreaders, int, 0444); 69module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
79MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 82MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
80module_param(irqreader, int, 0444); 83module_param(irqreader, int, 0444);
81MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 84MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
85module_param(fqs_duration, int, 0444);
86MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
87module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
82module_param(torture_type, charp, 0444); 91module_param(torture_type, charp, 0444);
83MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
84 93
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
99static struct task_struct *stats_task; 108static struct task_struct *stats_task;
100static struct task_struct *shuffler_task; 109static struct task_struct *shuffler_task;
101static struct task_struct *stutter_task; 110static struct task_struct *stutter_task;
111static struct task_struct *fqs_task;
102 112
103#define RCU_TORTURE_PIPE_LEN 10 113#define RCU_TORTURE_PIPE_LEN 10
104 114
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
263 void (*deferred_free)(struct rcu_torture *p); 273 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 274 void (*sync)(void);
265 void (*cb_barrier)(void); 275 void (*cb_barrier)(void);
276 void (*fqs)(void);
266 int (*stats)(char *page); 277 int (*stats)(char *page);
267 int irq_capable; 278 int irq_capable;
268 char *name; 279 char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
347 .deferred_free = rcu_torture_deferred_free, 358 .deferred_free = rcu_torture_deferred_free,
348 .sync = synchronize_rcu, 359 .sync = synchronize_rcu,
349 .cb_barrier = rcu_barrier, 360 .cb_barrier = rcu_barrier,
361 .fqs = rcu_force_quiescent_state,
350 .stats = NULL, 362 .stats = NULL,
351 .irq_capable = 1, 363 .irq_capable = 1,
352 .name = "rcu" 364 .name = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .deferred_free = rcu_sync_torture_deferred_free, 400 .deferred_free = rcu_sync_torture_deferred_free,
389 .sync = synchronize_rcu, 401 .sync = synchronize_rcu,
390 .cb_barrier = NULL, 402 .cb_barrier = NULL,
403 .fqs = rcu_force_quiescent_state,
391 .stats = NULL, 404 .stats = NULL,
392 .irq_capable = 1, 405 .irq_capable = 1,
393 .name = "rcu_sync" 406 .name = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
403 .deferred_free = rcu_sync_torture_deferred_free, 416 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited, 417 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL, 418 .cb_barrier = NULL,
419 .fqs = rcu_force_quiescent_state,
406 .stats = NULL, 420 .stats = NULL,
407 .irq_capable = 1, 421 .irq_capable = 1,
408 .name = "rcu_expedited" 422 .name = "rcu_expedited"
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
465 .deferred_free = rcu_bh_torture_deferred_free, 479 .deferred_free = rcu_bh_torture_deferred_free,
466 .sync = rcu_bh_torture_synchronize, 480 .sync = rcu_bh_torture_synchronize,
467 .cb_barrier = rcu_barrier_bh, 481 .cb_barrier = rcu_barrier_bh,
482 .fqs = rcu_bh_force_quiescent_state,
468 .stats = NULL, 483 .stats = NULL,
469 .irq_capable = 1, 484 .irq_capable = 1,
470 .name = "rcu_bh" 485 .name = "rcu_bh"
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
480 .deferred_free = rcu_sync_torture_deferred_free, 495 .deferred_free = rcu_sync_torture_deferred_free,
481 .sync = rcu_bh_torture_synchronize, 496 .sync = rcu_bh_torture_synchronize,
482 .cb_barrier = NULL, 497 .cb_barrier = NULL,
498 .fqs = rcu_bh_force_quiescent_state,
483 .stats = NULL, 499 .stats = NULL,
484 .irq_capable = 1, 500 .irq_capable = 1,
485 .name = "rcu_bh_sync" 501 .name = "rcu_bh_sync"
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = {
621 .deferred_free = rcu_sched_torture_deferred_free, 637 .deferred_free = rcu_sched_torture_deferred_free,
622 .sync = sched_torture_synchronize, 638 .sync = sched_torture_synchronize,
623 .cb_barrier = rcu_barrier_sched, 639 .cb_barrier = rcu_barrier_sched,
640 .fqs = rcu_sched_force_quiescent_state,
624 .stats = NULL, 641 .stats = NULL,
625 .irq_capable = 1, 642 .irq_capable = 1,
626 .name = "sched" 643 .name = "sched"
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = {
636 .deferred_free = rcu_sync_torture_deferred_free, 653 .deferred_free = rcu_sync_torture_deferred_free,
637 .sync = sched_torture_synchronize, 654 .sync = sched_torture_synchronize,
638 .cb_barrier = NULL, 655 .cb_barrier = NULL,
656 .fqs = rcu_sched_force_quiescent_state,
639 .stats = NULL, 657 .stats = NULL,
640 .name = "sched_sync" 658 .name = "sched_sync"
641}; 659};
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
650 .deferred_free = rcu_sync_torture_deferred_free, 668 .deferred_free = rcu_sync_torture_deferred_free,
651 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
652 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state,
653 .stats = rcu_expedited_torture_stats, 672 .stats = rcu_expedited_torture_stats,
654 .irq_capable = 1, 673 .irq_capable = 1,
655 .name = "sched_expedited" 674 .name = "sched_expedited"
656}; 675};
657 676
658/* 677/*
678 * RCU torture force-quiescent-state kthread. Repeatedly induces
679 * bursts of calls to force_quiescent_state(), increasing the probability
680 * of occurrence of some important types of race conditions.
681 */
682static int
683rcu_torture_fqs(void *arg)
684{
685 unsigned long fqs_resume_time;
686 int fqs_burst_remaining;
687
688 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
689 do {
690 fqs_resume_time = jiffies + fqs_stutter * HZ;
691 while (jiffies - fqs_resume_time > LONG_MAX) {
692 schedule_timeout_interruptible(1);
693 }
694 fqs_burst_remaining = fqs_duration;
695 while (fqs_burst_remaining > 0) {
696 cur_ops->fqs();
697 udelay(fqs_holdoff);
698 fqs_burst_remaining -= fqs_holdoff;
699 }
700 rcu_stutter_wait("rcu_torture_fqs");
701 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
702 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
703 rcutorture_shutdown_absorb("rcu_torture_fqs");
704 while (!kthread_should_stop())
705 schedule_timeout_uninterruptible(1);
706 return 0;
707}
708
709/*
659 * RCU torture writer kthread. Repeatedly substitutes a new structure 710 * RCU torture writer kthread. Repeatedly substitutes a new structure
660 * for that pointed to by rcu_torture_current, freeing the old structure 711 * for that pointed to by rcu_torture_current, freeing the old structure
661 * after a series of grace periods (the "pipeline"). 712 * after a series of grace periods (the "pipeline").
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
745 796
746 idx = cur_ops->readlock(); 797 idx = cur_ops->readlock();
747 completed = cur_ops->completed(); 798 completed = cur_ops->completed();
748 p = rcu_dereference(rcu_torture_current); 799 p = rcu_dereference_check(rcu_torture_current,
800 rcu_read_lock_held() ||
801 rcu_read_lock_bh_held() ||
802 rcu_read_lock_sched_held() ||
803 srcu_read_lock_held(&srcu_ctl));
749 if (p == NULL) { 804 if (p == NULL) {
750 /* Leave because rcu_torture_writer is not yet underway */ 805 /* Leave because rcu_torture_writer is not yet underway */
751 cur_ops->readunlock(idx); 806 cur_ops->readunlock(idx);
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
763 /* Should not happen, but... */ 818 /* Should not happen, but... */
764 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
765 } 820 }
766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 821 __this_cpu_inc(rcu_torture_count[pipe_count]);
767 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
768 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
769 /* Should not happen, but... */ 824 /* Should not happen, but... */
770 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
771 } 826 }
772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 827 __this_cpu_inc(rcu_torture_batch[completed]);
773 preempt_enable(); 828 preempt_enable();
774 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
775} 830}
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg)
798 do { 853 do {
799 if (irqreader && cur_ops->irq_capable) { 854 if (irqreader && cur_ops->irq_capable) {
800 if (!timer_pending(&t)) 855 if (!timer_pending(&t))
801 mod_timer(&t, 1); 856 mod_timer(&t, jiffies + 1);
802 } 857 }
803 idx = cur_ops->readlock(); 858 idx = cur_ops->readlock();
804 completed = cur_ops->completed(); 859 completed = cur_ops->completed();
805 p = rcu_dereference(rcu_torture_current); 860 p = rcu_dereference_check(rcu_torture_current,
861 rcu_read_lock_held() ||
862 rcu_read_lock_bh_held() ||
863 rcu_read_lock_sched_held() ||
864 srcu_read_lock_held(&srcu_ctl));
806 if (p == NULL) { 865 if (p == NULL) {
807 /* Wait for rcu_torture_writer to get underway */ 866 /* Wait for rcu_torture_writer to get underway */
808 cur_ops->readunlock(idx); 867 cur_ops->readunlock(idx);
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg)
818 /* Should not happen, but... */ 877 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 878 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 879 }
821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 880 __this_cpu_inc(rcu_torture_count[pipe_count]);
822 completed = cur_ops->completed() - completed; 881 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 882 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 883 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 884 completed = RCU_TORTURE_PIPE_LEN;
826 } 885 }
827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 886 __this_cpu_inc(rcu_torture_batch[completed]);
828 preempt_enable(); 887 preempt_enable();
829 cur_ops->readunlock(idx); 888 cur_ops->readunlock(idx);
830 schedule(); 889 schedule();
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
1030 printk(KERN_ALERT "%s" TORTURE_FLAG 1089 printk(KERN_ALERT "%s" TORTURE_FLAG
1031 "--- %s: nreaders=%d nfakewriters=%d " 1090 "--- %s: nreaders=%d nfakewriters=%d "
1032 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1091 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1033 "shuffle_interval=%d stutter=%d irqreader=%d\n", 1092 "shuffle_interval=%d stutter=%d irqreader=%d "
1093 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
1034 torture_type, tag, nrealreaders, nfakewriters, 1094 torture_type, tag, nrealreaders, nfakewriters,
1035 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1095 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1036 stutter, irqreader); 1096 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
1037} 1097}
1038 1098
1039static struct notifier_block rcutorture_nb = { 1099static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void)
1109 } 1169 }
1110 stats_task = NULL; 1170 stats_task = NULL;
1111 1171
1172 if (fqs_task) {
1173 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1174 kthread_stop(fqs_task);
1175 }
1176 fqs_task = NULL;
1177
1112 /* Wait for all RCU callbacks to fire. */ 1178 /* Wait for all RCU callbacks to fire. */
1113 1179
1114 if (cur_ops->cb_barrier != NULL) 1180 if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1220,11 @@ rcu_torture_init(void)
1154 mutex_unlock(&fullstop_mutex); 1220 mutex_unlock(&fullstop_mutex);
1155 return -EINVAL; 1221 return -EINVAL;
1156 } 1222 }
1223 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1224 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1225 "fqs_duration, fqs disabled.\n");
1226 fqs_duration = 0;
1227 }
1157 if (cur_ops->init) 1228 if (cur_ops->init)
1158 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1229 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1159 1230
@@ -1282,6 +1353,19 @@ rcu_torture_init(void)
1282 goto unwind; 1353 goto unwind;
1283 } 1354 }
1284 } 1355 }
1356 if (fqs_duration < 0)
1357 fqs_duration = 0;
1358 if (fqs_duration) {
1359 /* Create the stutter thread */
1360 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1361 "rcu_torture_fqs");
1362 if (IS_ERR(fqs_task)) {
1363 firsterr = PTR_ERR(fqs_task);
1364 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1365 fqs_task = NULL;
1366 goto unwind;
1367 }
1368 }
1285 register_reboot_notifier(&rcutorture_nb); 1369 register_reboot_notifier(&rcutorture_nb);
1286 mutex_unlock(&fullstop_mutex); 1370 mutex_unlock(&fullstop_mutex);
1287 return 0; 1371 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,7 +46,6 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
50 49
51#include "rcutree.h" 50#include "rcutree.h"
52 51
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
66 .signaled = RCU_GP_IDLE, \ 65 .signaled = RCU_GP_IDLE, \
67 .gpnum = -300, \ 66 .gpnum = -300, \
68 .completed = -300, \ 67 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \ 69 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \ 70 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \ 71 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 73 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 74 .n_force_qs_ngp = 0, \
76} 75}
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 82
84static int rcu_scheduler_active __read_mostly;
85
86
87/* 83/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node 85 * permit this function to be invoked without holding the root rcu_node
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void)
157EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 153EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
158 154
159/* 155/*
156 * Force a quiescent state for RCU BH.
157 */
158void rcu_bh_force_quiescent_state(void)
159{
160 force_quiescent_state(&rcu_bh_state, 0);
161}
162EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
163
164/*
165 * Force a quiescent state for RCU-sched.
166 */
167void rcu_sched_force_quiescent_state(void)
168{
169 force_quiescent_state(&rcu_sched_state, 0);
170}
171EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
172
173/*
160 * Does the CPU have callbacks ready to be invoked? 174 * Does the CPU have callbacks ready to be invoked?
161 */ 175 */
162static int 176static int
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
439 453
440 /* Only let one CPU complain about others per time interval. */ 454 /* Only let one CPU complain about others per time interval. */
441 455
442 spin_lock_irqsave(&rnp->lock, flags); 456 raw_spin_lock_irqsave(&rnp->lock, flags);
443 delta = jiffies - rsp->jiffies_stall; 457 delta = jiffies - rsp->jiffies_stall;
444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 458 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
445 spin_unlock_irqrestore(&rnp->lock, flags); 459 raw_spin_unlock_irqrestore(&rnp->lock, flags);
446 return; 460 return;
447 } 461 }
448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 462 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
452 * due to CPU offlining. 466 * due to CPU offlining.
453 */ 467 */
454 rcu_print_task_stall(rnp); 468 rcu_print_task_stall(rnp);
455 spin_unlock_irqrestore(&rnp->lock, flags); 469 raw_spin_unlock_irqrestore(&rnp->lock, flags);
456 470
457 /* OK, time to rat on our buddy... */ 471 /* OK, time to rat on our buddy... */
458 472
459 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 473 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
460 rcu_for_each_leaf_node(rsp, rnp) { 474 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags);
461 rcu_print_task_stall(rnp); 476 rcu_print_task_stall(rnp);
477 raw_spin_unlock_irqrestore(&rnp->lock, flags);
462 if (rnp->qsmask == 0) 478 if (rnp->qsmask == 0)
463 continue; 479 continue;
464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 480 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 485 smp_processor_id(), (long)(jiffies - rsp->gp_start));
470 trigger_all_cpu_backtrace(); 486 trigger_all_cpu_backtrace();
471 487
488 /* If so configured, complain about tasks blocking the grace period. */
489
490 rcu_print_detail_task_stall(rsp);
491
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 492 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 493}
474 494
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
481 smp_processor_id(), jiffies - rsp->gp_start); 501 smp_processor_id(), jiffies - rsp->gp_start);
482 trigger_all_cpu_backtrace(); 502 trigger_all_cpu_backtrace();
483 503
484 spin_lock_irqsave(&rnp->lock, flags); 504 raw_spin_lock_irqsave(&rnp->lock, flags);
485 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 505 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
486 rsp->jiffies_stall = 506 rsp->jiffies_stall =
487 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 507 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
488 spin_unlock_irqrestore(&rnp->lock, flags); 508 raw_spin_unlock_irqrestore(&rnp->lock, flags);
489 509
490 set_need_resched(); /* kick ourselves to get things going. */ 510 set_need_resched(); /* kick ourselves to get things going. */
491} 511}
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
545 local_irq_save(flags); 565 local_irq_save(flags);
546 rnp = rdp->mynode; 566 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ 567 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 568 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
549 local_irq_restore(flags); 569 local_irq_restore(flags);
550 return; 570 return;
551 } 571 }
552 __note_new_gpnum(rsp, rnp, rdp); 572 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags); 573 raw_spin_unlock_irqrestore(&rnp->lock, flags);
554} 574}
555 575
556/* 576/*
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
609 local_irq_save(flags); 629 local_irq_save(flags);
610 rnp = rdp->mynode; 630 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 631 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 632 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
613 local_irq_restore(flags); 633 local_irq_restore(flags);
614 return; 634 return;
615 } 635 }
616 __rcu_process_gp_end(rsp, rnp, rdp); 636 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags); 637 raw_spin_unlock_irqrestore(&rnp->lock, flags);
618} 638}
619 639
620/* 640/*
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
659 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 679 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
660 struct rcu_node *rnp = rcu_get_root(rsp); 680 struct rcu_node *rnp = rcu_get_root(rsp);
661 681
662 if (!cpu_needs_another_gp(rsp, rdp)) { 682 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
683 if (cpu_needs_another_gp(rsp, rdp))
684 rsp->fqs_need_gp = 1;
663 if (rnp->completed == rsp->completed) { 685 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags); 686 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 687 return;
666 } 688 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 689 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 690
669 /* 691 /*
670 * Propagate new ->completed value to rcu_node structures 692 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672 * of the next grace period to process their callbacks. 694 * of the next grace period to process their callbacks.
673 */ 695 */
674 rcu_for_each_node_breadth_first(rsp, rnp) { 696 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */ 697 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed; 698 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 699 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 } 700 }
679 local_irq_restore(flags); 701 local_irq_restore(flags);
680 return; 702 return;
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
695 rnp->completed = rsp->completed; 717 rnp->completed = rsp->completed;
696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 718 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp); 719 rcu_start_gp_per_cpu(rsp, rnp, rdp);
698 spin_unlock_irqrestore(&rnp->lock, flags); 720 raw_spin_unlock_irqrestore(&rnp->lock, flags);
699 return; 721 return;
700 } 722 }
701 723
702 spin_unlock(&rnp->lock); /* leave irqs disabled. */ 724 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
703 725
704 726
705 /* Exclude any concurrent CPU-hotplug operations. */ 727 /* Exclude any concurrent CPU-hotplug operations. */
706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 728 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
707 729
708 /* 730 /*
709 * Set the quiescent-state-needed bits in all the rcu_node 731 * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
723 * irqs disabled. 745 * irqs disabled.
724 */ 746 */
725 rcu_for_each_node_breadth_first(rsp, rnp) { 747 rcu_for_each_node_breadth_first(rsp, rnp) {
726 spin_lock(&rnp->lock); /* irqs already disabled. */ 748 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
727 rcu_preempt_check_blocked_tasks(rnp); 749 rcu_preempt_check_blocked_tasks(rnp);
728 rnp->qsmask = rnp->qsmaskinit; 750 rnp->qsmask = rnp->qsmaskinit;
729 rnp->gpnum = rsp->gpnum; 751 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed; 752 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode) 753 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp); 754 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 755 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
734 } 756 }
735 757
736 rnp = rcu_get_root(rsp); 758 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */ 759 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 760 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 761 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
740 spin_unlock_irqrestore(&rsp->onofflock, flags); 762 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
741} 763}
742 764
743/* 765/*
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
776 if (!(rnp->qsmask & mask)) { 798 if (!(rnp->qsmask & mask)) {
777 799
778 /* Our bit has already been cleared, so done. */ 800 /* Our bit has already been cleared, so done. */
779 spin_unlock_irqrestore(&rnp->lock, flags); 801 raw_spin_unlock_irqrestore(&rnp->lock, flags);
780 return; 802 return;
781 } 803 }
782 rnp->qsmask &= ~mask; 804 rnp->qsmask &= ~mask;
783 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 805 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
784 806
785 /* Other bits still set at this level, so done. */ 807 /* Other bits still set at this level, so done. */
786 spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
787 return; 809 return;
788 } 810 }
789 mask = rnp->grpmask; 811 mask = rnp->grpmask;
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
793 815
794 break; 816 break;
795 } 817 }
796 spin_unlock_irqrestore(&rnp->lock, flags); 818 raw_spin_unlock_irqrestore(&rnp->lock, flags);
797 rnp_c = rnp; 819 rnp_c = rnp;
798 rnp = rnp->parent; 820 rnp = rnp->parent;
799 spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
800 WARN_ON_ONCE(rnp_c->qsmask); 822 WARN_ON_ONCE(rnp_c->qsmask);
801 } 823 }
802 824
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
825 struct rcu_node *rnp; 847 struct rcu_node *rnp;
826 848
827 rnp = rdp->mynode; 849 rnp = rdp->mynode;
828 spin_lock_irqsave(&rnp->lock, flags); 850 raw_spin_lock_irqsave(&rnp->lock, flags);
829 if (lastcomp != rnp->completed) { 851 if (lastcomp != rnp->completed) {
830 852
831 /* 853 /*
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
837 * race occurred. 859 * race occurred.
838 */ 860 */
839 rdp->passed_quiesc = 0; /* try again later! */ 861 rdp->passed_quiesc = 0; /* try again later! */
840 spin_unlock_irqrestore(&rnp->lock, flags); 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
841 return; 863 return;
842 } 864 }
843 mask = rdp->grpmask; 865 mask = rdp->grpmask;
844 if ((rnp->qsmask & mask) == 0) { 866 if ((rnp->qsmask & mask) == 0) {
845 spin_unlock_irqrestore(&rnp->lock, flags); 867 raw_spin_unlock_irqrestore(&rnp->lock, flags);
846 } else { 868 } else {
847 rdp->qs_pending = 0; 869 rdp->qs_pending = 0;
848 870
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
906 928
907 if (rdp->nxtlist == NULL) 929 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */ 930 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 931 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist; 932 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 933 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL; 934 rdp->nxtlist = NULL;
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
914 rdp->nxttail[i] = &rdp->nxtlist; 936 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen; 937 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0; 938 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 939 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918} 940}
919 941
920/* 942/*
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
925 unsigned long flags; 947 unsigned long flags;
926 struct rcu_data *rdp; 948 struct rcu_data *rdp;
927 949
928 spin_lock_irqsave(&rsp->onofflock, flags); 950 raw_spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()]; 951 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) { 952 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags); 953 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return; 954 return;
933 } 955 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 956 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
937 rsp->orphan_cbs_list = NULL; 959 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 960 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0; 961 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 962 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
941} 963}
942 964
943/* 965/*
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
953 struct rcu_node *rnp; 975 struct rcu_node *rnp;
954 976
955 /* Exclude any attempts to start a new grace period. */ 977 /* Exclude any attempts to start a new grace period. */
956 spin_lock_irqsave(&rsp->onofflock, flags); 978 raw_spin_lock_irqsave(&rsp->onofflock, flags);
957 979
958 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 980 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
959 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ 981 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
960 mask = rdp->grpmask; /* rnp->grplo is constant. */ 982 mask = rdp->grpmask; /* rnp->grplo is constant. */
961 do { 983 do {
962 spin_lock(&rnp->lock); /* irqs already disabled. */ 984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
963 rnp->qsmaskinit &= ~mask; 985 rnp->qsmaskinit &= ~mask;
964 if (rnp->qsmaskinit != 0) { 986 if (rnp->qsmaskinit != 0) {
965 if (rnp != rdp->mynode) 987 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 988 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
967 break; 989 break;
968 } 990 }
969 if (rnp == rdp->mynode) 991 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 992 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else 993 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 994 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
973 mask = rnp->grpmask; 995 mask = rnp->grpmask;
974 rnp = rnp->parent; 996 rnp = rnp->parent;
975 } while (rnp != NULL); 997 } while (rnp != NULL);
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1002 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock. 1003 * held leads to deadlock.
982 */ 1004 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1005 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode; 1006 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP) 1007 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags); 1008 rcu_report_unblock_qs_rnp(rnp, flags);
987 else 1009 else
988 spin_unlock_irqrestore(&rnp->lock, flags); 1010 raw_spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP) 1011 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp); 1012 rcu_report_exp_rnp(rsp, rnp);
991 1013
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
1144/* 1166/*
1145 * Scan the leaf rcu_node structures, processing dyntick state for any that 1167 * Scan the leaf rcu_node structures, processing dyntick state for any that
1146 * have not yet encountered a quiescent state, using the function specified. 1168 * have not yet encountered a quiescent state, using the function specified.
1147 * Returns 1 if the current grace period ends while scanning (possibly 1169 * The caller must have suppressed start of new grace periods.
1148 * because we made it end).
1149 */ 1170 */
1150static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, 1171static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1151 int (*f)(struct rcu_data *))
1152{ 1172{
1153 unsigned long bit; 1173 unsigned long bit;
1154 int cpu; 1174 int cpu;
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1158 1178
1159 rcu_for_each_leaf_node(rsp, rnp) { 1179 rcu_for_each_leaf_node(rsp, rnp) {
1160 mask = 0; 1180 mask = 0;
1161 spin_lock_irqsave(&rnp->lock, flags); 1181 raw_spin_lock_irqsave(&rnp->lock, flags);
1162 if (rnp->completed != lastcomp) { 1182 if (!rcu_gp_in_progress(rsp)) {
1163 spin_unlock_irqrestore(&rnp->lock, flags); 1183 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1164 return 1; 1184 return;
1165 } 1185 }
1166 if (rnp->qsmask == 0) { 1186 if (rnp->qsmask == 0) {
1167 spin_unlock_irqrestore(&rnp->lock, flags); 1187 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168 continue; 1188 continue;
1169 } 1189 }
1170 cpu = rnp->grplo; 1190 cpu = rnp->grplo;
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1193 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1174 mask |= bit; 1194 mask |= bit;
1175 } 1195 }
1176 if (mask != 0 && rnp->completed == lastcomp) { 1196 if (mask != 0) {
1177 1197
1178 /* rcu_report_qs_rnp() releases rnp->lock. */ 1198 /* rcu_report_qs_rnp() releases rnp->lock. */
1179 rcu_report_qs_rnp(mask, rsp, rnp, flags); 1199 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1180 continue; 1200 continue;
1181 } 1201 }
1182 spin_unlock_irqrestore(&rnp->lock, flags); 1202 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1183 } 1203 }
1184 return 0;
1185} 1204}
1186 1205
1187/* 1206/*
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1191static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1210static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1192{ 1211{
1193 unsigned long flags; 1212 unsigned long flags;
1194 long lastcomp;
1195 struct rcu_node *rnp = rcu_get_root(rsp); 1213 struct rcu_node *rnp = rcu_get_root(rsp);
1196 u8 signaled;
1197 u8 forcenow;
1198 1214
1199 if (!rcu_gp_in_progress(rsp)) 1215 if (!rcu_gp_in_progress(rsp))
1200 return; /* No grace period in progress, nothing to force. */ 1216 return; /* No grace period in progress, nothing to force. */
1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1217 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1218 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1203 return; /* Someone else is already on the job. */ 1219 return; /* Someone else is already on the job. */
1204 } 1220 }
1205 if (relaxed && 1221 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1206 (long)(rsp->jiffies_force_qs - jiffies) >= 0) 1222 goto unlock_fqs_ret; /* no emergency and done recently. */
1207 goto unlock_ret; /* no emergency and done recently. */
1208 rsp->n_force_qs++; 1223 rsp->n_force_qs++;
1209 spin_lock(&rnp->lock); 1224 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1210 lastcomp = rsp->gpnum - 1;
1211 signaled = rsp->signaled;
1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1225 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1213 if(!rcu_gp_in_progress(rsp)) { 1226 if(!rcu_gp_in_progress(rsp)) {
1214 rsp->n_force_qs_ngp++; 1227 rsp->n_force_qs_ngp++;
1215 spin_unlock(&rnp->lock); 1228 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1216 goto unlock_ret; /* no GP in progress, time updated. */ 1229 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1217 } 1230 }
1218 spin_unlock(&rnp->lock); 1231 rsp->fqs_active = 1;
1219 switch (signaled) { 1232 switch (rsp->signaled) {
1220 case RCU_GP_IDLE: 1233 case RCU_GP_IDLE:
1221 case RCU_GP_INIT: 1234 case RCU_GP_INIT:
1222 1235
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1224 1237
1225 case RCU_SAVE_DYNTICK: 1238 case RCU_SAVE_DYNTICK:
1226 1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1227 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1228 break; /* So gcc recognizes the dead code. */ 1242 break; /* So gcc recognizes the dead code. */
1229 1243
1230 /* Record dyntick-idle state. */ 1244 /* Record dyntick-idle state. */
1231 if (rcu_process_dyntick(rsp, lastcomp, 1245 force_qs_rnp(rsp, dyntick_save_progress_counter);
1232 dyntick_save_progress_counter)) 1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1233 goto unlock_ret; 1247 if (rcu_gp_in_progress(rsp))
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1237
1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1240 spin_lock(&rnp->lock);
1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1244 rsp->signaled = RCU_FORCE_QS; 1248 rsp->signaled = RCU_FORCE_QS;
1245 rsp->completed_fqs = lastcomp; 1249 break;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1247 }
1248 spin_unlock(&rnp->lock);
1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1252 1250
1253 case RCU_FORCE_QS: 1251 case RCU_FORCE_QS:
1254 1252
1255 /* Check dyntick-idle state, send IPI to laggarts. */ 1253 /* Check dyntick-idle state, send IPI to laggarts. */
1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs, 1254 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1257 rcu_implicit_dynticks_qs)) 1255 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1258 goto unlock_ret;
1259 1256
1260 /* Leave state in case more forcing is required. */ 1257 /* Leave state in case more forcing is required. */
1261 1258
1259 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1262 break; 1260 break;
1263 } 1261 }
1264unlock_ret: 1262 rsp->fqs_active = 0;
1265 spin_unlock_irqrestore(&rsp->fqslock, flags); 1263 if (rsp->fqs_need_gp) {
1264 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1265 rsp->fqs_need_gp = 0;
1266 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1267 return;
1268 }
1269 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1270unlock_fqs_ret:
1271 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1266} 1272}
1267 1273
1268#else /* #ifdef CONFIG_SMP */ 1274#else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1290 * If an RCU GP has gone long enough, go check for dyntick 1296 * If an RCU GP has gone long enough, go check for dyntick
1291 * idle CPUs and, if needed, send resched IPIs. 1297 * idle CPUs and, if needed, send resched IPIs.
1292 */ 1298 */
1293 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1299 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1294 force_quiescent_state(rsp, 1); 1300 force_quiescent_state(rsp, 1);
1295 1301
1296 /* 1302 /*
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1304 1310
1305 /* Does this CPU require a not-yet-started grace period? */ 1311 /* Does this CPU require a not-yet-started grace period? */
1306 if (cpu_needs_another_gp(rsp, rdp)) { 1312 if (cpu_needs_another_gp(rsp, rdp)) {
1307 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 1313 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1308 rcu_start_gp(rsp, flags); /* releases above lock */ 1314 rcu_start_gp(rsp, flags); /* releases above lock */
1309 } 1315 }
1310 1316
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1335 * grace-period manipulations above. 1341 * grace-period manipulations above.
1336 */ 1342 */
1337 smp_mb(); /* See above block comment. */ 1343 smp_mb(); /* See above block comment. */
1344
1345 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1346 rcu_needs_cpu_flush();
1338} 1347}
1339 1348
1340static void 1349static void
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1369 unsigned long nestflag; 1378 unsigned long nestflag;
1370 struct rcu_node *rnp_root = rcu_get_root(rsp); 1379 struct rcu_node *rnp_root = rcu_get_root(rsp);
1371 1380
1372 spin_lock_irqsave(&rnp_root->lock, nestflag); 1381 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1382 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1374 } 1383 }
1375 1384
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1387 force_quiescent_state(rsp, 0); 1396 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs; 1397 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen; 1398 rdp->qlen_last_fqs_check = rdp->qlen;
1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1399 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1391 force_quiescent_state(rsp, 1); 1400 force_quiescent_state(rsp, 1);
1392 local_irq_restore(flags); 1401 local_irq_restore(flags);
1393} 1402}
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1520 1529
1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1530 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1522 if (rcu_gp_in_progress(rsp) && 1531 if (rcu_gp_in_progress(rsp) &&
1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1532 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1524 rdp->n_rp_need_fqs++; 1533 rdp->n_rp_need_fqs++;
1525 return 1; 1534 return 1;
1526 } 1535 }
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu)
1545/* 1554/*
1546 * Check to see if any future RCU-related work will need to be done 1555 * Check to see if any future RCU-related work will need to be done
1547 * by the current CPU, even if none need be done immediately, returning 1556 * by the current CPU, even if none need be done immediately, returning
1548 * 1 if so. This function is part of the RCU implementation; it is -not- 1557 * 1 if so.
1549 * an exported member of the RCU API.
1550 */ 1558 */
1551int rcu_needs_cpu(int cpu) 1559static int rcu_needs_cpu_quick_check(int cpu)
1552{ 1560{
1553 /* RCU callbacks either ready or pending? */ 1561 /* RCU callbacks either ready or pending? */
1554 return per_cpu(rcu_sched_data, cpu).nxtlist || 1562 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu)
1556 rcu_preempt_needs_cpu(cpu); 1564 rcu_preempt_needs_cpu(cpu);
1557} 1565}
1558 1566
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1567static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count; 1568static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex); 1569static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1659 struct rcu_node *rnp = rcu_get_root(rsp); 1652 struct rcu_node *rnp = rcu_get_root(rsp);
1660 1653
1661 /* Set up local state, ensuring consistent view of global state. */ 1654 /* Set up local state, ensuring consistent view of global state. */
1662 spin_lock_irqsave(&rnp->lock, flags); 1655 raw_spin_lock_irqsave(&rnp->lock, flags);
1663 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1656 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1664 rdp->nxtlist = NULL; 1657 rdp->nxtlist = NULL;
1665 for (i = 0; i < RCU_NEXT_SIZE; i++) 1658 for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1669 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1662 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1670#endif /* #ifdef CONFIG_NO_HZ */ 1663#endif /* #ifdef CONFIG_NO_HZ */
1671 rdp->cpu = cpu; 1664 rdp->cpu = cpu;
1672 spin_unlock_irqrestore(&rnp->lock, flags); 1665 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1673} 1666}
1674 1667
1675/* 1668/*
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1687 struct rcu_node *rnp = rcu_get_root(rsp); 1680 struct rcu_node *rnp = rcu_get_root(rsp);
1688 1681
1689 /* Set up local state, ensuring consistent view of global state. */ 1682 /* Set up local state, ensuring consistent view of global state. */
1690 spin_lock_irqsave(&rnp->lock, flags); 1683 raw_spin_lock_irqsave(&rnp->lock, flags);
1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1684 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1685 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1693 rdp->beenonline = 1; /* We have now been online. */ 1686 rdp->beenonline = 1; /* We have now been online. */
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1695 rdp->qlen_last_fqs_check = 0; 1688 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs; 1689 rdp->n_force_qs_snap = rsp->n_force_qs;
1697 rdp->blimit = blimit; 1690 rdp->blimit = blimit;
1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1691 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1699 1692
1700 /* 1693 /*
1701 * A new grace period might start here. If so, we won't be part 1694 * A new grace period might start here. If so, we won't be part
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1703 */ 1696 */
1704 1697
1705 /* Exclude any attempts to start a new GP on large systems. */ 1698 /* Exclude any attempts to start a new GP on large systems. */
1706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1699 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1707 1700
1708 /* Add CPU to rcu_node bitmasks. */ 1701 /* Add CPU to rcu_node bitmasks. */
1709 rnp = rdp->mynode; 1702 rnp = rdp->mynode;
1710 mask = rdp->grpmask; 1703 mask = rdp->grpmask;
1711 do { 1704 do {
1712 /* Exclude any attempts to start a new GP on small systems. */ 1705 /* Exclude any attempts to start a new GP on small systems. */
1713 spin_lock(&rnp->lock); /* irqs already disabled. */ 1706 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1714 rnp->qsmaskinit |= mask; 1707 rnp->qsmaskinit |= mask;
1715 mask = rnp->grpmask; 1708 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) { 1709 if (rnp == rdp->mynode) {
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1718 rdp->completed = rnp->completed; 1711 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1; 1712 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 } 1713 }
1721 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1714 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1722 rnp = rnp->parent; 1715 rnp = rnp->parent;
1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1716 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1724 1717
1725 spin_unlock_irqrestore(&rsp->onofflock, flags); 1718 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1726} 1719}
1727 1720
1728static void __cpuinit rcu_online_cpu(int cpu) 1721static void __cpuinit rcu_online_cpu(int cpu)
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1806 */ 1799 */
1807static void __init rcu_init_one(struct rcu_state *rsp) 1800static void __init rcu_init_one(struct rcu_state *rsp)
1808{ 1801{
1802 static char *buf[] = { "rcu_node_level_0",
1803 "rcu_node_level_1",
1804 "rcu_node_level_2",
1805 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
1809 int cpustride = 1; 1806 int cpustride = 1;
1810 int i; 1807 int i;
1811 int j; 1808 int j;
1812 struct rcu_node *rnp; 1809 struct rcu_node *rnp;
1813 1810
1811 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
1812
1814 /* Initialize the level-tracking arrays. */ 1813 /* Initialize the level-tracking arrays. */
1815 1814
1816 for (i = 1; i < NUM_RCU_LVLS; i++) 1815 for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1823 cpustride *= rsp->levelspread[i]; 1822 cpustride *= rsp->levelspread[i];
1824 rnp = rsp->level[i]; 1823 rnp = rsp->level[i];
1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1824 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1826 spin_lock_init(&rnp->lock); 1825 raw_spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]); 1826 lockdep_set_class_and_name(&rnp->lock,
1827 &rcu_node_class[i], buf[i]);
1828 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1829 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1830 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1876,7 +1876,7 @@ do { \
1876 1876
1877void __init rcu_init(void) 1877void __init rcu_init(void)
1878{ 1878{
1879 int i; 1879 int cpu;
1880 1880
1881 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -1896,8 +1896,8 @@ void __init rcu_init(void)
1896 * or the scheduler are operational. 1896 * or the scheduler are operational.
1897 */ 1897 */
1898 cpu_notifier(rcu_cpu_notify, 0); 1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i) 1899 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); 1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1901} 1901}
1902 1902
1903#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..1439eb504c22 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
90 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
91 */ 91 */
92struct rcu_node { 92struct rcu_node {
93 spinlock_t lock; /* Root rcu_node's lock protects some */ 93 raw_spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */ 94 /* rcu_state fields as well as following. */
95 long gpnum; /* Current grace period for this node. */ 95 unsigned long gpnum; /* Current grace period for this node. */
96 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
97 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */ 98 unsigned long completed; /* Last GP completed for this node. */
99 /* This will either be equal to or one */ 99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */ 100 /* behind the root rcu_node's gpnum. */
101 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
161/* Per-CPU data for read-copy update. */ 161/* Per-CPU data for read-copy update. */
162struct rcu_data { 162struct rcu_data {
163 /* 1) quiescent-state and grace-period handling : */ 163 /* 1) quiescent-state and grace-period handling : */
164 long completed; /* Track rsp->completed gp number */ 164 unsigned long completed; /* Track rsp->completed gp number */
165 /* in order to detect GP end. */ 165 /* in order to detect GP end. */
166 long gpnum; /* Highest gp number that this CPU */ 166 unsigned long gpnum; /* Highest gp number that this CPU */
167 /* is aware of having started. */ 167 /* is aware of having started. */
168 long passed_quiesc_completed; 168 unsigned long passed_quiesc_completed;
169 /* Value of completed at time of qs. */ 169 /* Value of completed at time of qs. */
170 bool passed_quiesc; /* User-mode/idle loop etc. */ 170 bool passed_quiesc; /* User-mode/idle loop etc. */
171 bool qs_pending; /* Core waits for quiesc state. */ 171 bool qs_pending; /* Core waits for quiesc state. */
@@ -221,14 +221,14 @@ struct rcu_data {
221 unsigned long resched_ipi; /* Sent a resched IPI. */ 221 unsigned long resched_ipi; /* Sent a resched IPI. */
222 222
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 long n_rp_cb_ready; 226 unsigned long n_rp_cb_ready;
227 long n_rp_cpu_needs_gp; 227 unsigned long n_rp_cpu_needs_gp;
228 long n_rp_gp_completed; 228 unsigned long n_rp_gp_completed;
229 long n_rp_gp_started; 229 unsigned long n_rp_gp_started;
230 long n_rp_need_fqs; 230 unsigned long n_rp_need_fqs;
231 long n_rp_need_nothing; 231 unsigned long n_rp_need_nothing;
232 232
233 int cpu; 233 int cpu;
234}; 234};
@@ -237,12 +237,11 @@ struct rcu_data {
237#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
238#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ 240#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
242#ifdef CONFIG_NO_HZ 241#ifdef CONFIG_NO_HZ
243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 242#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
244#else /* #ifdef CONFIG_NO_HZ */ 243#else /* #ifdef CONFIG_NO_HZ */
245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED 244#define RCU_SIGNAL_INIT RCU_FORCE_QS
246#endif /* #else #ifdef CONFIG_NO_HZ */ 245#endif /* #else #ifdef CONFIG_NO_HZ */
247 246
248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -256,6 +255,9 @@ struct rcu_data {
256 255
257#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 256#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
258 257
258#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
259#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
260
259/* 261/*
260 * RCU global state, including node hierarchy. This hierarchy is 262 * RCU global state, including node hierarchy. This hierarchy is
261 * represented in "heap" form in a dense array. The root (first level) 263 * represented in "heap" form in a dense array. The root (first level)
@@ -277,12 +279,19 @@ struct rcu_state {
277 279
278 u8 signaled ____cacheline_internodealigned_in_smp; 280 u8 signaled ____cacheline_internodealigned_in_smp;
279 /* Force QS state. */ 281 /* Force QS state. */
280 long gpnum; /* Current gp number. */ 282 u8 fqs_active; /* force_quiescent_state() */
281 long completed; /* # of last completed gp. */ 283 /* is running. */
284 u8 fqs_need_gp; /* A CPU was prevented from */
285 /* starting a new grace */
286 /* period because */
287 /* force_quiescent_state() */
288 /* was running. */
289 unsigned long gpnum; /* Current gp number. */
290 unsigned long completed; /* # of last completed gp. */
282 291
283 /* End of fields guarded by root rcu_node's lock. */ 292 /* End of fields guarded by root rcu_node's lock. */
284 293
285 spinlock_t onofflock; /* exclude on/offline and */ 294 raw_spinlock_t onofflock; /* exclude on/offline and */
286 /* starting new GP. Also */ 295 /* starting new GP. Also */
287 /* protects the following */ 296 /* protects the following */
288 /* orphan_cbs fields. */ 297 /* orphan_cbs fields. */
@@ -292,10 +301,8 @@ struct rcu_state {
292 /* going offline. */ 301 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ 302 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */ 303 long orphan_qlen; /* Number of orphaned cbs. */
295 spinlock_t fqslock; /* Only one task forcing */ 304 raw_spinlock_t fqslock; /* Only one task forcing */
296 /* quiescent states. */ 305 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
299 unsigned long jiffies_force_qs; /* Time at which to invoke */ 306 unsigned long jiffies_force_qs; /* Time at which to invoke */
300 /* force_quiescent_state(). */ 307 /* force_quiescent_state(). */
301 unsigned long n_force_qs; /* Number of calls to */ 308 unsigned long n_force_qs; /* Number of calls to */
@@ -319,8 +326,6 @@ struct rcu_state {
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ 326#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */ 327 /* GP were moved to root. */
321 328
322#ifdef RCU_TREE_NONCORE
323
324/* 329/*
325 * RCU implementation internal declarations: 330 * RCU implementation internal declarations:
326 */ 331 */
@@ -335,7 +340,7 @@ extern struct rcu_state rcu_preempt_state;
335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 340DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 341#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
337 342
338#else /* #ifdef RCU_TREE_NONCORE */ 343#ifndef RCU_TREE_NONCORE
339 344
340/* Forward declarations for rcutree_plugin.h */ 345/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void); 346static void rcu_bootup_announce(void);
@@ -347,6 +352,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags); 352 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 353#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 354#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
355static void rcu_print_detail_task_stall(struct rcu_state *rsp);
350static void rcu_print_task_stall(struct rcu_node *rnp); 356static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 357#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 358static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +373,6 @@ static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 373static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void); 374static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void); 375static void __init __rcu_init_preempt(void);
376static void rcu_needs_cpu_flush(void);
370 377
371#endif /* #else #ifdef RCU_TREE_NONCORE */ 378#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..464ad2cdee00 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -62,6 +62,15 @@ long rcu_batches_completed(void)
62EXPORT_SYMBOL_GPL(rcu_batches_completed); 62EXPORT_SYMBOL_GPL(rcu_batches_completed);
63 63
64/* 64/*
65 * Force a quiescent state for preemptible RCU.
66 */
67void rcu_force_quiescent_state(void)
68{
69 force_quiescent_state(&rcu_preempt_state, 0);
70}
71EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
72
73/*
65 * Record a preemptable-RCU quiescent state for the specified CPU. Note 74 * Record a preemptable-RCU quiescent state for the specified CPU. Note
66 * that this just means that the task currently running on the CPU is 75 * that this just means that the task currently running on the CPU is
67 * not in a quiescent state. There might be any number of tasks blocked 76 * not in a quiescent state. There might be any number of tasks blocked
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
102 /* Possibly blocking in an RCU read-side critical section. */ 111 /* Possibly blocking in an RCU read-side critical section. */
103 rdp = rcu_preempt_state.rda[cpu]; 112 rdp = rcu_preempt_state.rda[cpu];
104 rnp = rdp->mynode; 113 rnp = rdp->mynode;
105 spin_lock_irqsave(&rnp->lock, flags); 114 raw_spin_lock_irqsave(&rnp->lock, flags);
106 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 115 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
107 t->rcu_blocked_node = rnp; 116 t->rcu_blocked_node = rnp;
108 117
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
123 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 132 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
124 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 133 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
125 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 134 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
126 spin_unlock_irqrestore(&rnp->lock, flags); 135 raw_spin_unlock_irqrestore(&rnp->lock, flags);
127 } 136 }
128 137
129 /* 138 /*
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
180 struct rcu_node *rnp_p; 189 struct rcu_node *rnp_p;
181 190
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 191 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */ 193 return; /* Still need more quiescent states! */
185 } 194 }
186 195
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
197 206
198 /* Report up the rest of the hierarchy. */ 207 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask; 208 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 209 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */ 210 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 211 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203} 212}
204 213
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
248 */ 257 */
249 for (;;) { 258 for (;;) {
250 rnp = t->rcu_blocked_node; 259 rnp = t->rcu_blocked_node;
251 spin_lock(&rnp->lock); /* irqs already disabled. */ 260 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
252 if (rnp == t->rcu_blocked_node) 261 if (rnp == t->rcu_blocked_node)
253 break; 262 break;
254 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
255 } 264 }
256 empty = !rcu_preempted_readers(rnp); 265 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp); 266 empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 274 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
266 */ 275 */
267 if (empty) 276 if (empty)
268 spin_unlock_irqrestore(&rnp->lock, flags); 277 raw_spin_unlock_irqrestore(&rnp->lock, flags);
269 else 278 else
270 rcu_report_unblock_qs_rnp(rnp, flags); 279 rcu_report_unblock_qs_rnp(rnp, flags);
271 280
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void)
295 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 304 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
296 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 305 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
297 rcu_read_unlock_special(t); 306 rcu_read_unlock_special(t);
307#ifdef CONFIG_PROVE_LOCKING
308 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
309#endif /* #ifdef CONFIG_PROVE_LOCKING */
298} 310}
299EXPORT_SYMBOL_GPL(__rcu_read_unlock); 311EXPORT_SYMBOL_GPL(__rcu_read_unlock);
300 312
301#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 313#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
302 314
315#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
316
317/*
318 * Dump detailed information for all tasks blocking the current RCU
319 * grace period on the specified rcu_node structure.
320 */
321static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
322{
323 unsigned long flags;
324 struct list_head *lp;
325 int phase;
326 struct task_struct *t;
327
328 if (rcu_preempted_readers(rnp)) {
329 raw_spin_lock_irqsave(&rnp->lock, flags);
330 phase = rnp->gpnum & 0x1;
331 lp = &rnp->blocked_tasks[phase];
332 list_for_each_entry(t, lp, rcu_node_entry)
333 sched_show_task(t);
334 raw_spin_unlock_irqrestore(&rnp->lock, flags);
335 }
336}
337
338/*
339 * Dump detailed information for all tasks blocking the current RCU
340 * grace period.
341 */
342static void rcu_print_detail_task_stall(struct rcu_state *rsp)
343{
344 struct rcu_node *rnp = rcu_get_root(rsp);
345
346 rcu_print_detail_task_stall_rnp(rnp);
347 rcu_for_each_leaf_node(rsp, rnp)
348 rcu_print_detail_task_stall_rnp(rnp);
349}
350
351#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
352
353static void rcu_print_detail_task_stall(struct rcu_state *rsp)
354{
355}
356
357#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
358
303/* 359/*
304 * Scan the current list of tasks blocked within RCU read-side critical 360 * Scan the current list of tasks blocked within RCU read-side critical
305 * sections, printing out the tid of each. 361 * sections, printing out the tid of each.
306 */ 362 */
307static void rcu_print_task_stall(struct rcu_node *rnp) 363static void rcu_print_task_stall(struct rcu_node *rnp)
308{ 364{
309 unsigned long flags;
310 struct list_head *lp; 365 struct list_head *lp;
311 int phase; 366 int phase;
312 struct task_struct *t; 367 struct task_struct *t;
313 368
314 if (rcu_preempted_readers(rnp)) { 369 if (rcu_preempted_readers(rnp)) {
315 spin_lock_irqsave(&rnp->lock, flags);
316 phase = rnp->gpnum & 0x1; 370 phase = rnp->gpnum & 0x1;
317 lp = &rnp->blocked_tasks[phase]; 371 lp = &rnp->blocked_tasks[phase];
318 list_for_each_entry(t, lp, rcu_node_entry) 372 list_for_each_entry(t, lp, rcu_node_entry)
319 printk(" P%d", t->pid); 373 printk(" P%d", t->pid);
320 spin_unlock_irqrestore(&rnp->lock, flags);
321 } 374 }
322} 375}
323 376
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
388 lp_root = &rnp_root->blocked_tasks[i]; 441 lp_root = &rnp_root->blocked_tasks[i];
389 while (!list_empty(lp)) { 442 while (!list_empty(lp)) {
390 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 443 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
391 spin_lock(&rnp_root->lock); /* irqs already disabled */ 444 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
392 list_del(&tp->rcu_node_entry); 445 list_del(&tp->rcu_node_entry);
393 tp->rcu_blocked_node = rnp_root; 446 tp->rcu_blocked_node = rnp_root;
394 list_add(&tp->rcu_node_entry, lp_root); 447 list_add(&tp->rcu_node_entry, lp_root);
395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 448 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
396 } 449 }
397 } 450 }
398 return retval; 451 return retval;
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
516 unsigned long flags; 569 unsigned long flags;
517 unsigned long mask; 570 unsigned long mask;
518 571
519 spin_lock_irqsave(&rnp->lock, flags); 572 raw_spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) { 573 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp)) 574 if (!sync_rcu_preempt_exp_done(rnp))
522 break; 575 break;
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
525 break; 578 break;
526 } 579 }
527 mask = rnp->grpmask; 580 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */ 581 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent; 582 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */ 583 raw_spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask; 584 rnp->expmask &= ~mask;
532 } 585 }
533 spin_unlock_irqrestore(&rnp->lock, flags); 586 raw_spin_unlock_irqrestore(&rnp->lock, flags);
534} 587}
535 588
536/* 589/*
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{ 598{
546 int must_wait; 599 int must_wait;
547 600
548 spin_lock(&rnp->lock); /* irqs already disabled */ 601 raw_spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 602 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 603 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp); 604 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */ 605 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait) 606 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp); 607 rcu_report_exp_rnp(rsp, rnp);
555} 608}
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void)
594 /* force all RCU readers onto blocked_tasks[]. */ 647 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited(); 648 synchronize_sched_expedited();
596 649
597 spin_lock_irqsave(&rsp->onofflock, flags); 650 raw_spin_lock_irqsave(&rsp->onofflock, flags);
598 651
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 652 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 653 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */ 654 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit; 655 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 656 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 } 657 }
605 658
606 /* Snapshot current state of ->blocked_tasks[] lists. */ 659 /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void)
609 if (NUM_RCU_NODES > 1) 662 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 663 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611 664
612 spin_unlock_irqrestore(&rsp->onofflock, flags); 665 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
613 666
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 667 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp); 668 rnp = rcu_get_root(rsp);
@@ -713,6 +766,16 @@ long rcu_batches_completed(void)
713EXPORT_SYMBOL_GPL(rcu_batches_completed); 766EXPORT_SYMBOL_GPL(rcu_batches_completed);
714 767
715/* 768/*
769 * Force a quiescent state for RCU, which, because there is no preemptible
770 * RCU, becomes the same as rcu-sched.
771 */
772void rcu_force_quiescent_state(void)
773{
774 rcu_sched_force_quiescent_state();
775}
776EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
777
778/*
716 * Because preemptable RCU does not exist, we never have to check for 779 * Because preemptable RCU does not exist, we never have to check for
717 * CPUs being in quiescent states. 780 * CPUs being in quiescent states.
718 */ 781 */
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
734/* Because preemptible RCU does not exist, no quieting of tasks. */ 797/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 798static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{ 799{
737 spin_unlock_irqrestore(&rnp->lock, flags); 800 raw_spin_unlock_irqrestore(&rnp->lock, flags);
738} 801}
739 802
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 803#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
745 * Because preemptable RCU does not exist, we never have to check for 808 * Because preemptable RCU does not exist, we never have to check for
746 * tasks blocked within RCU read-side critical sections. 809 * tasks blocked within RCU read-side critical sections.
747 */ 810 */
811static void rcu_print_detail_task_stall(struct rcu_state *rsp)
812{
813}
814
815/*
816 * Because preemptable RCU does not exist, we never have to check for
817 * tasks blocked within RCU read-side critical sections.
818 */
748static void rcu_print_task_stall(struct rcu_node *rnp) 819static void rcu_print_task_stall(struct rcu_node *rnp)
749{ 820{
750} 821}
@@ -884,3 +955,113 @@ static void __init __rcu_init_preempt(void)
884} 955}
885 956
886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 957#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
958
959#if !defined(CONFIG_RCU_FAST_NO_HZ)
960
961/*
962 * Check to see if any future RCU-related work will need to be done
963 * by the current CPU, even if none need be done immediately, returning
964 * 1 if so. This function is part of the RCU implementation; it is -not-
965 * an exported member of the RCU API.
966 *
967 * Because we have preemptible RCU, just check whether this CPU needs
968 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
969 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
970 */
971int rcu_needs_cpu(int cpu)
972{
973 return rcu_needs_cpu_quick_check(cpu);
974}
975
976/*
977 * Check to see if we need to continue a callback-flush operations to
978 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
979 * entry is not configured, so we never do need to.
980 */
981static void rcu_needs_cpu_flush(void)
982{
983}
984
985#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
986
987#define RCU_NEEDS_CPU_FLUSHES 5
988static DEFINE_PER_CPU(int, rcu_dyntick_drain);
989static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
990
991/*
992 * Check to see if any future RCU-related work will need to be done
993 * by the current CPU, even if none need be done immediately, returning
994 * 1 if so. This function is part of the RCU implementation; it is -not-
995 * an exported member of the RCU API.
996 *
997 * Because we are not supporting preemptible RCU, attempt to accelerate
998 * any current grace periods so that RCU no longer needs this CPU, but
999 * only if all other CPUs are already in dynticks-idle mode. This will
1000 * allow the CPU cores to be powered down immediately, as opposed to after
1001 * waiting many milliseconds for grace periods to elapse.
1002 *
1003 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1004 * disabled, we do one pass of force_quiescent_state(), then do a
1005 * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
1006 * The per-cpu rcu_dyntick_drain variable controls the sequencing.
1007 */
1008int rcu_needs_cpu(int cpu)
1009{
1010 int c = 0;
1011 int thatcpu;
1012
1013 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1014 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1015 if (thatcpu != cpu) {
1016 per_cpu(rcu_dyntick_drain, cpu) = 0;
1017 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1018 return rcu_needs_cpu_quick_check(cpu);
1019 }
1020
1021 /* Check and update the rcu_dyntick_drain sequencing. */
1022 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1023 /* First time through, initialize the counter. */
1024 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1025 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1026 /* We have hit the limit, so time to give up. */
1027 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1028 return rcu_needs_cpu_quick_check(cpu);
1029 }
1030
1031 /* Do one step pushing remaining RCU callbacks through. */
1032 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1033 rcu_sched_qs(cpu);
1034 force_quiescent_state(&rcu_sched_state, 0);
1035 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1036 }
1037 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1038 rcu_bh_qs(cpu);
1039 force_quiescent_state(&rcu_bh_state, 0);
1040 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1041 }
1042
1043 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1044 if (c) {
1045 raise_softirq(RCU_SOFTIRQ);
1046 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1047 }
1048 return c;
1049}
1050
1051/*
1052 * Check to see if we need to continue a callback-flush operations to
1053 * allow the last CPU to enter dyntick-idle mode.
1054 */
1055static void rcu_needs_cpu_flush(void)
1056{
1057 int cpu = smp_processor_id();
1058 unsigned long flags;
1059
1060 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
1061 return;
1062 local_irq_save(flags);
1063 (void)rcu_needs_cpu(cpu);
1064 local_irq_restore(flags);
1065}
1066
1067#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 50{
51 if (!rdp->beenonline) 51 if (!rdp->beenonline)
52 return; 52 return;
53 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", 53 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
54 rdp->cpu, 54 rdp->cpu,
55 cpu_is_offline(rdp->cpu) ? '!' : ' ', 55 cpu_is_offline(rdp->cpu) ? '!' : ' ',
56 rdp->completed, rdp->gpnum, 56 rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
105{ 105{
106 if (!rdp->beenonline) 106 if (!rdp->beenonline)
107 return; 107 return;
108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
109 rdp->cpu, 109 rdp->cpu,
110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
111 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum; 158 unsigned long gpnum;
159 int level = 0; 159 int level = 0;
160 int phase; 160 int phase;
161 struct rcu_node *rnp; 161 struct rcu_node *rnp;
162 162
163 gpnum = rsp->gpnum; 163 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
166 rsp->completed, gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
167 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
215static int show_rcugp(struct seq_file *m, void *unused) 215static int show_rcugp(struct seq_file *m, void *unused)
216{ 216{
217#ifdef CONFIG_TREE_PREEMPT_RCU 217#ifdef CONFIG_TREE_PREEMPT_RCU
218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", 218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
219 rcu_preempt_state.completed, rcu_preempt_state.gpnum); 219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", 221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
222 rcu_sched_state.completed, rcu_sched_state.gpnum); 222 rcu_sched_state.completed, rcu_sched_state.gpnum);
223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
224 rcu_bh_state.completed, rcu_bh_state.gpnum); 224 rcu_bh_state.completed, rcu_bh_state.gpnum);
225 return 0; 225 return 0;
226} 226}
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..2d5be5d9bf5f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old)
188 return -EINVAL; 188 return -EINVAL;
189} 189}
190 190
191static void __release_child_resources(struct resource *r)
192{
193 struct resource *tmp, *p;
194 resource_size_t size;
195
196 p = r->child;
197 r->child = NULL;
198 while (p) {
199 tmp = p;
200 p = p->sibling;
201
202 tmp->parent = NULL;
203 tmp->sibling = NULL;
204 __release_child_resources(tmp);
205
206 printk(KERN_DEBUG "release child resource %pR\n", tmp);
207 /* need to restore size, and keep flags */
208 size = resource_size(tmp);
209 tmp->start = 0;
210 tmp->end = size - 1;
211 }
212}
213
214void release_child_resources(struct resource *r)
215{
216 write_lock(&resource_lock);
217 __release_child_resources(r);
218 write_unlock(&resource_lock);
219}
220
191/** 221/**
192 * request_resource - request and reserve an I/O or memory resource 222 * request_resource - request and reserve an I/O or memory resource
193 * @root: root resource descriptor 223 * @root: root resource descriptor
@@ -274,7 +304,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *)) 304 void *arg, int (*func)(unsigned long, unsigned long, void *))
275{ 305{
276 struct resource res; 306 struct resource res;
277 unsigned long pfn, len; 307 unsigned long pfn, end_pfn;
278 u64 orig_end; 308 u64 orig_end;
279 int ret = -1; 309 int ret = -1;
280 310
@@ -284,9 +314,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
284 orig_end = res.end; 314 orig_end = res.end;
285 while ((res.start < res.end) && 315 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) { 316 (find_next_system_ram(&res, "System RAM") >= 0)) {
287 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 317 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 318 end_pfn = (res.end + 1) >> PAGE_SHIFT;
289 ret = (*func)(pfn, len, arg); 319 if (end_pfn > pfn)
320 ret = (*func)(pfn, end_pfn - pfn, arg);
290 if (ret) 321 if (ret)
291 break; 322 break;
292 res.start = res.end + 1; 323 res.start = res.end + 1;
@@ -297,14 +328,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
297 328
298#endif 329#endif
299 330
331static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
332{
333 return 1;
334}
335/*
336 * This generic page_is_ram() returns true if specified address is
337 * registered as "System RAM" in iomem_resource list.
338 */
339int __weak page_is_ram(unsigned long pfn)
340{
341 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
342}
343
300/* 344/*
301 * Find empty slot in the resource tree given range and alignment. 345 * Find empty slot in the resource tree given range and alignment.
302 */ 346 */
303static int find_resource(struct resource *root, struct resource *new, 347static int find_resource(struct resource *root, struct resource *new,
304 resource_size_t size, resource_size_t min, 348 resource_size_t size, resource_size_t min,
305 resource_size_t max, resource_size_t align, 349 resource_size_t max, resource_size_t align,
306 void (*alignf)(void *, struct resource *, 350 resource_size_t (*alignf)(void *,
307 resource_size_t, resource_size_t), 351 const struct resource *,
352 resource_size_t,
353 resource_size_t),
308 void *alignf_data) 354 void *alignf_data)
309{ 355{
310 struct resource *this = root->child; 356 struct resource *this = root->child;
@@ -330,7 +376,7 @@ static int find_resource(struct resource *root, struct resource *new,
330 tmp.end = max; 376 tmp.end = max;
331 tmp.start = ALIGN(tmp.start, align); 377 tmp.start = ALIGN(tmp.start, align);
332 if (alignf) 378 if (alignf)
333 alignf(alignf_data, &tmp, size, align); 379 tmp.start = alignf(alignf_data, &tmp, size, align);
334 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 380 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
335 new->start = tmp.start; 381 new->start = tmp.start;
336 new->end = tmp.start + size - 1; 382 new->end = tmp.start + size - 1;
@@ -358,8 +404,10 @@ static int find_resource(struct resource *root, struct resource *new,
358int allocate_resource(struct resource *root, struct resource *new, 404int allocate_resource(struct resource *root, struct resource *new,
359 resource_size_t size, resource_size_t min, 405 resource_size_t size, resource_size_t min,
360 resource_size_t max, resource_size_t align, 406 resource_size_t max, resource_size_t align,
361 void (*alignf)(void *, struct resource *, 407 resource_size_t (*alignf)(void *,
362 resource_size_t, resource_size_t), 408 const struct resource *,
409 resource_size_t,
410 resource_size_t),
363 void *alignf_data) 411 void *alignf_data)
364{ 412{
365 int err; 413 int err;
diff --git a/kernel/sched.c b/kernel/sched.c
index 3e71ebb101c2..150b6988de49 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 602#endif
646} 603}
647 604
605#define rcu_dereference_check_sched_domain(p) \
606 rcu_dereference_check((p), \
607 rcu_read_lock_sched_held() || \
608 lockdep_is_held(&sched_domains_mutex))
609
648/* 610/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 611 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 612 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 615 * preempt-disabled sections.
654 */ 616 */
655#define for_each_domain(cpu, __sd) \ 617#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 618 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 619
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 620#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 621#define this_rq() (&__get_cpu_var(runqueues))
@@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 904
943/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
946 */ 921 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 923 __acquires(rq->lock)
949{ 924{
925 struct rq *rq;
926
950 for (;;) { 927 for (;;) {
951 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 931 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 933 return rq;
955 raw_spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
956 } 935 }
@@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 946 struct rq *rq;
968 947
969 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
970 local_irq_save(*flags); 951 local_irq_save(*flags);
971 rq = task_rq(p); 952 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 955 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 957 }
@@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1372};
1392 1373
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
1531 1486
1532static struct sched_group *group_of(int cpu) 1487static struct sched_group *group_of(int cpu)
1533{ 1488{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1535 1490
1536 if (!sd) 1491 if (!sd)
1537 return NULL; 1492 return NULL;
@@ -1566,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1566 1521
1567#ifdef CONFIG_FAIR_GROUP_SCHED 1522#ifdef CONFIG_FAIR_GROUP_SCHED
1568 1523
1569static __read_mostly unsigned long *update_shares_data; 1524static __read_mostly unsigned long __percpu *update_shares_data;
1570 1525
1571static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1526static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1572 1527
@@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1656 }
1702} 1657}
1703 1658
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1715{ 1660{
1716 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1670{
1726} 1671}
1727 1672
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1673#endif
1733 1674
1734#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1808#endif 1794#endif
1809 1795
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1820#endif
1835} 1821}
1836 1822
1837#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1824
1845#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1826#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1848 1828
1829#include "sched_stats.h"
1830
1849static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1850{ 1832{
1851 rq->nr_running++; 1833 rq->nr_running++;
@@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1865 *avg += diff >> 3;
1884} 1866}
1885 1867
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1870{
1888 if (wakeup) 1871 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1873
1891 sched_info_queued(p); 1874 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1894} 1877}
1895 1878
@@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1895}
1913 1896
1914/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1915 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1916 */ 1930 */
1917static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1971 return p->prio;
1958} 1972}
1959 1973
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1974/**
1985 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1976 * @p: the task in question.
@@ -2408,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2408 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2409 2399
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2411 if (cpu != orig_cpu) 2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2412 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2408 }
2413 2409
2414 rq = __task_rq_lock(p); 2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2415 update_rq_clock(rq); 2412 update_rq_clock(rq);
2416 2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2417 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2418 cpu = task_cpu(p);
2419 2422
2420#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2421 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2663,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2663 set_task_cpu(p, cpu); 2666 set_task_cpu(p, cpu);
2664#endif 2667#endif
2665 2668
2666 rq = task_rq_lock(p, &flags); 2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675
2667 BUG_ON(p->state != TASK_WAKING); 2676 BUG_ON(p->state != TASK_WAKING);
2668 p->state = TASK_RUNNING; 2677 p->state = TASK_RUNNING;
2669 update_rq_clock(rq); 2678 update_rq_clock(rq);
@@ -3105,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3105#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3106 3115
3107/* 3116/*
3108 * double_rq_lock - safely lock two runqueues
3109 *
3110 * Note this does not disable interrupts like task_rq_lock,
3111 * you need to do so manually before calling.
3112 */
3113static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3114 __acquires(rq1->lock)
3115 __acquires(rq2->lock)
3116{
3117 BUG_ON(!irqs_disabled());
3118 if (rq1 == rq2) {
3119 raw_spin_lock(&rq1->lock);
3120 __acquire(rq2->lock); /* Fake it out ;) */
3121 } else {
3122 if (rq1 < rq2) {
3123 raw_spin_lock(&rq1->lock);
3124 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3125 } else {
3126 raw_spin_lock(&rq2->lock);
3127 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3128 }
3129 }
3130 update_rq_clock(rq1);
3131 update_rq_clock(rq2);
3132}
3133
3134/*
3135 * double_rq_unlock - safely unlock two runqueues
3136 *
3137 * Note this does not restore interrupts like task_rq_unlock,
3138 * you need to do so manually after calling.
3139 */
3140static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3141 __releases(rq1->lock)
3142 __releases(rq2->lock)
3143{
3144 raw_spin_unlock(&rq1->lock);
3145 if (rq1 != rq2)
3146 raw_spin_unlock(&rq2->lock);
3147 else
3148 __release(rq2->lock);
3149}
3150
3151/*
3152 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3153 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3154 */ 3119 */
@@ -3196,1771 +3161,6 @@ again:
3196 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3197} 3162}
3198 3163
3199/*
3200 * pull_task - move a task from a remote runqueue to the local runqueue.
3201 * Both runqueues must be locked.
3202 */
3203static void pull_task(struct rq *src_rq, struct task_struct *p,
3204 struct rq *this_rq, int this_cpu)
3205{
3206 deactivate_task(src_rq, p, 0);
3207 set_task_cpu(p, this_cpu);
3208 activate_task(this_rq, p, 0);
3209 check_preempt_curr(this_rq, p, 0);
3210}
3211
3212/*
3213 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3214 */
3215static
3216int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3217 struct sched_domain *sd, enum cpu_idle_type idle,
3218 int *all_pinned)
3219{
3220 int tsk_cache_hot = 0;
3221 /*
3222 * We do not migrate tasks that are:
3223 * 1) running (obviously), or
3224 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3225 * 3) are cache-hot on their current CPU.
3226 */
3227 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3228 schedstat_inc(p, se.nr_failed_migrations_affine);
3229 return 0;
3230 }
3231 *all_pinned = 0;
3232
3233 if (task_running(rq, p)) {
3234 schedstat_inc(p, se.nr_failed_migrations_running);
3235 return 0;
3236 }
3237
3238 /*
3239 * Aggressive migration if:
3240 * 1) task is cache cold, or
3241 * 2) too many balance attempts have failed.
3242 */
3243
3244 tsk_cache_hot = task_hot(p, rq->clock, sd);
3245 if (!tsk_cache_hot ||
3246 sd->nr_balance_failed > sd->cache_nice_tries) {
3247#ifdef CONFIG_SCHEDSTATS
3248 if (tsk_cache_hot) {
3249 schedstat_inc(sd, lb_hot_gained[idle]);
3250 schedstat_inc(p, se.nr_forced_migrations);
3251 }
3252#endif
3253 return 1;
3254 }
3255
3256 if (tsk_cache_hot) {
3257 schedstat_inc(p, se.nr_failed_migrations_hot);
3258 return 0;
3259 }
3260 return 1;
3261}
3262
3263static unsigned long
3264balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3265 unsigned long max_load_move, struct sched_domain *sd,
3266 enum cpu_idle_type idle, int *all_pinned,
3267 int *this_best_prio, struct rq_iterator *iterator)
3268{
3269 int loops = 0, pulled = 0, pinned = 0;
3270 struct task_struct *p;
3271 long rem_load_move = max_load_move;
3272
3273 if (max_load_move == 0)
3274 goto out;
3275
3276 pinned = 1;
3277
3278 /*
3279 * Start the load-balancing iterator:
3280 */
3281 p = iterator->start(iterator->arg);
3282next:
3283 if (!p || loops++ > sysctl_sched_nr_migrate)
3284 goto out;
3285
3286 if ((p->se.load.weight >> 1) > rem_load_move ||
3287 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3288 p = iterator->next(iterator->arg);
3289 goto next;
3290 }
3291
3292 pull_task(busiest, p, this_rq, this_cpu);
3293 pulled++;
3294 rem_load_move -= p->se.load.weight;
3295
3296#ifdef CONFIG_PREEMPT
3297 /*
3298 * NEWIDLE balancing is a source of latency, so preemptible kernels
3299 * will stop after the first task is pulled to minimize the critical
3300 * section.
3301 */
3302 if (idle == CPU_NEWLY_IDLE)
3303 goto out;
3304#endif
3305
3306 /*
3307 * We only want to steal up to the prescribed amount of weighted load.
3308 */
3309 if (rem_load_move > 0) {
3310 if (p->prio < *this_best_prio)
3311 *this_best_prio = p->prio;
3312 p = iterator->next(iterator->arg);
3313 goto next;
3314 }
3315out:
3316 /*
3317 * Right now, this is one of only two places pull_task() is called,
3318 * so we can safely collect pull_task() stats here rather than
3319 * inside pull_task().
3320 */
3321 schedstat_add(sd, lb_gained[idle], pulled);
3322
3323 if (all_pinned)
3324 *all_pinned = pinned;
3325
3326 return max_load_move - rem_load_move;
3327}
3328
3329/*
3330 * move_tasks tries to move up to max_load_move weighted load from busiest to
3331 * this_rq, as part of a balancing operation within domain "sd".
3332 * Returns 1 if successful and 0 otherwise.
3333 *
3334 * Called with both runqueues locked.
3335 */
3336static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3337 unsigned long max_load_move,
3338 struct sched_domain *sd, enum cpu_idle_type idle,
3339 int *all_pinned)
3340{
3341 const struct sched_class *class = sched_class_highest;
3342 unsigned long total_load_moved = 0;
3343 int this_best_prio = this_rq->curr->prio;
3344
3345 do {
3346 total_load_moved +=
3347 class->load_balance(this_rq, this_cpu, busiest,
3348 max_load_move - total_load_moved,
3349 sd, idle, all_pinned, &this_best_prio);
3350 class = class->next;
3351
3352#ifdef CONFIG_PREEMPT
3353 /*
3354 * NEWIDLE balancing is a source of latency, so preemptible
3355 * kernels will stop after the first task is pulled to minimize
3356 * the critical section.
3357 */
3358 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3359 break;
3360#endif
3361 } while (class && max_load_move > total_load_moved);
3362
3363 return total_load_moved > 0;
3364}
3365
3366static int
3367iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3368 struct sched_domain *sd, enum cpu_idle_type idle,
3369 struct rq_iterator *iterator)
3370{
3371 struct task_struct *p = iterator->start(iterator->arg);
3372 int pinned = 0;
3373
3374 while (p) {
3375 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3376 pull_task(busiest, p, this_rq, this_cpu);
3377 /*
3378 * Right now, this is only the second place pull_task()
3379 * is called, so we can safely collect pull_task()
3380 * stats here rather than inside pull_task().
3381 */
3382 schedstat_inc(sd, lb_gained[idle]);
3383
3384 return 1;
3385 }
3386 p = iterator->next(iterator->arg);
3387 }
3388
3389 return 0;
3390}
3391
3392/*
3393 * move_one_task tries to move exactly one task from busiest to this_rq, as
3394 * part of active balancing operations within "domain".
3395 * Returns 1 if successful and 0 otherwise.
3396 *
3397 * Called with both runqueues locked.
3398 */
3399static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3400 struct sched_domain *sd, enum cpu_idle_type idle)
3401{
3402 const struct sched_class *class;
3403
3404 for_each_class(class) {
3405 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3406 return 1;
3407 }
3408
3409 return 0;
3410}
3411/********** Helpers for find_busiest_group ************************/
3412/*
3413 * sd_lb_stats - Structure to store the statistics of a sched_domain
3414 * during load balancing.
3415 */
3416struct sd_lb_stats {
3417 struct sched_group *busiest; /* Busiest group in this sd */
3418 struct sched_group *this; /* Local group in this sd */
3419 unsigned long total_load; /* Total load of all groups in sd */
3420 unsigned long total_pwr; /* Total power of all groups in sd */
3421 unsigned long avg_load; /* Average load across all groups in sd */
3422
3423 /** Statistics of this group */
3424 unsigned long this_load;
3425 unsigned long this_load_per_task;
3426 unsigned long this_nr_running;
3427
3428 /* Statistics of the busiest group */
3429 unsigned long max_load;
3430 unsigned long busiest_load_per_task;
3431 unsigned long busiest_nr_running;
3432
3433 int group_imb; /* Is there imbalance in this sd */
3434#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3435 int power_savings_balance; /* Is powersave balance needed for this sd */
3436 struct sched_group *group_min; /* Least loaded group in sd */
3437 struct sched_group *group_leader; /* Group which relieves group_min */
3438 unsigned long min_load_per_task; /* load_per_task in group_min */
3439 unsigned long leader_nr_running; /* Nr running of group_leader */
3440 unsigned long min_nr_running; /* Nr running of group_min */
3441#endif
3442};
3443
3444/*
3445 * sg_lb_stats - stats of a sched_group required for load_balancing
3446 */
3447struct sg_lb_stats {
3448 unsigned long avg_load; /*Avg load across the CPUs of the group */
3449 unsigned long group_load; /* Total load over the CPUs of the group */
3450 unsigned long sum_nr_running; /* Nr tasks running in the group */
3451 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3452 unsigned long group_capacity;
3453 int group_imb; /* Is there an imbalance in the group ? */
3454};
3455
3456/**
3457 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3458 * @group: The group whose first cpu is to be returned.
3459 */
3460static inline unsigned int group_first_cpu(struct sched_group *group)
3461{
3462 return cpumask_first(sched_group_cpus(group));
3463}
3464
3465/**
3466 * get_sd_load_idx - Obtain the load index for a given sched domain.
3467 * @sd: The sched_domain whose load_idx is to be obtained.
3468 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3469 */
3470static inline int get_sd_load_idx(struct sched_domain *sd,
3471 enum cpu_idle_type idle)
3472{
3473 int load_idx;
3474
3475 switch (idle) {
3476 case CPU_NOT_IDLE:
3477 load_idx = sd->busy_idx;
3478 break;
3479
3480 case CPU_NEWLY_IDLE:
3481 load_idx = sd->newidle_idx;
3482 break;
3483 default:
3484 load_idx = sd->idle_idx;
3485 break;
3486 }
3487
3488 return load_idx;
3489}
3490
3491
3492#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3493/**
3494 * init_sd_power_savings_stats - Initialize power savings statistics for
3495 * the given sched_domain, during load balancing.
3496 *
3497 * @sd: Sched domain whose power-savings statistics are to be initialized.
3498 * @sds: Variable containing the statistics for sd.
3499 * @idle: Idle status of the CPU at which we're performing load-balancing.
3500 */
3501static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3502 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3503{
3504 /*
3505 * Busy processors will not participate in power savings
3506 * balance.
3507 */
3508 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3509 sds->power_savings_balance = 0;
3510 else {
3511 sds->power_savings_balance = 1;
3512 sds->min_nr_running = ULONG_MAX;
3513 sds->leader_nr_running = 0;
3514 }
3515}
3516
3517/**
3518 * update_sd_power_savings_stats - Update the power saving stats for a
3519 * sched_domain while performing load balancing.
3520 *
3521 * @group: sched_group belonging to the sched_domain under consideration.
3522 * @sds: Variable containing the statistics of the sched_domain
3523 * @local_group: Does group contain the CPU for which we're performing
3524 * load balancing ?
3525 * @sgs: Variable containing the statistics of the group.
3526 */
3527static inline void update_sd_power_savings_stats(struct sched_group *group,
3528 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3529{
3530
3531 if (!sds->power_savings_balance)
3532 return;
3533
3534 /*
3535 * If the local group is idle or completely loaded
3536 * no need to do power savings balance at this domain
3537 */
3538 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3539 !sds->this_nr_running))
3540 sds->power_savings_balance = 0;
3541
3542 /*
3543 * If a group is already running at full capacity or idle,
3544 * don't include that group in power savings calculations
3545 */
3546 if (!sds->power_savings_balance ||
3547 sgs->sum_nr_running >= sgs->group_capacity ||
3548 !sgs->sum_nr_running)
3549 return;
3550
3551 /*
3552 * Calculate the group which has the least non-idle load.
3553 * This is the group from where we need to pick up the load
3554 * for saving power
3555 */
3556 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3557 (sgs->sum_nr_running == sds->min_nr_running &&
3558 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3559 sds->group_min = group;
3560 sds->min_nr_running = sgs->sum_nr_running;
3561 sds->min_load_per_task = sgs->sum_weighted_load /
3562 sgs->sum_nr_running;
3563 }
3564
3565 /*
3566 * Calculate the group which is almost near its
3567 * capacity but still has some space to pick up some load
3568 * from other group and save more power
3569 */
3570 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3571 return;
3572
3573 if (sgs->sum_nr_running > sds->leader_nr_running ||
3574 (sgs->sum_nr_running == sds->leader_nr_running &&
3575 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3576 sds->group_leader = group;
3577 sds->leader_nr_running = sgs->sum_nr_running;
3578 }
3579}
3580
3581/**
3582 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3583 * @sds: Variable containing the statistics of the sched_domain
3584 * under consideration.
3585 * @this_cpu: Cpu at which we're currently performing load-balancing.
3586 * @imbalance: Variable to store the imbalance.
3587 *
3588 * Description:
3589 * Check if we have potential to perform some power-savings balance.
3590 * If yes, set the busiest group to be the least loaded group in the
3591 * sched_domain, so that it's CPUs can be put to idle.
3592 *
3593 * Returns 1 if there is potential to perform power-savings balance.
3594 * Else returns 0.
3595 */
3596static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3597 int this_cpu, unsigned long *imbalance)
3598{
3599 if (!sds->power_savings_balance)
3600 return 0;
3601
3602 if (sds->this != sds->group_leader ||
3603 sds->group_leader == sds->group_min)
3604 return 0;
3605
3606 *imbalance = sds->min_load_per_task;
3607 sds->busiest = sds->group_min;
3608
3609 return 1;
3610
3611}
3612#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3614 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3615{
3616 return;
3617}
3618
3619static inline void update_sd_power_savings_stats(struct sched_group *group,
3620 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3621{
3622 return;
3623}
3624
3625static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3626 int this_cpu, unsigned long *imbalance)
3627{
3628 return 0;
3629}
3630#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3631
3632
3633unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3634{
3635 return SCHED_LOAD_SCALE;
3636}
3637
3638unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3639{
3640 return default_scale_freq_power(sd, cpu);
3641}
3642
3643unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3644{
3645 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3646 unsigned long smt_gain = sd->smt_gain;
3647
3648 smt_gain /= weight;
3649
3650 return smt_gain;
3651}
3652
3653unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3654{
3655 return default_scale_smt_power(sd, cpu);
3656}
3657
3658unsigned long scale_rt_power(int cpu)
3659{
3660 struct rq *rq = cpu_rq(cpu);
3661 u64 total, available;
3662
3663 sched_avg_update(rq);
3664
3665 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3666 available = total - rq->rt_avg;
3667
3668 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3669 total = SCHED_LOAD_SCALE;
3670
3671 total >>= SCHED_LOAD_SHIFT;
3672
3673 return div_u64(available, total);
3674}
3675
3676static void update_cpu_power(struct sched_domain *sd, int cpu)
3677{
3678 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3679 unsigned long power = SCHED_LOAD_SCALE;
3680 struct sched_group *sdg = sd->groups;
3681
3682 if (sched_feat(ARCH_POWER))
3683 power *= arch_scale_freq_power(sd, cpu);
3684 else
3685 power *= default_scale_freq_power(sd, cpu);
3686
3687 power >>= SCHED_LOAD_SHIFT;
3688
3689 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3690 if (sched_feat(ARCH_POWER))
3691 power *= arch_scale_smt_power(sd, cpu);
3692 else
3693 power *= default_scale_smt_power(sd, cpu);
3694
3695 power >>= SCHED_LOAD_SHIFT;
3696 }
3697
3698 power *= scale_rt_power(cpu);
3699 power >>= SCHED_LOAD_SHIFT;
3700
3701 if (!power)
3702 power = 1;
3703
3704 sdg->cpu_power = power;
3705}
3706
3707static void update_group_power(struct sched_domain *sd, int cpu)
3708{
3709 struct sched_domain *child = sd->child;
3710 struct sched_group *group, *sdg = sd->groups;
3711 unsigned long power;
3712
3713 if (!child) {
3714 update_cpu_power(sd, cpu);
3715 return;
3716 }
3717
3718 power = 0;
3719
3720 group = child->groups;
3721 do {
3722 power += group->cpu_power;
3723 group = group->next;
3724 } while (group != child->groups);
3725
3726 sdg->cpu_power = power;
3727}
3728
3729/**
3730 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3731 * @sd: The sched_domain whose statistics are to be updated.
3732 * @group: sched_group whose statistics are to be updated.
3733 * @this_cpu: Cpu for which load balance is currently performed.
3734 * @idle: Idle status of this_cpu
3735 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3736 * @sd_idle: Idle status of the sched_domain containing group.
3737 * @local_group: Does group contain this_cpu.
3738 * @cpus: Set of cpus considered for load balancing.
3739 * @balance: Should we balance.
3740 * @sgs: variable to hold the statistics for this group.
3741 */
3742static inline void update_sg_lb_stats(struct sched_domain *sd,
3743 struct sched_group *group, int this_cpu,
3744 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3745 int local_group, const struct cpumask *cpus,
3746 int *balance, struct sg_lb_stats *sgs)
3747{
3748 unsigned long load, max_cpu_load, min_cpu_load;
3749 int i;
3750 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3751 unsigned long sum_avg_load_per_task;
3752 unsigned long avg_load_per_task;
3753
3754 if (local_group) {
3755 balance_cpu = group_first_cpu(group);
3756 if (balance_cpu == this_cpu)
3757 update_group_power(sd, this_cpu);
3758 }
3759
3760 /* Tally up the load of all CPUs in the group */
3761 sum_avg_load_per_task = avg_load_per_task = 0;
3762 max_cpu_load = 0;
3763 min_cpu_load = ~0UL;
3764
3765 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3766 struct rq *rq = cpu_rq(i);
3767
3768 if (*sd_idle && rq->nr_running)
3769 *sd_idle = 0;
3770
3771 /* Bias balancing toward cpus of our domain */
3772 if (local_group) {
3773 if (idle_cpu(i) && !first_idle_cpu) {
3774 first_idle_cpu = 1;
3775 balance_cpu = i;
3776 }
3777
3778 load = target_load(i, load_idx);
3779 } else {
3780 load = source_load(i, load_idx);
3781 if (load > max_cpu_load)
3782 max_cpu_load = load;
3783 if (min_cpu_load > load)
3784 min_cpu_load = load;
3785 }
3786
3787 sgs->group_load += load;
3788 sgs->sum_nr_running += rq->nr_running;
3789 sgs->sum_weighted_load += weighted_cpuload(i);
3790
3791 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3792 }
3793
3794 /*
3795 * First idle cpu or the first cpu(busiest) in this sched group
3796 * is eligible for doing load balancing at this and above
3797 * domains. In the newly idle case, we will allow all the cpu's
3798 * to do the newly idle load balance.
3799 */
3800 if (idle != CPU_NEWLY_IDLE && local_group &&
3801 balance_cpu != this_cpu && balance) {
3802 *balance = 0;
3803 return;
3804 }
3805
3806 /* Adjust by relative CPU power of the group */
3807 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3808
3809
3810 /*
3811 * Consider the group unbalanced when the imbalance is larger
3812 * than the average weight of two tasks.
3813 *
3814 * APZ: with cgroup the avg task weight can vary wildly and
3815 * might not be a suitable number - should we keep a
3816 * normalized nr_running number somewhere that negates
3817 * the hierarchy?
3818 */
3819 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3820 group->cpu_power;
3821
3822 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3823 sgs->group_imb = 1;
3824
3825 sgs->group_capacity =
3826 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3827}
3828
3829/**
3830 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3831 * @sd: sched_domain whose statistics are to be updated.
3832 * @this_cpu: Cpu for which load balance is currently performed.
3833 * @idle: Idle status of this_cpu
3834 * @sd_idle: Idle status of the sched_domain containing group.
3835 * @cpus: Set of cpus considered for load balancing.
3836 * @balance: Should we balance.
3837 * @sds: variable to hold the statistics for this sched_domain.
3838 */
3839static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3840 enum cpu_idle_type idle, int *sd_idle,
3841 const struct cpumask *cpus, int *balance,
3842 struct sd_lb_stats *sds)
3843{
3844 struct sched_domain *child = sd->child;
3845 struct sched_group *group = sd->groups;
3846 struct sg_lb_stats sgs;
3847 int load_idx, prefer_sibling = 0;
3848
3849 if (child && child->flags & SD_PREFER_SIBLING)
3850 prefer_sibling = 1;
3851
3852 init_sd_power_savings_stats(sd, sds, idle);
3853 load_idx = get_sd_load_idx(sd, idle);
3854
3855 do {
3856 int local_group;
3857
3858 local_group = cpumask_test_cpu(this_cpu,
3859 sched_group_cpus(group));
3860 memset(&sgs, 0, sizeof(sgs));
3861 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3862 local_group, cpus, balance, &sgs);
3863
3864 if (local_group && balance && !(*balance))
3865 return;
3866
3867 sds->total_load += sgs.group_load;
3868 sds->total_pwr += group->cpu_power;
3869
3870 /*
3871 * In case the child domain prefers tasks go to siblings
3872 * first, lower the group capacity to one so that we'll try
3873 * and move all the excess tasks away.
3874 */
3875 if (prefer_sibling)
3876 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3877
3878 if (local_group) {
3879 sds->this_load = sgs.avg_load;
3880 sds->this = group;
3881 sds->this_nr_running = sgs.sum_nr_running;
3882 sds->this_load_per_task = sgs.sum_weighted_load;
3883 } else if (sgs.avg_load > sds->max_load &&
3884 (sgs.sum_nr_running > sgs.group_capacity ||
3885 sgs.group_imb)) {
3886 sds->max_load = sgs.avg_load;
3887 sds->busiest = group;
3888 sds->busiest_nr_running = sgs.sum_nr_running;
3889 sds->busiest_load_per_task = sgs.sum_weighted_load;
3890 sds->group_imb = sgs.group_imb;
3891 }
3892
3893 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3894 group = group->next;
3895 } while (group != sd->groups);
3896}
3897
3898/**
3899 * fix_small_imbalance - Calculate the minor imbalance that exists
3900 * amongst the groups of a sched_domain, during
3901 * load balancing.
3902 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3903 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3904 * @imbalance: Variable to store the imbalance.
3905 */
3906static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3907 int this_cpu, unsigned long *imbalance)
3908{
3909 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3910 unsigned int imbn = 2;
3911
3912 if (sds->this_nr_running) {
3913 sds->this_load_per_task /= sds->this_nr_running;
3914 if (sds->busiest_load_per_task >
3915 sds->this_load_per_task)
3916 imbn = 1;
3917 } else
3918 sds->this_load_per_task =
3919 cpu_avg_load_per_task(this_cpu);
3920
3921 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3922 sds->busiest_load_per_task * imbn) {
3923 *imbalance = sds->busiest_load_per_task;
3924 return;
3925 }
3926
3927 /*
3928 * OK, we don't have enough imbalance to justify moving tasks,
3929 * however we may be able to increase total CPU power used by
3930 * moving them.
3931 */
3932
3933 pwr_now += sds->busiest->cpu_power *
3934 min(sds->busiest_load_per_task, sds->max_load);
3935 pwr_now += sds->this->cpu_power *
3936 min(sds->this_load_per_task, sds->this_load);
3937 pwr_now /= SCHED_LOAD_SCALE;
3938
3939 /* Amount of load we'd subtract */
3940 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3941 sds->busiest->cpu_power;
3942 if (sds->max_load > tmp)
3943 pwr_move += sds->busiest->cpu_power *
3944 min(sds->busiest_load_per_task, sds->max_load - tmp);
3945
3946 /* Amount of load we'd add */
3947 if (sds->max_load * sds->busiest->cpu_power <
3948 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3949 tmp = (sds->max_load * sds->busiest->cpu_power) /
3950 sds->this->cpu_power;
3951 else
3952 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3953 sds->this->cpu_power;
3954 pwr_move += sds->this->cpu_power *
3955 min(sds->this_load_per_task, sds->this_load + tmp);
3956 pwr_move /= SCHED_LOAD_SCALE;
3957
3958 /* Move if we gain throughput */
3959 if (pwr_move > pwr_now)
3960 *imbalance = sds->busiest_load_per_task;
3961}
3962
3963/**
3964 * calculate_imbalance - Calculate the amount of imbalance present within the
3965 * groups of a given sched_domain during load balance.
3966 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3967 * @this_cpu: Cpu for which currently load balance is being performed.
3968 * @imbalance: The variable to store the imbalance.
3969 */
3970static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3971 unsigned long *imbalance)
3972{
3973 unsigned long max_pull;
3974 /*
3975 * In the presence of smp nice balancing, certain scenarios can have
3976 * max load less than avg load(as we skip the groups at or below
3977 * its cpu_power, while calculating max_load..)
3978 */
3979 if (sds->max_load < sds->avg_load) {
3980 *imbalance = 0;
3981 return fix_small_imbalance(sds, this_cpu, imbalance);
3982 }
3983
3984 /* Don't want to pull so many tasks that a group would go idle */
3985 max_pull = min(sds->max_load - sds->avg_load,
3986 sds->max_load - sds->busiest_load_per_task);
3987
3988 /* How much load to actually move to equalise the imbalance */
3989 *imbalance = min(max_pull * sds->busiest->cpu_power,
3990 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3991 / SCHED_LOAD_SCALE;
3992
3993 /*
3994 * if *imbalance is less than the average load per runnable task
3995 * there is no gaurantee that any tasks will be moved so we'll have
3996 * a think about bumping its value to force at least one task to be
3997 * moved
3998 */
3999 if (*imbalance < sds->busiest_load_per_task)
4000 return fix_small_imbalance(sds, this_cpu, imbalance);
4001
4002}
4003/******* find_busiest_group() helpers end here *********************/
4004
4005/**
4006 * find_busiest_group - Returns the busiest group within the sched_domain
4007 * if there is an imbalance. If there isn't an imbalance, and
4008 * the user has opted for power-savings, it returns a group whose
4009 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4010 * such a group exists.
4011 *
4012 * Also calculates the amount of weighted load which should be moved
4013 * to restore balance.
4014 *
4015 * @sd: The sched_domain whose busiest group is to be returned.
4016 * @this_cpu: The cpu for which load balancing is currently being performed.
4017 * @imbalance: Variable which stores amount of weighted load which should
4018 * be moved to restore balance/put a group to idle.
4019 * @idle: The idle status of this_cpu.
4020 * @sd_idle: The idleness of sd
4021 * @cpus: The set of CPUs under consideration for load-balancing.
4022 * @balance: Pointer to a variable indicating if this_cpu
4023 * is the appropriate cpu to perform load balancing at this_level.
4024 *
4025 * Returns: - the busiest group if imbalance exists.
4026 * - If no imbalance and user has opted for power-savings balance,
4027 * return the least loaded group whose CPUs can be
4028 * put to idle by rebalancing its tasks onto our group.
4029 */
4030static struct sched_group *
4031find_busiest_group(struct sched_domain *sd, int this_cpu,
4032 unsigned long *imbalance, enum cpu_idle_type idle,
4033 int *sd_idle, const struct cpumask *cpus, int *balance)
4034{
4035 struct sd_lb_stats sds;
4036
4037 memset(&sds, 0, sizeof(sds));
4038
4039 /*
4040 * Compute the various statistics relavent for load balancing at
4041 * this level.
4042 */
4043 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4044 balance, &sds);
4045
4046 /* Cases where imbalance does not exist from POV of this_cpu */
4047 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4048 * at this level.
4049 * 2) There is no busy sibling group to pull from.
4050 * 3) This group is the busiest group.
4051 * 4) This group is more busy than the avg busieness at this
4052 * sched_domain.
4053 * 5) The imbalance is within the specified limit.
4054 * 6) Any rebalance would lead to ping-pong
4055 */
4056 if (balance && !(*balance))
4057 goto ret;
4058
4059 if (!sds.busiest || sds.busiest_nr_running == 0)
4060 goto out_balanced;
4061
4062 if (sds.this_load >= sds.max_load)
4063 goto out_balanced;
4064
4065 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4066
4067 if (sds.this_load >= sds.avg_load)
4068 goto out_balanced;
4069
4070 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4071 goto out_balanced;
4072
4073 sds.busiest_load_per_task /= sds.busiest_nr_running;
4074 if (sds.group_imb)
4075 sds.busiest_load_per_task =
4076 min(sds.busiest_load_per_task, sds.avg_load);
4077
4078 /*
4079 * We're trying to get all the cpus to the average_load, so we don't
4080 * want to push ourselves above the average load, nor do we wish to
4081 * reduce the max loaded cpu below the average load, as either of these
4082 * actions would just result in more rebalancing later, and ping-pong
4083 * tasks around. Thus we look for the minimum possible imbalance.
4084 * Negative imbalances (*we* are more loaded than anyone else) will
4085 * be counted as no imbalance for these purposes -- we can't fix that
4086 * by pulling tasks to us. Be careful of negative numbers as they'll
4087 * appear as very large values with unsigned longs.
4088 */
4089 if (sds.max_load <= sds.busiest_load_per_task)
4090 goto out_balanced;
4091
4092 /* Looks like there is an imbalance. Compute it */
4093 calculate_imbalance(&sds, this_cpu, imbalance);
4094 return sds.busiest;
4095
4096out_balanced:
4097 /*
4098 * There is no obvious imbalance. But check if we can do some balancing
4099 * to save power.
4100 */
4101 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4102 return sds.busiest;
4103ret:
4104 *imbalance = 0;
4105 return NULL;
4106}
4107
4108/*
4109 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4110 */
4111static struct rq *
4112find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4113 unsigned long imbalance, const struct cpumask *cpus)
4114{
4115 struct rq *busiest = NULL, *rq;
4116 unsigned long max_load = 0;
4117 int i;
4118
4119 for_each_cpu(i, sched_group_cpus(group)) {
4120 unsigned long power = power_of(i);
4121 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4122 unsigned long wl;
4123
4124 if (!cpumask_test_cpu(i, cpus))
4125 continue;
4126
4127 rq = cpu_rq(i);
4128 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4129 wl /= power;
4130
4131 if (capacity && rq->nr_running == 1 && wl > imbalance)
4132 continue;
4133
4134 if (wl > max_load) {
4135 max_load = wl;
4136 busiest = rq;
4137 }
4138 }
4139
4140 return busiest;
4141}
4142
4143/*
4144 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4145 * so long as it is large enough.
4146 */
4147#define MAX_PINNED_INTERVAL 512
4148
4149/* Working cpumask for load_balance and load_balance_newidle. */
4150static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4151
4152/*
4153 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4154 * tasks if there is an imbalance.
4155 */
4156static int load_balance(int this_cpu, struct rq *this_rq,
4157 struct sched_domain *sd, enum cpu_idle_type idle,
4158 int *balance)
4159{
4160 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4161 struct sched_group *group;
4162 unsigned long imbalance;
4163 struct rq *busiest;
4164 unsigned long flags;
4165 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4166
4167 cpumask_copy(cpus, cpu_active_mask);
4168
4169 /*
4170 * When power savings policy is enabled for the parent domain, idle
4171 * sibling can pick up load irrespective of busy siblings. In this case,
4172 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4173 * portraying it as CPU_NOT_IDLE.
4174 */
4175 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4176 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4177 sd_idle = 1;
4178
4179 schedstat_inc(sd, lb_count[idle]);
4180
4181redo:
4182 update_shares(sd);
4183 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4184 cpus, balance);
4185
4186 if (*balance == 0)
4187 goto out_balanced;
4188
4189 if (!group) {
4190 schedstat_inc(sd, lb_nobusyg[idle]);
4191 goto out_balanced;
4192 }
4193
4194 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4195 if (!busiest) {
4196 schedstat_inc(sd, lb_nobusyq[idle]);
4197 goto out_balanced;
4198 }
4199
4200 BUG_ON(busiest == this_rq);
4201
4202 schedstat_add(sd, lb_imbalance[idle], imbalance);
4203
4204 ld_moved = 0;
4205 if (busiest->nr_running > 1) {
4206 /*
4207 * Attempt to move tasks. If find_busiest_group has found
4208 * an imbalance but busiest->nr_running <= 1, the group is
4209 * still unbalanced. ld_moved simply stays zero, so it is
4210 * correctly treated as an imbalance.
4211 */
4212 local_irq_save(flags);
4213 double_rq_lock(this_rq, busiest);
4214 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4215 imbalance, sd, idle, &all_pinned);
4216 double_rq_unlock(this_rq, busiest);
4217 local_irq_restore(flags);
4218
4219 /*
4220 * some other cpu did the load balance for us.
4221 */
4222 if (ld_moved && this_cpu != smp_processor_id())
4223 resched_cpu(this_cpu);
4224
4225 /* All tasks on this runqueue were pinned by CPU affinity */
4226 if (unlikely(all_pinned)) {
4227 cpumask_clear_cpu(cpu_of(busiest), cpus);
4228 if (!cpumask_empty(cpus))
4229 goto redo;
4230 goto out_balanced;
4231 }
4232 }
4233
4234 if (!ld_moved) {
4235 schedstat_inc(sd, lb_failed[idle]);
4236 sd->nr_balance_failed++;
4237
4238 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4239
4240 raw_spin_lock_irqsave(&busiest->lock, flags);
4241
4242 /* don't kick the migration_thread, if the curr
4243 * task on busiest cpu can't be moved to this_cpu
4244 */
4245 if (!cpumask_test_cpu(this_cpu,
4246 &busiest->curr->cpus_allowed)) {
4247 raw_spin_unlock_irqrestore(&busiest->lock,
4248 flags);
4249 all_pinned = 1;
4250 goto out_one_pinned;
4251 }
4252
4253 if (!busiest->active_balance) {
4254 busiest->active_balance = 1;
4255 busiest->push_cpu = this_cpu;
4256 active_balance = 1;
4257 }
4258 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4259 if (active_balance)
4260 wake_up_process(busiest->migration_thread);
4261
4262 /*
4263 * We've kicked active balancing, reset the failure
4264 * counter.
4265 */
4266 sd->nr_balance_failed = sd->cache_nice_tries+1;
4267 }
4268 } else
4269 sd->nr_balance_failed = 0;
4270
4271 if (likely(!active_balance)) {
4272 /* We were unbalanced, so reset the balancing interval */
4273 sd->balance_interval = sd->min_interval;
4274 } else {
4275 /*
4276 * If we've begun active balancing, start to back off. This
4277 * case may not be covered by the all_pinned logic if there
4278 * is only 1 task on the busy runqueue (because we don't call
4279 * move_tasks).
4280 */
4281 if (sd->balance_interval < sd->max_interval)
4282 sd->balance_interval *= 2;
4283 }
4284
4285 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4286 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4287 ld_moved = -1;
4288
4289 goto out;
4290
4291out_balanced:
4292 schedstat_inc(sd, lb_balanced[idle]);
4293
4294 sd->nr_balance_failed = 0;
4295
4296out_one_pinned:
4297 /* tune up the balancing interval */
4298 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4299 (sd->balance_interval < sd->max_interval))
4300 sd->balance_interval *= 2;
4301
4302 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4303 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4304 ld_moved = -1;
4305 else
4306 ld_moved = 0;
4307out:
4308 if (ld_moved)
4309 update_shares(sd);
4310 return ld_moved;
4311}
4312
4313/*
4314 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4315 * tasks if there is an imbalance.
4316 *
4317 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4318 * this_rq is locked.
4319 */
4320static int
4321load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4322{
4323 struct sched_group *group;
4324 struct rq *busiest = NULL;
4325 unsigned long imbalance;
4326 int ld_moved = 0;
4327 int sd_idle = 0;
4328 int all_pinned = 0;
4329 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4330
4331 cpumask_copy(cpus, cpu_active_mask);
4332
4333 /*
4334 * When power savings policy is enabled for the parent domain, idle
4335 * sibling can pick up load irrespective of busy siblings. In this case,
4336 * let the state of idle sibling percolate up as IDLE, instead of
4337 * portraying it as CPU_NOT_IDLE.
4338 */
4339 if (sd->flags & SD_SHARE_CPUPOWER &&
4340 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4341 sd_idle = 1;
4342
4343 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4344redo:
4345 update_shares_locked(this_rq, sd);
4346 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4347 &sd_idle, cpus, NULL);
4348 if (!group) {
4349 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4350 goto out_balanced;
4351 }
4352
4353 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4354 if (!busiest) {
4355 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4356 goto out_balanced;
4357 }
4358
4359 BUG_ON(busiest == this_rq);
4360
4361 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4362
4363 ld_moved = 0;
4364 if (busiest->nr_running > 1) {
4365 /* Attempt to move tasks */
4366 double_lock_balance(this_rq, busiest);
4367 /* this_rq->clock is already updated */
4368 update_rq_clock(busiest);
4369 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4370 imbalance, sd, CPU_NEWLY_IDLE,
4371 &all_pinned);
4372 double_unlock_balance(this_rq, busiest);
4373
4374 if (unlikely(all_pinned)) {
4375 cpumask_clear_cpu(cpu_of(busiest), cpus);
4376 if (!cpumask_empty(cpus))
4377 goto redo;
4378 }
4379 }
4380
4381 if (!ld_moved) {
4382 int active_balance = 0;
4383
4384 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4385 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4386 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4387 return -1;
4388
4389 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4390 return -1;
4391
4392 if (sd->nr_balance_failed++ < 2)
4393 return -1;
4394
4395 /*
4396 * The only task running in a non-idle cpu can be moved to this
4397 * cpu in an attempt to completely freeup the other CPU
4398 * package. The same method used to move task in load_balance()
4399 * have been extended for load_balance_newidle() to speedup
4400 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4401 *
4402 * The package power saving logic comes from
4403 * find_busiest_group(). If there are no imbalance, then
4404 * f_b_g() will return NULL. However when sched_mc={1,2} then
4405 * f_b_g() will select a group from which a running task may be
4406 * pulled to this cpu in order to make the other package idle.
4407 * If there is no opportunity to make a package idle and if
4408 * there are no imbalance, then f_b_g() will return NULL and no
4409 * action will be taken in load_balance_newidle().
4410 *
4411 * Under normal task pull operation due to imbalance, there
4412 * will be more than one task in the source run queue and
4413 * move_tasks() will succeed. ld_moved will be true and this
4414 * active balance code will not be triggered.
4415 */
4416
4417 /* Lock busiest in correct order while this_rq is held */
4418 double_lock_balance(this_rq, busiest);
4419
4420 /*
4421 * don't kick the migration_thread, if the curr
4422 * task on busiest cpu can't be moved to this_cpu
4423 */
4424 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4425 double_unlock_balance(this_rq, busiest);
4426 all_pinned = 1;
4427 return ld_moved;
4428 }
4429
4430 if (!busiest->active_balance) {
4431 busiest->active_balance = 1;
4432 busiest->push_cpu = this_cpu;
4433 active_balance = 1;
4434 }
4435
4436 double_unlock_balance(this_rq, busiest);
4437 /*
4438 * Should not call ttwu while holding a rq->lock
4439 */
4440 raw_spin_unlock(&this_rq->lock);
4441 if (active_balance)
4442 wake_up_process(busiest->migration_thread);
4443 raw_spin_lock(&this_rq->lock);
4444
4445 } else
4446 sd->nr_balance_failed = 0;
4447
4448 update_shares_locked(this_rq, sd);
4449 return ld_moved;
4450
4451out_balanced:
4452 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4453 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4454 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4455 return -1;
4456 sd->nr_balance_failed = 0;
4457
4458 return 0;
4459}
4460
4461/*
4462 * idle_balance is called by schedule() if this_cpu is about to become
4463 * idle. Attempts to pull tasks from other CPUs.
4464 */
4465static void idle_balance(int this_cpu, struct rq *this_rq)
4466{
4467 struct sched_domain *sd;
4468 int pulled_task = 0;
4469 unsigned long next_balance = jiffies + HZ;
4470
4471 this_rq->idle_stamp = this_rq->clock;
4472
4473 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4474 return;
4475
4476 for_each_domain(this_cpu, sd) {
4477 unsigned long interval;
4478
4479 if (!(sd->flags & SD_LOAD_BALANCE))
4480 continue;
4481
4482 if (sd->flags & SD_BALANCE_NEWIDLE)
4483 /* If we've pulled tasks over stop searching: */
4484 pulled_task = load_balance_newidle(this_cpu, this_rq,
4485 sd);
4486
4487 interval = msecs_to_jiffies(sd->balance_interval);
4488 if (time_after(next_balance, sd->last_balance + interval))
4489 next_balance = sd->last_balance + interval;
4490 if (pulled_task) {
4491 this_rq->idle_stamp = 0;
4492 break;
4493 }
4494 }
4495 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4496 /*
4497 * We are going idle. next_balance may be set based on
4498 * a busy processor. So reset next_balance.
4499 */
4500 this_rq->next_balance = next_balance;
4501 }
4502}
4503
4504/*
4505 * active_load_balance is run by migration threads. It pushes running tasks
4506 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4507 * running on each physical CPU where possible, and avoids physical /
4508 * logical imbalances.
4509 *
4510 * Called with busiest_rq locked.
4511 */
4512static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4513{
4514 int target_cpu = busiest_rq->push_cpu;
4515 struct sched_domain *sd;
4516 struct rq *target_rq;
4517
4518 /* Is there any task to move? */
4519 if (busiest_rq->nr_running <= 1)
4520 return;
4521
4522 target_rq = cpu_rq(target_cpu);
4523
4524 /*
4525 * This condition is "impossible", if it occurs
4526 * we need to fix it. Originally reported by
4527 * Bjorn Helgaas on a 128-cpu setup.
4528 */
4529 BUG_ON(busiest_rq == target_rq);
4530
4531 /* move a task from busiest_rq to target_rq */
4532 double_lock_balance(busiest_rq, target_rq);
4533 update_rq_clock(busiest_rq);
4534 update_rq_clock(target_rq);
4535
4536 /* Search for an sd spanning us and the target CPU. */
4537 for_each_domain(target_cpu, sd) {
4538 if ((sd->flags & SD_LOAD_BALANCE) &&
4539 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4540 break;
4541 }
4542
4543 if (likely(sd)) {
4544 schedstat_inc(sd, alb_count);
4545
4546 if (move_one_task(target_rq, target_cpu, busiest_rq,
4547 sd, CPU_IDLE))
4548 schedstat_inc(sd, alb_pushed);
4549 else
4550 schedstat_inc(sd, alb_failed);
4551 }
4552 double_unlock_balance(busiest_rq, target_rq);
4553}
4554
4555#ifdef CONFIG_NO_HZ
4556static struct {
4557 atomic_t load_balancer;
4558 cpumask_var_t cpu_mask;
4559 cpumask_var_t ilb_grp_nohz_mask;
4560} nohz ____cacheline_aligned = {
4561 .load_balancer = ATOMIC_INIT(-1),
4562};
4563
4564int get_nohz_load_balancer(void)
4565{
4566 return atomic_read(&nohz.load_balancer);
4567}
4568
4569#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4570/**
4571 * lowest_flag_domain - Return lowest sched_domain containing flag.
4572 * @cpu: The cpu whose lowest level of sched domain is to
4573 * be returned.
4574 * @flag: The flag to check for the lowest sched_domain
4575 * for the given cpu.
4576 *
4577 * Returns the lowest sched_domain of a cpu which contains the given flag.
4578 */
4579static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4580{
4581 struct sched_domain *sd;
4582
4583 for_each_domain(cpu, sd)
4584 if (sd && (sd->flags & flag))
4585 break;
4586
4587 return sd;
4588}
4589
4590/**
4591 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4592 * @cpu: The cpu whose domains we're iterating over.
4593 * @sd: variable holding the value of the power_savings_sd
4594 * for cpu.
4595 * @flag: The flag to filter the sched_domains to be iterated.
4596 *
4597 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4598 * set, starting from the lowest sched_domain to the highest.
4599 */
4600#define for_each_flag_domain(cpu, sd, flag) \
4601 for (sd = lowest_flag_domain(cpu, flag); \
4602 (sd && (sd->flags & flag)); sd = sd->parent)
4603
4604/**
4605 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4606 * @ilb_group: group to be checked for semi-idleness
4607 *
4608 * Returns: 1 if the group is semi-idle. 0 otherwise.
4609 *
4610 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4611 * and atleast one non-idle CPU. This helper function checks if the given
4612 * sched_group is semi-idle or not.
4613 */
4614static inline int is_semi_idle_group(struct sched_group *ilb_group)
4615{
4616 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4617 sched_group_cpus(ilb_group));
4618
4619 /*
4620 * A sched_group is semi-idle when it has atleast one busy cpu
4621 * and atleast one idle cpu.
4622 */
4623 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4624 return 0;
4625
4626 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4627 return 0;
4628
4629 return 1;
4630}
4631/**
4632 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4633 * @cpu: The cpu which is nominating a new idle_load_balancer.
4634 *
4635 * Returns: Returns the id of the idle load balancer if it exists,
4636 * Else, returns >= nr_cpu_ids.
4637 *
4638 * This algorithm picks the idle load balancer such that it belongs to a
4639 * semi-idle powersavings sched_domain. The idea is to try and avoid
4640 * completely idle packages/cores just for the purpose of idle load balancing
4641 * when there are other idle cpu's which are better suited for that job.
4642 */
4643static int find_new_ilb(int cpu)
4644{
4645 struct sched_domain *sd;
4646 struct sched_group *ilb_group;
4647
4648 /*
4649 * Have idle load balancer selection from semi-idle packages only
4650 * when power-aware load balancing is enabled
4651 */
4652 if (!(sched_smt_power_savings || sched_mc_power_savings))
4653 goto out_done;
4654
4655 /*
4656 * Optimize for the case when we have no idle CPUs or only one
4657 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4658 */
4659 if (cpumask_weight(nohz.cpu_mask) < 2)
4660 goto out_done;
4661
4662 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4663 ilb_group = sd->groups;
4664
4665 do {
4666 if (is_semi_idle_group(ilb_group))
4667 return cpumask_first(nohz.ilb_grp_nohz_mask);
4668
4669 ilb_group = ilb_group->next;
4670
4671 } while (ilb_group != sd->groups);
4672 }
4673
4674out_done:
4675 return cpumask_first(nohz.cpu_mask);
4676}
4677#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4678static inline int find_new_ilb(int call_cpu)
4679{
4680 return cpumask_first(nohz.cpu_mask);
4681}
4682#endif
4683
4684/*
4685 * This routine will try to nominate the ilb (idle load balancing)
4686 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4687 * load balancing on behalf of all those cpus. If all the cpus in the system
4688 * go into this tickless mode, then there will be no ilb owner (as there is
4689 * no need for one) and all the cpus will sleep till the next wakeup event
4690 * arrives...
4691 *
4692 * For the ilb owner, tick is not stopped. And this tick will be used
4693 * for idle load balancing. ilb owner will still be part of
4694 * nohz.cpu_mask..
4695 *
4696 * While stopping the tick, this cpu will become the ilb owner if there
4697 * is no other owner. And will be the owner till that cpu becomes busy
4698 * or if all cpus in the system stop their ticks at which point
4699 * there is no need for ilb owner.
4700 *
4701 * When the ilb owner becomes busy, it nominates another owner, during the
4702 * next busy scheduler_tick()
4703 */
4704int select_nohz_load_balancer(int stop_tick)
4705{
4706 int cpu = smp_processor_id();
4707
4708 if (stop_tick) {
4709 cpu_rq(cpu)->in_nohz_recently = 1;
4710
4711 if (!cpu_active(cpu)) {
4712 if (atomic_read(&nohz.load_balancer) != cpu)
4713 return 0;
4714
4715 /*
4716 * If we are going offline and still the leader,
4717 * give up!
4718 */
4719 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4720 BUG();
4721
4722 return 0;
4723 }
4724
4725 cpumask_set_cpu(cpu, nohz.cpu_mask);
4726
4727 /* time for ilb owner also to sleep */
4728 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4729 if (atomic_read(&nohz.load_balancer) == cpu)
4730 atomic_set(&nohz.load_balancer, -1);
4731 return 0;
4732 }
4733
4734 if (atomic_read(&nohz.load_balancer) == -1) {
4735 /* make me the ilb owner */
4736 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4737 return 1;
4738 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4739 int new_ilb;
4740
4741 if (!(sched_smt_power_savings ||
4742 sched_mc_power_savings))
4743 return 1;
4744 /*
4745 * Check to see if there is a more power-efficient
4746 * ilb.
4747 */
4748 new_ilb = find_new_ilb(cpu);
4749 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4750 atomic_set(&nohz.load_balancer, -1);
4751 resched_cpu(new_ilb);
4752 return 0;
4753 }
4754 return 1;
4755 }
4756 } else {
4757 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4758 return 0;
4759
4760 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4761
4762 if (atomic_read(&nohz.load_balancer) == cpu)
4763 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4764 BUG();
4765 }
4766 return 0;
4767}
4768#endif
4769
4770static DEFINE_SPINLOCK(balancing);
4771
4772/*
4773 * It checks each scheduling domain to see if it is due to be balanced,
4774 * and initiates a balancing operation if so.
4775 *
4776 * Balancing parameters are set up in arch_init_sched_domains.
4777 */
4778static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4779{
4780 int balance = 1;
4781 struct rq *rq = cpu_rq(cpu);
4782 unsigned long interval;
4783 struct sched_domain *sd;
4784 /* Earliest time when we have to do rebalance again */
4785 unsigned long next_balance = jiffies + 60*HZ;
4786 int update_next_balance = 0;
4787 int need_serialize;
4788
4789 for_each_domain(cpu, sd) {
4790 if (!(sd->flags & SD_LOAD_BALANCE))
4791 continue;
4792
4793 interval = sd->balance_interval;
4794 if (idle != CPU_IDLE)
4795 interval *= sd->busy_factor;
4796
4797 /* scale ms to jiffies */
4798 interval = msecs_to_jiffies(interval);
4799 if (unlikely(!interval))
4800 interval = 1;
4801 if (interval > HZ*NR_CPUS/10)
4802 interval = HZ*NR_CPUS/10;
4803
4804 need_serialize = sd->flags & SD_SERIALIZE;
4805
4806 if (need_serialize) {
4807 if (!spin_trylock(&balancing))
4808 goto out;
4809 }
4810
4811 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4812 if (load_balance(cpu, rq, sd, idle, &balance)) {
4813 /*
4814 * We've pulled tasks over so either we're no
4815 * longer idle, or one of our SMT siblings is
4816 * not idle.
4817 */
4818 idle = CPU_NOT_IDLE;
4819 }
4820 sd->last_balance = jiffies;
4821 }
4822 if (need_serialize)
4823 spin_unlock(&balancing);
4824out:
4825 if (time_after(next_balance, sd->last_balance + interval)) {
4826 next_balance = sd->last_balance + interval;
4827 update_next_balance = 1;
4828 }
4829
4830 /*
4831 * Stop the load balance at this level. There is another
4832 * CPU in our sched group which is doing load balancing more
4833 * actively.
4834 */
4835 if (!balance)
4836 break;
4837 }
4838
4839 /*
4840 * next_balance will be updated only when there is a need.
4841 * When the cpu is attached to null domain for ex, it will not be
4842 * updated.
4843 */
4844 if (likely(update_next_balance))
4845 rq->next_balance = next_balance;
4846}
4847
4848/*
4849 * run_rebalance_domains is triggered when needed from the scheduler tick.
4850 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4851 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4852 */
4853static void run_rebalance_domains(struct softirq_action *h)
4854{
4855 int this_cpu = smp_processor_id();
4856 struct rq *this_rq = cpu_rq(this_cpu);
4857 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4858 CPU_IDLE : CPU_NOT_IDLE;
4859
4860 rebalance_domains(this_cpu, idle);
4861
4862#ifdef CONFIG_NO_HZ
4863 /*
4864 * If this cpu is the owner for idle load balancing, then do the
4865 * balancing on behalf of the other idle cpus whose ticks are
4866 * stopped.
4867 */
4868 if (this_rq->idle_at_tick &&
4869 atomic_read(&nohz.load_balancer) == this_cpu) {
4870 struct rq *rq;
4871 int balance_cpu;
4872
4873 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4874 if (balance_cpu == this_cpu)
4875 continue;
4876
4877 /*
4878 * If this cpu gets work to do, stop the load balancing
4879 * work being done for other cpus. Next load
4880 * balancing owner will pick it up.
4881 */
4882 if (need_resched())
4883 break;
4884
4885 rebalance_domains(balance_cpu, CPU_IDLE);
4886
4887 rq = cpu_rq(balance_cpu);
4888 if (time_after(this_rq->next_balance, rq->next_balance))
4889 this_rq->next_balance = rq->next_balance;
4890 }
4891 }
4892#endif
4893}
4894
4895static inline int on_null_domain(int cpu)
4896{
4897 return !rcu_dereference(cpu_rq(cpu)->sd);
4898}
4899
4900/*
4901 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4902 *
4903 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4904 * idle load balancing owner or decide to stop the periodic load balancing,
4905 * if the whole system is idle.
4906 */
4907static inline void trigger_load_balance(struct rq *rq, int cpu)
4908{
4909#ifdef CONFIG_NO_HZ
4910 /*
4911 * If we were in the nohz mode recently and busy at the current
4912 * scheduler tick, then check if we need to nominate new idle
4913 * load balancer.
4914 */
4915 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4916 rq->in_nohz_recently = 0;
4917
4918 if (atomic_read(&nohz.load_balancer) == cpu) {
4919 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4920 atomic_set(&nohz.load_balancer, -1);
4921 }
4922
4923 if (atomic_read(&nohz.load_balancer) == -1) {
4924 int ilb = find_new_ilb(cpu);
4925
4926 if (ilb < nr_cpu_ids)
4927 resched_cpu(ilb);
4928 }
4929 }
4930
4931 /*
4932 * If this cpu is idle and doing idle load balancing for all the
4933 * cpus with ticks stopped, is it time for that to stop?
4934 */
4935 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4936 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4937 resched_cpu(cpu);
4938 return;
4939 }
4940
4941 /*
4942 * If this cpu is idle and the idle load balancing is done by
4943 * someone else, then no need raise the SCHED_SOFTIRQ
4944 */
4945 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4946 cpumask_test_cpu(cpu, nohz.cpu_mask))
4947 return;
4948#endif
4949 /* Don't need to rebalance while attached to NULL domain */
4950 if (time_after_eq(jiffies, rq->next_balance) &&
4951 likely(!on_null_domain(cpu)))
4952 raise_softirq(SCHED_SOFTIRQ);
4953}
4954
4955#else /* CONFIG_SMP */
4956
4957/*
4958 * on UP we do not need to balance between CPUs:
4959 */
4960static inline void idle_balance(int cpu, struct rq *rq)
4961{
4962}
4963
4964#endif 3164#endif
4965 3165
4966DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -6060,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6060 unsigned long flags; 4260 unsigned long flags;
6061 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6062 struct rq *rq; 4262 struct rq *rq;
6063 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6064 4264
6065 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6066 4266
@@ -6068,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6068 update_rq_clock(rq); 4268 update_rq_clock(rq);
6069 4269
6070 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6071 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6072 running = task_current(rq, p); 4273 running = task_current(rq, p);
6073 if (on_rq) 4274 if (on_rq)
@@ -6085,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6085 if (running) 4286 if (running)
6086 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6087 if (on_rq) { 4288 if (on_rq) {
6088 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6089 4290
6090 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6091 } 4292 }
@@ -6129,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6129 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6130 4331
6131 if (on_rq) { 4332 if (on_rq) {
6132 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6133 /* 4334 /*
6134 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6135 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6152,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice)
6152 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4353 /* convert nice value [19,-20] to rlimit style value [1,40] */
6153 int nice_rlim = 20 - nice; 4354 int nice_rlim = 20 - nice;
6154 4355
6155 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4356 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6156 capable(CAP_SYS_NICE)); 4357 capable(CAP_SYS_NICE));
6157} 4358}
6158 4359
@@ -6287,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6287{ 4488{
6288 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6289 unsigned long flags; 4490 unsigned long flags;
6290 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6291 struct rq *rq; 4492 struct rq *rq;
6292 int reset_on_fork; 4493 int reset_on_fork;
6293 4494
@@ -6329,7 +4530,7 @@ recheck:
6329 4530
6330 if (!lock_task_sighand(p, &flags)) 4531 if (!lock_task_sighand(p, &flags))
6331 return -ESRCH; 4532 return -ESRCH;
6332 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4533 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6333 unlock_task_sighand(p, &flags); 4534 unlock_task_sighand(p, &flags);
6334 4535
6335 /* can't set/change the rt policy */ 4536 /* can't set/change the rt policy */
@@ -6401,6 +4602,7 @@ recheck:
6401 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6402 4603
6403 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6404 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6405 4607
6406 if (running) 4608 if (running)
@@ -7151,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7151 struct rq *rq; 5353 struct rq *rq;
7152 int ret = 0; 5354 int ret = 0;
7153 5355
7154 /*
7155 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7156 * the ->cpus_allowed mask from under waking tasks, which would be
7157 * possible when we change rq->lock in ttwu(), so synchronize against
7158 * TASK_WAKING to avoid that.
7159 *
7160 * Make an exception for freshly cloned tasks, since cpuset namespaces
7161 * might move the task about, we have to validate the target in
7162 * wake_up_new_task() anyway since the cpu might have gone away.
7163 */
7164again:
7165 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7166 cpu_relax();
7167
7168 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
7169 5357
7170 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7171 task_rq_unlock(rq, &flags);
7172 goto again;
7173 }
7174
7175 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7176 ret = -EINVAL; 5359 ret = -EINVAL;
7177 goto out; 5360 goto out;
@@ -9223,11 +7406,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
9223 7406
9224#ifdef CONFIG_SCHED_MC 7407#ifdef CONFIG_SCHED_MC
9225static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7408static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7409 struct sysdev_class_attribute *attr,
9226 char *page) 7410 char *page)
9227{ 7411{
9228 return sprintf(page, "%u\n", sched_mc_power_savings); 7412 return sprintf(page, "%u\n", sched_mc_power_savings);
9229} 7413}
9230static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7414static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7415 struct sysdev_class_attribute *attr,
9231 const char *buf, size_t count) 7416 const char *buf, size_t count)
9232{ 7417{
9233 return sched_power_savings_store(buf, count, 0); 7418 return sched_power_savings_store(buf, count, 0);
@@ -9239,11 +7424,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
9239 7424
9240#ifdef CONFIG_SCHED_SMT 7425#ifdef CONFIG_SCHED_SMT
9241static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7426static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7427 struct sysdev_class_attribute *attr,
9242 char *page) 7428 char *page)
9243{ 7429{
9244 return sprintf(page, "%u\n", sched_smt_power_savings); 7430 return sprintf(page, "%u\n", sched_smt_power_savings);
9245} 7431}
9246static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7432static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7433 struct sysdev_class_attribute *attr,
9247 const char *buf, size_t count) 7434 const char *buf, size_t count)
9248{ 7435{
9249 return sched_power_savings_store(buf, count, 1); 7436 return sched_power_savings_store(buf, count, 1);
@@ -9458,7 +7645,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9458 tg->rt_rq[cpu] = rt_rq; 7645 tg->rt_rq[cpu] = rt_rq;
9459 init_rt_rq(rt_rq, rq); 7646 init_rt_rq(rt_rq, rq);
9460 rt_rq->tg = tg; 7647 rt_rq->tg = tg;
9461 rt_rq->rt_se = rt_se;
9462 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7648 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9463 if (add) 7649 if (add)
9464 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7650 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9489,9 +7675,6 @@ void __init sched_init(void)
9489#ifdef CONFIG_RT_GROUP_SCHED 7675#ifdef CONFIG_RT_GROUP_SCHED
9490 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7676 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9491#endif 7677#endif
9492#ifdef CONFIG_USER_SCHED
9493 alloc_size *= 2;
9494#endif
9495#ifdef CONFIG_CPUMASK_OFFSTACK 7678#ifdef CONFIG_CPUMASK_OFFSTACK
9496 alloc_size += num_possible_cpus() * cpumask_size(); 7679 alloc_size += num_possible_cpus() * cpumask_size();
9497#endif 7680#endif
@@ -9505,13 +7688,6 @@ void __init sched_init(void)
9505 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7688 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9506 ptr += nr_cpu_ids * sizeof(void **); 7689 ptr += nr_cpu_ids * sizeof(void **);
9507 7690
9508#ifdef CONFIG_USER_SCHED
9509 root_task_group.se = (struct sched_entity **)ptr;
9510 ptr += nr_cpu_ids * sizeof(void **);
9511
9512 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9513 ptr += nr_cpu_ids * sizeof(void **);
9514#endif /* CONFIG_USER_SCHED */
9515#endif /* CONFIG_FAIR_GROUP_SCHED */ 7691#endif /* CONFIG_FAIR_GROUP_SCHED */
9516#ifdef CONFIG_RT_GROUP_SCHED 7692#ifdef CONFIG_RT_GROUP_SCHED
9517 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7693 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9520,13 +7696,6 @@ void __init sched_init(void)
9520 init_task_group.rt_rq = (struct rt_rq **)ptr; 7696 init_task_group.rt_rq = (struct rt_rq **)ptr;
9521 ptr += nr_cpu_ids * sizeof(void **); 7697 ptr += nr_cpu_ids * sizeof(void **);
9522 7698
9523#ifdef CONFIG_USER_SCHED
9524 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9525 ptr += nr_cpu_ids * sizeof(void **);
9526
9527 root_task_group.rt_rq = (struct rt_rq **)ptr;
9528 ptr += nr_cpu_ids * sizeof(void **);
9529#endif /* CONFIG_USER_SCHED */
9530#endif /* CONFIG_RT_GROUP_SCHED */ 7699#endif /* CONFIG_RT_GROUP_SCHED */
9531#ifdef CONFIG_CPUMASK_OFFSTACK 7700#ifdef CONFIG_CPUMASK_OFFSTACK
9532 for_each_possible_cpu(i) { 7701 for_each_possible_cpu(i) {
@@ -9546,22 +7715,13 @@ void __init sched_init(void)
9546#ifdef CONFIG_RT_GROUP_SCHED 7715#ifdef CONFIG_RT_GROUP_SCHED
9547 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7716 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9548 global_rt_period(), global_rt_runtime()); 7717 global_rt_period(), global_rt_runtime());
9549#ifdef CONFIG_USER_SCHED
9550 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9551 global_rt_period(), RUNTIME_INF);
9552#endif /* CONFIG_USER_SCHED */
9553#endif /* CONFIG_RT_GROUP_SCHED */ 7718#endif /* CONFIG_RT_GROUP_SCHED */
9554 7719
9555#ifdef CONFIG_GROUP_SCHED 7720#ifdef CONFIG_CGROUP_SCHED
9556 list_add(&init_task_group.list, &task_groups); 7721 list_add(&init_task_group.list, &task_groups);
9557 INIT_LIST_HEAD(&init_task_group.children); 7722 INIT_LIST_HEAD(&init_task_group.children);
9558 7723
9559#ifdef CONFIG_USER_SCHED 7724#endif /* CONFIG_CGROUP_SCHED */
9560 INIT_LIST_HEAD(&root_task_group.children);
9561 init_task_group.parent = &root_task_group;
9562 list_add(&init_task_group.siblings, &root_task_group.children);
9563#endif /* CONFIG_USER_SCHED */
9564#endif /* CONFIG_GROUP_SCHED */
9565 7725
9566#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7726#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9567 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7727 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9601,25 +7761,6 @@ void __init sched_init(void)
9601 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7761 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9602 */ 7762 */
9603 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7763 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9604#elif defined CONFIG_USER_SCHED
9605 root_task_group.shares = NICE_0_LOAD;
9606 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9607 /*
9608 * In case of task-groups formed thr' the user id of tasks,
9609 * init_task_group represents tasks belonging to root user.
9610 * Hence it forms a sibling of all subsequent groups formed.
9611 * In this case, init_task_group gets only a fraction of overall
9612 * system cpu resource, based on the weight assigned to root
9613 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9614 * by letting tasks of init_task_group sit in a separate cfs_rq
9615 * (init_tg_cfs_rq) and having one entity represent this group of
9616 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9617 */
9618 init_tg_cfs_entry(&init_task_group,
9619 &per_cpu(init_tg_cfs_rq, i),
9620 &per_cpu(init_sched_entity, i), i, 1,
9621 root_task_group.se[i]);
9622
9623#endif 7764#endif
9624#endif /* CONFIG_FAIR_GROUP_SCHED */ 7765#endif /* CONFIG_FAIR_GROUP_SCHED */
9625 7766
@@ -9628,12 +7769,6 @@ void __init sched_init(void)
9628 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7769 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9629#ifdef CONFIG_CGROUP_SCHED 7770#ifdef CONFIG_CGROUP_SCHED
9630 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7771 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9631#elif defined CONFIG_USER_SCHED
9632 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9633 init_tg_rt_entry(&init_task_group,
9634 &per_cpu(init_rt_rq_var, i),
9635 &per_cpu(init_sched_rt_entity, i), i, 1,
9636 root_task_group.rt_se[i]);
9637#endif 7772#endif
9638#endif 7773#endif
9639 7774
@@ -9718,7 +7853,7 @@ static inline int preempt_count_equals(int preempt_offset)
9718 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7853 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9719} 7854}
9720 7855
9721void __might_sleep(char *file, int line, int preempt_offset) 7856void __might_sleep(const char *file, int line, int preempt_offset)
9722{ 7857{
9723#ifdef in_atomic 7858#ifdef in_atomic
9724 static unsigned long prev_jiffy; /* ratelimiting */ 7859 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10029,7 +8164,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10029} 8164}
10030#endif /* CONFIG_RT_GROUP_SCHED */ 8165#endif /* CONFIG_RT_GROUP_SCHED */
10031 8166
10032#ifdef CONFIG_GROUP_SCHED 8167#ifdef CONFIG_CGROUP_SCHED
10033static void free_sched_group(struct task_group *tg) 8168static void free_sched_group(struct task_group *tg)
10034{ 8169{
10035 free_fair_sched_group(tg); 8170 free_fair_sched_group(tg);
@@ -10134,11 +8269,11 @@ void sched_move_task(struct task_struct *tsk)
10134 if (unlikely(running)) 8269 if (unlikely(running))
10135 tsk->sched_class->set_curr_task(rq); 8270 tsk->sched_class->set_curr_task(rq);
10136 if (on_rq) 8271 if (on_rq)
10137 enqueue_task(rq, tsk, 0); 8272 enqueue_task(rq, tsk, 0, false);
10138 8273
10139 task_rq_unlock(rq, &flags); 8274 task_rq_unlock(rq, &flags);
10140} 8275}
10141#endif /* CONFIG_GROUP_SCHED */ 8276#endif /* CONFIG_CGROUP_SCHED */
10142 8277
10143#ifdef CONFIG_FAIR_GROUP_SCHED 8278#ifdef CONFIG_FAIR_GROUP_SCHED
10144static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8279static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10280,13 +8415,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10280 runtime = d->rt_runtime; 8415 runtime = d->rt_runtime;
10281 } 8416 }
10282 8417
10283#ifdef CONFIG_USER_SCHED
10284 if (tg == &root_task_group) {
10285 period = global_rt_period();
10286 runtime = global_rt_runtime();
10287 }
10288#endif
10289
10290 /* 8418 /*
10291 * Cannot have more runtime than the period. 8419 * Cannot have more runtime than the period.
10292 */ 8420 */
@@ -10689,7 +8817,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10689struct cpuacct { 8817struct cpuacct {
10690 struct cgroup_subsys_state css; 8818 struct cgroup_subsys_state css;
10691 /* cpuusage holds pointer to a u64-type object on every cpu */ 8819 /* cpuusage holds pointer to a u64-type object on every cpu */
10692 u64 *cpuusage; 8820 u64 __percpu *cpuusage;
10693 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8821 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10694 struct cpuacct *parent; 8822 struct cpuacct *parent;
10695}; 8823};
@@ -10906,12 +9034,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10906} 9034}
10907 9035
10908/* 9036/*
9037 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9038 * in cputime_t units. As a result, cpuacct_update_stats calls
9039 * percpu_counter_add with values large enough to always overflow the
9040 * per cpu batch limit causing bad SMP scalability.
9041 *
9042 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9043 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9044 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9045 */
9046#ifdef CONFIG_SMP
9047#define CPUACCT_BATCH \
9048 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9049#else
9050#define CPUACCT_BATCH 0
9051#endif
9052
9053/*
10909 * Charge the system/user time to the task's accounting group. 9054 * Charge the system/user time to the task's accounting group.
10910 */ 9055 */
10911static void cpuacct_update_stats(struct task_struct *tsk, 9056static void cpuacct_update_stats(struct task_struct *tsk,
10912 enum cpuacct_stat_index idx, cputime_t val) 9057 enum cpuacct_stat_index idx, cputime_t val)
10913{ 9058{
10914 struct cpuacct *ca; 9059 struct cpuacct *ca;
9060 int batch = CPUACCT_BATCH;
10915 9061
10916 if (unlikely(!cpuacct_subsys.active)) 9062 if (unlikely(!cpuacct_subsys.active))
10917 return; 9063 return;
@@ -10920,7 +9066,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10920 ca = task_ca(tsk); 9066 ca = task_ca(tsk);
10921 9067
10922 do { 9068 do {
10923 percpu_counter_add(&ca->cpustat[idx], val); 9069 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10924 ca = ca->parent; 9070 ca = ca->parent;
10925 } while (ca); 9071 } while (ca);
10926 rcu_read_unlock(); 9072 rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..82095bf2099f 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..3e1fd96c6cf9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1850 1903
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1934{ 2050{
1935 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1936 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
1937 2053
1938 cfs_rq_iterator.start = load_balance_start_fair; 2054 do {
1939 cfs_rq_iterator.next = load_balance_next_fair; 2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1940 2058
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
1942 /* 2062 /*
1943 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1944 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1945 */ 2066 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1948 &cfs_rq_iterator)) 2069
1949 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
1950 } 2155 }
1951 2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
1952 return 0; 2297 return 0;
1953} 2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
3348 return 0;
3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
1954 3536
1955static void rq_online_fair(struct rq *rq) 3537static void rq_online_fair(struct rq *rq)
1956{ 3538{
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3544 update_sysctl();
1963} 3545}
1964 3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1965#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1966 3557
1967/* 3558/*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3667}
2077#endif 3668#endif
2078 3669
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3671{
2081 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2110 3701
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3702 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3703 .rq_offline = rq_offline_fair,
2115 3704
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..5a6ed1f0990a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1670,8 +1662,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1670 if (!p->signal) 1662 if (!p->signal)
1671 return; 1663 return;
1672 1664
1673 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1665 /* max may change after cur was read, this will be fixed next tick */
1674 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1666 soft = task_rlimit(p, RLIMIT_RTTIME);
1667 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1675 1668
1676 if (soft != RLIM_INFINITY) { 1669 if (soft != RLIM_INFINITY) {
1677 unsigned long next; 1670 unsigned long next;
@@ -1721,7 +1714,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1714 dequeue_pushable_task(rq, p);
1722} 1715}
1723 1716
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1717static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1718{
1726 /* 1719 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1720 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1739,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1739#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1740 .select_task_rq = select_task_rq_rt,
1748 1741
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1742 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1743 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1744 .rq_offline = rq_offline_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
159 159
160/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
161 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
162int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
163{ 167{
164 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
166 170
167 s = pending->signal.sig; 171 s = pending->signal.sig;
168 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
169 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
170 default: 187 default:
171 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
172 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
173 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
174 break; 191 continue;
175 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
176 break; 195 break;
177 196
178 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
179 sig = 1; 198 x = s[1] &~ m[1];
180 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
181 sig = _NSIG_BPW + 1;
182 else
183 break; 200 break;
184 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
185 break; 202 break;
186 203
187 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
188 sig = ffz(~x) + 1; 205 /* Nothing to do */
189 break; 206 break;
190 } 207 }
191 208
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
228 245
229 if (override_rlimit || 246 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
232 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else { 250 } else {
234 print_dropped_signal(sig); 251 print_dropped_signal(sig);
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..9867b6bfefce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,8 +12,6 @@
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14 14
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 15static struct {
18 struct list_head queue; 16 struct list_head queue;
19 raw_spinlock_t lock; 17 raw_spinlock_t lock;
@@ -33,12 +31,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 31 cpumask_var_t cpumask;
34}; 32};
35 33
34static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 raw_spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 42
43static int 43static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void)
256 } 256 }
257} 257}
258 258
259static DEFINE_PER_CPU(struct call_single_data, csd_data); 259static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
260 260
261/* 261/*
262 * smp_call_function_single - Run a function on a specific CPU 262 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..bde4295774c8 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,6 +34,30 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37static int init_srcu_struct_fields(struct srcu_struct *sp)
38{
39 sp->completed = 0;
40 mutex_init(&sp->mutex);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM;
43}
44
45#ifdef CONFIG_DEBUG_LOCK_ALLOC
46
47int __init_srcu_struct(struct srcu_struct *sp, const char *name,
48 struct lock_class_key *key)
49{
50#ifdef CONFIG_DEBUG_LOCK_ALLOC
51 /* Don't re-initialize a lock while it is held. */
52 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
53 lockdep_init_map(&sp->dep_map, name, key, 0);
54#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
55 return init_srcu_struct_fields(sp);
56}
57EXPORT_SYMBOL_GPL(__init_srcu_struct);
58
59#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
60
37/** 61/**
38 * init_srcu_struct - initialize a sleep-RCU structure 62 * init_srcu_struct - initialize a sleep-RCU structure
39 * @sp: structure to initialize. 63 * @sp: structure to initialize.
@@ -44,13 +68,12 @@
44 */ 68 */
45int init_srcu_struct(struct srcu_struct *sp) 69int init_srcu_struct(struct srcu_struct *sp)
46{ 70{
47 sp->completed = 0; 71 return init_srcu_struct_fields(sp);
48 mutex_init(&sp->mutex);
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 72}
52EXPORT_SYMBOL_GPL(init_srcu_struct); 73EXPORT_SYMBOL_GPL(init_srcu_struct);
53 74
75#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
76
54/* 77/*
55 * srcu_readers_active_idx -- returns approximate number of readers 78 * srcu_readers_active_idx -- returns approximate number of readers
56 * active on the specified rank of per-CPU counters. 79 * active on the specified rank of per-CPU counters.
@@ -100,15 +123,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
100} 123}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 124EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
102 125
103/** 126/*
104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
105 * @sp: srcu_struct in which to register the new reader.
106 *
107 * Counts the new reader in the appropriate per-CPU element of the 127 * Counts the new reader in the appropriate per-CPU element of the
108 * srcu_struct. Must be called from process context. 128 * srcu_struct. Must be called from process context.
109 * Returns an index that must be passed to the matching srcu_read_unlock(). 129 * Returns an index that must be passed to the matching srcu_read_unlock().
110 */ 130 */
111int srcu_read_lock(struct srcu_struct *sp) 131int __srcu_read_lock(struct srcu_struct *sp)
112{ 132{
113 int idx; 133 int idx;
114 134
@@ -120,31 +140,27 @@ int srcu_read_lock(struct srcu_struct *sp)
120 preempt_enable(); 140 preempt_enable();
121 return idx; 141 return idx;
122} 142}
123EXPORT_SYMBOL_GPL(srcu_read_lock); 143EXPORT_SYMBOL_GPL(__srcu_read_lock);
124 144
125/** 145/*
126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
127 * @sp: srcu_struct in which to unregister the old reader.
128 * @idx: return value from corresponding srcu_read_lock().
129 *
130 * Removes the count for the old reader from the appropriate per-CPU 146 * Removes the count for the old reader from the appropriate per-CPU
131 * element of the srcu_struct. Note that this may well be a different 147 * element of the srcu_struct. Note that this may well be a different
132 * CPU than that which was incremented by the corresponding srcu_read_lock(). 148 * CPU than that which was incremented by the corresponding srcu_read_lock().
133 * Must be called from process context. 149 * Must be called from process context.
134 */ 150 */
135void srcu_read_unlock(struct srcu_struct *sp, int idx) 151void __srcu_read_unlock(struct srcu_struct *sp, int idx)
136{ 152{
137 preempt_disable(); 153 preempt_disable();
138 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 154 srcu_barrier(); /* ensure compiler won't misorder critical section. */
139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 155 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
140 preempt_enable(); 156 preempt_enable();
141} 157}
142EXPORT_SYMBOL_GPL(srcu_read_unlock); 158EXPORT_SYMBOL_GPL(__srcu_read_unlock);
143 159
144/* 160/*
145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 161 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
146 */ 162 */
147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 163static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
148{ 164{
149 int idx; 165 int idx;
150 166
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void __percpu *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
51{ 51{
diff --git a/kernel/sys.c b/kernel/sys.c
index 18bde979f346..9814e43fb23b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -571,13 +571,7 @@ static int set_user(struct cred *new)
571 if (!new_user) 571 if (!new_user)
572 return -EAGAIN; 572 return -EAGAIN;
573 573
574 if (!task_can_switch_user(new_user, current)) { 574 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
575 free_uid(new_user);
576 return -EINVAL;
577 }
578
579 if (atomic_read(&new_user->processes) >=
580 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
581 new_user != INIT_USER) { 575 new_user != INIT_USER) {
582 free_uid(new_user); 576 free_uid(new_user);
583 return -EAGAIN; 577 return -EAGAIN;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..0ef19c614f6d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,6 +50,7 @@
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_event.h> 52#include <linux/perf_event.h>
53#include <linux/kprobes.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/processor.h> 56#include <asm/processor.h>
@@ -1441,7 +1442,7 @@ static struct ctl_table fs_table[] = {
1441}; 1442};
1442 1443
1443static struct ctl_table debug_table[] = { 1444static struct ctl_table debug_table[] = {
1444#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1445#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
1445 { 1446 {
1446 .procname = "exception-trace", 1447 .procname = "exception-trace",
1447 .data = &show_unhandled_signals, 1448 .data = &show_unhandled_signals,
@@ -1450,6 +1451,17 @@ static struct ctl_table debug_table[] = {
1450 .proc_handler = proc_dointvec 1451 .proc_handler = proc_dointvec
1451 }, 1452 },
1452#endif 1453#endif
1454#if defined(CONFIG_OPTPROBES)
1455 {
1456 .procname = "kprobes-optimization",
1457 .data = &sysctl_kprobes_optimization,
1458 .maxlen = sizeof(int),
1459 .mode = 0644,
1460 .proc_handler = proc_kprobes_optimization_handler,
1461 .extra1 = &zero,
1462 .extra2 = &one,
1463 },
1464#endif
1453 { } 1465 { }
1454}; 1466};
1455 1467
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..8cd50d8f9bde 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1331 ssize_t result; 1331 ssize_t result;
1332 char *pathname; 1332 char *pathname;
1333 int flags; 1333 int flags;
1334 int acc_mode, fmode; 1334 int acc_mode;
1335 1335
1336 pathname = sysctl_getname(name, nlen, &table); 1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname); 1337 result = PTR_ERR(pathname);
@@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1342 if (oldval && oldlen && newval && newlen) { 1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR; 1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE; 1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) { 1345 } else if (newval && newlen) {
1347 flags = O_WRONLY; 1346 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE; 1347 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) { 1348 } else if (oldval && oldlen) {
1351 flags = O_RDONLY; 1349 flags = O_RDONLY;
1352 acc_mode = MAY_READ; 1350 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else { 1351 } else {
1355 result = 0; 1352 result = 0;
1356 goto out_putname; 1353 goto out_putname;
@@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1361 if (result) 1358 if (result)
1362 goto out_putname; 1359 goto out_putname;
1363 1360
1364 result = may_open(&nd.path, acc_mode, fmode); 1361 result = may_open(&nd.path, acc_mode, flags);
1365 if (result) 1362 if (result)
1366 goto out_putpath; 1363 goto out_putpath;
1367 1364
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..899ca51be5e8 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -46,15 +46,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 46 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 47};
48 48
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 49static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 50 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 52 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 53 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 54
56static struct nla_policy 55static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 56 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 57};
60 58
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 13700833c181..1f663d23e85e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -453,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
454 454
455/** 455/**
456 * clocksource_suspend - suspend the clocksource(s)
457 */
458void clocksource_suspend(void)
459{
460 struct clocksource *cs;
461
462 list_for_each_entry_reverse(cs, &clocksource_list, list)
463 if (cs->suspend)
464 cs->suspend(cs);
465}
466
467/**
456 * clocksource_resume - resume the clocksource(s) 468 * clocksource_resume - resume the clocksource(s)
457 */ 469 */
458void clocksource_resume(void) 470void clocksource_resume(void)
@@ -461,7 +473,7 @@ void clocksource_resume(void)
461 473
462 list_for_each_entry(cs, &clocksource_list, list) 474 list_for_each_entry(cs, &clocksource_list, list)
463 if (cs->resume) 475 if (cs->resume)
464 cs->resume(); 476 cs->resume(cs);
465 477
466 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
467} 479}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64 time_offset;
58static long time_constant = 2; 58static long time_constant = 2;
59 59
60/* maximum error (usecs): */ 60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT; 61static long time_maxerror = NTP_PHASE_LIMIT;
62 62
63/* estimated error (usecs): */ 63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT; 64static long time_esterror = NTP_PHASE_LIMIT;
65 65
66/* frequency offset (scaled nsecs/secs): */ 66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq; 67static s64 time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
142 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
143 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
144 */ 144 */
145 secs = xtime.tv_sec - time_reftime; 145 secs = get_seconds() - time_reftime;
146 if (unlikely(time_status & STA_FREQHOLD)) 146 if (unlikely(time_status & STA_FREQHOLD))
147 secs = 0; 147 secs = 0;
148 148
149 time_reftime = xtime.tv_sec; 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
368 * reference time to current time. 368 * reference time to current time.
369 */ 369 */
370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) 370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
371 time_reftime = xtime.tv_sec; 371 time_reftime = get_seconds();
372 372
373 /* only set allowed bits */ 373 /* only set allowed bits */
374 time_status &= STA_RONLY; 374 time_status &= STA_RONLY;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e2ab064c6d41..16736379a9ca 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -622,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
622 write_sequnlock_irqrestore(&xtime_lock, flags); 622 write_sequnlock_irqrestore(&xtime_lock, flags);
623 623
624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
625 clocksource_suspend();
625 626
626 return 0; 627 return 0;
627} 628}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 60e2ce0181ee..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -328,15 +328,6 @@ config BRANCH_TRACER
328 328
329 Say N if unsure. 329 Say N if unsure.
330 330
331config POWER_TRACER
332 bool "Trace power consumption behavior"
333 depends on X86
334 select GENERIC_TRACER
335 help
336 This tracer helps developers to analyze and optimize the kernel's
337 power management decisions, specifically the C-state and P-state
338 behavior.
339
340config KSYM_TRACER 331config KSYM_TRACER
341 bool "Trace read and write access on kernel memory locations" 332 bool "Trace read and write access on kernel memory locations"
342 depends on HAVE_HW_BREAKPOINT 333 depends on HAVE_HW_BREAKPOINT
@@ -449,7 +440,7 @@ config BLK_DEV_IO_TRACE
449 440
450config KPROBE_EVENT 441config KPROBE_EVENT
451 depends on KPROBES 442 depends on KPROBES
452 depends on X86 443 depends on HAVE_REGS_AND_STACK_ACCESS_API
453 bool "Enable kprobes-based dynamic events" 444 bool "Enable kprobes-based dynamic events"
454 select TRACING 445 select TRACING
455 default y 446 default y
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..07f945a99430 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -540,9 +540,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
540 if (ret) 540 if (ret)
541 return ret; 541 return ret;
542 542
543 if (copy_to_user(arg, &buts, sizeof(buts))) 543 if (copy_to_user(arg, &buts, sizeof(buts))) {
544 blk_trace_remove(q);
544 return -EFAULT; 545 return -EFAULT;
545 546 }
546 return 0; 547 return 0;
547} 548}
548EXPORT_SYMBOL_GPL(blk_trace_setup); 549EXPORT_SYMBOL_GPL(blk_trace_setup);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1904797f4a8a..83783579378f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2402,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = {
2402static DEFINE_MUTEX(graph_lock); 2402static DEFINE_MUTEX(graph_lock);
2403 2403
2404int ftrace_graph_count; 2404int ftrace_graph_count;
2405int ftrace_graph_filter_enabled;
2405unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2406unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2406 2407
2407static void * 2408static void *
@@ -2424,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2424 mutex_lock(&graph_lock); 2425 mutex_lock(&graph_lock);
2425 2426
2426 /* Nothing, tell g_show to print all functions are enabled */ 2427 /* Nothing, tell g_show to print all functions are enabled */
2427 if (!ftrace_graph_count && !*pos) 2428 if (!ftrace_graph_filter_enabled && !*pos)
2428 return (void *)1; 2429 return (void *)1;
2429 2430
2430 return __g_next(m, pos); 2431 return __g_next(m, pos);
@@ -2470,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2470 mutex_lock(&graph_lock); 2471 mutex_lock(&graph_lock);
2471 if ((file->f_mode & FMODE_WRITE) && 2472 if ((file->f_mode & FMODE_WRITE) &&
2472 (file->f_flags & O_TRUNC)) { 2473 (file->f_flags & O_TRUNC)) {
2474 ftrace_graph_filter_enabled = 0;
2473 ftrace_graph_count = 0; 2475 ftrace_graph_count = 0;
2474 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2476 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2475 } 2477 }
@@ -2495,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2495 struct dyn_ftrace *rec; 2497 struct dyn_ftrace *rec;
2496 struct ftrace_page *pg; 2498 struct ftrace_page *pg;
2497 int search_len; 2499 int search_len;
2498 int found = 0; 2500 int fail = 1;
2499 int type, not; 2501 int type, not;
2500 char *search; 2502 char *search;
2501 bool exists; 2503 bool exists;
@@ -2506,37 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2506 2508
2507 /* decode regex */ 2509 /* decode regex */
2508 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 2510 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2509 if (not) 2511 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2510 return -EINVAL; 2512 return -EBUSY;
2511 2513
2512 search_len = strlen(search); 2514 search_len = strlen(search);
2513 2515
2514 mutex_lock(&ftrace_lock); 2516 mutex_lock(&ftrace_lock);
2515 do_for_each_ftrace_rec(pg, rec) { 2517 do_for_each_ftrace_rec(pg, rec) {
2516 2518
2517 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2518 break;
2519
2520 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2519 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2521 continue; 2520 continue;
2522 2521
2523 if (ftrace_match_record(rec, search, search_len, type)) { 2522 if (ftrace_match_record(rec, search, search_len, type)) {
2524 /* ensure it is not already in the array */ 2523 /* if it is in the array */
2525 exists = false; 2524 exists = false;
2526 for (i = 0; i < *idx; i++) 2525 for (i = 0; i < *idx; i++) {
2527 if (array[i] == rec->ip) { 2526 if (array[i] == rec->ip) {
2528 exists = true; 2527 exists = true;
2529 break; 2528 break;
2530 } 2529 }
2531 if (!exists) 2530 }
2532 array[(*idx)++] = rec->ip; 2531
2533 found = 1; 2532 if (!not) {
2533 fail = 0;
2534 if (!exists) {
2535 array[(*idx)++] = rec->ip;
2536 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2537 goto out;
2538 }
2539 } else {
2540 if (exists) {
2541 array[i] = array[--(*idx)];
2542 array[*idx] = 0;
2543 fail = 0;
2544 }
2545 }
2534 } 2546 }
2535 } while_for_each_ftrace_rec(); 2547 } while_for_each_ftrace_rec();
2536 2548out:
2537 mutex_unlock(&ftrace_lock); 2549 mutex_unlock(&ftrace_lock);
2538 2550
2539 return found ? 0 : -EINVAL; 2551 if (fail)
2552 return -EINVAL;
2553
2554 ftrace_graph_filter_enabled = 1;
2555 return 0;
2540} 2556}
2541 2557
2542static ssize_t 2558static ssize_t
@@ -2546,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2546 struct trace_parser parser; 2562 struct trace_parser parser;
2547 ssize_t read, ret; 2563 ssize_t read, ret;
2548 2564
2549 if (!cnt || cnt < 0) 2565 if (!cnt)
2550 return 0; 2566 return 0;
2551 2567
2552 mutex_lock(&graph_lock); 2568 mutex_lock(&graph_lock);
2553 2569
2554 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2555 ret = -EBUSY;
2556 goto out_unlock;
2557 }
2558
2559 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2570 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2560 ret = -ENOMEM; 2571 ret = -ENOMEM;
2561 goto out_unlock; 2572 goto out_unlock;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8c1b2d290718..0287f9f52f5a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22 22
23#include <asm/local.h>
23#include "trace.h" 24#include "trace.h"
24 25
25/* 26/*
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eac6875cb990..ed01fdba4a55 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,7 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
35#include <linux/ctype.h> 36#include <linux/ctype.h>
36#include <linux/init.h> 37#include <linux/init.h>
37#include <linux/poll.h> 38#include <linux/poll.h>
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
91static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
92{ 93{
93 preempt_disable(); 94 preempt_disable();
94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
95} 96}
96 97
97static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
98{ 99{
99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
100 preempt_enable(); 101 preempt_enable();
101} 102}
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -1089,7 +1166,7 @@ trace_function(struct trace_array *tr,
1089 struct ftrace_entry *entry; 1166 struct ftrace_entry *entry;
1090 1167
1091 /* If we are reading the ring buffer, don't trace */ 1168 /* If we are reading the ring buffer, don't trace */
1092 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 1169 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1093 return; 1170 return;
1094 1171
1095 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1172 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1320,8 +1397,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1320 entry->fmt = fmt; 1397 entry->fmt = fmt;
1321 1398
1322 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1399 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1323 if (!filter_check_discard(call, entry, buffer, event)) 1400 if (!filter_check_discard(call, entry, buffer, event)) {
1324 ring_buffer_unlock_commit(buffer, event); 1401 ring_buffer_unlock_commit(buffer, event);
1402 ftrace_trace_stack(buffer, flags, 6, pc);
1403 }
1325 1404
1326out_unlock: 1405out_unlock:
1327 arch_spin_unlock(&trace_buf_lock); 1406 arch_spin_unlock(&trace_buf_lock);
@@ -1394,8 +1473,10 @@ int trace_array_vprintk(struct trace_array *tr,
1394 1473
1395 memcpy(&entry->buf, trace_buf, len); 1474 memcpy(&entry->buf, trace_buf, len);
1396 entry->buf[len] = '\0'; 1475 entry->buf[len] = '\0';
1397 if (!filter_check_discard(call, entry, buffer, event)) 1476 if (!filter_check_discard(call, entry, buffer, event)) {
1398 ring_buffer_unlock_commit(buffer, event); 1477 ring_buffer_unlock_commit(buffer, event);
1478 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1479 }
1399 1480
1400 out_unlock: 1481 out_unlock:
1401 arch_spin_unlock(&trace_buf_lock); 1482 arch_spin_unlock(&trace_buf_lock);
@@ -1585,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1585} 1666}
1586 1667
1587/* 1668/*
1588 * No necessary locking here. The worst thing which can
1589 * happen is loosing events consumed at the same time
1590 * by a trace_pipe reader.
1591 * Other than that, we don't risk to crash the ring buffer
1592 * because it serializes the readers.
1593 *
1594 * The current tracer is copied to avoid a global locking 1669 * The current tracer is copied to avoid a global locking
1595 * all around. 1670 * all around.
1596 */ 1671 */
@@ -1645,12 +1720,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1645 } 1720 }
1646 1721
1647 trace_event_read_lock(); 1722 trace_event_read_lock();
1723 trace_access_lock(cpu_file);
1648 return p; 1724 return p;
1649} 1725}
1650 1726
1651static void s_stop(struct seq_file *m, void *p) 1727static void s_stop(struct seq_file *m, void *p)
1652{ 1728{
1729 struct trace_iterator *iter = m->private;
1730
1653 atomic_dec(&trace_record_cmdline_disabled); 1731 atomic_dec(&trace_record_cmdline_disabled);
1732 trace_access_unlock(iter->cpu_file);
1654 trace_event_read_unlock(); 1733 trace_event_read_unlock();
1655} 1734}
1656 1735
@@ -2841,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2841 2920
2842 mutex_lock(&trace_types_lock); 2921 mutex_lock(&trace_types_lock);
2843 2922
2844 /* We only allow one reader per cpu */
2845 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2846 if (!cpumask_empty(tracing_reader_cpumask)) {
2847 ret = -EBUSY;
2848 goto out;
2849 }
2850 cpumask_setall(tracing_reader_cpumask);
2851 } else {
2852 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2853 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2854 else {
2855 ret = -EBUSY;
2856 goto out;
2857 }
2858 }
2859
2860 /* create a buffer to store the information to pass to userspace */ 2923 /* create a buffer to store the information to pass to userspace */
2861 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2924 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2862 if (!iter) { 2925 if (!iter) {
@@ -2912,12 +2975,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2912 2975
2913 mutex_lock(&trace_types_lock); 2976 mutex_lock(&trace_types_lock);
2914 2977
2915 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2916 cpumask_clear(tracing_reader_cpumask);
2917 else
2918 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2919
2920
2921 if (iter->trace->pipe_close) 2978 if (iter->trace->pipe_close)
2922 iter->trace->pipe_close(iter); 2979 iter->trace->pipe_close(iter);
2923 2980
@@ -3079,6 +3136,7 @@ waitagain:
3079 iter->pos = -1; 3136 iter->pos = -1;
3080 3137
3081 trace_event_read_lock(); 3138 trace_event_read_lock();
3139 trace_access_lock(iter->cpu_file);
3082 while (find_next_entry_inc(iter) != NULL) { 3140 while (find_next_entry_inc(iter) != NULL) {
3083 enum print_line_t ret; 3141 enum print_line_t ret;
3084 int len = iter->seq.len; 3142 int len = iter->seq.len;
@@ -3095,6 +3153,7 @@ waitagain:
3095 if (iter->seq.len >= cnt) 3153 if (iter->seq.len >= cnt)
3096 break; 3154 break;
3097 } 3155 }
3156 trace_access_unlock(iter->cpu_file);
3098 trace_event_read_unlock(); 3157 trace_event_read_unlock();
3099 3158
3100 /* Now copy what we have to the user */ 3159 /* Now copy what we have to the user */
@@ -3220,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3220 } 3279 }
3221 3280
3222 trace_event_read_lock(); 3281 trace_event_read_lock();
3282 trace_access_lock(iter->cpu_file);
3223 3283
3224 /* Fill as many pages as possible. */ 3284 /* Fill as many pages as possible. */
3225 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3285 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3243,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3243 trace_seq_init(&iter->seq); 3303 trace_seq_init(&iter->seq);
3244 } 3304 }
3245 3305
3306 trace_access_unlock(iter->cpu_file);
3246 trace_event_read_unlock(); 3307 trace_event_read_unlock();
3247 mutex_unlock(&iter->mutex); 3308 mutex_unlock(&iter->mutex);
3248 3309
@@ -3544,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3544 3605
3545 info->read = 0; 3606 info->read = 0;
3546 3607
3608 trace_access_lock(info->cpu);
3547 ret = ring_buffer_read_page(info->tr->buffer, 3609 ret = ring_buffer_read_page(info->tr->buffer,
3548 &info->spare, 3610 &info->spare,
3549 count, 3611 count,
3550 info->cpu, 0); 3612 info->cpu, 0);
3613 trace_access_unlock(info->cpu);
3551 if (ret < 0) 3614 if (ret < 0)
3552 return 0; 3615 return 0;
3553 3616
@@ -3675,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3675 len &= PAGE_MASK; 3738 len &= PAGE_MASK;
3676 } 3739 }
3677 3740
3741 trace_access_lock(info->cpu);
3678 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3742 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3679 3743
3680 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3744 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3722,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3722 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3786 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3723 } 3787 }
3724 3788
3789 trace_access_unlock(info->cpu);
3725 spd.nr_pages = i; 3790 spd.nr_pages = i;
3726 3791
3727 /* did we read anything? */ 3792 /* did we read anything? */
@@ -4158,6 +4223,8 @@ static __init int tracer_init_debugfs(void)
4158 struct dentry *d_tracer; 4223 struct dentry *d_tracer;
4159 int cpu; 4224 int cpu;
4160 4225
4226 trace_access_lock_init();
4227
4161 d_tracer = tracing_init_dentry(); 4228 d_tracer = tracing_init_dentry();
4162 4229
4163 trace_create_file("tracing_enabled", 0644, d_tracer, 4230 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4392,9 +4459,6 @@ __init static int tracer_alloc_buffers(void)
4392 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4459 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4393 goto out_free_buffer_mask; 4460 goto out_free_buffer_mask;
4394 4461
4395 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4396 goto out_free_tracing_cpumask;
4397
4398 /* To save memory, keep the ring buffer size to its minimum */ 4462 /* To save memory, keep the ring buffer size to its minimum */
4399 if (ring_buffer_expanded) 4463 if (ring_buffer_expanded)
4400 ring_buf_size = trace_buf_size; 4464 ring_buf_size = trace_buf_size;
@@ -4452,8 +4516,6 @@ __init static int tracer_alloc_buffers(void)
4452 return 0; 4516 return 0;
4453 4517
4454out_free_cpumask: 4518out_free_cpumask:
4455 free_cpumask_var(tracing_reader_cpumask);
4456out_free_tracing_cpumask:
4457 free_cpumask_var(tracing_cpumask); 4519 free_cpumask_var(tracing_cpumask);
4458out_free_buffer_mask: 4520out_free_buffer_mask:
4459 free_cpumask_var(tracing_buffer_mask); 4521 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..fd05bcaf91b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
497#ifdef CONFIG_DYNAMIC_FTRACE 497#ifdef CONFIG_DYNAMIC_FTRACE
498/* TODO: make this variable */ 498/* TODO: make this variable */
499#define FTRACE_GRAPH_MAX_FUNCS 32 499#define FTRACE_GRAPH_MAX_FUNCS 32
500extern int ftrace_graph_filter_enabled;
500extern int ftrace_graph_count; 501extern int ftrace_graph_count;
501extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 502extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
502 503
@@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
504{ 505{
505 int i; 506 int i;
506 507
507 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 508 if (!ftrace_graph_filter_enabled)
508 return 1; 509 return 1;
509 510
510 for (i = 0; i < ftrace_graph_count; i++) { 511 for (i = 0; i < ftrace_graph_count; i++) {
@@ -791,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[];
791 792
792#undef FTRACE_ENTRY 793#undef FTRACE_ENTRY
793#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 794#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
794 extern struct ftrace_event_call event_##call; 795 extern struct ftrace_event_call \
796 __attribute__((__aligned__(4))) event_##call;
795#undef FTRACE_ENTRY_DUP 797#undef FTRACE_ENTRY_DUP
796#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 798#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
797 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 799 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 307 return -1;
308 if (percent_a > percent_b) 308 if (percent_a > percent_b)
309 return 1; 309 return 1;
310 else 310
311 return 0; 311 if (a->incorrect < b->incorrect)
312 return -1;
313 if (a->incorrect > b->incorrect)
314 return 1;
315
316 /*
317 * Since the above shows worse (incorrect) cases
318 * first, we continue that by showing best (correct)
319 * cases last.
320 */
321 if (a->correct > b->correct)
322 return -1;
323 if (a->correct < b->correct)
324 return 1;
325
326 return 0;
312} 327}
313 328
314static struct tracer_stat annotated_branch_stats = { 329static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..3f972ad98d04 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
60 return 0; 60 return 0;
61 61
62err: 62err:
63 if (field) { 63 if (field)
64 kfree(field->name); 64 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 65 kfree(field);
68 66
69 return -ENOMEM; 67 return -ENOMEM;
@@ -520,41 +518,16 @@ out:
520 return ret; 518 return ret;
521} 519}
522 520
523extern char *__bad_type_size(void);
524
525#undef FIELD
526#define FIELD(type, name) \
527 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
528 #type, "common_" #name, offsetof(typeof(field), name), \
529 sizeof(field.name), is_signed_type(type)
530
531static int trace_write_header(struct trace_seq *s)
532{
533 struct trace_entry field;
534
535 /* struct trace_entry */
536 return trace_seq_printf(s,
537 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
538 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
539 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
540 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
541 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
542 "\n",
543 FIELD(unsigned short, type),
544 FIELD(unsigned char, flags),
545 FIELD(unsigned char, preempt_count),
546 FIELD(int, pid),
547 FIELD(int, lock_depth));
548}
549
550static ssize_t 521static ssize_t
551event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 522event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
552 loff_t *ppos) 523 loff_t *ppos)
553{ 524{
554 struct ftrace_event_call *call = filp->private_data; 525 struct ftrace_event_call *call = filp->private_data;
526 struct ftrace_event_field *field;
555 struct trace_seq *s; 527 struct trace_seq *s;
528 int common_field_count = 5;
556 char *buf; 529 char *buf;
557 int r; 530 int r = 0;
558 531
559 if (*ppos) 532 if (*ppos)
560 return 0; 533 return 0;
@@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
565 538
566 trace_seq_init(s); 539 trace_seq_init(s);
567 540
568 /* If any of the first writes fail, so will the show_format. */
569
570 trace_seq_printf(s, "name: %s\n", call->name); 541 trace_seq_printf(s, "name: %s\n", call->name);
571 trace_seq_printf(s, "ID: %d\n", call->id); 542 trace_seq_printf(s, "ID: %d\n", call->id);
572 trace_seq_printf(s, "format:\n"); 543 trace_seq_printf(s, "format:\n");
573 trace_write_header(s);
574 544
575 r = call->show_format(call, s); 545 list_for_each_entry_reverse(field, &call->fields, link) {
546 /*
547 * Smartly shows the array type(except dynamic array).
548 * Normal:
549 * field:TYPE VAR
550 * If TYPE := TYPE[LEN], it is shown:
551 * field:TYPE VAR[LEN]
552 */
553 const char *array_descriptor = strchr(field->type, '[');
554
555 if (!strncmp(field->type, "__data_loc", 10))
556 array_descriptor = NULL;
557
558 if (!array_descriptor) {
559 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
560 "\tsize:%u;\tsigned:%d;\n",
561 field->type, field->name, field->offset,
562 field->size, !!field->is_signed);
563 } else {
564 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
565 "\tsize:%u;\tsigned:%d;\n",
566 (int)(array_descriptor - field->type),
567 field->type, field->name,
568 array_descriptor, field->offset,
569 field->size, !!field->is_signed);
570 }
571
572 if (--common_field_count == 0)
573 r = trace_seq_printf(s, "\n");
574
575 if (!r)
576 break;
577 }
578
579 if (r)
580 r = trace_seq_printf(s, "\nprint fmt: %s\n",
581 call->print_fmt);
582
576 if (!r) { 583 if (!r) {
577 /* 584 /*
578 * ug! The format output is bigger than a PAGE!! 585 * ug! The format output is bigger than a PAGE!!
@@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
948 filter); 955 filter);
949 } 956 }
950 957
951 /* A trace may not want to export its format */
952 if (!call->show_format)
953 return 0;
954
955 trace_create_file("format", 0444, call->dir, call, 958 trace_create_file("format", 0444, call->dir, call,
956 format); 959 format);
957 960
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4e..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
84
85#undef __array
86#define __array(type, item, len) \
87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
89 offsetof(typeof(field), item), \
90 sizeof(field.item), is_signed_type(type)); \
91 if (!ret) \
92 return 0;
93
94#undef __array_desc
95#define __array_desc(type, container, item, len) \
96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
101 if (!ret) \
102 return 0;
103
104#undef __dynamic_array
105#define __dynamic_array(type, item) \
106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
112
113#undef F_printk
114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
115
116#undef __entry
117#define __entry REC
118
119#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121static int \
122ftrace_format_##name(struct ftrace_event_call *unused, \
123 struct trace_seq *s) \
124{ \
125 struct struct_name field __attribute__((unused)); \
126 int ret = 0; \
127 \
128 tstruct; \
129 \
130 trace_seq_printf(s, "\nprint fmt: " print); \
131 \
132 return ret; \
133}
134
135#include "trace_entries.h"
136
137#undef __field 65#undef __field
138#define __field(type, item) \ 66#define __field(type, item) \
139 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
175 return ret; 103 return ret;
176 104
177#undef __dynamic_array 105#undef __dynamic_array
178#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
179 112
180#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
181#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
198 return 0; 131 return 0;
199} 132}
200 133
134#undef __entry
135#define __entry REC
136
201#undef __field 137#undef __field
202#define __field(type, item) 138#define __field(type, item)
203 139
@@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
213#undef __dynamic_array 149#undef __dynamic_array
214#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
215 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
216#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
217#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
218 \ 157 \
@@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
223 .id = type, \ 162 .id = type, \
224 .system = __stringify(TRACE_SYSTEM), \ 163 .system = __stringify(TRACE_SYSTEM), \
225 .raw_init = ftrace_raw_init_event, \ 164 .raw_init = ftrace_raw_init_event, \
226 .show_format = ftrace_format_##call, \ 165 .print_fmt = print, \
227 .define_fields = ftrace_define_fields_##call, \ 166 .define_fields = ftrace_define_fields_##call, \
228}; \ 167}; \
229 168
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..3fc2a575664f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -18,6 +18,7 @@ struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore; 20 int ignore;
21 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
21}; 22};
22 23
23struct fgraph_data { 24struct fgraph_data {
@@ -187,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr,
187 struct ring_buffer *buffer = tr->buffer; 188 struct ring_buffer *buffer = tr->buffer;
188 struct ftrace_graph_ent_entry *entry; 189 struct ftrace_graph_ent_entry *entry;
189 190
190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 191 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
191 return 0; 192 return 0;
192 193
193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 194 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
212 int cpu; 213 int cpu;
213 int pc; 214 int pc;
214 215
215 if (unlikely(!tr))
216 return 0;
217
218 if (!ftrace_trace_task(current)) 216 if (!ftrace_trace_task(current))
219 return 0; 217 return 0;
220 218
221 if (!ftrace_graph_addr(trace->func)) 219 /* trace it when it is-nested-in or is a function enabled. */
220 if (!(trace->depth || ftrace_graph_addr(trace->func)))
222 return 0; 221 return 0;
223 222
224 local_irq_save(flags); 223 local_irq_save(flags);
@@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
231 } else { 230 } else {
232 ret = 0; 231 ret = 0;
233 } 232 }
234 /* Only do the atomic if it is not already set */
235 if (!test_tsk_trace_graph(current))
236 set_tsk_trace_graph(current);
237 233
238 atomic_dec(&data->disabled); 234 atomic_dec(&data->disabled);
239 local_irq_restore(flags); 235 local_irq_restore(flags);
@@ -251,7 +247,7 @@ static void __trace_graph_return(struct trace_array *tr,
251 struct ring_buffer *buffer = tr->buffer; 247 struct ring_buffer *buffer = tr->buffer;
252 struct ftrace_graph_ret_entry *entry; 248 struct ftrace_graph_ret_entry *entry;
253 249
254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 250 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
255 return; 251 return;
256 252
257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 253 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
281 pc = preempt_count(); 277 pc = preempt_count();
282 __trace_graph_return(tr, trace, flags, pc); 278 __trace_graph_return(tr, trace, flags, pc);
283 } 279 }
284 if (!trace->depth)
285 clear_tsk_trace_graph(current);
286 atomic_dec(&data->disabled); 280 atomic_dec(&data->disabled);
287 local_irq_restore(flags); 281 local_irq_restore(flags);
288} 282}
289 283
284void set_graph_array(struct trace_array *tr)
285{
286 graph_array = tr;
287
288 /* Make graph_array visible before we start tracing */
289
290 smp_mb();
291}
292
290static int graph_trace_init(struct trace_array *tr) 293static int graph_trace_init(struct trace_array *tr)
291{ 294{
292 int ret; 295 int ret;
293 296
294 graph_array = tr; 297 set_graph_array(tr);
295 ret = register_ftrace_graph(&trace_graph_return, 298 ret = register_ftrace_graph(&trace_graph_return,
296 &trace_graph_entry); 299 &trace_graph_entry);
297 if (ret) 300 if (ret)
@@ -301,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr)
301 return 0; 304 return 0;
302} 305}
303 306
304void set_graph_array(struct trace_array *tr)
305{
306 graph_array = tr;
307}
308
309static void graph_trace_reset(struct trace_array *tr) 307static void graph_trace_reset(struct trace_array *tr)
310{ 308{
311 tracing_stop_cmdline_record(); 309 tracing_stop_cmdline_record();
@@ -673,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
673 duration = graph_ret->rettime - graph_ret->calltime; 671 duration = graph_ret->rettime - graph_ret->calltime;
674 672
675 if (data) { 673 if (data) {
674 struct fgraph_cpu_data *cpu_data;
676 int cpu = iter->cpu; 675 int cpu = iter->cpu;
677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 676
677 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
678 678
679 /* 679 /*
680 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
681 * this is a leaf function, keep the comments 681 * this is a leaf function, keep the comments
682 * equal to this depth. 682 * equal to this depth.
683 */ 683 */
684 *depth = call->depth - 1; 684 cpu_data->depth = call->depth - 1;
685
686 /* No need to keep this function around for this depth */
687 if (call->depth < FTRACE_RETFUNC_DEPTH)
688 cpu_data->enter_funcs[call->depth] = 0;
685 } 689 }
686 690
687 /* Overhead */ 691 /* Overhead */
@@ -721,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
721 int i; 725 int i;
722 726
723 if (data) { 727 if (data) {
728 struct fgraph_cpu_data *cpu_data;
724 int cpu = iter->cpu; 729 int cpu = iter->cpu;
725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
726 730
727 *depth = call->depth; 731 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
732 cpu_data->depth = call->depth;
733
734 /* Save this function pointer to see if the exit matches */
735 if (call->depth < FTRACE_RETFUNC_DEPTH)
736 cpu_data->enter_funcs[call->depth] = call->func;
728 } 737 }
729 738
730 /* No overhead */ 739 /* No overhead */
@@ -854,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
854 struct fgraph_data *data = iter->private; 863 struct fgraph_data *data = iter->private;
855 pid_t pid = ent->pid; 864 pid_t pid = ent->pid;
856 int cpu = iter->cpu; 865 int cpu = iter->cpu;
866 int func_match = 1;
857 int ret; 867 int ret;
858 int i; 868 int i;
859 869
860 if (data) { 870 if (data) {
871 struct fgraph_cpu_data *cpu_data;
861 int cpu = iter->cpu; 872 int cpu = iter->cpu;
862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 873
874 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
863 875
864 /* 876 /*
865 * Comments display at + 1 to depth. This is the 877 * Comments display at + 1 to depth. This is the
866 * return from a function, we now want the comments 878 * return from a function, we now want the comments
867 * to display at the same level of the bracket. 879 * to display at the same level of the bracket.
868 */ 880 */
869 *depth = trace->depth - 1; 881 cpu_data->depth = trace->depth - 1;
882
883 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
884 if (cpu_data->enter_funcs[trace->depth] != trace->func)
885 func_match = 0;
886 cpu_data->enter_funcs[trace->depth] = 0;
887 }
870 } 888 }
871 889
872 if (print_graph_prologue(iter, s, 0, 0)) 890 if (print_graph_prologue(iter, s, 0, 0))
@@ -891,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
891 return TRACE_TYPE_PARTIAL_LINE; 909 return TRACE_TYPE_PARTIAL_LINE;
892 } 910 }
893 911
894 ret = trace_seq_printf(s, "}\n"); 912 /*
895 if (!ret) 913 * If the return function does not have a matching entry,
896 return TRACE_TYPE_PARTIAL_LINE; 914 * then the entry was lost. Instead of just printing
915 * the '}' and letting the user guess what function this
916 * belongs to, write out the function name.
917 */
918 if (func_match) {
919 ret = trace_seq_printf(s, "}\n");
920 if (!ret)
921 return TRACE_TYPE_PARTIAL_LINE;
922 } else {
923 ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
924 if (!ret)
925 return TRACE_TYPE_PARTIAL_LINE;
926 }
897 927
898 /* Overrun */ 928 /* Overrun */
899 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 929 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 356c10227c98..505c92273b1a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -635,12 +635,12 @@ static int create_trace_probe(int argc, char **argv)
635 event = strchr(group, '/') + 1; 635 event = strchr(group, '/') + 1;
636 event[-1] = '\0'; 636 event[-1] = '\0';
637 if (strlen(group) == 0) { 637 if (strlen(group) == 0) {
638 pr_info("Group name is not specifiled\n"); 638 pr_info("Group name is not specified\n");
639 return -EINVAL; 639 return -EINVAL;
640 } 640 }
641 } 641 }
642 if (strlen(event) == 0) { 642 if (strlen(event) == 0) {
643 pr_info("Event name is not specifiled\n"); 643 pr_info("Event name is not specified\n");
644 return -EINVAL; 644 return -EINVAL;
645 } 645 }
646 } 646 }
@@ -1155,80 +1155,60 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1155 return 0; 1155 return 0;
1156} 1156}
1157 1157
1158static int __probe_event_show_format(struct trace_seq *s, 1158static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1159 struct trace_probe *tp, const char *fmt,
1160 const char *arg)
1161{ 1159{
1162 int i; 1160 int i;
1161 int pos = 0;
1163 1162
1164 /* Show format */ 1163 const char *fmt, *arg;
1165 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1166 return 0;
1167 1164
1168 for (i = 0; i < tp->nr_args; i++) 1165 if (!probe_is_return(tp)) {
1169 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) 1166 fmt = "(%lx)";
1170 return 0; 1167 arg = "REC->" FIELD_STRING_IP;
1168 } else {
1169 fmt = "(%lx <- %lx)";
1170 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1171 }
1171 1172
1172 if (!trace_seq_printf(s, "\", %s", arg)) 1173 /* When len=0, we just calculate the needed length */
1173 return 0; 1174#define LEN_OR_ZERO (len ? len - pos : 0)
1174 1175
1175 for (i = 0; i < tp->nr_args; i++) 1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1176 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1177 return 0;
1178 1177
1179 return trace_seq_puts(s, "\n"); 1178 for (i = 0; i < tp->nr_args; i++) {
1180} 1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
1180 tp->args[i].name);
1181 }
1181 1182
1182#undef SHOW_FIELD 1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1183#define SHOW_FIELD(type, item, name) \
1184 do { \
1185 ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \
1186 "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
1187 (unsigned int)offsetof(typeof(field), item),\
1188 (unsigned int)sizeof(type), \
1189 is_signed_type(type)); \
1190 if (!ret) \
1191 return 0; \
1192 } while (0)
1193 1184
1194static int kprobe_event_show_format(struct ftrace_event_call *call, 1185 for (i = 0; i < tp->nr_args; i++) {
1195 struct trace_seq *s) 1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1196{ 1187 tp->args[i].name);
1197 struct kprobe_trace_entry field __attribute__((unused)); 1188 }
1198 int ret, i;
1199 struct trace_probe *tp = (struct trace_probe *)call->data;
1200
1201 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1202 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1203 1189
1204 /* Show fields */ 1190#undef LEN_OR_ZERO
1205 for (i = 0; i < tp->nr_args; i++)
1206 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1207 trace_seq_puts(s, "\n");
1208 1191
1209 return __probe_event_show_format(s, tp, "(%lx)", 1192 /* return the length of print_fmt */
1210 "REC->" FIELD_STRING_IP); 1193 return pos;
1211} 1194}
1212 1195
1213static int kretprobe_event_show_format(struct ftrace_event_call *call, 1196static int set_print_fmt(struct trace_probe *tp)
1214 struct trace_seq *s)
1215{ 1197{
1216 struct kretprobe_trace_entry field __attribute__((unused)); 1198 int len;
1217 int ret, i; 1199 char *print_fmt;
1218 struct trace_probe *tp = (struct trace_probe *)call->data;
1219 1200
1220 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); 1201 /* First: called with 0 length to calculate the needed length */
1221 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); 1202 len = __set_print_fmt(tp, NULL, 0);
1222 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1203 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1204 if (!print_fmt)
1205 return -ENOMEM;
1223 1206
1224 /* Show fields */ 1207 /* Second: actually write the @print_fmt */
1225 for (i = 0; i < tp->nr_args; i++) 1208 __set_print_fmt(tp, print_fmt, len + 1);
1226 SHOW_FIELD(unsigned long, args[i], tp->args[i].name); 1209 tp->call.print_fmt = print_fmt;
1227 trace_seq_puts(s, "\n");
1228 1210
1229 return __probe_event_show_format(s, tp, "(%lx <- %lx)", 1211 return 0;
1230 "REC->" FIELD_STRING_FUNC
1231 ", REC->" FIELD_STRING_RETIP);
1232} 1212}
1233 1213
1234#ifdef CONFIG_PERF_EVENTS 1214#ifdef CONFIG_PERF_EVENTS
@@ -1359,18 +1339,20 @@ static int register_probe_event(struct trace_probe *tp)
1359 if (probe_is_return(tp)) { 1339 if (probe_is_return(tp)) {
1360 tp->event.trace = print_kretprobe_event; 1340 tp->event.trace = print_kretprobe_event;
1361 call->raw_init = probe_event_raw_init; 1341 call->raw_init = probe_event_raw_init;
1362 call->show_format = kretprobe_event_show_format;
1363 call->define_fields = kretprobe_event_define_fields; 1342 call->define_fields = kretprobe_event_define_fields;
1364 } else { 1343 } else {
1365 tp->event.trace = print_kprobe_event; 1344 tp->event.trace = print_kprobe_event;
1366 call->raw_init = probe_event_raw_init; 1345 call->raw_init = probe_event_raw_init;
1367 call->show_format = kprobe_event_show_format;
1368 call->define_fields = kprobe_event_define_fields; 1346 call->define_fields = kprobe_event_define_fields;
1369 } 1347 }
1348 if (set_print_fmt(tp) < 0)
1349 return -ENOMEM;
1370 call->event = &tp->event; 1350 call->event = &tp->event;
1371 call->id = register_ftrace_event(&tp->event); 1351 call->id = register_ftrace_event(&tp->event);
1372 if (!call->id) 1352 if (!call->id) {
1353 kfree(call->print_fmt);
1373 return -ENODEV; 1354 return -ENODEV;
1355 }
1374 call->enabled = 0; 1356 call->enabled = 0;
1375 call->regfunc = probe_event_enable; 1357 call->regfunc = probe_event_enable;
1376 call->unregfunc = probe_event_disable; 1358 call->unregfunc = probe_event_disable;
@@ -1383,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp)
1383 ret = trace_add_event_call(call); 1365 ret = trace_add_event_call(call);
1384 if (ret) { 1366 if (ret) {
1385 pr_info("Failed to register kprobe event: %s\n", call->name); 1367 pr_info("Failed to register kprobe event: %s\n", call->name);
1368 kfree(call->print_fmt);
1386 unregister_ftrace_event(&tp->event); 1369 unregister_ftrace_event(&tp->event);
1387 } 1370 }
1388 return ret; 1371 return ret;
@@ -1392,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1392{ 1375{
1393 /* tp->event is unregistered in trace_remove_event_call() */ 1376 /* tp->event is unregistered in trace_remove_event_call() */
1394 trace_remove_event_call(&tp->call); 1377 trace_remove_event_call(&tp->call);
1378 kfree(tp->call.print_fmt);
1395} 1379}
1396 1380
1397/* Make a debugfs interface for controling probe points */ 1381/* Make a debugfs interface for controling probe points */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4e332b9e449c..cba47d7935cc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -143,70 +143,65 @@ extern char *__bad_type_size(void);
143 #type, #name, offsetof(typeof(trace), name), \ 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type) 144 sizeof(trace.name), is_signed_type(type)
145 145
146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146static
147int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
147{ 148{
148 int i; 149 int i;
149 int ret; 150 int pos = 0;
150 struct syscall_metadata *entry = call->data;
151 struct syscall_trace_enter trace;
152 int offset = offsetof(struct syscall_trace_enter, args);
153 151
154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 152 /* When len=0, we just calculate the needed length */
155 "\tsigned:%u;\n", 153#define LEN_OR_ZERO (len ? len - pos : 0)
156 SYSCALL_FIELD(int, nr));
157 if (!ret)
158 return 0;
159 154
155 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
160 for (i = 0; i < entry->nb_args; i++) { 156 for (i = 0; i < entry->nb_args; i++) {
161 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 157 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
162 entry->args[i]); 158 entry->args[i], sizeof(unsigned long),
163 if (!ret) 159 i == entry->nb_args - 1 ? "" : ", ");
164 return 0;
165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
169 if (!ret)
170 return 0;
171 offset += sizeof(unsigned long);
172 } 160 }
161 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
173 162
174 trace_seq_puts(s, "\nprint fmt: \"");
175 for (i = 0; i < entry->nb_args; i++) { 163 for (i = 0; i < entry->nb_args; i++) {
176 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 164 pos += snprintf(buf + pos, LEN_OR_ZERO,
177 sizeof(unsigned long), 165 ", ((unsigned long)(REC->%s))", entry->args[i]);
178 i == entry->nb_args - 1 ? "" : ", ");
179 if (!ret)
180 return 0;
181 } 166 }
182 trace_seq_putc(s, '"');
183 167
184 for (i = 0; i < entry->nb_args; i++) { 168#undef LEN_OR_ZERO
185 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186 entry->args[i]);
187 if (!ret)
188 return 0;
189 }
190 169
191 return trace_seq_putc(s, '\n'); 170 /* return the length of print_fmt */
171 return pos;
192} 172}
193 173
194int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 174static int set_syscall_print_fmt(struct ftrace_event_call *call)
195{ 175{
196 int ret; 176 char *print_fmt;
197 struct syscall_trace_exit trace; 177 int len;
178 struct syscall_metadata *entry = call->data;
198 179
199 ret = trace_seq_printf(s, 180 if (entry->enter_event != call) {
200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 181 call->print_fmt = "\"0x%lx\", REC->ret";
201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
204 SYSCALL_FIELD(int, nr),
205 SYSCALL_FIELD(long, ret));
206 if (!ret)
207 return 0; 182 return 0;
183 }
208 184
209 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 185 /* First: called with 0 length to calculate the needed length */
186 len = __set_enter_print_fmt(entry, NULL, 0);
187
188 print_fmt = kmalloc(len + 1, GFP_KERNEL);
189 if (!print_fmt)
190 return -ENOMEM;
191
192 /* Second: actually write the @print_fmt */
193 __set_enter_print_fmt(entry, print_fmt, len + 1);
194 call->print_fmt = print_fmt;
195
196 return 0;
197}
198
199static void free_syscall_print_fmt(struct ftrace_event_call *call)
200{
201 struct syscall_metadata *entry = call->data;
202
203 if (entry->enter_event == call)
204 kfree(call->print_fmt);
210} 205}
211 206
212int syscall_enter_define_fields(struct ftrace_event_call *call) 207int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
386{ 381{
387 int id; 382 int id;
388 383
389 id = register_ftrace_event(call->event); 384 if (set_syscall_print_fmt(call) < 0)
390 if (!id) 385 return -ENOMEM;
391 return -ENODEV; 386
392 call->id = id; 387 id = trace_event_raw_init(call);
393 INIT_LIST_HEAD(&call->fields); 388
394 return 0; 389 if (id < 0) {
390 free_syscall_print_fmt(call);
391 return id;
392 }
393
394 return id;
395}
396
397unsigned long __init arch_syscall_addr(int nr)
398{
399 return (unsigned long)sys_call_table[nr];
395} 400}
396 401
397int __init init_ftrace_syscalls(void) 402int __init init_ftrace_syscalls(void)
@@ -552,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
552 ret = register_trace_sys_exit(prof_syscall_exit); 557 ret = register_trace_sys_exit(prof_syscall_exit);
553 if (ret) { 558 if (ret) {
554 pr_info("event trace: Could not activate" 559 pr_info("event trace: Could not activate"
555 "syscall entry trace point"); 560 "syscall exit trace point");
556 } else { 561 } else {
557 set_bit(num, enabled_prof_exit_syscalls); 562 set_bit(num, enabled_prof_exit_syscalls);
558 sys_prof_refcount_exit++; 563 sys_prof_refcount_exit++;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186