aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/audit_tree.c113
-rw-r--r--kernel/auditsc.c8
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c23
-rw-r--r--kernel/cpu.c36
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/early_res.c578
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exit.c55
-rw-r--r--kernel/fork.c37
-rw-r--r--kernel/futex.c57
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/hw_breakpoint.c68
-rw-r--r--kernel/irq/chip.c52
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/kfifo.c410
-rw-r--r--kernel/kgdb.c9
-rw-r--r--kernel/kmod.c12
-rw-r--r--kernel/kprobes.c683
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c25
-rw-r--r--kernel/lockdep.c20
-rw-r--r--kernel/module.c59
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/padata.c696
-rw-r--r--kernel/panic.c49
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/perf_event.c684
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/posix-cpu-timers.c36
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig19
-rw-r--r--kernel/power/hibernate.c9
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/swsusp.c58
-rw-r--r--kernel/power/user.c23
-rw-r--r--kernel/printk.c60
-rw-r--r--kernel/ptrace.c88
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c29
-rw-r--r--kernel/rcutorture.c102
-rw-r--r--kernel/rcutree.c268
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h229
-rw-r--r--kernel/rcutree_trace.c14
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c96
-rw-r--r--kernel/sched.c2509
-rw-r--r--kernel/sched_clock.c23
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1754
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c63
-rw-r--r--kernel/signal.c73
-rw-r--r--kernel/smp.c10
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/softlockup.c15
-rw-r--r--kernel/srcu.c52
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/sysctl.c18
-rw-r--r--kernel/sysctl_binary.c38
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/time/clockevents.c19
-rw-r--r--kernel/time/clocksource.c32
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/timekeeping.c30
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c5
-rw-r--r--kernel/trace/Kconfig125
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c5
-rw-r--r--kernel/trace/ftrace.c111
-rw-r--r--kernel/trace/ring_buffer.c29
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c157
-rw-r--r--kernel/trace/trace.h6
-rw-r--r--kernel/trace/trace_branch.c19
-rw-r--r--kernel/trace/trace_event_profile.c52
-rw-r--r--kernel/trace/trace_events.c81
-rw-r--r--kernel/trace/trace_events_filter.c33
-rw-r--r--kernel/trace/trace_export.c94
-rw-r--r--kernel/trace/trace_functions_graph.c82
-rw-r--r--kernel/trace/trace_kprobe.c336
-rw-r--r--kernel/trace/trace_ksym.c140
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/trace/trace_syscalls.c189
-rw-r--r--kernel/trace/trace_sysprof.c1
-rw-r--r--kernel/tsacct.c1
-rw-r--r--kernel/user.c305
98 files changed, 7100 insertions, 4531 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..a987aa1676b5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
90obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 93obj-$(CONFIG_LATENCYTOP) += latencytop.o
94obj-$(CONFIG_BINFMT_ELF) += elfcore.o
95obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
@@ -100,6 +104,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
100obj-$(CONFIG_PERF_EVENTS) += perf_event.o 104obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
107obj-$(CONFIG_PADATA) += padata.o
103 108
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 109ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 110# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2451dc6f3282..028e85663f27 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -277,7 +277,7 @@ static void untag_chunk(struct node *p)
277 owner->root = NULL; 277 owner->root = NULL;
278 } 278 }
279 279
280 for (i = j = 0; i < size; i++, j++) { 280 for (i = j = 0; j <= size; i++, j++) {
281 struct audit_tree *s; 281 struct audit_tree *s;
282 if (&chunk->owners[j] == p) { 282 if (&chunk->owners[j] == p) {
283 list_del_init(&p->list); 283 list_del_init(&p->list);
@@ -290,7 +290,7 @@ static void untag_chunk(struct node *p)
290 if (!s) /* result of earlier fallback */ 290 if (!s) /* result of earlier fallback */
291 continue; 291 continue;
292 get_tree(s); 292 get_tree(s);
293 list_replace_init(&chunk->owners[i].list, &new->owners[j].list); 293 list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
294 } 294 }
295 295
296 list_replace_rcu(&chunk->hash, &new->hash); 296 list_replace_rcu(&chunk->hash, &new->hash);
@@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
373 for (n = 0; n < old->count; n++) { 373 for (n = 0; n < old->count; n++) {
374 if (old->owners[n].owner == tree) { 374 if (old->owners[n].owner == tree) {
375 spin_unlock(&hash_lock); 375 spin_unlock(&hash_lock);
376 put_inotify_watch(watch); 376 put_inotify_watch(&old->watch);
377 return 0; 377 return 0;
378 } 378 }
379 } 379 }
380 spin_unlock(&hash_lock); 380 spin_unlock(&hash_lock);
381 381
382 chunk = alloc_chunk(old->count + 1); 382 chunk = alloc_chunk(old->count + 1);
383 if (!chunk) 383 if (!chunk) {
384 put_inotify_watch(&old->watch);
384 return -ENOMEM; 385 return -ENOMEM;
386 }
385 387
386 mutex_lock(&inode->inotify_mutex); 388 mutex_lock(&inode->inotify_mutex);
387 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 389 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
@@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
425 spin_unlock(&hash_lock); 427 spin_unlock(&hash_lock);
426 inotify_evict_watch(&old->watch); 428 inotify_evict_watch(&old->watch);
427 mutex_unlock(&inode->inotify_mutex); 429 mutex_unlock(&inode->inotify_mutex);
428 put_inotify_watch(&old->watch); 430 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
431 put_inotify_watch(&old->watch); /* and kill it */
429 return 0; 432 return 0;
430} 433}
431 434
@@ -545,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
545 return 0; 548 return 0;
546} 549}
547 550
551static int compare_root(struct vfsmount *mnt, void *arg)
552{
553 return mnt->mnt_root->d_inode == arg;
554}
555
548void audit_trim_trees(void) 556void audit_trim_trees(void)
549{ 557{
550 struct list_head cursor; 558 struct list_head cursor;
@@ -556,7 +564,6 @@ void audit_trim_trees(void)
556 struct path path; 564 struct path path;
557 struct vfsmount *root_mnt; 565 struct vfsmount *root_mnt;
558 struct node *node; 566 struct node *node;
559 struct list_head list;
560 int err; 567 int err;
561 568
562 tree = container_of(cursor.next, struct audit_tree, list); 569 tree = container_of(cursor.next, struct audit_tree, list);
@@ -574,24 +581,16 @@ void audit_trim_trees(void)
574 if (!root_mnt) 581 if (!root_mnt)
575 goto skip_it; 582 goto skip_it;
576 583
577 list_add_tail(&list, &root_mnt->mnt_list);
578 spin_lock(&hash_lock); 584 spin_lock(&hash_lock);
579 list_for_each_entry(node, &tree->chunks, list) { 585 list_for_each_entry(node, &tree->chunks, list) {
580 struct audit_chunk *chunk = find_chunk(node); 586 struct inode *inode = find_chunk(node)->watch.inode;
581 struct inode *inode = chunk->watch.inode;
582 struct vfsmount *mnt;
583 node->index |= 1U<<31; 587 node->index |= 1U<<31;
584 list_for_each_entry(mnt, &list, mnt_list) { 588 if (iterate_mounts(compare_root, inode, root_mnt))
585 if (mnt->mnt_root->d_inode == inode) { 589 node->index &= ~(1U<<31);
586 node->index &= ~(1U<<31);
587 break;
588 }
589 }
590 } 590 }
591 spin_unlock(&hash_lock); 591 spin_unlock(&hash_lock);
592 trim_marked(tree); 592 trim_marked(tree);
593 put_tree(tree); 593 put_tree(tree);
594 list_del_init(&list);
595 drop_collected_mounts(root_mnt); 594 drop_collected_mounts(root_mnt);
596skip_it: 595skip_it:
597 mutex_lock(&audit_filter_mutex); 596 mutex_lock(&audit_filter_mutex);
@@ -600,22 +599,6 @@ skip_it:
600 mutex_unlock(&audit_filter_mutex); 599 mutex_unlock(&audit_filter_mutex);
601} 600}
602 601
603static int is_under(struct vfsmount *mnt, struct dentry *dentry,
604 struct path *path)
605{
606 if (mnt != path->mnt) {
607 for (;;) {
608 if (mnt->mnt_parent == mnt)
609 return 0;
610 if (mnt->mnt_parent == path->mnt)
611 break;
612 mnt = mnt->mnt_parent;
613 }
614 dentry = mnt->mnt_mountpoint;
615 }
616 return is_subdir(dentry, path->dentry);
617}
618
619int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 602int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
620{ 603{
621 604
@@ -635,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree)
635 put_tree(tree); 618 put_tree(tree);
636} 619}
637 620
621static int tag_mount(struct vfsmount *mnt, void *arg)
622{
623 return tag_chunk(mnt->mnt_root->d_inode, arg);
624}
625
638/* called with audit_filter_mutex */ 626/* called with audit_filter_mutex */
639int audit_add_tree_rule(struct audit_krule *rule) 627int audit_add_tree_rule(struct audit_krule *rule)
640{ 628{
641 struct audit_tree *seed = rule->tree, *tree; 629 struct audit_tree *seed = rule->tree, *tree;
642 struct path path; 630 struct path path;
643 struct vfsmount *mnt, *p; 631 struct vfsmount *mnt;
644 struct list_head list;
645 int err; 632 int err;
646 633
647 list_for_each_entry(tree, &tree_list, list) { 634 list_for_each_entry(tree, &tree_list, list) {
@@ -667,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
667 err = -ENOMEM; 654 err = -ENOMEM;
668 goto Err; 655 goto Err;
669 } 656 }
670 list_add_tail(&list, &mnt->mnt_list);
671 657
672 get_tree(tree); 658 get_tree(tree);
673 list_for_each_entry(p, &list, mnt_list) { 659 err = iterate_mounts(tag_mount, tree, mnt);
674 err = tag_chunk(p->mnt_root->d_inode, tree);
675 if (err)
676 break;
677 }
678
679 list_del(&list);
680 drop_collected_mounts(mnt); 660 drop_collected_mounts(mnt);
681 661
682 if (!err) { 662 if (!err) {
@@ -711,31 +691,23 @@ int audit_tag_tree(char *old, char *new)
711{ 691{
712 struct list_head cursor, barrier; 692 struct list_head cursor, barrier;
713 int failed = 0; 693 int failed = 0;
714 struct path path; 694 struct path path1, path2;
715 struct vfsmount *tagged; 695 struct vfsmount *tagged;
716 struct list_head list;
717 struct vfsmount *mnt;
718 struct dentry *dentry;
719 int err; 696 int err;
720 697
721 err = kern_path(new, 0, &path); 698 err = kern_path(new, 0, &path2);
722 if (err) 699 if (err)
723 return err; 700 return err;
724 tagged = collect_mounts(&path); 701 tagged = collect_mounts(&path2);
725 path_put(&path); 702 path_put(&path2);
726 if (!tagged) 703 if (!tagged)
727 return -ENOMEM; 704 return -ENOMEM;
728 705
729 err = kern_path(old, 0, &path); 706 err = kern_path(old, 0, &path1);
730 if (err) { 707 if (err) {
731 drop_collected_mounts(tagged); 708 drop_collected_mounts(tagged);
732 return err; 709 return err;
733 } 710 }
734 mnt = mntget(path.mnt);
735 dentry = dget(path.dentry);
736 path_put(&path);
737
738 list_add_tail(&list, &tagged->mnt_list);
739 711
740 mutex_lock(&audit_filter_mutex); 712 mutex_lock(&audit_filter_mutex);
741 list_add(&barrier, &tree_list); 713 list_add(&barrier, &tree_list);
@@ -743,7 +715,7 @@ int audit_tag_tree(char *old, char *new)
743 715
744 while (cursor.next != &tree_list) { 716 while (cursor.next != &tree_list) {
745 struct audit_tree *tree; 717 struct audit_tree *tree;
746 struct vfsmount *p; 718 int good_one = 0;
747 719
748 tree = container_of(cursor.next, struct audit_tree, list); 720 tree = container_of(cursor.next, struct audit_tree, list);
749 get_tree(tree); 721 get_tree(tree);
@@ -751,30 +723,19 @@ int audit_tag_tree(char *old, char *new)
751 list_add(&cursor, &tree->list); 723 list_add(&cursor, &tree->list);
752 mutex_unlock(&audit_filter_mutex); 724 mutex_unlock(&audit_filter_mutex);
753 725
754 err = kern_path(tree->pathname, 0, &path); 726 err = kern_path(tree->pathname, 0, &path2);
755 if (err) { 727 if (!err) {
756 put_tree(tree); 728 good_one = path_is_under(&path1, &path2);
757 mutex_lock(&audit_filter_mutex); 729 path_put(&path2);
758 continue;
759 } 730 }
760 731
761 spin_lock(&vfsmount_lock); 732 if (!good_one) {
762 if (!is_under(mnt, dentry, &path)) {
763 spin_unlock(&vfsmount_lock);
764 path_put(&path);
765 put_tree(tree); 733 put_tree(tree);
766 mutex_lock(&audit_filter_mutex); 734 mutex_lock(&audit_filter_mutex);
767 continue; 735 continue;
768 } 736 }
769 spin_unlock(&vfsmount_lock);
770 path_put(&path);
771
772 list_for_each_entry(p, &list, mnt_list) {
773 failed = tag_chunk(p->mnt_root->d_inode, tree);
774 if (failed)
775 break;
776 }
777 737
738 failed = iterate_mounts(tag_mount, tree, tagged);
778 if (failed) { 739 if (failed) {
779 put_tree(tree); 740 put_tree(tree);
780 mutex_lock(&audit_filter_mutex); 741 mutex_lock(&audit_filter_mutex);
@@ -815,10 +776,8 @@ int audit_tag_tree(char *old, char *new)
815 } 776 }
816 list_del(&barrier); 777 list_del(&barrier);
817 list_del(&cursor); 778 list_del(&cursor);
818 list_del(&list);
819 mutex_unlock(&audit_filter_mutex); 779 mutex_unlock(&audit_filter_mutex);
820 dput(dentry); 780 path_put(&path1);
821 mntput(mnt);
822 drop_collected_mounts(tagged); 781 drop_collected_mounts(tagged);
823 return failed; 782 return failed;
824} 783}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 267e484f0198..f3a461c0970a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -250,7 +250,6 @@ struct audit_context {
250#endif 250#endif
251}; 251};
252 252
253#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
254static inline int open_arg(int flags, int mask) 253static inline int open_arg(int flags, int mask)
255{ 254{
256 int n = ACC_MODE(flags); 255 int n = ACC_MODE(flags);
@@ -1989,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1989 1988
1990/** 1989/**
1991 * audit_inode_child - collect inode info for created/removed objects 1990 * audit_inode_child - collect inode info for created/removed objects
1992 * @dname: inode's dentry name
1993 * @dentry: dentry being audited 1991 * @dentry: dentry being audited
1994 * @parent: inode of dentry parent 1992 * @parent: inode of dentry parent
1995 * 1993 *
@@ -2001,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2001 * must be hooked prior, in order to capture the target inode during 1999 * must be hooked prior, in order to capture the target inode during
2002 * unsuccessful attempts. 2000 * unsuccessful attempts.
2003 */ 2001 */
2004void __audit_inode_child(const char *dname, const struct dentry *dentry, 2002void __audit_inode_child(const struct dentry *dentry,
2005 const struct inode *parent) 2003 const struct inode *parent)
2006{ 2004{
2007 int idx; 2005 int idx;
2008 struct audit_context *context = current->audit_context; 2006 struct audit_context *context = current->audit_context;
2009 const char *found_parent = NULL, *found_child = NULL; 2007 const char *found_parent = NULL, *found_child = NULL;
2010 const struct inode *inode = dentry->d_inode; 2008 const struct inode *inode = dentry->d_inode;
2009 const char *dname = dentry->d_name.name;
2011 int dirlen = 0; 2010 int dirlen = 0;
2012 2011
2013 if (!context->in_syscall) 2012 if (!context->in_syscall)
@@ -2015,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2015 2014
2016 if (inode) 2015 if (inode)
2017 handle_one(inode); 2016 handle_one(inode);
2018 /* determine matching parent */
2019 if (!dname)
2020 goto add_names;
2021 2017
2022 /* parent is more likely, look for it first */ 2018 /* parent is more likely, look for it first */
2023 for (idx = 0; idx < context->name_count; idx++) { 2019 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
135 if (pid && (pid != task_pid_vnr(current))) { 135 if (pid && (pid != task_pid_vnr(current))) {
136 struct task_struct *target; 136 struct task_struct *target;
137 137
138 read_lock(&tasklist_lock); 138 rcu_read_lock();
139 139
140 target = find_task_by_vpid(pid); 140 target = find_task_by_vpid(pid);
141 if (!target) 141 if (!target)
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
143 else 143 else
144 ret = security_capget(target, pEp, pIp, pPp); 144 ret = security_capget(target, pEp, pIp, pPp);
145 145
146 read_unlock(&tasklist_lock); 146 rcu_read_unlock();
147 } else 147 } else
148 ret = security_capget(current, pEp, pIp, pPp); 148 ret = security_capget(current, pEp, pIp, pPp);
149 149
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0249f4be9b5c..4fd90e129772 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/module.h>
26#include <linux/ctype.h> 27#include <linux/ctype.h>
27#include <linux/errno.h> 28#include <linux/errno.h>
28#include <linux/fs.h> 29#include <linux/fs.h>
@@ -166,6 +167,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 167 */
167static int need_forkexit_callback __read_mostly; 168static int need_forkexit_callback __read_mostly;
168 169
170#ifdef CONFIG_PROVE_LOCKING
171int cgroup_lock_is_held(void)
172{
173 return lockdep_is_held(&cgroup_mutex);
174}
175#else /* #ifdef CONFIG_PROVE_LOCKING */
176int cgroup_lock_is_held(void)
177{
178 return mutex_is_locked(&cgroup_mutex);
179}
180#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
181
182EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
183
169/* convenient tests for these bits */ 184/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 185inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 186{
@@ -2468,7 +2483,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2468 /* make sure l doesn't vanish out from under us */ 2483 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2484 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2485 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l; 2486 return l;
2473 } 2487 }
2474 } 2488 }
@@ -2937,14 +2951,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2937 2951
2938 for_each_subsys(root, ss) { 2952 for_each_subsys(root, ss) {
2939 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 2953 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2954
2940 if (IS_ERR(css)) { 2955 if (IS_ERR(css)) {
2941 err = PTR_ERR(css); 2956 err = PTR_ERR(css);
2942 goto err_destroy; 2957 goto err_destroy;
2943 } 2958 }
2944 init_cgroup_css(css, ss, cgrp); 2959 init_cgroup_css(css, ss, cgrp);
2945 if (ss->use_id) 2960 if (ss->use_id) {
2946 if (alloc_css_id(ss, parent, cgrp)) 2961 err = alloc_css_id(ss, parent, cgrp);
2962 if (err)
2947 goto err_destroy; 2963 goto err_destroy;
2964 }
2948 /* At error, ->destroy() callback has to free assigned ID. */ 2965 /* At error, ->destroy() callback has to free assigned ID. */
2949 } 2966 }
2950 2967
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 291ac586f37f..f8cced2692b3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
151 151
152 write_lock_irq(&tasklist_lock); 152 write_lock_irq(&tasklist_lock);
153 for_each_process(p) { 153 for_each_process(p) {
154 if (task_cpu(p) == cpu && 154 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
155 (!cputime_eq(p->utime, cputime_zero) || 155 (!cputime_eq(p->utime, cputime_zero) ||
156 !cputime_eq(p->stime, cputime_zero))) 156 !cputime_eq(p->stime, cputime_zero)))
157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
158 (state = %ld, flags = %x) \n", 158 "(state = %ld, flags = %x)\n",
159 p->comm, task_pid_nr(p), cpu, 159 p->comm, task_pid_nr(p), cpu,
160 p->state, p->flags); 160 p->state, p->flags);
161 } 161 }
162 write_unlock_irq(&tasklist_lock); 162 write_unlock_irq(&tasklist_lock);
163} 163}
@@ -209,6 +209,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
209 return -ENOMEM; 209 return -ENOMEM;
210 210
211 cpu_hotplug_begin(); 211 cpu_hotplug_begin();
212 set_cpu_active(cpu, false);
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 213 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 214 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 215 if (err == NOTIFY_BAD) {
@@ -280,18 +281,6 @@ int __ref cpu_down(unsigned int cpu)
280 goto out; 281 goto out;
281 } 282 }
282 283
283 set_cpu_active(cpu, false);
284
285 /*
286 * Make sure the all cpus did the reschedule and are not
287 * using stale version of the cpu_active_mask.
288 * This is not strictly necessary becuase stop_machine()
289 * that we run down the line already provides the required
290 * synchronization. But it's really a side effect and we do not
291 * want to depend on the innards of the stop_machine here.
292 */
293 synchronize_sched();
294
295 err = _cpu_down(cpu, 0); 284 err = _cpu_down(cpu, 0);
296 285
297out: 286out:
@@ -349,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu)
349 if (!cpu_possible(cpu)) { 338 if (!cpu_possible(cpu)) {
350 printk(KERN_ERR "can't online cpu %d because it is not " 339 printk(KERN_ERR "can't online cpu %d because it is not "
351 "configured as may-hotadd at boot time\n", cpu); 340 "configured as may-hotadd at boot time\n", cpu);
352#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 341#if defined(CONFIG_IA64)
353 printk(KERN_ERR "please check additional_cpus= boot " 342 printk(KERN_ERR "please check additional_cpus= boot "
354 "parameter\n"); 343 "parameter\n");
355#endif 344#endif
@@ -382,19 +371,12 @@ int disable_nonboot_cpus(void)
382 return error; 371 return error;
383 cpu_maps_update_begin(); 372 cpu_maps_update_begin();
384 first_cpu = cpumask_first(cpu_online_mask); 373 first_cpu = cpumask_first(cpu_online_mask);
385 /* We take down all of the non-boot CPUs in one shot to avoid races 374 /*
375 * We take down all of the non-boot CPUs in one shot to avoid races
386 * with the userspace trying to use the CPU hotplug at the same time 376 * with the userspace trying to use the CPU hotplug at the same time
387 */ 377 */
388 cpumask_clear(frozen_cpus); 378 cpumask_clear(frozen_cpus);
389 379
390 for_each_online_cpu(cpu) {
391 if (cpu == first_cpu)
392 continue;
393 set_cpu_active(cpu, false);
394 }
395
396 synchronize_sched();
397
398 printk("Disabling non-boot CPUs ...\n"); 380 printk("Disabling non-boot CPUs ...\n");
399 for_each_online_cpu(cpu) { 381 for_each_online_cpu(cpu) {
400 if (cpu == first_cpu) 382 if (cpu == first_cpu)
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..1ed8ca18790c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void)
224#ifdef CONFIG_KEYS 224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); 225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) { 226 if (!new->tgcred) {
227 kfree(new); 227 kmem_cache_free(cred_jar, new);
228 return NULL; 228 return NULL;
229 } 229 }
230 atomic_set(&new->tgcred->usage, 1); 230 atomic_set(&new->tgcred->usage, 1);
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..3cb2c661bb78
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,578 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10
11/*
12 * Early reserved memory areas.
13 */
14/*
15 * need to make sure this one is bigger enough before
16 * find_fw_memmap_area could be used
17 */
18#define MAX_EARLY_RES_X 32
19
20struct early_res {
21 u64 start, end;
22 char name[15];
23 char overlap_ok;
24};
25static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
26
27static int max_early_res __initdata = MAX_EARLY_RES_X;
28static struct early_res *early_res __initdata = &early_res_x[0];
29static int early_res_count __initdata;
30
31static int __init find_overlapped_early(u64 start, u64 end)
32{
33 int i;
34 struct early_res *r;
35
36 for (i = 0; i < max_early_res && early_res[i].end; i++) {
37 r = &early_res[i];
38 if (end > r->start && start < r->end)
39 break;
40 }
41
42 return i;
43}
44
45/*
46 * Drop the i-th range from the early reservation map,
47 * by copying any higher ranges down one over it, and
48 * clearing what had been the last slot.
49 */
50static void __init drop_range(int i)
51{
52 int j;
53
54 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
55 ;
56
57 memmove(&early_res[i], &early_res[i + 1],
58 (j - 1 - i) * sizeof(struct early_res));
59
60 early_res[j - 1].end = 0;
61 early_res_count--;
62}
63
64static void __init drop_range_partial(int i, u64 start, u64 end)
65{
66 u64 common_start, common_end;
67 u64 old_start, old_end;
68
69 old_start = early_res[i].start;
70 old_end = early_res[i].end;
71 common_start = max(old_start, start);
72 common_end = min(old_end, end);
73
74 /* no overlap ? */
75 if (common_start >= common_end)
76 return;
77
78 if (old_start < common_start) {
79 /* make head segment */
80 early_res[i].end = common_start;
81 if (old_end > common_end) {
82 char name[15];
83
84 /*
85 * Save a local copy of the name, since the
86 * early_res array could get resized inside
87 * reserve_early_without_check() ->
88 * __check_and_double_early_res(), which would
89 * make the current name pointer invalid.
90 */
91 strncpy(name, early_res[i].name,
92 sizeof(early_res[i].name) - 1);
93 /* add another for left over on tail */
94 reserve_early_without_check(common_end, old_end, name);
95 }
96 return;
97 } else {
98 if (old_end > common_end) {
99 /* reuse the entry for tail left */
100 early_res[i].start = common_end;
101 return;
102 }
103 /* all covered */
104 drop_range(i);
105 }
106}
107
108/*
109 * Split any existing ranges that:
110 * 1) are marked 'overlap_ok', and
111 * 2) overlap with the stated range [start, end)
112 * into whatever portion (if any) of the existing range is entirely
113 * below or entirely above the stated range. Drop the portion
114 * of the existing range that overlaps with the stated range,
115 * which will allow the caller of this routine to then add that
116 * stated range without conflicting with any existing range.
117 */
118static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
119{
120 int i;
121 struct early_res *r;
122 u64 lower_start, lower_end;
123 u64 upper_start, upper_end;
124 char name[15];
125
126 for (i = 0; i < max_early_res && early_res[i].end; i++) {
127 r = &early_res[i];
128
129 /* Continue past non-overlapping ranges */
130 if (end <= r->start || start >= r->end)
131 continue;
132
133 /*
134 * Leave non-ok overlaps as is; let caller
135 * panic "Overlapping early reservations"
136 * when it hits this overlap.
137 */
138 if (!r->overlap_ok)
139 return;
140
141 /*
142 * We have an ok overlap. We will drop it from the early
143 * reservation map, and add back in any non-overlapping
144 * portions (lower or upper) as separate, overlap_ok,
145 * non-overlapping ranges.
146 */
147
148 /* 1. Note any non-overlapping (lower or upper) ranges. */
149 strncpy(name, r->name, sizeof(name) - 1);
150
151 lower_start = lower_end = 0;
152 upper_start = upper_end = 0;
153 if (r->start < start) {
154 lower_start = r->start;
155 lower_end = start;
156 }
157 if (r->end > end) {
158 upper_start = end;
159 upper_end = r->end;
160 }
161
162 /* 2. Drop the original ok overlapping range */
163 drop_range(i);
164
165 i--; /* resume for-loop on copied down entry */
166
167 /* 3. Add back in any non-overlapping ranges. */
168 if (lower_end)
169 reserve_early_overlap_ok(lower_start, lower_end, name);
170 if (upper_end)
171 reserve_early_overlap_ok(upper_start, upper_end, name);
172 }
173}
174
175static void __init __reserve_early(u64 start, u64 end, char *name,
176 int overlap_ok)
177{
178 int i;
179 struct early_res *r;
180
181 i = find_overlapped_early(start, end);
182 if (i >= max_early_res)
183 panic("Too many early reservations");
184 r = &early_res[i];
185 if (r->end)
186 panic("Overlapping early reservations "
187 "%llx-%llx %s to %llx-%llx %s\n",
188 start, end - 1, name ? name : "", r->start,
189 r->end - 1, r->name);
190 r->start = start;
191 r->end = end;
192 r->overlap_ok = overlap_ok;
193 if (name)
194 strncpy(r->name, name, sizeof(r->name) - 1);
195 early_res_count++;
196}
197
198/*
199 * A few early reservtations come here.
200 *
201 * The 'overlap_ok' in the name of this routine does -not- mean it
202 * is ok for these reservations to overlap an earlier reservation.
203 * Rather it means that it is ok for subsequent reservations to
204 * overlap this one.
205 *
206 * Use this entry point to reserve early ranges when you are doing
207 * so out of "Paranoia", reserving perhaps more memory than you need,
208 * just in case, and don't mind a subsequent overlapping reservation
209 * that is known to be needed.
210 *
211 * The drop_overlaps_that_are_ok() call here isn't really needed.
212 * It would be needed if we had two colliding 'overlap_ok'
213 * reservations, so that the second such would not panic on the
214 * overlap with the first. We don't have any such as of this
215 * writing, but might as well tolerate such if it happens in
216 * the future.
217 */
218void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
219{
220 drop_overlaps_that_are_ok(start, end);
221 __reserve_early(start, end, name, 1);
222}
223
224static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
225{
226 u64 start, end, size, mem;
227 struct early_res *new;
228
229 /* do we have enough slots left ? */
230 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
231 return;
232
233 /* double it */
234 mem = -1ULL;
235 size = sizeof(struct early_res) * max_early_res * 2;
236 if (early_res == early_res_x)
237 start = 0;
238 else
239 start = early_res[0].end;
240 end = ex_start;
241 if (start + size < end)
242 mem = find_fw_memmap_area(start, end, size,
243 sizeof(struct early_res));
244 if (mem == -1ULL) {
245 start = ex_end;
246 end = get_max_mapped();
247 if (start + size < end)
248 mem = find_fw_memmap_area(start, end, size,
249 sizeof(struct early_res));
250 }
251 if (mem == -1ULL)
252 panic("can not find more space for early_res array");
253
254 new = __va(mem);
255 /* save the first one for own */
256 new[0].start = mem;
257 new[0].end = mem + size;
258 new[0].overlap_ok = 0;
259 /* copy old to new */
260 if (early_res == early_res_x) {
261 memcpy(&new[1], &early_res[0],
262 sizeof(struct early_res) * max_early_res);
263 memset(&new[max_early_res+1], 0,
264 sizeof(struct early_res) * (max_early_res - 1));
265 early_res_count++;
266 } else {
267 memcpy(&new[1], &early_res[1],
268 sizeof(struct early_res) * (max_early_res - 1));
269 memset(&new[max_early_res], 0,
270 sizeof(struct early_res) * max_early_res);
271 }
272 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
273 early_res = new;
274 max_early_res *= 2;
275 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
276 max_early_res, mem, mem + size - 1);
277}
278
279/*
280 * Most early reservations come here.
281 *
282 * We first have drop_overlaps_that_are_ok() drop any pre-existing
283 * 'overlap_ok' ranges, so that we can then reserve this memory
284 * range without risk of panic'ing on an overlapping overlap_ok
285 * early reservation.
286 */
287void __init reserve_early(u64 start, u64 end, char *name)
288{
289 if (start >= end)
290 return;
291
292 __check_and_double_early_res(start, end);
293
294 drop_overlaps_that_are_ok(start, end);
295 __reserve_early(start, end, name, 0);
296}
297
298void __init reserve_early_without_check(u64 start, u64 end, char *name)
299{
300 struct early_res *r;
301
302 if (start >= end)
303 return;
304
305 __check_and_double_early_res(start, end);
306
307 r = &early_res[early_res_count];
308
309 r->start = start;
310 r->end = end;
311 r->overlap_ok = 0;
312 if (name)
313 strncpy(r->name, name, sizeof(r->name) - 1);
314 early_res_count++;
315}
316
317void __init free_early(u64 start, u64 end)
318{
319 struct early_res *r;
320 int i;
321
322 i = find_overlapped_early(start, end);
323 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start)
325 panic("free_early on not reserved area: %llx-%llx!",
326 start, end - 1);
327
328 drop_range(i);
329}
330
331void __init free_early_partial(u64 start, u64 end)
332{
333 struct early_res *r;
334 int i;
335
336try_next:
337 i = find_overlapped_early(start, end);
338 if (i >= max_early_res)
339 return;
340
341 r = &early_res[i];
342 /* hole ? */
343 if (r->end >= end && r->start <= start) {
344 drop_range_partial(i, start, end);
345 return;
346 }
347
348 drop_range_partial(i, start, end);
349 goto try_next;
350}
351
352#ifdef CONFIG_NO_BOOTMEM
353static void __init subtract_early_res(struct range *range, int az)
354{
355 int i, count;
356 u64 final_start, final_end;
357 int idx = 0;
358
359 count = 0;
360 for (i = 0; i < max_early_res && early_res[i].end; i++)
361 count++;
362
363 /* need to skip first one ?*/
364 if (early_res != early_res_x)
365 idx = 1;
366
367#define DEBUG_PRINT_EARLY_RES 1
368
369#if DEBUG_PRINT_EARLY_RES
370 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
371#endif
372 for (i = idx; i < count; i++) {
373 struct early_res *r = &early_res[i];
374#if DEBUG_PRINT_EARLY_RES
375 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
376 r->start, r->end, r->name);
377#endif
378 final_start = PFN_DOWN(r->start);
379 final_end = PFN_UP(r->end);
380 if (final_start >= final_end)
381 continue;
382 subtract_range(range, az, final_start, final_end);
383 }
384
385}
386
387int __init get_free_all_memory_range(struct range **rangep, int nodeid)
388{
389 int i, count;
390 u64 start = 0, end;
391 u64 size;
392 u64 mem;
393 struct range *range;
394 int nr_range;
395
396 count = 0;
397 for (i = 0; i < max_early_res && early_res[i].end; i++)
398 count++;
399
400 count *= 2;
401
402 size = sizeof(struct range) * count;
403 end = get_max_mapped();
404#ifdef MAX_DMA32_PFN
405 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
406 start = MAX_DMA32_PFN << PAGE_SHIFT;
407#endif
408 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
409 if (mem == -1ULL)
410 panic("can not find more space for range free");
411
412 range = __va(mem);
413 /* use early_node_map[] and early_res to get range array at first */
414 memset(range, 0, size);
415 nr_range = 0;
416
417 /* need to go over early_node_map to find out good range for node */
418 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
419#ifdef CONFIG_X86_32
420 subtract_range(range, count, max_low_pfn, -1ULL);
421#endif
422 subtract_early_res(range, count);
423 nr_range = clean_sort_range(range, count);
424
425 /* need to clear it ? */
426 if (nodeid == MAX_NUMNODES) {
427 memset(&early_res[0], 0,
428 sizeof(struct early_res) * max_early_res);
429 early_res = NULL;
430 max_early_res = 0;
431 }
432
433 *rangep = range;
434 return nr_range;
435}
436#else
437void __init early_res_to_bootmem(u64 start, u64 end)
438{
439 int i, count;
440 u64 final_start, final_end;
441 int idx = 0;
442
443 count = 0;
444 for (i = 0; i < max_early_res && early_res[i].end; i++)
445 count++;
446
447 /* need to skip first one ?*/
448 if (early_res != early_res_x)
449 idx = 1;
450
451 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
452 count - idx, max_early_res, start, end);
453 for (i = idx; i < count; i++) {
454 struct early_res *r = &early_res[i];
455 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
456 r->start, r->end, r->name);
457 final_start = max(start, r->start);
458 final_end = min(end, r->end);
459 if (final_start >= final_end) {
460 printk(KERN_CONT "\n");
461 continue;
462 }
463 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
464 final_start, final_end);
465 reserve_bootmem_generic(final_start, final_end - final_start,
466 BOOTMEM_DEFAULT);
467 }
468 /* clear them */
469 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
470 early_res = NULL;
471 max_early_res = 0;
472 early_res_count = 0;
473}
474#endif
475
476/* Check for already reserved areas */
477static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
478{
479 int i;
480 u64 addr = *addrp;
481 int changed = 0;
482 struct early_res *r;
483again:
484 i = find_overlapped_early(addr, addr + size);
485 r = &early_res[i];
486 if (i < max_early_res && r->end) {
487 *addrp = addr = round_up(r->end, align);
488 changed = 1;
489 goto again;
490 }
491 return changed;
492}
493
494/* Check for already reserved areas */
495static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
496{
497 int i;
498 u64 addr = *addrp, last;
499 u64 size = *sizep;
500 int changed = 0;
501again:
502 last = addr + size;
503 for (i = 0; i < max_early_res && early_res[i].end; i++) {
504 struct early_res *r = &early_res[i];
505 if (last > r->start && addr < r->start) {
506 size = r->start - addr;
507 changed = 1;
508 goto again;
509 }
510 if (last > r->end && addr < r->end) {
511 addr = round_up(r->end, align);
512 size = last - addr;
513 changed = 1;
514 goto again;
515 }
516 if (last <= r->end && addr >= r->start) {
517 (*sizep)++;
518 return 0;
519 }
520 }
521 if (changed) {
522 *addrp = addr;
523 *sizep = size;
524 }
525 return changed;
526}
527
528/*
529 * Find a free area with specified alignment in a specific range.
530 * only with the area.between start to end is active range from early_node_map
531 * so they are good as RAM
532 */
533u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
534 u64 size, u64 align)
535{
536 u64 addr, last;
537
538 addr = round_up(ei_start, align);
539 if (addr < start)
540 addr = round_up(start, align);
541 if (addr >= ei_last)
542 goto out;
543 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
544 ;
545 last = addr + size;
546 if (last > ei_last)
547 goto out;
548 if (last > end)
549 goto out;
550
551 return addr;
552
553out:
554 return -1ULL;
555}
556
557u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
558 u64 *sizep, u64 align)
559{
560 u64 addr, last;
561
562 addr = round_up(ei_start, align);
563 if (addr < start)
564 addr = round_up(start, align);
565 if (addr >= ei_last)
566 goto out;
567 *sizep = ei_last - addr;
568 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
569 ;
570 last = addr + *sizep;
571 if (last > ei_last)
572 goto out;
573
574 return addr;
575
576out:
577 return -1ULL;
578}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exit.c b/kernel/exit.c
index 5962d7ccf243..ce1e48c2d93d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,10 +68,10 @@ static void __unhash_process(struct task_struct *p)
68 detach_pid(p, PIDTYPE_SID); 68 detach_pid(p, PIDTYPE_SID);
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling);
71 __get_cpu_var(process_counts)--; 72 __get_cpu_var(process_counts)--;
72 } 73 }
73 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
74 list_del_init(&p->sibling);
75} 75}
76 76
77/* 77/*
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk)
85 BUG_ON(!sig); 85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count)); 86 BUG_ON(!atomic_read(&sig->count));
87 87
88 sighand = rcu_dereference(tsk->sighand); 88 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() ||
90 lockdep_is_held(&tasklist_lock));
89 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
90 92
91 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p)
170repeat: 172repeat:
171 tracehook_prepare_release_task(p); 173 tracehook_prepare_release_task(p);
172 /* don't need to get the RCU readlock here - the process is dead and 174 /* don't need to get the RCU readlock here - the process is dead and
173 * can't be modifying its own credentials */ 175 * can't be modifying its own credentials. But shut RCU-lockdep up */
176 rcu_read_lock();
174 atomic_dec(&__task_cred(p)->user->processes); 177 atomic_dec(&__task_cred(p)->user->processes);
178 rcu_read_unlock();
175 179
176 proc_flush_task(p); 180 proc_flush_task(p);
177 181
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files)
473 /* 477 /*
474 * It is safe to dereference the fd table without RCU or 478 * It is safe to dereference the fd table without RCU or
475 * ->file_lock because this is the last reference to the 479 * ->file_lock because this is the last reference to the
476 * files structure. 480 * files structure. But use RCU to shut RCU-lockdep up.
477 */ 481 */
482 rcu_read_lock();
478 fdt = files_fdtable(files); 483 fdt = files_fdtable(files);
484 rcu_read_unlock();
479 for (;;) { 485 for (;;) {
480 unsigned long set; 486 unsigned long set;
481 i = j * __NFDBITS; 487 i = j * __NFDBITS;
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files)
521 * at the end of the RCU grace period. Otherwise, 527 * at the end of the RCU grace period. Otherwise,
522 * you can free files immediately. 528 * you can free files immediately.
523 */ 529 */
530 rcu_read_lock();
524 fdt = files_fdtable(files); 531 fdt = files_fdtable(files);
525 if (fdt != &files->fdtab) 532 if (fdt != &files->fdtab)
526 kmem_cache_free(files_cachep, files); 533 kmem_cache_free(files_cachep, files);
527 free_fdtable(fdt); 534 free_fdtable(fdt);
535 rcu_read_unlock();
528 } 536 }
529} 537}
530 538
@@ -736,12 +744,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
736/* 744/*
737* Any that need to be release_task'd are put on the @dead list. 745* Any that need to be release_task'd are put on the @dead list.
738 */ 746 */
739static void reparent_thread(struct task_struct *father, struct task_struct *p, 747static void reparent_leader(struct task_struct *father, struct task_struct *p,
740 struct list_head *dead) 748 struct list_head *dead)
741{ 749{
742 if (p->pdeath_signal)
743 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744
745 list_move_tail(&p->sibling, &p->real_parent->children); 750 list_move_tail(&p->sibling, &p->real_parent->children);
746 751
747 if (task_detached(p)) 752 if (task_detached(p))
@@ -780,12 +785,18 @@ static void forget_original_parent(struct task_struct *father)
780 reaper = find_new_reaper(father); 785 reaper = find_new_reaper(father);
781 786
782 list_for_each_entry_safe(p, n, &father->children, sibling) { 787 list_for_each_entry_safe(p, n, &father->children, sibling) {
783 p->real_parent = reaper; 788 struct task_struct *t = p;
784 if (p->parent == father) { 789 do {
785 BUG_ON(task_ptrace(p)); 790 t->real_parent = reaper;
786 p->parent = p->real_parent; 791 if (t->parent == father) {
787 } 792 BUG_ON(task_ptrace(t));
788 reparent_thread(father, p, &dead_children); 793 t->parent = t->real_parent;
794 }
795 if (t->pdeath_signal)
796 group_send_sig_info(t->pdeath_signal,
797 SEND_SIG_NOINFO, t);
798 } while_each_thread(p, t);
799 reparent_leader(father, p, &dead_children);
789 } 800 }
790 write_unlock_irq(&tasklist_lock); 801 write_unlock_irq(&tasklist_lock);
791 802
@@ -941,7 +952,8 @@ NORET_TYPE void do_exit(long code)
941 preempt_count()); 952 preempt_count());
942 953
943 acct_update_integrals(tsk); 954 acct_update_integrals(tsk);
944 955 /* sync mm's RSS info before statistics gathering */
956 sync_mm_rss(tsk, tsk->mm);
945 group_dead = atomic_dec_and_test(&tsk->signal->live); 957 group_dead = atomic_dec_and_test(&tsk->signal->live);
946 if (group_dead) { 958 if (group_dead) {
947 hrtimer_cancel(&tsk->signal->real_timer); 959 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1177,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1177 1189
1178 if (unlikely(wo->wo_flags & WNOWAIT)) { 1190 if (unlikely(wo->wo_flags & WNOWAIT)) {
1179 int exit_code = p->exit_code; 1191 int exit_code = p->exit_code;
1180 int why, status; 1192 int why;
1181 1193
1182 get_task_struct(p); 1194 get_task_struct(p);
1183 read_unlock(&tasklist_lock); 1195 read_unlock(&tasklist_lock);
@@ -1551,14 +1563,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1551 struct task_struct *p; 1563 struct task_struct *p;
1552 1564
1553 list_for_each_entry(p, &tsk->children, sibling) { 1565 list_for_each_entry(p, &tsk->children, sibling) {
1554 /* 1566 int ret = wait_consider_task(wo, 0, p);
1555 * Do not consider detached threads. 1567 if (ret)
1556 */ 1568 return ret;
1557 if (!task_detached(p)) {
1558 int ret = wait_consider_task(wo, 0, p);
1559 if (ret)
1560 return ret;
1561 }
1562 } 1569 }
1563 1570
1564 return 0; 1571 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 202a0ba63d3c..b0ec34abc0bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,6 +86,7 @@ int max_threads; /* tunable limit on nr_threads */
86DEFINE_PER_CPU(unsigned long, process_counts) = 0; 86DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89EXPORT_SYMBOL_GPL(tasklist_lock);
89 90
90int nr_processes(void) 91int nr_processes(void)
91{ 92{
@@ -328,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (!tmp) 329 if (!tmp)
329 goto fail_nomem; 330 goto fail_nomem;
330 *tmp = *mpnt; 331 *tmp = *mpnt;
332 INIT_LIST_HEAD(&tmp->anon_vma_chain);
331 pol = mpol_dup(vma_policy(mpnt)); 333 pol = mpol_dup(vma_policy(mpnt));
332 retval = PTR_ERR(pol); 334 retval = PTR_ERR(pol);
333 if (IS_ERR(pol)) 335 if (IS_ERR(pol))
334 goto fail_nomem_policy; 336 goto fail_nomem_policy;
335 vma_set_policy(tmp, pol); 337 vma_set_policy(tmp, pol);
338 if (anon_vma_fork(tmp, mpnt))
339 goto fail_nomem_anon_vma_fork;
336 tmp->vm_flags &= ~VM_LOCKED; 340 tmp->vm_flags &= ~VM_LOCKED;
337 tmp->vm_mm = mm; 341 tmp->vm_mm = mm;
338 tmp->vm_next = NULL; 342 tmp->vm_next = NULL;
339 anon_vma_link(tmp);
340 file = tmp->vm_file; 343 file = tmp->vm_file;
341 if (file) { 344 if (file) {
342 struct inode *inode = file->f_path.dentry->d_inode; 345 struct inode *inode = file->f_path.dentry->d_inode;
@@ -391,6 +394,8 @@ out:
391 flush_tlb_mm(oldmm); 394 flush_tlb_mm(oldmm);
392 up_write(&oldmm->mmap_sem); 395 up_write(&oldmm->mmap_sem);
393 return retval; 396 return retval;
397fail_nomem_anon_vma_fork:
398 mpol_put(pol);
394fail_nomem_policy: 399fail_nomem_policy:
395 kmem_cache_free(vm_area_cachep, tmp); 400 kmem_cache_free(vm_area_cachep, tmp);
396fail_nomem: 401fail_nomem:
@@ -454,8 +459,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
454 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 459 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
455 mm->core_state = NULL; 460 mm->core_state = NULL;
456 mm->nr_ptes = 0; 461 mm->nr_ptes = 0;
457 set_mm_counter(mm, file_rss, 0); 462 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
458 set_mm_counter(mm, anon_rss, 0);
459 spin_lock_init(&mm->page_table_lock); 463 spin_lock_init(&mm->page_table_lock);
460 mm->free_area_cache = TASK_UNMAPPED_BASE; 464 mm->free_area_cache = TASK_UNMAPPED_BASE;
461 mm->cached_hole_size = ~0UL; 465 mm->cached_hole_size = ~0UL;
@@ -824,6 +828,8 @@ void __cleanup_sighand(struct sighand_struct *sighand)
824 */ 828 */
825static void posix_cpu_timers_init_group(struct signal_struct *sig) 829static void posix_cpu_timers_init_group(struct signal_struct *sig)
826{ 830{
831 unsigned long cpu_limit;
832
827 /* Thread group counters. */ 833 /* Thread group counters. */
828 thread_group_cputime_init(sig); 834 thread_group_cputime_init(sig);
829 835
@@ -838,9 +844,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
838 sig->cputime_expires.virt_exp = cputime_zero; 844 sig->cputime_expires.virt_exp = cputime_zero;
839 sig->cputime_expires.sched_exp = 0; 845 sig->cputime_expires.sched_exp = 0;
840 846
841 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 847 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
842 sig->cputime_expires.prof_exp = 848 if (cpu_limit != RLIM_INFINITY) {
843 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 849 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
844 sig->cputimer.running = 1; 850 sig->cputimer.running = 1;
845 } 851 }
846 852
@@ -1033,7 +1039,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1033#endif 1039#endif
1034 retval = -EAGAIN; 1040 retval = -EAGAIN;
1035 if (atomic_read(&p->real_cred->user->processes) >= 1041 if (atomic_read(&p->real_cred->user->processes) >=
1036 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1042 task_rlimit(p, RLIMIT_NPROC)) {
1037 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1043 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1038 p->real_cred->user != INIT_USER) 1044 p->real_cred->user != INIT_USER)
1039 goto bad_fork_free; 1045 goto bad_fork_free;
@@ -1241,21 +1247,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1241 /* Need tasklist lock for parent etc handling! */ 1247 /* Need tasklist lock for parent etc handling! */
1242 write_lock_irq(&tasklist_lock); 1248 write_lock_irq(&tasklist_lock);
1243 1249
1244 /*
1245 * The task hasn't been attached yet, so its cpus_allowed mask will
1246 * not be changed, nor will its assigned CPU.
1247 *
1248 * The cpus_allowed mask of the parent may have changed after it was
1249 * copied first time - so re-copy it here, then check the child's CPU
1250 * to ensure it is on a valid CPU (and if not, just force it back to
1251 * parent's CPU). This avoids alot of nasty races.
1252 */
1253 p->cpus_allowed = current->cpus_allowed;
1254 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1255 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1256 !cpu_online(task_cpu(p))))
1257 set_task_cpu(p, smp_processor_id());
1258
1259 /* CLONE_PARENT re-uses the old parent */ 1250 /* CLONE_PARENT re-uses the old parent */
1260 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { 1251 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1261 p->real_parent = current->real_parent; 1252 p->real_parent = current->real_parent;
@@ -1291,7 +1282,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1291 } 1282 }
1292 1283
1293 if (likely(p->pid)) { 1284 if (likely(p->pid)) {
1294 list_add_tail(&p->sibling, &p->real_parent->children);
1295 tracehook_finish_clone(p, clone_flags, trace); 1285 tracehook_finish_clone(p, clone_flags, trace);
1296 1286
1297 if (thread_group_leader(p)) { 1287 if (thread_group_leader(p)) {
@@ -1303,6 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1303 p->signal->tty = tty_kref_get(current->signal->tty); 1293 p->signal->tty = tty_kref_get(current->signal->tty);
1304 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1294 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1305 attach_pid(p, PIDTYPE_SID, task_session(current)); 1295 attach_pid(p, PIDTYPE_SID, task_session(current));
1296 list_add_tail(&p->sibling, &p->real_parent->children);
1306 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1297 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1307 __get_cpu_var(process_counts)++; 1298 __get_cpu_var(process_counts)++;
1308 } 1299 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ffe1b9a..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
203 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
205 * @key: address where result is stored. 205 * @key: address where result is stored.
206 * @rw: mapping needs to be read/write (values: VERIFY_READ,
207 * VERIFY_WRITE)
208 * 206 *
209 * Returns a negative error code or 0 207 * Returns a negative error code or 0
210 * The key words are stored in *key on success. 208 * The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
216 * lock_page() might sleep, the caller should not hold a spinlock. 214 * lock_page() might sleep, the caller should not hold a spinlock.
217 */ 215 */
218static int 216static int
219get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 217get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
220{ 218{
221 unsigned long address = (unsigned long)uaddr; 219 unsigned long address = (unsigned long)uaddr;
222 struct mm_struct *mm = current->mm; 220 struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
239 * but access_ok() should be faster than find_vma() 237 * but access_ok() should be faster than find_vma()
240 */ 238 */
241 if (!fshared) { 239 if (!fshared) {
242 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 240 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
243 return -EFAULT; 241 return -EFAULT;
244 key->private.mm = mm; 242 key->private.mm = mm;
245 key->private.address = address; 243 key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
248 } 246 }
249 247
250again: 248again:
251 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); 249 err = get_user_pages_fast(address, 1, 1, &page);
252 if (err < 0) 250 if (err < 0)
253 return err; 251 return err;
254 252
@@ -532,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
532 return -EINVAL; 530 return -EINVAL;
533 531
534 WARN_ON(!atomic_read(&pi_state->refcount)); 532 WARN_ON(!atomic_read(&pi_state->refcount));
535 WARN_ON(pid && pi_state->owner && 533
536 pi_state->owner->pid != pid); 534 /*
535 * When pi_state->owner is NULL then the owner died
536 * and another waiter is on the fly. pi_state->owner
537 * is fixed up by the task which acquires
538 * pi_state->rt_mutex.
539 *
540 * We do not check for pid == 0 which can happen when
541 * the owner died and robust_list_exit() cleared the
542 * TID.
543 */
544 if (pid && pi_state->owner) {
545 /*
546 * Bail out if user space manipulated the
547 * futex value.
548 */
549 if (pid != task_pid_vnr(pi_state->owner))
550 return -EINVAL;
551 }
537 552
538 atomic_inc(&pi_state->refcount); 553 atomic_inc(&pi_state->refcount);
539 *ps = pi_state; 554 *ps = pi_state;
@@ -760,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
760 if (!pi_state) 775 if (!pi_state)
761 return -EINVAL; 776 return -EINVAL;
762 777
778 /*
779 * If current does not own the pi_state then the futex is
780 * inconsistent and user space fiddled with the futex value.
781 */
782 if (pi_state->owner != current)
783 return -EINVAL;
784
763 raw_spin_lock(&pi_state->pi_mutex.wait_lock); 785 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
764 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 786 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
765 787
@@ -867,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
867 if (!bitset) 889 if (!bitset)
868 return -EINVAL; 890 return -EINVAL;
869 891
870 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); 892 ret = get_futex_key(uaddr, fshared, &key);
871 if (unlikely(ret != 0)) 893 if (unlikely(ret != 0))
872 goto out; 894 goto out;
873 895
@@ -913,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
913 int ret, op_ret; 935 int ret, op_ret;
914 936
915retry: 937retry:
916 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 938 ret = get_futex_key(uaddr1, fshared, &key1);
917 if (unlikely(ret != 0)) 939 if (unlikely(ret != 0))
918 goto out; 940 goto out;
919 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 941 ret = get_futex_key(uaddr2, fshared, &key2);
920 if (unlikely(ret != 0)) 942 if (unlikely(ret != 0))
921 goto out_put_key1; 943 goto out_put_key1;
922 944
@@ -1175,11 +1197,10 @@ retry:
1175 pi_state = NULL; 1197 pi_state = NULL;
1176 } 1198 }
1177 1199
1178 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1200 ret = get_futex_key(uaddr1, fshared, &key1);
1179 if (unlikely(ret != 0)) 1201 if (unlikely(ret != 0))
1180 goto out; 1202 goto out;
1181 ret = get_futex_key(uaddr2, fshared, &key2, 1203 ret = get_futex_key(uaddr2, fshared, &key2);
1182 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1183 if (unlikely(ret != 0)) 1204 if (unlikely(ret != 0))
1184 goto out_put_key1; 1205 goto out_put_key1;
1185 1206
@@ -1738,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1738 */ 1759 */
1739retry: 1760retry:
1740 q->key = FUTEX_KEY_INIT; 1761 q->key = FUTEX_KEY_INIT;
1741 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); 1762 ret = get_futex_key(uaddr, fshared, &q->key);
1742 if (unlikely(ret != 0)) 1763 if (unlikely(ret != 0))
1743 return ret; 1764 return ret;
1744 1765
@@ -1904,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1904 q.requeue_pi_key = NULL; 1925 q.requeue_pi_key = NULL;
1905retry: 1926retry:
1906 q.key = FUTEX_KEY_INIT; 1927 q.key = FUTEX_KEY_INIT;
1907 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1928 ret = get_futex_key(uaddr, fshared, &q.key);
1908 if (unlikely(ret != 0)) 1929 if (unlikely(ret != 0))
1909 goto out; 1930 goto out;
1910 1931
@@ -1974,7 +1995,7 @@ retry_private:
1974 /* Unqueue and drop the lock */ 1995 /* Unqueue and drop the lock */
1975 unqueue_me_pi(&q); 1996 unqueue_me_pi(&q);
1976 1997
1977 goto out; 1998 goto out_put_key;
1978 1999
1979out_unlock_put_key: 2000out_unlock_put_key:
1980 queue_unlock(&q, hb); 2001 queue_unlock(&q, hb);
@@ -2023,7 +2044,7 @@ retry:
2023 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2044 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2024 return -EPERM; 2045 return -EPERM;
2025 2046
2026 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); 2047 ret = get_futex_key(uaddr, fshared, &key);
2027 if (unlikely(ret != 0)) 2048 if (unlikely(ret != 0))
2028 goto out; 2049 goto out;
2029 2050
@@ -2215,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2215 rt_waiter.task = NULL; 2236 rt_waiter.task = NULL;
2216 2237
2217 key2 = FUTEX_KEY_INIT; 2238 key2 = FUTEX_KEY_INIT;
2218 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2239 ret = get_futex_key(uaddr2, fshared, &key2);
2219 if (unlikely(ret != 0)) 2240 if (unlikely(ret != 0))
2220 goto out; 2241 goto out;
2221 2242
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
146 struct task_struct *p; 146 struct task_struct *p;
147 147
148 ret = -ESRCH; 148 ret = -ESRCH;
149 read_lock(&tasklist_lock); 149 rcu_read_lock();
150 p = find_task_by_vpid(pid); 150 p = find_task_by_vpid(pid);
151 if (!p) 151 if (!p)
152 goto err_unlock; 152 goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
157 !capable(CAP_SYS_PTRACE)) 157 !capable(CAP_SYS_PTRACE))
158 goto err_unlock; 158 goto err_unlock;
159 head = p->compat_robust_list; 159 head = p->compat_robust_list;
160 read_unlock(&tasklist_lock); 160 rcu_read_unlock();
161 } 161 }
162 162
163 if (put_user(sizeof(*head), len_ptr)) 163 if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
165 return put_user(ptr_to_compat(head), head_ptr); 165 return put_user(ptr_to_compat(head), head_ptr);
166 166
167err_unlock: 167err_unlock:
168 read_unlock(&tasklist_lock); 168 rcu_read_unlock();
169 169
170 return ret; 170 return ret;
171} 171}
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index dbcbf6a33a08..967e66143e11 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,6 +40,7 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/cpu.h>
43#include <linux/smp.h> 44#include <linux/smp.h>
44 45
45#include <linux/hw_breakpoint.h> 46#include <linux/hw_breakpoint.h>
@@ -242,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
242 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
243 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM 244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
244 */ 245 */
245int reserve_bp_slot(struct perf_event *bp) 246static int __reserve_bp_slot(struct perf_event *bp)
246{ 247{
247 struct bp_busy_slots slots = {0}; 248 struct bp_busy_slots slots = {0};
248 int ret = 0;
249
250 mutex_lock(&nr_bp_mutex);
251 249
252 fetch_bp_busy_slots(&slots, bp); 250 fetch_bp_busy_slots(&slots, bp);
253 251
254 /* Flexible counters need to keep at least one slot */ 252 /* Flexible counters need to keep at least one slot */
255 if (slots.pinned + (!!slots.flexible) == HBP_NUM) { 253 if (slots.pinned + (!!slots.flexible) == HBP_NUM)
256 ret = -ENOSPC; 254 return -ENOSPC;
257 goto end;
258 }
259 255
260 toggle_bp_slot(bp, true); 256 toggle_bp_slot(bp, true);
261 257
262end: 258 return 0;
259}
260
261int reserve_bp_slot(struct perf_event *bp)
262{
263 int ret;
264
265 mutex_lock(&nr_bp_mutex);
266
267 ret = __reserve_bp_slot(bp);
268
263 mutex_unlock(&nr_bp_mutex); 269 mutex_unlock(&nr_bp_mutex);
264 270
265 return ret; 271 return ret;
266} 272}
267 273
274static void __release_bp_slot(struct perf_event *bp)
275{
276 toggle_bp_slot(bp, false);
277}
278
268void release_bp_slot(struct perf_event *bp) 279void release_bp_slot(struct perf_event *bp)
269{ 280{
270 mutex_lock(&nr_bp_mutex); 281 mutex_lock(&nr_bp_mutex);
271 282
272 toggle_bp_slot(bp, false); 283 __release_bp_slot(bp);
273 284
274 mutex_unlock(&nr_bp_mutex); 285 mutex_unlock(&nr_bp_mutex);
275} 286}
276 287
288/*
289 * Allow the kernel debugger to reserve breakpoint slots without
290 * taking a lock using the dbg_* variant of for the reserve and
291 * release breakpoint slots.
292 */
293int dbg_reserve_bp_slot(struct perf_event *bp)
294{
295 if (mutex_is_locked(&nr_bp_mutex))
296 return -1;
297
298 return __reserve_bp_slot(bp);
299}
300
301int dbg_release_bp_slot(struct perf_event *bp)
302{
303 if (mutex_is_locked(&nr_bp_mutex))
304 return -1;
305
306 __release_bp_slot(bp);
307
308 return 0;
309}
277 310
278int register_perf_hw_breakpoint(struct perf_event *bp) 311int register_perf_hw_breakpoint(struct perf_event *bp)
279{ 312{
@@ -295,6 +328,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
295 if (!bp->attr.disabled || !bp->overflow_handler) 328 if (!bp->attr.disabled || !bp->overflow_handler)
296 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
297 330
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret)
333 release_bp_slot(bp);
334
298 return ret; 335 return ret;
299} 336}
300 337
@@ -323,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
323int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) 360int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
324{ 361{
325 u64 old_addr = bp->attr.bp_addr; 362 u64 old_addr = bp->attr.bp_addr;
363 u64 old_len = bp->attr.bp_len;
326 int old_type = bp->attr.bp_type; 364 int old_type = bp->attr.bp_type;
327 int old_len = bp->attr.bp_len;
328 int err = 0; 365 int err = 0;
329 366
330 perf_event_disable(bp); 367 perf_event_disable(bp);
@@ -388,7 +425,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
388 if (!cpu_events) 425 if (!cpu_events)
389 return ERR_PTR(-ENOMEM); 426 return ERR_PTR(-ENOMEM);
390 427
391 for_each_possible_cpu(cpu) { 428 get_online_cpus();
429 for_each_online_cpu(cpu) {
392 pevent = per_cpu_ptr(cpu_events, cpu); 430 pevent = per_cpu_ptr(cpu_events, cpu);
393 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 431 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
394 432
@@ -399,18 +437,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
399 goto fail; 437 goto fail;
400 } 438 }
401 } 439 }
440 put_online_cpus();
402 441
403 return cpu_events; 442 return cpu_events;
404 443
405fail: 444fail:
406 for_each_possible_cpu(cpu) { 445 for_each_online_cpu(cpu) {
407 pevent = per_cpu_ptr(cpu_events, cpu); 446 pevent = per_cpu_ptr(cpu_events, cpu);
408 if (IS_ERR(*pevent)) 447 if (IS_ERR(*pevent))
409 break; 448 break;
410 unregister_hw_breakpoint(*pevent); 449 unregister_hw_breakpoint(*pevent);
411 } 450 }
451 put_online_cpus();
452
412 free_percpu(cpu_events); 453 free_percpu(cpu_events);
413 /* return the error if any */
414 return ERR_PTR(err); 454 return ERR_PTR(err);
415} 455}
416EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..d70394f12ee9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
85 raw_spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 100}
87 101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
88 122
89/** 123/**
90 * set_irq_chip - set the irq chip for an irq 124 * set_irq_chip - set the irq chip for an irq
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
132 */ 128 */
133DEFINE_RAW_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..963559dbd858 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_to_desc(irq);
74 74
75 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
76 goto out_unlock; 76 goto out_unlock;
@@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 90 goto out_unlock;
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 replace_irq_desc(irq, desc);
94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a9a93d9ee7a7..87ebe8adc474 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,6 +32,7 @@
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
35 36
36#include <asm/page.h> 37#include <asm/page.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
@@ -40,7 +41,7 @@
40#include <asm/sections.h> 41#include <asm/sections.h>
41 42
42/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
43note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
44 45
45/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
46static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
@@ -1074,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
1074 if (mutex_trylock(&kexec_mutex)) { 1075 if (mutex_trylock(&kexec_mutex)) {
1075 if (kexec_crash_image) { 1076 if (kexec_crash_image) {
1076 struct pt_regs fixed_regs; 1077 struct pt_regs fixed_regs;
1078
1079 kmsg_dump(KMSG_DUMP_KEXEC);
1080
1077 crash_setup_regs(&fixed_regs, regs); 1081 crash_setup_regs(&fixed_regs, regs);
1078 crash_save_vmcoreinfo(); 1082 crash_save_vmcoreinfo();
1079 machine_crash_shutdown(&fixed_regs); 1083 machine_crash_shutdown(&fixed_regs);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 3765ff3c1bbe..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * A simple kernel FIFO implementation. 2 * A generic kernel FIFO implementation.
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
4 * Copyright (C) 2004 Stelian Pop <stelian@popies.net> 5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -25,50 +26,48 @@
25#include <linux/err.h> 26#include <linux/err.h>
26#include <linux/kfifo.h> 27#include <linux/kfifo.h>
27#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/uaccess.h>
30
31static void _kfifo_init(struct kfifo *fifo, void *buffer,
32 unsigned int size)
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
28 39
29/** 40/**
30 * kfifo_init - allocates a new FIFO using a preallocated buffer 41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
31 * @buffer: the preallocated buffer to be used. 43 * @buffer: the preallocated buffer to be used.
32 * @size: the size of the internal buffer, this have to be a power of 2. 44 * @size: the size of the internal buffer, this has to be a power of 2.
33 * @gfp_mask: get_free_pages mask, passed to kmalloc()
34 * @lock: the lock to be used to protect the fifo buffer
35 * 45 *
36 * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
37 * &struct kfifo with kfree().
38 */ 46 */
39struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
40 gfp_t gfp_mask, spinlock_t *lock)
41{ 48{
42 struct kfifo *fifo;
43
44 /* size must be a power of 2 */ 49 /* size must be a power of 2 */
45 BUG_ON(!is_power_of_2(size)); 50 BUG_ON(!is_power_of_2(size));
46 51
47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 52 _kfifo_init(fifo, buffer, size);
48 if (!fifo)
49 return ERR_PTR(-ENOMEM);
50
51 fifo->buffer = buffer;
52 fifo->size = size;
53 fifo->in = fifo->out = 0;
54 fifo->lock = lock;
55
56 return fifo;
57} 53}
58EXPORT_SYMBOL(kfifo_init); 54EXPORT_SYMBOL(kfifo_init);
59 55
60/** 56/**
61 * kfifo_alloc - allocates a new FIFO and its internal buffer 57 * kfifo_alloc - allocates a new FIFO internal buffer
62 * @size: the size of the internal buffer to be allocated. 58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
63 * @gfp_mask: get_free_pages mask, passed to kmalloc() 60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
64 * @lock: the lock to be used to protect the fifo buffer 61 *
62 * This function dynamically allocates a new fifo internal buffer
65 * 63 *
66 * The size will be rounded-up to a power of 2. 64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */ 67 */
68struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) 68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 69{
70 unsigned char *buffer; 70 unsigned char *buffer;
71 struct kfifo *ret;
72 71
73 /* 72 /*
74 * round up to the next power of 2, since our 'let the indices 73 * round up to the next power of 2, since our 'let the indices
@@ -80,48 +79,93 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
80 } 79 }
81 80
82 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
83 if (!buffer) 82 if (!buffer) {
84 return ERR_PTR(-ENOMEM); 83 _kfifo_init(fifo, NULL, 0);
85 84 return -ENOMEM;
86 ret = kfifo_init(buffer, size, gfp_mask, lock); 85 }
87 86
88 if (IS_ERR(ret)) 87 _kfifo_init(fifo, buffer, size);
89 kfree(buffer);
90 88
91 return ret; 89 return 0;
92} 90}
93EXPORT_SYMBOL(kfifo_alloc); 91EXPORT_SYMBOL(kfifo_alloc);
94 92
95/** 93/**
96 * kfifo_free - frees the FIFO 94 * kfifo_free - frees the FIFO internal buffer
97 * @fifo: the fifo to be freed. 95 * @fifo: the fifo to be freed.
98 */ 96 */
99void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
100{ 98{
101 kfree(fifo->buffer); 99 kfree(fifo->buffer);
102 kfree(fifo); 100 _kfifo_init(fifo, NULL, 0);
103} 101}
104EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
105 103
106/** 104/**
107 * __kfifo_put - puts some data into the FIFO, no locking version 105 * kfifo_skip - skip output data
108 * @fifo: the fifo to be used. 106 * @fifo: the fifo to be used.
109 * @buffer: the data to be added. 107 * @len: number of bytes to skip
110 * @len: the length of the data to be added.
111 *
112 * This function copies at most @len bytes from the @buffer into
113 * the FIFO depending on the free space, and returns the number of
114 * bytes copied.
115 *
116 * Note that with only one concurrent reader and one concurrent
117 * writer, you don't need extra locking to use these functions.
118 */ 108 */
119unsigned int __kfifo_put(struct kfifo *fifo, 109void kfifo_skip(struct kfifo *fifo, unsigned int len)
120 const unsigned char *buffer, unsigned int len) 110{
111 if (len < kfifo_len(fifo)) {
112 __kfifo_add_out(fifo, len);
113 return;
114 }
115 kfifo_reset_out(fifo);
116}
117EXPORT_SYMBOL(kfifo_skip);
118
119static inline void __kfifo_in_data(struct kfifo *fifo,
120 const void *from, unsigned int len, unsigned int off)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
124 len = min(len, fifo->size - fifo->in + fifo->out); 124 /*
125 * Ensure that we sample the fifo->out index -before- we
126 * start putting bytes into the kfifo.
127 */
128
129 smp_mb();
130
131 off = __kfifo_off(fifo, fifo->in + off);
132
133 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + off, from, l);
136
137 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, from + l, len - l);
139}
140
141static inline void __kfifo_out_data(struct kfifo *fifo,
142 void *to, unsigned int len, unsigned int off)
143{
144 unsigned int l;
145
146 /*
147 * Ensure that we sample the fifo->in index -before- we
148 * start removing bytes from the kfifo.
149 */
150
151 smp_rmb();
152
153 off = __kfifo_off(fifo, fifo->out + off);
154
155 /* first get the data from fifo->out until the end of the buffer */
156 l = min(len, fifo->size - off);
157 memcpy(to, fifo->buffer + off, l);
158
159 /* then get the rest (if any) from the beginning of the buffer */
160 memcpy(to + l, fifo->buffer, len - l);
161}
162
163static inline int __kfifo_from_user_data(struct kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout)
166{
167 unsigned int l;
168 int ret;
125 169
126 /* 170 /*
127 * Ensure that we sample the fifo->out index -before- we 171 * Ensure that we sample the fifo->out index -before- we
@@ -130,68 +174,272 @@ unsigned int __kfifo_put(struct kfifo *fifo,
130 174
131 smp_mb(); 175 smp_mb();
132 176
177 off = __kfifo_off(fifo, fifo->in + off);
178
133 /* first put the data starting from fifo->in to buffer end */ 179 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); 180 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); 181 ret = copy_from_user(fifo->buffer + off, from, l);
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
136 187
137 /* then put the rest (if any) at the beginning of the buffer */ 188 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, buffer + l, len - l); 189 ret = copy_from_user(fifo->buffer, from + l, len - l);
190 *lenout += ret ? ret : len - l;
191 return ret ? -EFAULT : 0;
192}
193
194static inline int __kfifo_to_user_data(struct kfifo *fifo,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
196{
197 unsigned int l;
198 int ret;
139 199
140 /* 200 /*
141 * Ensure that we add the bytes to the kfifo -before- 201 * Ensure that we sample the fifo->in index -before- we
142 * we update the fifo->in index. 202 * start removing bytes from the kfifo.
143 */ 203 */
144 204
145 smp_wmb(); 205 smp_rmb();
206
207 off = __kfifo_off(fifo, fifo->out + off);
208
209 /* first get the data from fifo->out until the end of the buffer */
210 l = min(len, fifo->size - off);
211 ret = copy_to_user(to, fifo->buffer + off, l);
212 *lenout = l;
213 if (unlikely(ret)) {
214 *lenout -= ret;
215 return -EFAULT;
216 }
217
218 /* then get the rest (if any) from the beginning of the buffer */
219 len -= l;
220 ret = copy_to_user(to + l, fifo->buffer, len);
221 if (unlikely(ret)) {
222 *lenout += len - ret;
223 return -EFAULT;
224 }
225 *lenout += len;
226 return 0;
227}
228
229unsigned int __kfifo_in_n(struct kfifo *fifo,
230 const void *from, unsigned int len, unsigned int recsize)
231{
232 if (kfifo_avail(fifo) < len + recsize)
233 return len + 1;
234
235 __kfifo_in_data(fifo, from, len, recsize);
236 return 0;
237}
238EXPORT_SYMBOL(__kfifo_in_n);
146 239
147 fifo->in += len; 240/**
241 * kfifo_in - puts some data into the FIFO
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{
256 len = min(kfifo_avail(fifo), len);
148 257
258 __kfifo_in_data(fifo, from, len, 0);
259 __kfifo_add_in(fifo, len);
149 return len; 260 return len;
150} 261}
151EXPORT_SYMBOL(__kfifo_put); 262EXPORT_SYMBOL(kfifo_in);
263
264unsigned int __kfifo_in_generic(struct kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize)
266{
267 return __kfifo_in_rec(fifo, from, len, recsize);
268}
269EXPORT_SYMBOL(__kfifo_in_generic);
270
271unsigned int __kfifo_out_n(struct kfifo *fifo,
272 void *to, unsigned int len, unsigned int recsize)
273{
274 if (kfifo_len(fifo) < len + recsize)
275 return len;
276
277 __kfifo_out_data(fifo, to, len, recsize);
278 __kfifo_add_out(fifo, len + recsize);
279 return 0;
280}
281EXPORT_SYMBOL(__kfifo_out_n);
152 282
153/** 283/**
154 * __kfifo_get - gets some data from the FIFO, no locking version 284 * kfifo_out - gets some data from the FIFO
155 * @fifo: the fifo to be used. 285 * @fifo: the fifo to be used.
156 * @buffer: where the data must be copied. 286 * @to: where the data must be copied.
157 * @len: the size of the destination buffer. 287 * @len: the size of the destination buffer.
158 * 288 *
159 * This function copies at most @len bytes from the FIFO into the 289 * This function copies at most @len bytes from the FIFO into the
160 * @buffer and returns the number of copied bytes. 290 * @to buffer and returns the number of copied bytes.
161 * 291 *
162 * Note that with only one concurrent reader and one concurrent 292 * Note that with only one concurrent reader and one concurrent
163 * writer, you don't need extra locking to use these functions. 293 * writer, you don't need extra locking to use these functions.
164 */ 294 */
165unsigned int __kfifo_get(struct kfifo *fifo, 295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
166 unsigned char *buffer, unsigned int len)
167{ 296{
168 unsigned int l; 297 len = min(kfifo_len(fifo), len);
169 298
170 len = min(len, fifo->in - fifo->out); 299 __kfifo_out_data(fifo, to, len, 0);
300 __kfifo_add_out(fifo, len);
171 301
172 /* 302 return len;
173 * Ensure that we sample the fifo->in index -before- we 303}
174 * start removing bytes from the kfifo. 304EXPORT_SYMBOL(kfifo_out);
175 */
176 305
177 smp_rmb(); 306/**
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it
308 * @fifo: the fifo to be used.
309 * @to: where the data must be copied.
310 * @len: the size of the destination buffer.
311 * @offset: offset into the fifo
312 *
313 * This function copies at most @len bytes at @offset from the FIFO
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
318 unsigned offset)
319{
320 len = min(kfifo_len(fifo), len + offset);
178 321
179 /* first get the data from fifo->out until the end of the buffer */ 322 __kfifo_out_data(fifo, to, len, offset);
180 l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); 323 return len;
181 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); 324}
325EXPORT_SYMBOL(kfifo_out_peek);
182 326
183 /* then get the rest (if any) from the beginning of the buffer */ 327unsigned int __kfifo_out_generic(struct kfifo *fifo,
184 memcpy(buffer + l, fifo->buffer, len - l); 328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{
331 return __kfifo_out_rec(fifo, to, len, recsize, total);
332}
333EXPORT_SYMBOL(__kfifo_out_generic);
185 334
186 /* 335unsigned int __kfifo_from_user_n(struct kfifo *fifo,
187 * Ensure that we remove the bytes from the kfifo -before- 336 const void __user *from, unsigned int len, unsigned int recsize)
188 * we update the fifo->out index. 337{
189 */ 338 unsigned total;
190 339
191 smp_mb(); 340 if (kfifo_avail(fifo) < len + recsize)
341 return len + 1;
192 342
193 fifo->out += len; 343 __kfifo_from_user_data(fifo, from, len, recsize, &total);
344 return total;
345}
346EXPORT_SYMBOL(__kfifo_from_user_n);
194 347
195 return len; 348/**
349 * kfifo_from_user - puts some data from user space into the FIFO
350 * @fifo: the fifo to be used.
351 * @from: pointer to the data to be added.
352 * @len: the length of the data to be added.
353 * @total: the actual returned data length.
354 *
355 * This function copies at most @len bytes from the @from into the
356 * FIFO depending and returns -EFAULT/0.
357 *
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
196} 371}
197EXPORT_SYMBOL(__kfifo_get); 372EXPORT_SYMBOL(kfifo_from_user);
373
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
375 const void __user *from, unsigned int len, unsigned int recsize)
376{
377 return __kfifo_from_user_rec(fifo, from, len, recsize);
378}
379EXPORT_SYMBOL(__kfifo_from_user_generic);
380
381unsigned int __kfifo_to_user_n(struct kfifo *fifo,
382 void __user *to, unsigned int len, unsigned int reclen,
383 unsigned int recsize)
384{
385 unsigned int ret, total;
386
387 if (kfifo_len(fifo) < reclen + recsize)
388 return len;
389
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
391
392 if (likely(ret == 0))
393 __kfifo_add_out(fifo, reclen + recsize);
394
395 return total;
396}
397EXPORT_SYMBOL(__kfifo_to_user_n);
398
399/**
400 * kfifo_to_user - gets data from the FIFO and write it to user space
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{
415 int ret;
416 len = min(kfifo_len(fifo), len);
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
418 __kfifo_add_out(fifo, *lenout);
419 return ret;
420}
421EXPORT_SYMBOL(kfifo_to_user);
422
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
424 void __user *to, unsigned int len, unsigned int recsize,
425 unsigned int *total)
426{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total);
428}
429EXPORT_SYMBOL(__kfifo_to_user_generic);
430
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
432{
433 if (recsize == 0)
434 return kfifo_avail(fifo);
435
436 return __kfifo_peek_n(fifo, recsize);
437}
438EXPORT_SYMBOL(__kfifo_peek_generic);
439
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
441{
442 __kfifo_skip_rec(fifo, recsize);
443}
444EXPORT_SYMBOL(__kfifo_skip_generic);
445
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 2eb517e23514..761fdd2b3034 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
583 smp_wmb(); 583 smp_wmb();
584 atomic_set(&cpu_in_kgdb[cpu], 1); 584 atomic_set(&cpu_in_kgdb[cpu], 1);
585 585
586 /* Disable any cpu specific hw breakpoints */
587 kgdb_disable_hw_debug(regs);
588
586 /* Wait till primary CPU is done with debugging */ 589 /* Wait till primary CPU is done with debugging */
587 while (atomic_read(&passive_cpu_wait[cpu])) 590 while (atomic_read(&passive_cpu_wait[cpu]))
588 cpu_relax(); 591 cpu_relax();
@@ -596,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
596 599
597 /* Signal the primary CPU that we are done: */ 600 /* Signal the primary CPU that we are done: */
598 atomic_set(&cpu_in_kgdb[cpu], 0); 601 atomic_set(&cpu_in_kgdb[cpu], 0);
599 touch_softlockup_watchdog(); 602 touch_softlockup_watchdog_sync();
600 clocksource_touch_watchdog(); 603 clocksource_touch_watchdog();
601 local_irq_restore(flags); 604 local_irq_restore(flags);
602} 605}
@@ -1450,7 +1453,7 @@ acquirelock:
1450 (kgdb_info[cpu].task && 1453 (kgdb_info[cpu].task &&
1451 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 1454 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1452 atomic_set(&kgdb_active, -1); 1455 atomic_set(&kgdb_active, -1);
1453 touch_softlockup_watchdog(); 1456 touch_softlockup_watchdog_sync();
1454 clocksource_touch_watchdog(); 1457 clocksource_touch_watchdog();
1455 local_irq_restore(flags); 1458 local_irq_restore(flags);
1456 1459
@@ -1550,7 +1553,7 @@ kgdb_restore:
1550 } 1553 }
1551 /* Free kgdb_active */ 1554 /* Free kgdb_active */
1552 atomic_set(&kgdb_active, -1); 1555 atomic_set(&kgdb_active, -1);
1553 touch_softlockup_watchdog(); 1556 touch_softlockup_watchdog_sync();
1554 clocksource_touch_watchdog(); 1557 clocksource_touch_watchdog();
1555 local_irq_restore(flags); 1558 local_irq_restore(flags);
1556 1559
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 25b103190364..bf0e231d9702 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
520 return -ENOMEM; 520 return -ENOMEM;
521 521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp); 522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) 523 if (ret < 0) {
524 goto out; 524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
525 527
526 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
527 531
528 out:
529 call_usermodehelper_freeinfo(sub_info);
530 return ret; 532 return ret;
531} 533}
532EXPORT_SYMBOL(call_usermodehelper_pipe); 534EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5342a344c43..fa034d29cf73 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,8 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h>
49#include <linux/cpu.h>
47 50
48#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -93,6 +96,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
93 {"native_get_debugreg",}, 96 {"native_get_debugreg",},
94 {"irq_entries_start",}, 97 {"irq_entries_start",},
95 {"common_interrupt",}, 98 {"common_interrupt",},
99 {"mcount",}, /* mcount can be called from everywhere */
96 {NULL} /* Terminator */ 100 {NULL} /* Terminator */
97}; 101};
98 102
@@ -103,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
104 * is a recipe for disaster 108 * is a recipe for disaster
105 */ 109 */
106#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
107
108struct kprobe_insn_page { 110struct kprobe_insn_page {
109 struct list_head list; 111 struct list_head list;
110 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
111 char slot_used[INSNS_PER_PAGE];
112 int nused; 113 int nused;
113 int ngarbage; 114 int ngarbage;
115 char slot_used[];
114}; 116};
115 117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
126};
127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
116enum kprobe_slot_state { 133enum kprobe_slot_state {
117 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
118 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
119 SLOT_USED = 2, 136 SLOT_USED = 2,
120}; 137};
121 138
122static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
123static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
124static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
125static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
126 143 .nr_garbage = 0,
127static int __kprobes check_safety(void) 144};
128{ 145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
129 int ret = 0;
130#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
131 ret = freeze_processes();
132 if (ret == 0) {
133 struct task_struct *p, *q;
134 do_each_thread(p, q) {
135 if (p != current && p->state == TASK_RUNNING &&
136 p->pid != 0) {
137 printk("Check failed: %s is running\n",p->comm);
138 ret = -1;
139 goto loop_end;
140 }
141 } while_each_thread(p, q);
142 }
143loop_end:
144 thaw_processes();
145#else
146 synchronize_sched();
147#endif
148 return ret;
149}
150 146
151/** 147/**
152 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
153 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
154 */ 150 */
155static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
156{ 152{
157 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
158 154
159 retry: 155 retry:
160 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
161 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
162 int i; 158 int i;
163 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
164 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
165 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
166 kip->nused++; 162 kip->nused++;
167 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
168 } 164 }
169 } 165 }
170 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
171 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
172 } 169 }
173 } 170 }
174 171
175 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
176 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
177 goto retry; 174 goto retry;
178 } 175
179 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
180 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
181 if (!kip) 178 if (!kip)
182 return NULL; 179 return NULL;
183 180
@@ -192,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
192 return NULL; 189 return NULL;
193 } 190 }
194 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
195 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
196 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
197 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
198 kip->nused = 1; 194 kip->nused = 1;
199 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
200 return kip->insns; 197 return kip->insns;
201} 198}
202 199
200
203kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
204{ 202{
205 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
206 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
207 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
208 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
209 return ret; 209 return ret;
210} 210}
211 211
@@ -221,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
221 * so as not to have to set it up again the 221 * so as not to have to set it up again the
222 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
223 */ 223 */
224 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
225 list_del(&kip->list); 225 list_del(&kip->list);
226 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
227 kfree(kip); 227 kfree(kip);
@@ -231,52 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
231 return 0; 231 return 0;
232} 232}
233 233
234static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
235{ 235{
236 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
237 237
238 /* Ensure no-one is preepmted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
239 if (check_safety()) 239 synchronize_sched();
240 return -EAGAIN;
241 240
242 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
243 int i; 242 int i;
244 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
245 continue; 244 continue;
246 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
247 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
248 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
249 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
250 break; 249 break;
251 } 250 }
252 } 251 }
253 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
254 return 0; 253 return 0;
255} 254}
256 255
257void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
258{ 258{
259 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
260 260
261 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
262 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) / c->insn_size;
263 if (kip->insns <= slot && 263 if (idx >= 0 && idx < slots_per_page(c)) {
264 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 WARN_ON(kip->slot_used[idx] != SLOT_USED);
265 int i = (slot - kip->insns) / MAX_INSN_SIZE;
266 if (dirty) { 265 if (dirty) {
267 kip->slot_used[i] = SLOT_DIRTY; 266 kip->slot_used[idx] = SLOT_DIRTY;
268 kip->ngarbage++; 267 kip->ngarbage++;
268 if (++c->nr_garbage > slots_per_page(c))
269 collect_garbage_slots(c);
269 } else 270 } else
270 collect_one_slot(kip, i); 271 collect_one_slot(kip, idx);
271 break; 272 return;
272 } 273 }
273 } 274 }
275 /* Could not free this slot. */
276 WARN_ON(1);
277}
274 278
275 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 279void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
276 collect_garbage_slots(); 280{
277 281 mutex_lock(&kprobe_insn_mutex);
282 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
278 mutex_unlock(&kprobe_insn_mutex); 283 mutex_unlock(&kprobe_insn_mutex);
279} 284}
285#ifdef CONFIG_OPTPROBES
286/* For optimized_kprobe buffer */
287static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
288static struct kprobe_insn_cache kprobe_optinsn_slots = {
289 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
290 /* .insn_size is initialized later */
291 .nr_garbage = 0,
292};
293/* Get a slot for optimized_kprobe buffer */
294kprobe_opcode_t __kprobes *get_optinsn_slot(void)
295{
296 kprobe_opcode_t *ret = NULL;
297
298 mutex_lock(&kprobe_optinsn_mutex);
299 ret = __get_insn_slot(&kprobe_optinsn_slots);
300 mutex_unlock(&kprobe_optinsn_mutex);
301
302 return ret;
303}
304
305void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
306{
307 mutex_lock(&kprobe_optinsn_mutex);
308 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
309 mutex_unlock(&kprobe_optinsn_mutex);
310}
311#endif
280#endif 312#endif
281 313
282/* We have preemption disabled.. so it is safe to use __ versions */ 314/* We have preemption disabled.. so it is safe to use __ versions */
@@ -307,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
307 if (p->addr == addr) 339 if (p->addr == addr)
308 return p; 340 return p;
309 } 341 }
342
310 return NULL; 343 return NULL;
311} 344}
312 345
346static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
347
348/* Return true if the kprobe is an aggregator */
349static inline int kprobe_aggrprobe(struct kprobe *p)
350{
351 return p->pre_handler == aggr_pre_handler;
352}
353
354/*
355 * Keep all fields in the kprobe consistent
356 */
357static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
358{
359 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
360 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
361}
362
363#ifdef CONFIG_OPTPROBES
364/* NOTE: change this value only with kprobe_mutex held */
365static bool kprobes_allow_optimization;
366
367/*
368 * Call all pre_handler on the list, but ignores its return value.
369 * This must be called from arch-dep optimized caller.
370 */
371void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
372{
373 struct kprobe *kp;
374
375 list_for_each_entry_rcu(kp, &p->list, list) {
376 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
377 set_kprobe_instance(kp);
378 kp->pre_handler(kp, regs);
379 }
380 reset_kprobe_instance();
381 }
382}
383
384/* Return true(!0) if the kprobe is ready for optimization. */
385static inline int kprobe_optready(struct kprobe *p)
386{
387 struct optimized_kprobe *op;
388
389 if (kprobe_aggrprobe(p)) {
390 op = container_of(p, struct optimized_kprobe, kp);
391 return arch_prepared_optinsn(&op->optinsn);
392 }
393
394 return 0;
395}
396
397/*
398 * Return an optimized kprobe whose optimizing code replaces
399 * instructions including addr (exclude breakpoint).
400 */
401struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
402{
403 int i;
404 struct kprobe *p = NULL;
405 struct optimized_kprobe *op;
406
407 /* Don't check i == 0, since that is a breakpoint case. */
408 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
409 p = get_kprobe((void *)(addr - i));
410
411 if (p && kprobe_optready(p)) {
412 op = container_of(p, struct optimized_kprobe, kp);
413 if (arch_within_optimized_kprobe(op, addr))
414 return p;
415 }
416
417 return NULL;
418}
419
420/* Optimization staging list, protected by kprobe_mutex */
421static LIST_HEAD(optimizing_list);
422
423static void kprobe_optimizer(struct work_struct *work);
424static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
425#define OPTIMIZE_DELAY 5
426
427/* Kprobe jump optimizer */
428static __kprobes void kprobe_optimizer(struct work_struct *work)
429{
430 struct optimized_kprobe *op, *tmp;
431
432 /* Lock modules while optimizing kprobes */
433 mutex_lock(&module_mutex);
434 mutex_lock(&kprobe_mutex);
435 if (kprobes_all_disarmed || !kprobes_allow_optimization)
436 goto end;
437
438 /*
439 * Wait for quiesence period to ensure all running interrupts
440 * are done. Because optprobe may modify multiple instructions
441 * there is a chance that Nth instruction is interrupted. In that
442 * case, running interrupt can return to 2nd-Nth byte of jump
443 * instruction. This wait is for avoiding it.
444 */
445 synchronize_sched();
446
447 /*
448 * The optimization/unoptimization refers online_cpus via
449 * stop_machine() and cpu-hotplug modifies online_cpus.
450 * And same time, text_mutex will be held in cpu-hotplug and here.
451 * This combination can cause a deadlock (cpu-hotplug try to lock
452 * text_mutex but stop_machine can not be done because online_cpus
453 * has been changed)
454 * To avoid this deadlock, we need to call get_online_cpus()
455 * for preventing cpu-hotplug outside of text_mutex locking.
456 */
457 get_online_cpus();
458 mutex_lock(&text_mutex);
459 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
460 WARN_ON(kprobe_disabled(&op->kp));
461 if (arch_optimize_kprobe(op) < 0)
462 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
463 list_del_init(&op->list);
464 }
465 mutex_unlock(&text_mutex);
466 put_online_cpus();
467end:
468 mutex_unlock(&kprobe_mutex);
469 mutex_unlock(&module_mutex);
470}
471
472/* Optimize kprobe if p is ready to be optimized */
473static __kprobes void optimize_kprobe(struct kprobe *p)
474{
475 struct optimized_kprobe *op;
476
477 /* Check if the kprobe is disabled or not ready for optimization. */
478 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
479 (kprobe_disabled(p) || kprobes_all_disarmed))
480 return;
481
482 /* Both of break_handler and post_handler are not supported. */
483 if (p->break_handler || p->post_handler)
484 return;
485
486 op = container_of(p, struct optimized_kprobe, kp);
487
488 /* Check there is no other kprobes at the optimized instructions */
489 if (arch_check_optimized_kprobe(op) < 0)
490 return;
491
492 /* Check if it is already optimized. */
493 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
494 return;
495
496 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
497 list_add(&op->list, &optimizing_list);
498 if (!delayed_work_pending(&optimizing_work))
499 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
500}
501
502/* Unoptimize a kprobe if p is optimized */
503static __kprobes void unoptimize_kprobe(struct kprobe *p)
504{
505 struct optimized_kprobe *op;
506
507 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
508 op = container_of(p, struct optimized_kprobe, kp);
509 if (!list_empty(&op->list))
510 /* Dequeue from the optimization queue */
511 list_del_init(&op->list);
512 else
513 /* Replace jump with break */
514 arch_unoptimize_kprobe(op);
515 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
516 }
517}
518
519/* Remove optimized instructions */
520static void __kprobes kill_optimized_kprobe(struct kprobe *p)
521{
522 struct optimized_kprobe *op;
523
524 op = container_of(p, struct optimized_kprobe, kp);
525 if (!list_empty(&op->list)) {
526 /* Dequeue from the optimization queue */
527 list_del_init(&op->list);
528 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
529 }
530 /* Don't unoptimize, because the target code will be freed. */
531 arch_remove_optimized_kprobe(op);
532}
533
534/* Try to prepare optimized instructions */
535static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
536{
537 struct optimized_kprobe *op;
538
539 op = container_of(p, struct optimized_kprobe, kp);
540 arch_prepare_optimized_kprobe(op);
541}
542
543/* Free optimized instructions and optimized_kprobe */
544static __kprobes void free_aggr_kprobe(struct kprobe *p)
545{
546 struct optimized_kprobe *op;
547
548 op = container_of(p, struct optimized_kprobe, kp);
549 arch_remove_optimized_kprobe(op);
550 kfree(op);
551}
552
553/* Allocate new optimized_kprobe and try to prepare optimized instructions */
554static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
555{
556 struct optimized_kprobe *op;
557
558 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
559 if (!op)
560 return NULL;
561
562 INIT_LIST_HEAD(&op->list);
563 op->kp.addr = p->addr;
564 arch_prepare_optimized_kprobe(op);
565
566 return &op->kp;
567}
568
569static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
570
571/*
572 * Prepare an optimized_kprobe and optimize it
573 * NOTE: p must be a normal registered kprobe
574 */
575static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
576{
577 struct kprobe *ap;
578 struct optimized_kprobe *op;
579
580 ap = alloc_aggr_kprobe(p);
581 if (!ap)
582 return;
583
584 op = container_of(ap, struct optimized_kprobe, kp);
585 if (!arch_prepared_optinsn(&op->optinsn)) {
586 /* If failed to setup optimizing, fallback to kprobe */
587 free_aggr_kprobe(ap);
588 return;
589 }
590
591 init_aggr_kprobe(ap, p);
592 optimize_kprobe(ap);
593}
594
595#ifdef CONFIG_SYSCTL
596static void __kprobes optimize_all_kprobes(void)
597{
598 struct hlist_head *head;
599 struct hlist_node *node;
600 struct kprobe *p;
601 unsigned int i;
602
603 /* If optimization is already allowed, just return */
604 if (kprobes_allow_optimization)
605 return;
606
607 kprobes_allow_optimization = true;
608 mutex_lock(&text_mutex);
609 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
610 head = &kprobe_table[i];
611 hlist_for_each_entry_rcu(p, node, head, hlist)
612 if (!kprobe_disabled(p))
613 optimize_kprobe(p);
614 }
615 mutex_unlock(&text_mutex);
616 printk(KERN_INFO "Kprobes globally optimized\n");
617}
618
619static void __kprobes unoptimize_all_kprobes(void)
620{
621 struct hlist_head *head;
622 struct hlist_node *node;
623 struct kprobe *p;
624 unsigned int i;
625
626 /* If optimization is already prohibited, just return */
627 if (!kprobes_allow_optimization)
628 return;
629
630 kprobes_allow_optimization = false;
631 printk(KERN_INFO "Kprobes globally unoptimized\n");
632 get_online_cpus(); /* For avoiding text_mutex deadlock */
633 mutex_lock(&text_mutex);
634 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
635 head = &kprobe_table[i];
636 hlist_for_each_entry_rcu(p, node, head, hlist) {
637 if (!kprobe_disabled(p))
638 unoptimize_kprobe(p);
639 }
640 }
641
642 mutex_unlock(&text_mutex);
643 put_online_cpus();
644 /* Allow all currently running kprobes to complete */
645 synchronize_sched();
646}
647
648int sysctl_kprobes_optimization;
649int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
650 void __user *buffer, size_t *length,
651 loff_t *ppos)
652{
653 int ret;
654
655 mutex_lock(&kprobe_mutex);
656 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
657 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
658
659 if (sysctl_kprobes_optimization)
660 optimize_all_kprobes();
661 else
662 unoptimize_all_kprobes();
663 mutex_unlock(&kprobe_mutex);
664
665 return ret;
666}
667#endif /* CONFIG_SYSCTL */
668
669static void __kprobes __arm_kprobe(struct kprobe *p)
670{
671 struct kprobe *old_p;
672
673 /* Check collision with other optimized kprobes */
674 old_p = get_optimized_kprobe((unsigned long)p->addr);
675 if (unlikely(old_p))
676 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
677
678 arch_arm_kprobe(p);
679 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
680}
681
682static void __kprobes __disarm_kprobe(struct kprobe *p)
683{
684 struct kprobe *old_p;
685
686 unoptimize_kprobe(p); /* Try to unoptimize */
687 arch_disarm_kprobe(p);
688
689 /* If another kprobe was blocked, optimize it. */
690 old_p = get_optimized_kprobe((unsigned long)p->addr);
691 if (unlikely(old_p))
692 optimize_kprobe(old_p);
693}
694
695#else /* !CONFIG_OPTPROBES */
696
697#define optimize_kprobe(p) do {} while (0)
698#define unoptimize_kprobe(p) do {} while (0)
699#define kill_optimized_kprobe(p) do {} while (0)
700#define prepare_optimized_kprobe(p) do {} while (0)
701#define try_to_optimize_kprobe(p) do {} while (0)
702#define __arm_kprobe(p) arch_arm_kprobe(p)
703#define __disarm_kprobe(p) arch_disarm_kprobe(p)
704
705static __kprobes void free_aggr_kprobe(struct kprobe *p)
706{
707 kfree(p);
708}
709
710static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
711{
712 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
713}
714#endif /* CONFIG_OPTPROBES */
715
313/* Arm a kprobe with text_mutex */ 716/* Arm a kprobe with text_mutex */
314static void __kprobes arm_kprobe(struct kprobe *kp) 717static void __kprobes arm_kprobe(struct kprobe *kp)
315{ 718{
719 /*
720 * Here, since __arm_kprobe() doesn't use stop_machine(),
721 * this doesn't cause deadlock on text_mutex. So, we don't
722 * need get_online_cpus().
723 */
316 mutex_lock(&text_mutex); 724 mutex_lock(&text_mutex);
317 arch_arm_kprobe(kp); 725 __arm_kprobe(kp);
318 mutex_unlock(&text_mutex); 726 mutex_unlock(&text_mutex);
319} 727}
320 728
321/* Disarm a kprobe with text_mutex */ 729/* Disarm a kprobe with text_mutex */
322static void __kprobes disarm_kprobe(struct kprobe *kp) 730static void __kprobes disarm_kprobe(struct kprobe *kp)
323{ 731{
732 get_online_cpus(); /* For avoiding text_mutex deadlock */
324 mutex_lock(&text_mutex); 733 mutex_lock(&text_mutex);
325 arch_disarm_kprobe(kp); 734 __disarm_kprobe(kp);
326 mutex_unlock(&text_mutex); 735 mutex_unlock(&text_mutex);
736 put_online_cpus();
327} 737}
328 738
329/* 739/*
@@ -392,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
392void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 802void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
393{ 803{
394 struct kprobe *kp; 804 struct kprobe *kp;
395 if (p->pre_handler != aggr_pre_handler) { 805 if (!kprobe_aggrprobe(p)) {
396 p->nmissed++; 806 p->nmissed++;
397 } else { 807 } else {
398 list_for_each_entry_rcu(kp, &p->list, list) 808 list_for_each_entry_rcu(kp, &p->list, list)
@@ -516,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
516} 926}
517 927
518/* 928/*
519 * Keep all fields in the kprobe consistent
520 */
521static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
522{
523 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
524 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
525}
526
527/*
528* Add the new probe to ap->list. Fail if this is the 929* Add the new probe to ap->list. Fail if this is the
529* second jprobe at the address - two jprobes can't coexist 930* second jprobe at the address - two jprobes can't coexist
530*/ 931*/
531static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 932static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
532{ 933{
533 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 934 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
935
936 if (p->break_handler || p->post_handler)
937 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
938
534 if (p->break_handler) { 939 if (p->break_handler) {
535 if (ap->break_handler) 940 if (ap->break_handler)
536 return -EEXIST; 941 return -EEXIST;
@@ -545,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
545 ap->flags &= ~KPROBE_FLAG_DISABLED; 950 ap->flags &= ~KPROBE_FLAG_DISABLED;
546 if (!kprobes_all_disarmed) 951 if (!kprobes_all_disarmed)
547 /* Arm the breakpoint again. */ 952 /* Arm the breakpoint again. */
548 arm_kprobe(ap); 953 __arm_kprobe(ap);
549 } 954 }
550 return 0; 955 return 0;
551} 956}
@@ -554,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
554 * Fill in the required fields of the "manager kprobe". Replace the 959 * Fill in the required fields of the "manager kprobe". Replace the
555 * earlier kprobe in the hlist with the manager kprobe 960 * earlier kprobe in the hlist with the manager kprobe
556 */ 961 */
557static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 962static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
558{ 963{
964 /* Copy p's insn slot to ap */
559 copy_kprobe(p, ap); 965 copy_kprobe(p, ap);
560 flush_insn_slot(ap); 966 flush_insn_slot(ap);
561 ap->addr = p->addr; 967 ap->addr = p->addr;
562 ap->flags = p->flags; 968 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
563 ap->pre_handler = aggr_pre_handler; 969 ap->pre_handler = aggr_pre_handler;
564 ap->fault_handler = aggr_fault_handler; 970 ap->fault_handler = aggr_fault_handler;
565 /* We don't care the kprobe which has gone. */ 971 /* We don't care the kprobe which has gone. */
@@ -569,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
569 ap->break_handler = aggr_break_handler; 975 ap->break_handler = aggr_break_handler;
570 976
571 INIT_LIST_HEAD(&ap->list); 977 INIT_LIST_HEAD(&ap->list);
572 list_add_rcu(&p->list, &ap->list); 978 INIT_HLIST_NODE(&ap->hlist);
573 979
980 list_add_rcu(&p->list, &ap->list);
574 hlist_replace_rcu(&p->hlist, &ap->hlist); 981 hlist_replace_rcu(&p->hlist, &ap->hlist);
575} 982}
576 983
@@ -584,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
584 int ret = 0; 991 int ret = 0;
585 struct kprobe *ap = old_p; 992 struct kprobe *ap = old_p;
586 993
587 if (old_p->pre_handler != aggr_pre_handler) { 994 if (!kprobe_aggrprobe(old_p)) {
588 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 995 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
589 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 996 ap = alloc_aggr_kprobe(old_p);
590 if (!ap) 997 if (!ap)
591 return -ENOMEM; 998 return -ENOMEM;
592 add_aggr_kprobe(ap, old_p); 999 init_aggr_kprobe(ap, old_p);
593 } 1000 }
594 1001
595 if (kprobe_gone(ap)) { 1002 if (kprobe_gone(ap)) {
@@ -608,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
608 */ 1015 */
609 return ret; 1016 return ret;
610 1017
1018 /* Prepare optimized instructions if possible. */
1019 prepare_optimized_kprobe(ap);
1020
611 /* 1021 /*
612 * Clear gone flag to prevent allocating new slot again, and 1022 * Clear gone flag to prevent allocating new slot again, and
613 * set disabled flag because it is not armed yet. 1023 * set disabled flag because it is not armed yet.
@@ -616,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
616 | KPROBE_FLAG_DISABLED; 1026 | KPROBE_FLAG_DISABLED;
617 } 1027 }
618 1028
1029 /* Copy ap's insn slot to p */
619 copy_kprobe(ap, p); 1030 copy_kprobe(ap, p);
620 return add_new_kprobe(ap, p); 1031 return add_new_kprobe(ap, p);
621} 1032}
@@ -728,7 +1139,8 @@ int __kprobes register_kprobe(struct kprobe *p)
728 1139
729 preempt_disable(); 1140 preempt_disable();
730 if (!kernel_text_address((unsigned long) p->addr) || 1141 if (!kernel_text_address((unsigned long) p->addr) ||
731 in_kprobes_functions((unsigned long) p->addr)) { 1142 in_kprobes_functions((unsigned long) p->addr) ||
1143 ftrace_text_reserved(p->addr, p->addr)) {
732 preempt_enable(); 1144 preempt_enable();
733 return -EINVAL; 1145 return -EINVAL;
734 } 1146 }
@@ -765,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p)
765 p->nmissed = 0; 1177 p->nmissed = 0;
766 INIT_LIST_HEAD(&p->list); 1178 INIT_LIST_HEAD(&p->list);
767 mutex_lock(&kprobe_mutex); 1179 mutex_lock(&kprobe_mutex);
1180
1181 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1182 mutex_lock(&text_mutex);
1183
768 old_p = get_kprobe(p->addr); 1184 old_p = get_kprobe(p->addr);
769 if (old_p) { 1185 if (old_p) {
1186 /* Since this may unoptimize old_p, locking text_mutex. */
770 ret = register_aggr_kprobe(old_p, p); 1187 ret = register_aggr_kprobe(old_p, p);
771 goto out; 1188 goto out;
772 } 1189 }
773 1190
774 mutex_lock(&text_mutex);
775 ret = arch_prepare_kprobe(p); 1191 ret = arch_prepare_kprobe(p);
776 if (ret) 1192 if (ret)
777 goto out_unlock_text; 1193 goto out;
778 1194
779 INIT_HLIST_NODE(&p->hlist); 1195 INIT_HLIST_NODE(&p->hlist);
780 hlist_add_head_rcu(&p->hlist, 1196 hlist_add_head_rcu(&p->hlist,
781 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1197 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
782 1198
783 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1199 if (!kprobes_all_disarmed && !kprobe_disabled(p))
784 arch_arm_kprobe(p); 1200 __arm_kprobe(p);
1201
1202 /* Try to optimize kprobe */
1203 try_to_optimize_kprobe(p);
785 1204
786out_unlock_text:
787 mutex_unlock(&text_mutex);
788out: 1205out:
1206 mutex_unlock(&text_mutex);
1207 put_online_cpus();
789 mutex_unlock(&kprobe_mutex); 1208 mutex_unlock(&kprobe_mutex);
790 1209
791 if (probed_mod) 1210 if (probed_mod)
@@ -807,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
807 return -EINVAL; 1226 return -EINVAL;
808 1227
809 if (old_p == p || 1228 if (old_p == p ||
810 (old_p->pre_handler == aggr_pre_handler && 1229 (kprobe_aggrprobe(old_p) &&
811 list_is_singular(&old_p->list))) { 1230 list_is_singular(&old_p->list))) {
812 /* 1231 /*
813 * Only probe on the hash list. Disarm only if kprobes are 1232 * Only probe on the hash list. Disarm only if kprobes are
@@ -815,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
815 * already have been removed. We save on flushing icache. 1234 * already have been removed. We save on flushing icache.
816 */ 1235 */
817 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1236 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
818 disarm_kprobe(p); 1237 disarm_kprobe(old_p);
819 hlist_del_rcu(&old_p->hlist); 1238 hlist_del_rcu(&old_p->hlist);
820 } else { 1239 } else {
821 if (p->break_handler && !kprobe_gone(p)) 1240 if (p->break_handler && !kprobe_gone(p))
@@ -831,8 +1250,13 @@ noclean:
831 list_del_rcu(&p->list); 1250 list_del_rcu(&p->list);
832 if (!kprobe_disabled(old_p)) { 1251 if (!kprobe_disabled(old_p)) {
833 try_to_disable_aggr_kprobe(old_p); 1252 try_to_disable_aggr_kprobe(old_p);
834 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1253 if (!kprobes_all_disarmed) {
835 disarm_kprobe(old_p); 1254 if (kprobe_disabled(old_p))
1255 disarm_kprobe(old_p);
1256 else
1257 /* Try to optimize this probe again */
1258 optimize_kprobe(old_p);
1259 }
836 } 1260 }
837 } 1261 }
838 return 0; 1262 return 0;
@@ -849,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
849 old_p = list_entry(p->list.next, struct kprobe, list); 1273 old_p = list_entry(p->list.next, struct kprobe, list);
850 list_del(&p->list); 1274 list_del(&p->list);
851 arch_remove_kprobe(old_p); 1275 arch_remove_kprobe(old_p);
852 kfree(old_p); 1276 free_aggr_kprobe(old_p);
853 } 1277 }
854} 1278}
855 1279
@@ -1035,7 +1459,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1035 /* Pre-allocate memory for max kretprobe instances */ 1459 /* Pre-allocate memory for max kretprobe instances */
1036 if (rp->maxactive <= 0) { 1460 if (rp->maxactive <= 0) {
1037#ifdef CONFIG_PREEMPT 1461#ifdef CONFIG_PREEMPT
1038 rp->maxactive = max(10, 2 * num_possible_cpus()); 1462 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
1039#else 1463#else
1040 rp->maxactive = num_possible_cpus(); 1464 rp->maxactive = num_possible_cpus();
1041#endif 1465#endif
@@ -1145,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1145 struct kprobe *kp; 1569 struct kprobe *kp;
1146 1570
1147 p->flags |= KPROBE_FLAG_GONE; 1571 p->flags |= KPROBE_FLAG_GONE;
1148 if (p->pre_handler == aggr_pre_handler) { 1572 if (kprobe_aggrprobe(p)) {
1149 /* 1573 /*
1150 * If this is an aggr_kprobe, we have to list all the 1574 * If this is an aggr_kprobe, we have to list all the
1151 * chained probes and mark them GONE. 1575 * chained probes and mark them GONE.
@@ -1154,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1154 kp->flags |= KPROBE_FLAG_GONE; 1578 kp->flags |= KPROBE_FLAG_GONE;
1155 p->post_handler = NULL; 1579 p->post_handler = NULL;
1156 p->break_handler = NULL; 1580 p->break_handler = NULL;
1581 kill_optimized_kprobe(p);
1157 } 1582 }
1158 /* 1583 /*
1159 * Here, we can remove insn_slot safely, because no thread calls 1584 * Here, we can remove insn_slot safely, because no thread calls
@@ -1263,6 +1688,15 @@ static int __init init_kprobes(void)
1263 } 1688 }
1264 } 1689 }
1265 1690
1691#if defined(CONFIG_OPTPROBES)
1692#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1693 /* Init kprobe_optinsn_slots */
1694 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1695#endif
1696 /* By default, kprobes can be optimized */
1697 kprobes_allow_optimization = true;
1698#endif
1699
1266 /* By default, kprobes are armed */ 1700 /* By default, kprobes are armed */
1267 kprobes_all_disarmed = false; 1701 kprobes_all_disarmed = false;
1268 1702
@@ -1281,7 +1715,7 @@ static int __init init_kprobes(void)
1281 1715
1282#ifdef CONFIG_DEBUG_FS 1716#ifdef CONFIG_DEBUG_FS
1283static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1717static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1284 const char *sym, int offset,char *modname) 1718 const char *sym, int offset, char *modname, struct kprobe *pp)
1285{ 1719{
1286 char *kprobe_type; 1720 char *kprobe_type;
1287 1721
@@ -1291,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1291 kprobe_type = "j"; 1725 kprobe_type = "j";
1292 else 1726 else
1293 kprobe_type = "k"; 1727 kprobe_type = "k";
1728
1294 if (sym) 1729 if (sym)
1295 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1730 seq_printf(pi, "%p %s %s+0x%x %s ",
1296 p->addr, kprobe_type, sym, offset, 1731 p->addr, kprobe_type, sym, offset,
1297 (modname ? modname : " "), 1732 (modname ? modname : " "));
1298 (kprobe_gone(p) ? "[GONE]" : ""),
1299 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1300 "[DISABLED]" : ""));
1301 else 1733 else
1302 seq_printf(pi, "%p %s %p %s%s\n", 1734 seq_printf(pi, "%p %s %p ",
1303 p->addr, kprobe_type, p->addr, 1735 p->addr, kprobe_type, p->addr);
1304 (kprobe_gone(p) ? "[GONE]" : ""), 1736
1305 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1737 if (!pp)
1306 "[DISABLED]" : "")); 1738 pp = p;
1739 seq_printf(pi, "%s%s%s\n",
1740 (kprobe_gone(p) ? "[GONE]" : ""),
1741 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1742 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1307} 1743}
1308 1744
1309static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1745static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1339,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1339 hlist_for_each_entry_rcu(p, node, head, hlist) { 1775 hlist_for_each_entry_rcu(p, node, head, hlist) {
1340 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1776 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1341 &offset, &modname, namebuf); 1777 &offset, &modname, namebuf);
1342 if (p->pre_handler == aggr_pre_handler) { 1778 if (kprobe_aggrprobe(p)) {
1343 list_for_each_entry_rcu(kp, &p->list, list) 1779 list_for_each_entry_rcu(kp, &p->list, list)
1344 report_probe(pi, kp, sym, offset, modname); 1780 report_probe(pi, kp, sym, offset, modname, p);
1345 } else 1781 } else
1346 report_probe(pi, p, sym, offset, modname); 1782 report_probe(pi, p, sym, offset, modname, NULL);
1347 } 1783 }
1348 preempt_enable(); 1784 preempt_enable();
1349 return 0; 1785 return 0;
@@ -1421,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
1421 goto out; 1857 goto out;
1422 } 1858 }
1423 1859
1424 if (!kprobes_all_disarmed && kprobe_disabled(p))
1425 arm_kprobe(p);
1426
1427 p->flags &= ~KPROBE_FLAG_DISABLED;
1428 if (p != kp) 1860 if (p != kp)
1429 kp->flags &= ~KPROBE_FLAG_DISABLED; 1861 kp->flags &= ~KPROBE_FLAG_DISABLED;
1862
1863 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1864 p->flags &= ~KPROBE_FLAG_DISABLED;
1865 arm_kprobe(p);
1866 }
1430out: 1867out:
1431 mutex_unlock(&kprobe_mutex); 1868 mutex_unlock(&kprobe_mutex);
1432 return ret; 1869 return ret;
@@ -1446,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void)
1446 if (!kprobes_all_disarmed) 1883 if (!kprobes_all_disarmed)
1447 goto already_enabled; 1884 goto already_enabled;
1448 1885
1886 /* Arming kprobes doesn't optimize kprobe itself */
1449 mutex_lock(&text_mutex); 1887 mutex_lock(&text_mutex);
1450 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1888 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1451 head = &kprobe_table[i]; 1889 head = &kprobe_table[i];
1452 hlist_for_each_entry_rcu(p, node, head, hlist) 1890 hlist_for_each_entry_rcu(p, node, head, hlist)
1453 if (!kprobe_disabled(p)) 1891 if (!kprobe_disabled(p))
1454 arch_arm_kprobe(p); 1892 __arm_kprobe(p);
1455 } 1893 }
1456 mutex_unlock(&text_mutex); 1894 mutex_unlock(&text_mutex);
1457 1895
@@ -1478,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void)
1478 1916
1479 kprobes_all_disarmed = true; 1917 kprobes_all_disarmed = true;
1480 printk(KERN_INFO "Kprobes globally disabled\n"); 1918 printk(KERN_INFO "Kprobes globally disabled\n");
1919
1920 /*
1921 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1922 * because disarming may also unoptimize kprobes.
1923 */
1924 get_online_cpus();
1481 mutex_lock(&text_mutex); 1925 mutex_lock(&text_mutex);
1482 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1926 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1483 head = &kprobe_table[i]; 1927 head = &kprobe_table[i];
1484 hlist_for_each_entry_rcu(p, node, head, hlist) { 1928 hlist_for_each_entry_rcu(p, node, head, hlist) {
1485 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1929 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1486 arch_disarm_kprobe(p); 1930 __disarm_kprobe(p);
1487 } 1931 }
1488 } 1932 }
1489 1933
1490 mutex_unlock(&text_mutex); 1934 mutex_unlock(&text_mutex);
1935 put_online_cpus();
1491 mutex_unlock(&kprobe_mutex); 1936 mutex_unlock(&kprobe_mutex);
1492 /* Allow all currently running kprobes to complete */ 1937 /* Allow all currently running kprobes to complete */
1493 synchronize_sched(); 1938 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ab7ae57773e1..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
@@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @p: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *p, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168
169 p->cpus_allowed = cpumask_of_cpu(cpu);
170 p->rt.nr_cpus_allowed = 1;
171 p->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
153 * kthread_stop - stop a thread created by kthread_create(). 176 * kthread_stop - stop a thread created by kthread_create().
154 * @k: thread created by kthread_create(). 177 * @k: thread created by kthread_create().
155 * 178 *
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe49..0c30d0455de1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2147 return ret; 2147 return ret;
2148 2148
2149 return print_irq_inversion_bug(curr, &root, target_entry, 2149 return print_irq_inversion_bug(curr, &root, target_entry,
2150 this, 1, irqclass); 2150 this, 0, irqclass);
2151} 2151}
2152 2152
2153void print_irqtrace_events(struct task_struct *curr) 2153void print_irqtrace_events(struct task_struct *curr)
@@ -3809,3 +3809,21 @@ void lockdep_sys_exit(void)
3809 lockdep_print_held_locks(curr); 3809 lockdep_print_held_locks(curr);
3810 } 3810 }
3811} 3811}
3812
3813void lockdep_rcu_dereference(const char *file, const int line)
3814{
3815 struct task_struct *curr = current;
3816
3817 if (!debug_locks_off())
3818 return;
3819 printk("\n===================================================\n");
3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3821 printk( "---------------------------------------------------\n");
3822 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3823 file, line);
3824 printk("\nother info that might help us debug this:\n\n");
3825 lockdep_print_held_locks(curr);
3826 printk("\nstack backtrace:\n");
3827 dump_stack();
3828}
3829EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index a65dc787a27b..e5538d5f00ad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod)
474 474
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 475 INIT_LIST_HEAD(&mod->modules_which_use_me);
476 for_each_possible_cpu(cpu) 476 for_each_possible_cpu(cpu)
477 local_set(__module_ref_addr(mod, cpu), 0); 477 per_cpu_ptr(mod->refptr, cpu)->count = 0;
478
478 /* Hold reference count during initialization. */ 479 /* Hold reference count during initialization. */
479 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 480 __this_cpu_write(mod->refptr->count, 1);
480 /* Backwards compatibility macros put refcount during init. */ 481 /* Backwards compatibility macros put refcount during init. */
481 mod->waiter = current; 482 mod->waiter = current;
482} 483}
@@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod)
619 int cpu; 620 int cpu;
620 621
621 for_each_possible_cpu(cpu) 622 for_each_possible_cpu(cpu)
622 total += local_read(__module_ref_addr(mod, cpu)); 623 total += per_cpu_ptr(mod->refptr, cpu)->count;
623 return total; 624 return total;
624} 625}
625EXPORT_SYMBOL(module_refcount); 626EXPORT_SYMBOL(module_refcount);
@@ -796,14 +797,15 @@ static struct module_attribute refcnt = {
796void module_put(struct module *module) 797void module_put(struct module *module)
797{ 798{
798 if (module) { 799 if (module) {
799 unsigned int cpu = get_cpu(); 800 preempt_disable();
800 local_dec(__module_ref_addr(module, cpu)); 801 __this_cpu_dec(module->refptr->count);
802
801 trace_module_put(module, _RET_IP_, 803 trace_module_put(module, _RET_IP_,
802 local_read(__module_ref_addr(module, cpu))); 804 __this_cpu_read(module->refptr->count));
803 /* Maybe they're waiting for us to drop reference? */ 805 /* Maybe they're waiting for us to drop reference? */
804 if (unlikely(!module_is_live(module))) 806 if (unlikely(!module_is_live(module)))
805 wake_up_process(module->waiter); 807 wake_up_process(module->waiter);
806 put_cpu(); 808 preempt_enable();
807 } 809 }
808} 810}
809EXPORT_SYMBOL(module_put); 811EXPORT_SYMBOL(module_put);
@@ -1010,6 +1012,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1010 * J. Corbet <corbet@lwn.net> 1012 * J. Corbet <corbet@lwn.net>
1011 */ 1013 */
1012#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1014#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1015
1016static inline bool sect_empty(const Elf_Shdr *sect)
1017{
1018 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1019}
1020
1013struct module_sect_attr 1021struct module_sect_attr
1014{ 1022{
1015 struct module_attribute mattr; 1023 struct module_attribute mattr;
@@ -1051,8 +1059,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1051 1059
1052 /* Count loaded sections and allocate structures */ 1060 /* Count loaded sections and allocate structures */
1053 for (i = 0; i < nsect; i++) 1061 for (i = 0; i < nsect; i++)
1054 if (sechdrs[i].sh_flags & SHF_ALLOC 1062 if (!sect_empty(&sechdrs[i]))
1055 && sechdrs[i].sh_size)
1056 nloaded++; 1063 nloaded++;
1057 size[0] = ALIGN(sizeof(*sect_attrs) 1064 size[0] = ALIGN(sizeof(*sect_attrs)
1058 + nloaded * sizeof(sect_attrs->attrs[0]), 1065 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1070,9 +1077,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1070 sattr = &sect_attrs->attrs[0]; 1077 sattr = &sect_attrs->attrs[0];
1071 gattr = &sect_attrs->grp.attrs[0]; 1078 gattr = &sect_attrs->grp.attrs[0];
1072 for (i = 0; i < nsect; i++) { 1079 for (i = 0; i < nsect; i++) {
1073 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1080 if (sect_empty(&sechdrs[i]))
1074 continue;
1075 if (!sechdrs[i].sh_size)
1076 continue; 1081 continue;
1077 sattr->address = sechdrs[i].sh_addr; 1082 sattr->address = sechdrs[i].sh_addr;
1078 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1083 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1156,7 +1161,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1156 /* Count notes sections and allocate structures. */ 1161 /* Count notes sections and allocate structures. */
1157 notes = 0; 1162 notes = 0;
1158 for (i = 0; i < nsect; i++) 1163 for (i = 0; i < nsect; i++)
1159 if ((sechdrs[i].sh_flags & SHF_ALLOC) && 1164 if (!sect_empty(&sechdrs[i]) &&
1160 (sechdrs[i].sh_type == SHT_NOTE)) 1165 (sechdrs[i].sh_type == SHT_NOTE))
1161 ++notes; 1166 ++notes;
1162 1167
@@ -1172,7 +1177,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1172 notes_attrs->notes = notes; 1177 notes_attrs->notes = notes;
1173 nattr = &notes_attrs->attrs[0]; 1178 nattr = &notes_attrs->attrs[0];
1174 for (loaded = i = 0; i < nsect; ++i) { 1179 for (loaded = i = 0; i < nsect; ++i) {
1175 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1180 if (sect_empty(&sechdrs[i]))
1176 continue; 1181 continue;
1177 if (sechdrs[i].sh_type == SHT_NOTE) { 1182 if (sechdrs[i].sh_type == SHT_NOTE) {
1178 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1183 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
@@ -1394,9 +1399,9 @@ static void free_module(struct module *mod)
1394 kfree(mod->args); 1399 kfree(mod->args);
1395 if (mod->percpu) 1400 if (mod->percpu)
1396 percpu_modfree(mod->percpu); 1401 percpu_modfree(mod->percpu);
1397#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 1402#if defined(CONFIG_MODULE_UNLOAD)
1398 if (mod->refptr) 1403 if (mod->refptr)
1399 percpu_modfree(mod->refptr); 1404 free_percpu(mod->refptr);
1400#endif 1405#endif
1401 /* Free lock-classes: */ 1406 /* Free lock-classes: */
1402 lockdep_free_key_range(mod->module_core, mod->core_size); 1407 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1910,9 +1915,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1910 unsigned int i; 1915 unsigned int i;
1911 1916
1912 /* only scan the sections containing data */ 1917 /* only scan the sections containing data */
1913 kmemleak_scan_area(mod->module_core, (unsigned long)mod - 1918 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
1914 (unsigned long)mod->module_core,
1915 sizeof(struct module), GFP_KERNEL);
1916 1919
1917 for (i = 1; i < hdr->e_shnum; i++) { 1920 for (i = 1; i < hdr->e_shnum; i++) {
1918 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1921 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -1921,8 +1924,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1921 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) 1924 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
1922 continue; 1925 continue;
1923 1926
1924 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - 1927 kmemleak_scan_area((void *)sechdrs[i].sh_addr,
1925 (unsigned long)mod->module_core,
1926 sechdrs[i].sh_size, GFP_KERNEL); 1928 sechdrs[i].sh_size, GFP_KERNEL);
1927 } 1929 }
1928} 1930}
@@ -2162,9 +2164,8 @@ static noinline struct module *load_module(void __user *umod,
2162 mod = (void *)sechdrs[modindex].sh_addr; 2164 mod = (void *)sechdrs[modindex].sh_addr;
2163 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2165 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2164 2166
2165#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2167#if defined(CONFIG_MODULE_UNLOAD)
2166 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2168 mod->refptr = alloc_percpu(struct module_ref);
2167 mod->name);
2168 if (!mod->refptr) { 2169 if (!mod->refptr) {
2169 err = -ENOMEM; 2170 err = -ENOMEM;
2170 goto free_init; 2171 goto free_init;
@@ -2250,6 +2251,12 @@ static noinline struct module *load_module(void __user *umod,
2250 "_ftrace_events", 2251 "_ftrace_events",
2251 sizeof(*mod->trace_events), 2252 sizeof(*mod->trace_events),
2252 &mod->num_trace_events); 2253 &mod->num_trace_events);
2254 /*
2255 * This section contains pointers to allocated objects in the trace
2256 * code and not scanning it leads to false positives.
2257 */
2258 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2259 mod->num_trace_events, GFP_KERNEL);
2253#endif 2260#endif
2254#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2261#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2255 /* sechdrs[0].sh_size is always zero */ 2262 /* sechdrs[0].sh_size is always zero */
@@ -2390,8 +2397,8 @@ static noinline struct module *load_module(void __user *umod,
2390 kobject_put(&mod->mkobj.kobj); 2397 kobject_put(&mod->mkobj.kobj);
2391 free_unload: 2398 free_unload:
2392 module_unload_free(mod); 2399 module_unload_free(mod);
2393#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2400#if defined(CONFIG_MODULE_UNLOAD)
2394 percpu_modfree(mod->refptr); 2401 free_percpu(mod->refptr);
2395 free_init: 2402 free_init:
2396#endif 2403#endif
2397 module_free(mod, mod->module_init); 2404 module_free(mod, mod->module_init);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
80 80
81 nb = rcu_dereference(*nl); 81 nb = rcu_dereference_raw(*nl);
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference_raw(nb->next);
85 85
86#ifdef CONFIG_DEBUG_NOTIFIERS 86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { 87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference(nh->head)) { 312 if (rcu_dereference_raw(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..93caf65ff57c
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,696 @@
1/*
2 * padata.c - generic interface to process data streams in parallel
3 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include <linux/module.h>
22#include <linux/cpumask.h>
23#include <linux/err.h>
24#include <linux/cpu.h>
25#include <linux/padata.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/rcupdate.h>
29
30#define MAX_SEQ_NR INT_MAX - NR_CPUS
31#define MAX_OBJ_NUM 10000 * NR_CPUS
32
33static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
34{
35 int cpu, target_cpu;
36
37 target_cpu = cpumask_first(pd->cpumask);
38 for (cpu = 0; cpu < cpu_index; cpu++)
39 target_cpu = cpumask_next(target_cpu, pd->cpumask);
40
41 return target_cpu;
42}
43
44static int padata_cpu_hash(struct padata_priv *padata)
45{
46 int cpu_index;
47 struct parallel_data *pd;
48
49 pd = padata->pd;
50
51 /*
52 * Hash the sequence numbers to the cpus by taking
53 * seq_nr mod. number of cpus in use.
54 */
55 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
56
57 return padata_index_to_cpu(pd, cpu_index);
58}
59
60static void padata_parallel_worker(struct work_struct *work)
61{
62 struct padata_queue *queue;
63 struct parallel_data *pd;
64 struct padata_instance *pinst;
65 LIST_HEAD(local_list);
66
67 local_bh_disable();
68 queue = container_of(work, struct padata_queue, pwork);
69 pd = queue->pd;
70 pinst = pd->pinst;
71
72 spin_lock(&queue->parallel.lock);
73 list_replace_init(&queue->parallel.list, &local_list);
74 spin_unlock(&queue->parallel.lock);
75
76 while (!list_empty(&local_list)) {
77 struct padata_priv *padata;
78
79 padata = list_entry(local_list.next,
80 struct padata_priv, list);
81
82 list_del_init(&padata->list);
83
84 padata->parallel(padata);
85 }
86
87 local_bh_enable();
88}
89
90/*
91 * padata_do_parallel - padata parallelization function
92 *
93 * @pinst: padata instance
94 * @padata: object to be parallelized
95 * @cb_cpu: cpu the serialization callback function will run on,
96 * must be in the cpumask of padata.
97 *
98 * The parallelization callback function will run with BHs off.
99 * Note: Every object which is parallelized by padata_do_parallel
100 * must be seen by padata_do_serial.
101 */
102int padata_do_parallel(struct padata_instance *pinst,
103 struct padata_priv *padata, int cb_cpu)
104{
105 int target_cpu, err;
106 struct padata_queue *queue;
107 struct parallel_data *pd;
108
109 rcu_read_lock_bh();
110
111 pd = rcu_dereference(pinst->pd);
112
113 err = 0;
114 if (!(pinst->flags & PADATA_INIT))
115 goto out;
116
117 err = -EBUSY;
118 if ((pinst->flags & PADATA_RESET))
119 goto out;
120
121 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
122 goto out;
123
124 err = -EINVAL;
125 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
126 goto out;
127
128 err = -EINPROGRESS;
129 atomic_inc(&pd->refcnt);
130 padata->pd = pd;
131 padata->cb_cpu = cb_cpu;
132
133 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
134 atomic_set(&pd->seq_nr, -1);
135
136 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
137
138 target_cpu = padata_cpu_hash(padata);
139 queue = per_cpu_ptr(pd->queue, target_cpu);
140
141 spin_lock(&queue->parallel.lock);
142 list_add_tail(&padata->list, &queue->parallel.list);
143 spin_unlock(&queue->parallel.lock);
144
145 queue_work_on(target_cpu, pinst->wq, &queue->pwork);
146
147out:
148 rcu_read_unlock_bh();
149
150 return err;
151}
152EXPORT_SYMBOL(padata_do_parallel);
153
154static struct padata_priv *padata_get_next(struct parallel_data *pd)
155{
156 int cpu, num_cpus, empty, calc_seq_nr;
157 int seq_nr, next_nr, overrun, next_overrun;
158 struct padata_queue *queue, *next_queue;
159 struct padata_priv *padata;
160 struct padata_list *reorder;
161
162 empty = 0;
163 next_nr = -1;
164 next_overrun = 0;
165 next_queue = NULL;
166
167 num_cpus = cpumask_weight(pd->cpumask);
168
169 for_each_cpu(cpu, pd->cpumask) {
170 queue = per_cpu_ptr(pd->queue, cpu);
171 reorder = &queue->reorder;
172
173 /*
174 * Calculate the seq_nr of the object that should be
175 * next in this queue.
176 */
177 overrun = 0;
178 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
179 + queue->cpu_index;
180
181 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
182 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
183 overrun = 1;
184 }
185
186 if (!list_empty(&reorder->list)) {
187 padata = list_entry(reorder->list.next,
188 struct padata_priv, list);
189
190 seq_nr = padata->seq_nr;
191 BUG_ON(calc_seq_nr != seq_nr);
192 } else {
193 seq_nr = calc_seq_nr;
194 empty++;
195 }
196
197 if (next_nr < 0 || seq_nr < next_nr
198 || (next_overrun && !overrun)) {
199 next_nr = seq_nr;
200 next_overrun = overrun;
201 next_queue = queue;
202 }
203 }
204
205 padata = NULL;
206
207 if (empty == num_cpus)
208 goto out;
209
210 reorder = &next_queue->reorder;
211
212 if (!list_empty(&reorder->list)) {
213 padata = list_entry(reorder->list.next,
214 struct padata_priv, list);
215
216 if (unlikely(next_overrun)) {
217 for_each_cpu(cpu, pd->cpumask) {
218 queue = per_cpu_ptr(pd->queue, cpu);
219 atomic_set(&queue->num_obj, 0);
220 }
221 }
222
223 spin_lock(&reorder->lock);
224 list_del_init(&padata->list);
225 atomic_dec(&pd->reorder_objects);
226 spin_unlock(&reorder->lock);
227
228 atomic_inc(&next_queue->num_obj);
229
230 goto out;
231 }
232
233 if (next_nr % num_cpus == next_queue->cpu_index) {
234 padata = ERR_PTR(-ENODATA);
235 goto out;
236 }
237
238 padata = ERR_PTR(-EINPROGRESS);
239out:
240 return padata;
241}
242
243static void padata_reorder(struct parallel_data *pd)
244{
245 struct padata_priv *padata;
246 struct padata_queue *queue;
247 struct padata_instance *pinst = pd->pinst;
248
249try_again:
250 if (!spin_trylock_bh(&pd->lock))
251 goto out;
252
253 while (1) {
254 padata = padata_get_next(pd);
255
256 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
257 break;
258
259 if (PTR_ERR(padata) == -ENODATA) {
260 spin_unlock_bh(&pd->lock);
261 goto out;
262 }
263
264 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
265
266 spin_lock(&queue->serial.lock);
267 list_add_tail(&padata->list, &queue->serial.list);
268 spin_unlock(&queue->serial.lock);
269
270 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
271 }
272
273 spin_unlock_bh(&pd->lock);
274
275 if (atomic_read(&pd->reorder_objects))
276 goto try_again;
277
278out:
279 return;
280}
281
282static void padata_serial_worker(struct work_struct *work)
283{
284 struct padata_queue *queue;
285 struct parallel_data *pd;
286 LIST_HEAD(local_list);
287
288 local_bh_disable();
289 queue = container_of(work, struct padata_queue, swork);
290 pd = queue->pd;
291
292 spin_lock(&queue->serial.lock);
293 list_replace_init(&queue->serial.list, &local_list);
294 spin_unlock(&queue->serial.lock);
295
296 while (!list_empty(&local_list)) {
297 struct padata_priv *padata;
298
299 padata = list_entry(local_list.next,
300 struct padata_priv, list);
301
302 list_del_init(&padata->list);
303
304 padata->serial(padata);
305 atomic_dec(&pd->refcnt);
306 }
307 local_bh_enable();
308}
309
310/*
311 * padata_do_serial - padata serialization function
312 *
313 * @padata: object to be serialized.
314 *
315 * padata_do_serial must be called for every parallelized object.
316 * The serialization callback function will run with BHs off.
317 */
318void padata_do_serial(struct padata_priv *padata)
319{
320 int cpu;
321 struct padata_queue *queue;
322 struct parallel_data *pd;
323
324 pd = padata->pd;
325
326 cpu = get_cpu();
327 queue = per_cpu_ptr(pd->queue, cpu);
328
329 spin_lock(&queue->reorder.lock);
330 atomic_inc(&pd->reorder_objects);
331 list_add_tail(&padata->list, &queue->reorder.list);
332 spin_unlock(&queue->reorder.lock);
333
334 put_cpu();
335
336 padata_reorder(pd);
337}
338EXPORT_SYMBOL(padata_do_serial);
339
340static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
341 const struct cpumask *cpumask)
342{
343 int cpu, cpu_index, num_cpus;
344 struct padata_queue *queue;
345 struct parallel_data *pd;
346
347 cpu_index = 0;
348
349 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
350 if (!pd)
351 goto err;
352
353 pd->queue = alloc_percpu(struct padata_queue);
354 if (!pd->queue)
355 goto err_free_pd;
356
357 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
358 goto err_free_queue;
359
360 for_each_possible_cpu(cpu) {
361 queue = per_cpu_ptr(pd->queue, cpu);
362
363 queue->pd = pd;
364
365 if (cpumask_test_cpu(cpu, cpumask)
366 && cpumask_test_cpu(cpu, cpu_active_mask)) {
367 queue->cpu_index = cpu_index;
368 cpu_index++;
369 } else
370 queue->cpu_index = -1;
371
372 INIT_LIST_HEAD(&queue->reorder.list);
373 INIT_LIST_HEAD(&queue->parallel.list);
374 INIT_LIST_HEAD(&queue->serial.list);
375 spin_lock_init(&queue->reorder.lock);
376 spin_lock_init(&queue->parallel.lock);
377 spin_lock_init(&queue->serial.lock);
378
379 INIT_WORK(&queue->pwork, padata_parallel_worker);
380 INIT_WORK(&queue->swork, padata_serial_worker);
381 atomic_set(&queue->num_obj, 0);
382 }
383
384 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
385
386 num_cpus = cpumask_weight(pd->cpumask);
387 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
388
389 atomic_set(&pd->seq_nr, -1);
390 atomic_set(&pd->reorder_objects, 0);
391 atomic_set(&pd->refcnt, 0);
392 pd->pinst = pinst;
393 spin_lock_init(&pd->lock);
394
395 return pd;
396
397err_free_queue:
398 free_percpu(pd->queue);
399err_free_pd:
400 kfree(pd);
401err:
402 return NULL;
403}
404
405static void padata_free_pd(struct parallel_data *pd)
406{
407 free_cpumask_var(pd->cpumask);
408 free_percpu(pd->queue);
409 kfree(pd);
410}
411
412static void padata_replace(struct padata_instance *pinst,
413 struct parallel_data *pd_new)
414{
415 struct parallel_data *pd_old = pinst->pd;
416
417 pinst->flags |= PADATA_RESET;
418
419 rcu_assign_pointer(pinst->pd, pd_new);
420
421 synchronize_rcu();
422
423 while (atomic_read(&pd_old->refcnt) != 0)
424 yield();
425
426 flush_workqueue(pinst->wq);
427
428 padata_free_pd(pd_old);
429
430 pinst->flags &= ~PADATA_RESET;
431}
432
433/*
434 * padata_set_cpumask - set the cpumask that padata should use
435 *
436 * @pinst: padata instance
437 * @cpumask: the cpumask to use
438 */
439int padata_set_cpumask(struct padata_instance *pinst,
440 cpumask_var_t cpumask)
441{
442 struct parallel_data *pd;
443 int err = 0;
444
445 might_sleep();
446
447 mutex_lock(&pinst->lock);
448
449 pd = padata_alloc_pd(pinst, cpumask);
450 if (!pd) {
451 err = -ENOMEM;
452 goto out;
453 }
454
455 cpumask_copy(pinst->cpumask, cpumask);
456
457 padata_replace(pinst, pd);
458
459out:
460 mutex_unlock(&pinst->lock);
461
462 return err;
463}
464EXPORT_SYMBOL(padata_set_cpumask);
465
466static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
467{
468 struct parallel_data *pd;
469
470 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
471 pd = padata_alloc_pd(pinst, pinst->cpumask);
472 if (!pd)
473 return -ENOMEM;
474
475 padata_replace(pinst, pd);
476 }
477
478 return 0;
479}
480
481/*
482 * padata_add_cpu - add a cpu to the padata cpumask
483 *
484 * @pinst: padata instance
485 * @cpu: cpu to add
486 */
487int padata_add_cpu(struct padata_instance *pinst, int cpu)
488{
489 int err;
490
491 might_sleep();
492
493 mutex_lock(&pinst->lock);
494
495 cpumask_set_cpu(cpu, pinst->cpumask);
496 err = __padata_add_cpu(pinst, cpu);
497
498 mutex_unlock(&pinst->lock);
499
500 return err;
501}
502EXPORT_SYMBOL(padata_add_cpu);
503
504static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
505{
506 struct parallel_data *pd;
507
508 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
509 pd = padata_alloc_pd(pinst, pinst->cpumask);
510 if (!pd)
511 return -ENOMEM;
512
513 padata_replace(pinst, pd);
514 }
515
516 return 0;
517}
518
519/*
520 * padata_remove_cpu - remove a cpu from the padata cpumask
521 *
522 * @pinst: padata instance
523 * @cpu: cpu to remove
524 */
525int padata_remove_cpu(struct padata_instance *pinst, int cpu)
526{
527 int err;
528
529 might_sleep();
530
531 mutex_lock(&pinst->lock);
532
533 cpumask_clear_cpu(cpu, pinst->cpumask);
534 err = __padata_remove_cpu(pinst, cpu);
535
536 mutex_unlock(&pinst->lock);
537
538 return err;
539}
540EXPORT_SYMBOL(padata_remove_cpu);
541
542/*
543 * padata_start - start the parallel processing
544 *
545 * @pinst: padata instance to start
546 */
547void padata_start(struct padata_instance *pinst)
548{
549 might_sleep();
550
551 mutex_lock(&pinst->lock);
552 pinst->flags |= PADATA_INIT;
553 mutex_unlock(&pinst->lock);
554}
555EXPORT_SYMBOL(padata_start);
556
557/*
558 * padata_stop - stop the parallel processing
559 *
560 * @pinst: padata instance to stop
561 */
562void padata_stop(struct padata_instance *pinst)
563{
564 might_sleep();
565
566 mutex_lock(&pinst->lock);
567 pinst->flags &= ~PADATA_INIT;
568 mutex_unlock(&pinst->lock);
569}
570EXPORT_SYMBOL(padata_stop);
571
572static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
573 unsigned long action, void *hcpu)
574{
575 int err;
576 struct padata_instance *pinst;
577 int cpu = (unsigned long)hcpu;
578
579 pinst = container_of(nfb, struct padata_instance, cpu_notifier);
580
581 switch (action) {
582 case CPU_ONLINE:
583 case CPU_ONLINE_FROZEN:
584 if (!cpumask_test_cpu(cpu, pinst->cpumask))
585 break;
586 mutex_lock(&pinst->lock);
587 err = __padata_add_cpu(pinst, cpu);
588 mutex_unlock(&pinst->lock);
589 if (err)
590 return NOTIFY_BAD;
591 break;
592
593 case CPU_DOWN_PREPARE:
594 case CPU_DOWN_PREPARE_FROZEN:
595 if (!cpumask_test_cpu(cpu, pinst->cpumask))
596 break;
597 mutex_lock(&pinst->lock);
598 err = __padata_remove_cpu(pinst, cpu);
599 mutex_unlock(&pinst->lock);
600 if (err)
601 return NOTIFY_BAD;
602 break;
603
604 case CPU_UP_CANCELED:
605 case CPU_UP_CANCELED_FROZEN:
606 if (!cpumask_test_cpu(cpu, pinst->cpumask))
607 break;
608 mutex_lock(&pinst->lock);
609 __padata_remove_cpu(pinst, cpu);
610 mutex_unlock(&pinst->lock);
611
612 case CPU_DOWN_FAILED:
613 case CPU_DOWN_FAILED_FROZEN:
614 if (!cpumask_test_cpu(cpu, pinst->cpumask))
615 break;
616 mutex_lock(&pinst->lock);
617 __padata_add_cpu(pinst, cpu);
618 mutex_unlock(&pinst->lock);
619 }
620
621 return NOTIFY_OK;
622}
623
624/*
625 * padata_alloc - allocate and initialize a padata instance
626 *
627 * @cpumask: cpumask that padata uses for parallelization
628 * @wq: workqueue to use for the allocated padata instance
629 */
630struct padata_instance *padata_alloc(const struct cpumask *cpumask,
631 struct workqueue_struct *wq)
632{
633 int err;
634 struct padata_instance *pinst;
635 struct parallel_data *pd;
636
637 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
638 if (!pinst)
639 goto err;
640
641 pd = padata_alloc_pd(pinst, cpumask);
642 if (!pd)
643 goto err_free_inst;
644
645 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
646 goto err_free_pd;
647
648 rcu_assign_pointer(pinst->pd, pd);
649
650 pinst->wq = wq;
651
652 cpumask_copy(pinst->cpumask, cpumask);
653
654 pinst->flags = 0;
655
656 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
657 pinst->cpu_notifier.priority = 0;
658 err = register_hotcpu_notifier(&pinst->cpu_notifier);
659 if (err)
660 goto err_free_cpumask;
661
662 mutex_init(&pinst->lock);
663
664 return pinst;
665
666err_free_cpumask:
667 free_cpumask_var(pinst->cpumask);
668err_free_pd:
669 padata_free_pd(pd);
670err_free_inst:
671 kfree(pinst);
672err:
673 return NULL;
674}
675EXPORT_SYMBOL(padata_alloc);
676
677/*
678 * padata_free - free a padata instance
679 *
680 * @ padata_inst: padata instance to free
681 */
682void padata_free(struct padata_instance *pinst)
683{
684 padata_stop(pinst);
685
686 synchronize_rcu();
687
688 while (atomic_read(&pinst->pd->refcnt) != 0)
689 yield();
690
691 unregister_hotcpu_notifier(&pinst->cpu_notifier);
692 padata_free_pd(pinst->pd);
693 free_cpumask_var(pinst->cpumask);
694 kfree(pinst);
695}
696EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 5827f7b97254..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 36
37EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
38 38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
45long (*panic_blink)(long time); 40long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
47 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
48/** 69/**
49 * panic - halt the system 70 * panic - halt the system
50 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -75,7 +96,6 @@ NORET_TYPE void panic(const char * fmt, ...)
75 dump_stack(); 96 dump_stack();
76#endif 97#endif
77 98
78 kmsg_dump(KMSG_DUMP_PANIC);
79 /* 99 /*
80 * If we have crashed and we have a crash kernel loaded let it handle 100 * If we have crashed and we have a crash kernel loaded let it handle
81 * everything else. 101 * everything else.
@@ -83,6 +103,8 @@ NORET_TYPE void panic(const char * fmt, ...)
83 */ 103 */
84 crash_kexec(NULL); 104 crash_kexec(NULL);
85 105
106 kmsg_dump(KMSG_DUMP_PANIC);
107
86 /* 108 /*
87 * Note smp_send_stop is the usual smp shutdown function, which 109 * Note smp_send_stop is the usual smp shutdown function, which
88 * unfortunately means it may not be hardened to work in a panic 110 * unfortunately means it may not be hardened to work in a panic
@@ -94,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
94 116
95 bust_spinlocks(0); 117 bust_spinlocks(0);
96 118
97 if (!panic_blink)
98 panic_blink = no_blink;
99
100 if (panic_timeout > 0) { 119 if (panic_timeout > 0) {
101 /* 120 /*
102 * Delay timeout seconds before rebooting the machine. 121 * Delay timeout seconds before rebooting the machine.
@@ -104,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
104 */ 123 */
105 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
106 125
107 for (i = 0; i < panic_timeout*1000; ) { 126 for (i = 0; i < panic_timeout; i++) {
108 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
109 i += panic_blink(i); 128 panic_blink_one_second();
110 mdelay(1);
111 i++;
112 } 129 }
113 /* 130 /*
114 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
@@ -134,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
134 } 151 }
135#endif 152#endif
136 local_irq_enable(); 153 local_irq_enable();
137 for (i = 0; ; ) { 154 while (1) {
138 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
139 i += panic_blink(i); 156 panic_blink_one_second();
140 mdelay(1);
141 i++;
142 } 157 }
143} 158}
144 159
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..8d95f5451b22 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
28 27
29#if 0 28#if 0
30#define DEBUGP printk 29#define DEBUGP printk
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8ab86988bd24..8e352c756ba7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -98,11 +98,12 @@ void __weak hw_perf_enable(void) { barrier(); }
98 98
99void __weak hw_perf_event_setup(int cpu) { barrier(); } 99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); } 100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
101 102
102int __weak 103int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 104hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 105 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 106 struct perf_event_context *ctx)
106{ 107{
107 return 0; 108 return 0;
108} 109}
@@ -248,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 249
249static inline u64 perf_clock(void) 250static inline u64 perf_clock(void)
250{ 251{
251 return cpu_clock(smp_processor_id()); 252 return cpu_clock(raw_smp_processor_id());
252} 253}
253 254
254/* 255/*
@@ -289,6 +290,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 290 event->total_time_running = run_end - event->tstamp_running;
290} 291}
291 292
293static struct list_head *
294ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
295{
296 if (event->attr.pinned)
297 return &ctx->pinned_groups;
298 else
299 return &ctx->flexible_groups;
300}
301
292/* 302/*
293 * Add a event from the lists for its context. 303 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 304 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +313,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 313 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 314 * leader's sibling list:
305 */ 315 */
306 if (group_leader == event) 316 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 317 struct list_head *list;
308 else { 318
319 if (is_software_event(event))
320 event->group_flags |= PERF_GROUP_SOFTWARE;
321
322 list = ctx_group_list(event, ctx);
323 list_add_tail(&event->group_entry, list);
324 } else {
325 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
326 !is_software_event(event))
327 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
328
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 329 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 330 group_leader->nr_siblings++;
311 } 331 }
@@ -355,9 +375,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 375 * to the context list directly:
356 */ 376 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 377 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
378 struct list_head *list;
358 379
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 380 list = ctx_group_list(event, ctx);
381 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 382 sibling->group_leader = sibling;
383
384 /* Inherit group flags from the previous leader */
385 sibling->group_flags = event->group_flags;
361 } 386 }
362} 387}
363 388
@@ -608,14 +633,13 @@ void perf_event_disable(struct perf_event *event)
608static int 633static int
609event_sched_in(struct perf_event *event, 634event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 635 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 636 struct perf_event_context *ctx)
612 int cpu)
613{ 637{
614 if (event->state <= PERF_EVENT_STATE_OFF) 638 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 639 return 0;
616 640
617 event->state = PERF_EVENT_STATE_ACTIVE; 641 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 642 event->oncpu = smp_processor_id();
619 /* 643 /*
620 * The new state must be visible before we turn it on in the hardware: 644 * The new state must be visible before we turn it on in the hardware:
621 */ 645 */
@@ -642,8 +666,7 @@ event_sched_in(struct perf_event *event,
642static int 666static int
643group_sched_in(struct perf_event *group_event, 667group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 669 struct perf_event_context *ctx)
646 int cpu)
647{ 670{
648 struct perf_event *event, *partial_group; 671 struct perf_event *event, *partial_group;
649 int ret; 672 int ret;
@@ -651,18 +674,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 674 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 675 return 0;
653 676
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 677 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 678 if (ret)
656 return ret < 0 ? ret : 0; 679 return ret < 0 ? ret : 0;
657 680
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 681 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 682 return -EAGAIN;
660 683
661 /* 684 /*
662 * Schedule in siblings as one group (if any): 685 * Schedule in siblings as one group (if any):
663 */ 686 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 687 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 688 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 689 partial_group = event;
667 goto group_error; 690 goto group_error;
668 } 691 }
@@ -686,24 +709,6 @@ group_error:
686} 709}
687 710
688/* 711/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 712 * Work out whether we can put this event group on the CPU now.
708 */ 713 */
709static int group_can_go_on(struct perf_event *event, 714static int group_can_go_on(struct perf_event *event,
@@ -713,7 +718,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 718 /*
714 * Groups consisting entirely of software events can always go on. 719 * Groups consisting entirely of software events can always go on.
715 */ 720 */
716 if (is_software_only_group(event)) 721 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 722 return 1;
718 /* 723 /*
719 * If an exclusive group is already on, no other hardware 724 * If an exclusive group is already on, no other hardware
@@ -754,7 +759,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 759 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 760 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 761 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 762 int err;
759 763
760 /* 764 /*
@@ -801,7 +805,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 805 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 806 err = -EEXIST;
803 else 807 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 808 err = event_sched_in(event, cpuctx, ctx);
805 809
806 if (err) { 810 if (err) {
807 /* 811 /*
@@ -943,11 +947,9 @@ static void __perf_event_enable(void *info)
943 } else { 947 } else {
944 perf_disable(); 948 perf_disable();
945 if (event == leader) 949 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 950 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 951 else
949 err = event_sched_in(event, cpuctx, ctx, 952 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 953 perf_enable();
952 } 954 }
953 955
@@ -1043,8 +1045,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1045 return 0;
1044} 1046}
1045 1047
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1048enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1049 EVENT_FLEXIBLE = 0x1,
1050 EVENT_PINNED = 0x2,
1051 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1052};
1053
1054static void ctx_sched_out(struct perf_event_context *ctx,
1055 struct perf_cpu_context *cpuctx,
1056 enum event_type_t event_type)
1048{ 1057{
1049 struct perf_event *event; 1058 struct perf_event *event;
1050 1059
@@ -1055,10 +1064,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1064 update_context_time(ctx);
1056 1065
1057 perf_disable(); 1066 perf_disable();
1058 if (ctx->nr_active) { 1067 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1068 goto out_enable;
1069
1070 if (event_type & EVENT_PINNED)
1071 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1072 group_sched_out(event, cpuctx, ctx);
1061 } 1073
1074 if (event_type & EVENT_FLEXIBLE)
1075 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1076 group_sched_out(event, cpuctx, ctx);
1077
1078 out_enable:
1062 perf_enable(); 1079 perf_enable();
1063 out: 1080 out:
1064 raw_spin_unlock(&ctx->lock); 1081 raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1187,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1187 * not restart the event.
1171 */ 1188 */
1172void perf_event_task_sched_out(struct task_struct *task, 1189void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1190 struct task_struct *next)
1174{ 1191{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1192 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1193 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1194 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1195 struct perf_event_context *parent;
@@ -1220,15 +1237,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1237 rcu_read_unlock();
1221 1238
1222 if (do_switch) { 1239 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1240 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1241 cpuctx->task_ctx = NULL;
1225 } 1242 }
1226} 1243}
1227 1244
1228/* 1245static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1246 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1247{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1249
@@ -1238,47 +1253,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1253 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1254 return;
1240 1255
1241 __perf_event_sched_out(ctx, cpuctx); 1256 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1257 cpuctx->task_ctx = NULL;
1243} 1258}
1244 1259
1245/* 1260/*
1246 * Called with IRQs disabled 1261 * Called with IRQs disabled
1247 */ 1262 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1263static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1264{
1265 task_ctx_sched_out(ctx, EVENT_ALL);
1266}
1267
1268/*
1269 * Called with IRQs disabled
1270 */
1271static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1272 enum event_type_t event_type)
1249{ 1273{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1274 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1275}
1252 1276
1253static void 1277static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1278ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1279 struct perf_cpu_context *cpuctx)
1256{ 1280{
1257 struct perf_event *event; 1281 struct perf_event *event;
1258 int can_add_hw = 1;
1259
1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264 1282
1265 ctx->timestamp = perf_clock(); 1283 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1266 1284 if (event->state <= PERF_EVENT_STATE_OFF)
1267 perf_disable();
1268
1269 /*
1270 * First go through the list and put on any pinned groups
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1285 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1286 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1287 continue;
1279 1288
1280 if (group_can_go_on(event, cpuctx, 1)) 1289 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1290 group_sched_in(event, cpuctx, ctx);
1282 1291
1283 /* 1292 /*
1284 * If this pinned group hasn't been scheduled, 1293 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1298,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1298 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1299 }
1291 } 1300 }
1301}
1292 1302
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1303static void
1294 /* 1304ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1305 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1306{
1297 */ 1307 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1308 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1309
1310 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1311 /* Ignore events in OFF or ERROR state */
1312 if (event->state <= PERF_EVENT_STATE_OFF)
1313 continue;
1302 /* 1314 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1315 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1316 * of events:
1305 */ 1317 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1318 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1319 continue;
1308 1320
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1321 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1322 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1323 can_add_hw = 0;
1312 } 1324 }
1325}
1326
1327static void
1328ctx_sched_in(struct perf_event_context *ctx,
1329 struct perf_cpu_context *cpuctx,
1330 enum event_type_t event_type)
1331{
1332 raw_spin_lock(&ctx->lock);
1333 ctx->is_active = 1;
1334 if (likely(!ctx->nr_events))
1335 goto out;
1336
1337 ctx->timestamp = perf_clock();
1338
1339 perf_disable();
1340
1341 /*
1342 * First go through the list and put on any pinned groups
1343 * in order to give them the best chance of going on.
1344 */
1345 if (event_type & EVENT_PINNED)
1346 ctx_pinned_sched_in(ctx, cpuctx);
1347
1348 /* Then walk through the lower prio flexible groups */
1349 if (event_type & EVENT_FLEXIBLE)
1350 ctx_flexible_sched_in(ctx, cpuctx);
1351
1313 perf_enable(); 1352 perf_enable();
1314 out: 1353 out:
1315 raw_spin_unlock(&ctx->lock); 1354 raw_spin_unlock(&ctx->lock);
1316} 1355}
1317 1356
1357static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1358 enum event_type_t event_type)
1359{
1360 struct perf_event_context *ctx = &cpuctx->ctx;
1361
1362 ctx_sched_in(ctx, cpuctx, event_type);
1363}
1364
1365static void task_ctx_sched_in(struct task_struct *task,
1366 enum event_type_t event_type)
1367{
1368 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1369 struct perf_event_context *ctx = task->perf_event_ctxp;
1370
1371 if (likely(!ctx))
1372 return;
1373 if (cpuctx->task_ctx == ctx)
1374 return;
1375 ctx_sched_in(ctx, cpuctx, event_type);
1376 cpuctx->task_ctx = ctx;
1377}
1318/* 1378/*
1319 * Called from scheduler to add the events of the current task 1379 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1380 * with interrupts disabled.
@@ -1326,38 +1386,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1386 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1387 * keep the event running.
1328 */ 1388 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1389void perf_event_task_sched_in(struct task_struct *task)
1330{ 1390{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1391 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1392 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1393
1334 if (likely(!ctx)) 1394 if (likely(!ctx))
1335 return; 1395 return;
1396
1336 if (cpuctx->task_ctx == ctx) 1397 if (cpuctx->task_ctx == ctx)
1337 return; 1398 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1399
1400 /*
1401 * We want to keep the following priority order:
1402 * cpu pinned (that don't need to move), task pinned,
1403 * cpu flexible, task flexible.
1404 */
1405 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1406
1407 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1408 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1409 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1410
1339 cpuctx->task_ctx = ctx; 1411 cpuctx->task_ctx = ctx;
1340} 1412}
1341 1413
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1414#define MAX_INTERRUPTS (~0ULL)
1415
1416static void perf_log_throttle(struct perf_event *event, int enable);
1417
1418static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1419{
1420 u64 frequency = event->attr.sample_freq;
1421 u64 sec = NSEC_PER_SEC;
1422 u64 divisor, dividend;
1423
1424 int count_fls, nsec_fls, frequency_fls, sec_fls;
1425
1426 count_fls = fls64(count);
1427 nsec_fls = fls64(nsec);
1428 frequency_fls = fls64(frequency);
1429 sec_fls = 30;
1430
1431 /*
1432 * We got @count in @nsec, with a target of sample_freq HZ
1433 * the target period becomes:
1434 *
1435 * @count * 10^9
1436 * period = -------------------
1437 * @nsec * sample_freq
1438 *
1439 */
1440
1441 /*
1442 * Reduce accuracy by one bit such that @a and @b converge
1443 * to a similar magnitude.
1444 */
1445#define REDUCE_FLS(a, b) \
1446do { \
1447 if (a##_fls > b##_fls) { \
1448 a >>= 1; \
1449 a##_fls--; \
1450 } else { \
1451 b >>= 1; \
1452 b##_fls--; \
1453 } \
1454} while (0)
1455
1456 /*
1457 * Reduce accuracy until either term fits in a u64, then proceed with
1458 * the other, so that finally we can do a u64/u64 division.
1459 */
1460 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1461 REDUCE_FLS(nsec, frequency);
1462 REDUCE_FLS(sec, count);
1463 }
1464
1465 if (count_fls + sec_fls > 64) {
1466 divisor = nsec * frequency;
1467
1468 while (count_fls + sec_fls > 64) {
1469 REDUCE_FLS(count, sec);
1470 divisor >>= 1;
1471 }
1472
1473 dividend = count * sec;
1474 } else {
1475 dividend = count * sec;
1476
1477 while (nsec_fls + frequency_fls > 64) {
1478 REDUCE_FLS(nsec, frequency);
1479 dividend >>= 1;
1480 }
1481
1482 divisor = nsec * frequency;
1483 }
1484
1485 return div64_u64(dividend, divisor);
1486}
1487
1488static void perf_event_stop(struct perf_event *event)
1343{ 1489{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1490 if (!event->pmu->stop)
1491 return event->pmu->disable(event);
1345 1492
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1493 return event->pmu->stop(event);
1347} 1494}
1348 1495
1349#define MAX_INTERRUPTS (~0ULL) 1496static int perf_event_start(struct perf_event *event)
1497{
1498 if (!event->pmu->start)
1499 return event->pmu->enable(event);
1350 1500
1351static void perf_log_throttle(struct perf_event *event, int enable); 1501 return event->pmu->start(event);
1502}
1352 1503
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1504static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1505{
1355 struct hw_perf_event *hwc = &event->hw; 1506 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1507 u64 period, sample_period;
1357 s64 delta; 1508 s64 delta;
1358 1509
1359 events *= hwc->sample_period; 1510 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1511
1362 delta = (s64)(period - hwc->sample_period); 1512 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1513 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,19 +1518,31 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1518 sample_period = 1;
1369 1519
1370 hwc->sample_period = sample_period; 1520 hwc->sample_period = sample_period;
1521
1522 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1523 perf_disable();
1524 perf_event_stop(event);
1525 atomic64_set(&hwc->period_left, 0);
1526 perf_event_start(event);
1527 perf_enable();
1528 }
1371} 1529}
1372 1530
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1531static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1532{
1375 struct perf_event *event; 1533 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1534 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1535 u64 interrupts, now;
1536 s64 delta;
1378 1537
1379 raw_spin_lock(&ctx->lock); 1538 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1539 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1381 if (event->state != PERF_EVENT_STATE_ACTIVE) 1540 if (event->state != PERF_EVENT_STATE_ACTIVE)
1382 continue; 1541 continue;
1383 1542
1543 if (event->cpu != -1 && event->cpu != smp_processor_id())
1544 continue;
1545
1384 hwc = &event->hw; 1546 hwc = &event->hw;
1385 1547
1386 interrupts = hwc->interrupts; 1548 interrupts = hwc->interrupts;
@@ -1392,44 +1554,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1392 if (interrupts == MAX_INTERRUPTS) { 1554 if (interrupts == MAX_INTERRUPTS) {
1393 perf_log_throttle(event, 1); 1555 perf_log_throttle(event, 1);
1394 event->pmu->unthrottle(event); 1556 event->pmu->unthrottle(event);
1395 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1396 } 1557 }
1397 1558
1398 if (!event->attr.freq || !event->attr.sample_freq) 1559 if (!event->attr.freq || !event->attr.sample_freq)
1399 continue; 1560 continue;
1400 1561
1401 /* 1562 event->pmu->read(event);
1402 * if the specified freq < HZ then we need to skip ticks 1563 now = atomic64_read(&event->count);
1403 */ 1564 delta = now - hwc->freq_count_stamp;
1404 if (event->attr.sample_freq < HZ) { 1565 hwc->freq_count_stamp = now;
1405 freq = event->attr.sample_freq;
1406
1407 hwc->freq_count += freq;
1408 hwc->freq_interrupts += interrupts;
1409
1410 if (hwc->freq_count < HZ)
1411 continue;
1412
1413 interrupts = hwc->freq_interrupts;
1414 hwc->freq_interrupts = 0;
1415 hwc->freq_count -= HZ;
1416 } else
1417 freq = HZ;
1418
1419 perf_adjust_period(event, freq * interrupts);
1420 1566
1421 /* 1567 if (delta > 0)
1422 * In order to avoid being stalled by an (accidental) huge 1568 perf_adjust_period(event, TICK_NSEC, delta);
1423 * sample period, force reset the sample period if we didn't
1424 * get any events in this freq period.
1425 */
1426 if (!interrupts) {
1427 perf_disable();
1428 event->pmu->disable(event);
1429 atomic64_set(&hwc->period_left, 0);
1430 event->pmu->enable(event);
1431 perf_enable();
1432 }
1433 } 1569 }
1434 raw_spin_unlock(&ctx->lock); 1570 raw_spin_unlock(&ctx->lock);
1435} 1571}
@@ -1439,26 +1575,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1439 */ 1575 */
1440static void rotate_ctx(struct perf_event_context *ctx) 1576static void rotate_ctx(struct perf_event_context *ctx)
1441{ 1577{
1442 struct perf_event *event;
1443
1444 if (!ctx->nr_events) 1578 if (!ctx->nr_events)
1445 return; 1579 return;
1446 1580
1447 raw_spin_lock(&ctx->lock); 1581 raw_spin_lock(&ctx->lock);
1448 /* 1582
1449 * Rotate the first entry last (works just fine for group events too): 1583 /* Rotate the first entry last of non-pinned groups */
1450 */ 1584 list_rotate_left(&ctx->flexible_groups);
1451 perf_disable();
1452 list_for_each_entry(event, &ctx->group_list, group_entry) {
1453 list_move_tail(&event->group_entry, &ctx->group_list);
1454 break;
1455 }
1456 perf_enable();
1457 1585
1458 raw_spin_unlock(&ctx->lock); 1586 raw_spin_unlock(&ctx->lock);
1459} 1587}
1460 1588
1461void perf_event_task_tick(struct task_struct *curr, int cpu) 1589void perf_event_task_tick(struct task_struct *curr)
1462{ 1590{
1463 struct perf_cpu_context *cpuctx; 1591 struct perf_cpu_context *cpuctx;
1464 struct perf_event_context *ctx; 1592 struct perf_event_context *ctx;
@@ -1466,24 +1594,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
1466 if (!atomic_read(&nr_events)) 1594 if (!atomic_read(&nr_events))
1467 return; 1595 return;
1468 1596
1469 cpuctx = &per_cpu(perf_cpu_context, cpu); 1597 cpuctx = &__get_cpu_var(perf_cpu_context);
1470 ctx = curr->perf_event_ctxp; 1598 ctx = curr->perf_event_ctxp;
1471 1599
1600 perf_disable();
1601
1472 perf_ctx_adjust_freq(&cpuctx->ctx); 1602 perf_ctx_adjust_freq(&cpuctx->ctx);
1473 if (ctx) 1603 if (ctx)
1474 perf_ctx_adjust_freq(ctx); 1604 perf_ctx_adjust_freq(ctx);
1475 1605
1476 perf_event_cpu_sched_out(cpuctx); 1606 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1477 if (ctx) 1607 if (ctx)
1478 __perf_event_task_sched_out(ctx); 1608 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1479 1609
1480 rotate_ctx(&cpuctx->ctx); 1610 rotate_ctx(&cpuctx->ctx);
1481 if (ctx) 1611 if (ctx)
1482 rotate_ctx(ctx); 1612 rotate_ctx(ctx);
1483 1613
1484 perf_event_cpu_sched_in(cpuctx, cpu); 1614 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1485 if (ctx) 1615 if (ctx)
1486 perf_event_task_sched_in(curr, cpu); 1616 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618 perf_enable();
1619}
1620
1621static int event_enable_on_exec(struct perf_event *event,
1622 struct perf_event_context *ctx)
1623{
1624 if (!event->attr.enable_on_exec)
1625 return 0;
1626
1627 event->attr.enable_on_exec = 0;
1628 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1629 return 0;
1630
1631 __perf_event_mark_enabled(event, ctx);
1632
1633 return 1;
1487} 1634}
1488 1635
1489/* 1636/*
@@ -1496,6 +1643,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1496 struct perf_event *event; 1643 struct perf_event *event;
1497 unsigned long flags; 1644 unsigned long flags;
1498 int enabled = 0; 1645 int enabled = 0;
1646 int ret;
1499 1647
1500 local_irq_save(flags); 1648 local_irq_save(flags);
1501 ctx = task->perf_event_ctxp; 1649 ctx = task->perf_event_ctxp;
@@ -1506,14 +1654,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1506 1654
1507 raw_spin_lock(&ctx->lock); 1655 raw_spin_lock(&ctx->lock);
1508 1656
1509 list_for_each_entry(event, &ctx->group_list, group_entry) { 1657 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1510 if (!event->attr.enable_on_exec) 1658 ret = event_enable_on_exec(event, ctx);
1511 continue; 1659 if (ret)
1512 event->attr.enable_on_exec = 0; 1660 enabled = 1;
1513 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1661 }
1514 continue; 1662
1515 __perf_event_mark_enabled(event, ctx); 1663 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1516 enabled = 1; 1664 ret = event_enable_on_exec(event, ctx);
1665 if (ret)
1666 enabled = 1;
1517 } 1667 }
1518 1668
1519 /* 1669 /*
@@ -1524,7 +1674,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1524 1674
1525 raw_spin_unlock(&ctx->lock); 1675 raw_spin_unlock(&ctx->lock);
1526 1676
1527 perf_event_task_sched_in(task, smp_processor_id()); 1677 perf_event_task_sched_in(task);
1528 out: 1678 out:
1529 local_irq_restore(flags); 1679 local_irq_restore(flags);
1530} 1680}
@@ -1587,7 +1737,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1587{ 1737{
1588 raw_spin_lock_init(&ctx->lock); 1738 raw_spin_lock_init(&ctx->lock);
1589 mutex_init(&ctx->mutex); 1739 mutex_init(&ctx->mutex);
1590 INIT_LIST_HEAD(&ctx->group_list); 1740 INIT_LIST_HEAD(&ctx->pinned_groups);
1741 INIT_LIST_HEAD(&ctx->flexible_groups);
1591 INIT_LIST_HEAD(&ctx->event_list); 1742 INIT_LIST_HEAD(&ctx->event_list);
1592 atomic_set(&ctx->refcount, 1); 1743 atomic_set(&ctx->refcount, 1);
1593 ctx->task = task; 1744 ctx->task = task;
@@ -1614,7 +1765,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1614 * offline CPU and activate it when the CPU comes up, but 1765 * offline CPU and activate it when the CPU comes up, but
1615 * that's for later. 1766 * that's for later.
1616 */ 1767 */
1617 if (!cpu_isset(cpu, cpu_online_map)) 1768 if (!cpu_online(cpu))
1618 return ERR_PTR(-ENODEV); 1769 return ERR_PTR(-ENODEV);
1619 1770
1620 cpuctx = &per_cpu(perf_cpu_context, cpu); 1771 cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -2459,7 +2610,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2459 if (user_locked > user_lock_limit) 2610 if (user_locked > user_lock_limit)
2460 extra = user_locked - user_lock_limit; 2611 extra = user_locked - user_lock_limit;
2461 2612
2462 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2613 lock_limit = rlimit(RLIMIT_MEMLOCK);
2463 lock_limit >>= PAGE_SHIFT; 2614 lock_limit >>= PAGE_SHIFT;
2464 locked = vma->vm_mm->locked_vm + extra; 2615 locked = vma->vm_mm->locked_vm + extra;
2465 2616
@@ -3256,8 +3407,6 @@ static void perf_event_task_output(struct perf_event *event,
3256 task_event->event_id.tid = perf_event_tid(event, task); 3407 task_event->event_id.tid = perf_event_tid(event, task);
3257 task_event->event_id.ptid = perf_event_tid(event, current); 3408 task_event->event_id.ptid = perf_event_tid(event, current);
3258 3409
3259 task_event->event_id.time = perf_clock();
3260
3261 perf_output_put(&handle, task_event->event_id); 3410 perf_output_put(&handle, task_event->event_id);
3262 3411
3263 perf_output_end(&handle); 3412 perf_output_end(&handle);
@@ -3265,6 +3414,12 @@ static void perf_event_task_output(struct perf_event *event,
3265 3414
3266static int perf_event_task_match(struct perf_event *event) 3415static int perf_event_task_match(struct perf_event *event)
3267{ 3416{
3417 if (event->state < PERF_EVENT_STATE_INACTIVE)
3418 return 0;
3419
3420 if (event->cpu != -1 && event->cpu != smp_processor_id())
3421 return 0;
3422
3268 if (event->attr.comm || event->attr.mmap || event->attr.task) 3423 if (event->attr.comm || event->attr.mmap || event->attr.task)
3269 return 1; 3424 return 1;
3270 3425
@@ -3290,12 +3445,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3290 rcu_read_lock(); 3445 rcu_read_lock();
3291 cpuctx = &get_cpu_var(perf_cpu_context); 3446 cpuctx = &get_cpu_var(perf_cpu_context);
3292 perf_event_task_ctx(&cpuctx->ctx, task_event); 3447 perf_event_task_ctx(&cpuctx->ctx, task_event);
3293 put_cpu_var(perf_cpu_context);
3294
3295 if (!ctx) 3448 if (!ctx)
3296 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3449 ctx = rcu_dereference(current->perf_event_ctxp);
3297 if (ctx) 3450 if (ctx)
3298 perf_event_task_ctx(ctx, task_event); 3451 perf_event_task_ctx(ctx, task_event);
3452 put_cpu_var(perf_cpu_context);
3299 rcu_read_unlock(); 3453 rcu_read_unlock();
3300} 3454}
3301 3455
@@ -3323,6 +3477,7 @@ static void perf_event_task(struct task_struct *task,
3323 /* .ppid */ 3477 /* .ppid */
3324 /* .tid */ 3478 /* .tid */
3325 /* .ptid */ 3479 /* .ptid */
3480 .time = perf_clock(),
3326 }, 3481 },
3327 }; 3482 };
3328 3483
@@ -3372,6 +3527,12 @@ static void perf_event_comm_output(struct perf_event *event,
3372 3527
3373static int perf_event_comm_match(struct perf_event *event) 3528static int perf_event_comm_match(struct perf_event *event)
3374{ 3529{
3530 if (event->state < PERF_EVENT_STATE_INACTIVE)
3531 return 0;
3532
3533 if (event->cpu != -1 && event->cpu != smp_processor_id())
3534 return 0;
3535
3375 if (event->attr.comm) 3536 if (event->attr.comm)
3376 return 1; 3537 return 1;
3377 3538
@@ -3408,15 +3569,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3408 rcu_read_lock(); 3569 rcu_read_lock();
3409 cpuctx = &get_cpu_var(perf_cpu_context); 3570 cpuctx = &get_cpu_var(perf_cpu_context);
3410 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3571 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3411 put_cpu_var(perf_cpu_context);
3412
3413 /*
3414 * doesn't really matter which of the child contexts the
3415 * events ends up in.
3416 */
3417 ctx = rcu_dereference(current->perf_event_ctxp); 3572 ctx = rcu_dereference(current->perf_event_ctxp);
3418 if (ctx) 3573 if (ctx)
3419 perf_event_comm_ctx(ctx, comm_event); 3574 perf_event_comm_ctx(ctx, comm_event);
3575 put_cpu_var(perf_cpu_context);
3420 rcu_read_unlock(); 3576 rcu_read_unlock();
3421} 3577}
3422 3578
@@ -3491,6 +3647,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3491static int perf_event_mmap_match(struct perf_event *event, 3647static int perf_event_mmap_match(struct perf_event *event,
3492 struct perf_mmap_event *mmap_event) 3648 struct perf_mmap_event *mmap_event)
3493{ 3649{
3650 if (event->state < PERF_EVENT_STATE_INACTIVE)
3651 return 0;
3652
3653 if (event->cpu != -1 && event->cpu != smp_processor_id())
3654 return 0;
3655
3494 if (event->attr.mmap) 3656 if (event->attr.mmap)
3495 return 1; 3657 return 1;
3496 3658
@@ -3564,15 +3726,10 @@ got_name:
3564 rcu_read_lock(); 3726 rcu_read_lock();
3565 cpuctx = &get_cpu_var(perf_cpu_context); 3727 cpuctx = &get_cpu_var(perf_cpu_context);
3566 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3728 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3567 put_cpu_var(perf_cpu_context);
3568
3569 /*
3570 * doesn't really matter which of the child contexts the
3571 * events ends up in.
3572 */
3573 ctx = rcu_dereference(current->perf_event_ctxp); 3729 ctx = rcu_dereference(current->perf_event_ctxp);
3574 if (ctx) 3730 if (ctx)
3575 perf_event_mmap_ctx(ctx, mmap_event); 3731 perf_event_mmap_ctx(ctx, mmap_event);
3732 put_cpu_var(perf_cpu_context);
3576 rcu_read_unlock(); 3733 rcu_read_unlock();
3577 3734
3578 kfree(buf); 3735 kfree(buf);
@@ -3599,7 +3756,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3599 /* .tid */ 3756 /* .tid */
3600 .start = vma->vm_start, 3757 .start = vma->vm_start,
3601 .len = vma->vm_end - vma->vm_start, 3758 .len = vma->vm_end - vma->vm_start,
3602 .pgoff = vma->vm_pgoff, 3759 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3603 }, 3760 },
3604 }; 3761 };
3605 3762
@@ -3679,12 +3836,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3679 3836
3680 if (event->attr.freq) { 3837 if (event->attr.freq) {
3681 u64 now = perf_clock(); 3838 u64 now = perf_clock();
3682 s64 delta = now - hwc->freq_stamp; 3839 s64 delta = now - hwc->freq_time_stamp;
3683 3840
3684 hwc->freq_stamp = now; 3841 hwc->freq_time_stamp = now;
3685 3842
3686 if (delta > 0 && delta < TICK_NSEC) 3843 if (delta > 0 && delta < 2*TICK_NSEC)
3687 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3844 perf_adjust_period(event, delta, hwc->last_period);
3688 } 3845 }
3689 3846
3690 /* 3847 /*
@@ -3863,6 +4020,9 @@ static int perf_swevent_match(struct perf_event *event,
3863 struct perf_sample_data *data, 4020 struct perf_sample_data *data,
3864 struct pt_regs *regs) 4021 struct pt_regs *regs)
3865{ 4022{
4023 if (event->cpu != -1 && event->cpu != smp_processor_id())
4024 return 0;
4025
3866 if (!perf_swevent_is_counting(event)) 4026 if (!perf_swevent_is_counting(event))
3867 return 0; 4027 return 0;
3868 4028
@@ -4172,7 +4332,7 @@ static const struct pmu perf_ops_task_clock = {
4172 .read = task_clock_perf_event_read, 4332 .read = task_clock_perf_event_read,
4173}; 4333};
4174 4334
4175#ifdef CONFIG_EVENT_PROFILE 4335#ifdef CONFIG_EVENT_TRACING
4176 4336
4177void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4337void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4178 int entry_size) 4338 int entry_size)
@@ -4277,7 +4437,7 @@ static void perf_event_free_filter(struct perf_event *event)
4277{ 4437{
4278} 4438}
4279 4439
4280#endif /* CONFIG_EVENT_PROFILE */ 4440#endif /* CONFIG_EVENT_TRACING */
4281 4441
4282#ifdef CONFIG_HAVE_HW_BREAKPOINT 4442#ifdef CONFIG_HAVE_HW_BREAKPOINT
4283static void bp_perf_event_destroy(struct perf_event *event) 4443static void bp_perf_event_destroy(struct perf_event *event)
@@ -4567,7 +4727,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4567 if (attr->type >= PERF_TYPE_MAX) 4727 if (attr->type >= PERF_TYPE_MAX)
4568 return -EINVAL; 4728 return -EINVAL;
4569 4729
4570 if (attr->__reserved_1 || attr->__reserved_2) 4730 if (attr->__reserved_1)
4571 return -EINVAL; 4731 return -EINVAL;
4572 4732
4573 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4733 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4720,7 +4880,7 @@ SYSCALL_DEFINE5(perf_event_open,
4720 if (IS_ERR(event)) 4880 if (IS_ERR(event))
4721 goto err_put_context; 4881 goto err_put_context;
4722 4882
4723 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); 4883 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4724 if (err < 0) 4884 if (err < 0)
4725 goto err_free_put_context; 4885 goto err_free_put_context;
4726 4886
@@ -4858,8 +5018,15 @@ inherit_event(struct perf_event *parent_event,
4858 else 5018 else
4859 child_event->state = PERF_EVENT_STATE_OFF; 5019 child_event->state = PERF_EVENT_STATE_OFF;
4860 5020
4861 if (parent_event->attr.freq) 5021 if (parent_event->attr.freq) {
4862 child_event->hw.sample_period = parent_event->hw.sample_period; 5022 u64 sample_period = parent_event->hw.sample_period;
5023 struct hw_perf_event *hwc = &child_event->hw;
5024
5025 hwc->sample_period = sample_period;
5026 hwc->last_period = sample_period;
5027
5028 atomic64_set(&hwc->period_left, sample_period);
5029 }
4863 5030
4864 child_event->overflow_handler = parent_event->overflow_handler; 5031 child_event->overflow_handler = parent_event->overflow_handler;
4865 5032
@@ -5027,7 +5194,11 @@ void perf_event_exit_task(struct task_struct *child)
5027 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5194 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5028 5195
5029again: 5196again:
5030 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5197 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5198 group_entry)
5199 __perf_event_exit_task(child_event, child_ctx, child);
5200
5201 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5031 group_entry) 5202 group_entry)
5032 __perf_event_exit_task(child_event, child_ctx, child); 5203 __perf_event_exit_task(child_event, child_ctx, child);
5033 5204
@@ -5036,7 +5207,8 @@ again:
5036 * its siblings to the list, but we obtained 'tmp' before that which 5207 * its siblings to the list, but we obtained 'tmp' before that which
5037 * will still point to the list head terminating the iteration. 5208 * will still point to the list head terminating the iteration.
5038 */ 5209 */
5039 if (!list_empty(&child_ctx->group_list)) 5210 if (!list_empty(&child_ctx->pinned_groups) ||
5211 !list_empty(&child_ctx->flexible_groups))
5040 goto again; 5212 goto again;
5041 5213
5042 mutex_unlock(&child_ctx->mutex); 5214 mutex_unlock(&child_ctx->mutex);
@@ -5044,6 +5216,24 @@ again:
5044 put_ctx(child_ctx); 5216 put_ctx(child_ctx);
5045} 5217}
5046 5218
5219static void perf_free_event(struct perf_event *event,
5220 struct perf_event_context *ctx)
5221{
5222 struct perf_event *parent = event->parent;
5223
5224 if (WARN_ON_ONCE(!parent))
5225 return;
5226
5227 mutex_lock(&parent->child_mutex);
5228 list_del_init(&event->child_list);
5229 mutex_unlock(&parent->child_mutex);
5230
5231 fput(parent->filp);
5232
5233 list_del_event(event, ctx);
5234 free_event(event);
5235}
5236
5047/* 5237/*
5048 * free an unexposed, unused context as created by inheritance by 5238 * free an unexposed, unused context as created by inheritance by
5049 * init_task below, used by fork() in case of fail. 5239 * init_task below, used by fork() in case of fail.
@@ -5058,36 +5248,70 @@ void perf_event_free_task(struct task_struct *task)
5058 5248
5059 mutex_lock(&ctx->mutex); 5249 mutex_lock(&ctx->mutex);
5060again: 5250again:
5061 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5251 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5062 struct perf_event *parent = event->parent; 5252 perf_free_event(event, ctx);
5063 5253
5064 if (WARN_ON_ONCE(!parent)) 5254 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5065 continue; 5255 group_entry)
5256 perf_free_event(event, ctx);
5066 5257
5067 mutex_lock(&parent->child_mutex); 5258 if (!list_empty(&ctx->pinned_groups) ||
5068 list_del_init(&event->child_list); 5259 !list_empty(&ctx->flexible_groups))
5069 mutex_unlock(&parent->child_mutex); 5260 goto again;
5070 5261
5071 fput(parent->filp); 5262 mutex_unlock(&ctx->mutex);
5072 5263
5073 list_del_event(event, ctx); 5264 put_ctx(ctx);
5074 free_event(event); 5265}
5266
5267static int
5268inherit_task_group(struct perf_event *event, struct task_struct *parent,
5269 struct perf_event_context *parent_ctx,
5270 struct task_struct *child,
5271 int *inherited_all)
5272{
5273 int ret;
5274 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5275
5276 if (!event->attr.inherit) {
5277 *inherited_all = 0;
5278 return 0;
5075 } 5279 }
5076 5280
5077 if (!list_empty(&ctx->group_list)) 5281 if (!child_ctx) {
5078 goto again; 5282 /*
5283 * This is executed from the parent task context, so
5284 * inherit events that have been marked for cloning.
5285 * First allocate and initialize a context for the
5286 * child.
5287 */
5079 5288
5080 mutex_unlock(&ctx->mutex); 5289 child_ctx = kzalloc(sizeof(struct perf_event_context),
5290 GFP_KERNEL);
5291 if (!child_ctx)
5292 return -ENOMEM;
5081 5293
5082 put_ctx(ctx); 5294 __perf_event_init_context(child_ctx, child);
5295 child->perf_event_ctxp = child_ctx;
5296 get_task_struct(child);
5297 }
5298
5299 ret = inherit_group(event, parent, parent_ctx,
5300 child, child_ctx);
5301
5302 if (ret)
5303 *inherited_all = 0;
5304
5305 return ret;
5083} 5306}
5084 5307
5308
5085/* 5309/*
5086 * Initialize the perf_event context in task_struct 5310 * Initialize the perf_event context in task_struct
5087 */ 5311 */
5088int perf_event_init_task(struct task_struct *child) 5312int perf_event_init_task(struct task_struct *child)
5089{ 5313{
5090 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5314 struct perf_event_context *child_ctx, *parent_ctx;
5091 struct perf_event_context *cloned_ctx; 5315 struct perf_event_context *cloned_ctx;
5092 struct perf_event *event; 5316 struct perf_event *event;
5093 struct task_struct *parent = current; 5317 struct task_struct *parent = current;
@@ -5125,42 +5349,23 @@ int perf_event_init_task(struct task_struct *child)
5125 * We dont have to disable NMIs - we are only looking at 5349 * We dont have to disable NMIs - we are only looking at
5126 * the list, not manipulating it: 5350 * the list, not manipulating it:
5127 */ 5351 */
5128 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5352 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5129 5353 ret = inherit_task_group(event, parent, parent_ctx, child,
5130 if (!event->attr.inherit) { 5354 &inherited_all);
5131 inherited_all = 0; 5355 if (ret)
5132 continue; 5356 break;
5133 } 5357 }
5134
5135 if (!child->perf_event_ctxp) {
5136 /*
5137 * This is executed from the parent task context, so
5138 * inherit events that have been marked for cloning.
5139 * First allocate and initialize a context for the
5140 * child.
5141 */
5142
5143 child_ctx = kzalloc(sizeof(struct perf_event_context),
5144 GFP_KERNEL);
5145 if (!child_ctx) {
5146 ret = -ENOMEM;
5147 goto exit;
5148 }
5149
5150 __perf_event_init_context(child_ctx, child);
5151 child->perf_event_ctxp = child_ctx;
5152 get_task_struct(child);
5153 }
5154 5358
5155 ret = inherit_group(event, parent, parent_ctx, 5359 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5156 child, child_ctx); 5360 ret = inherit_task_group(event, parent, parent_ctx, child,
5157 if (ret) { 5361 &inherited_all);
5158 inherited_all = 0; 5362 if (ret)
5159 break; 5363 break;
5160 }
5161 } 5364 }
5162 5365
5163 if (inherited_all) { 5366 child_ctx = child->perf_event_ctxp;
5367
5368 if (child_ctx && inherited_all) {
5164 /* 5369 /*
5165 * Mark the child context as a clone of the parent 5370 * Mark the child context as a clone of the parent
5166 * context, or of whatever the parent is a clone of. 5371 * context, or of whatever the parent is a clone of.
@@ -5180,7 +5385,6 @@ int perf_event_init_task(struct task_struct *child)
5180 get_ctx(child_ctx->parent_ctx); 5385 get_ctx(child_ctx->parent_ctx);
5181 } 5386 }
5182 5387
5183exit:
5184 mutex_unlock(&parent_ctx->mutex); 5388 mutex_unlock(&parent_ctx->mutex);
5185 5389
5186 perf_unpin_context(parent_ctx); 5390 perf_unpin_context(parent_ctx);
@@ -5209,7 +5413,9 @@ static void __perf_event_exit_cpu(void *info)
5209 struct perf_event_context *ctx = &cpuctx->ctx; 5413 struct perf_event_context *ctx = &cpuctx->ctx;
5210 struct perf_event *event, *tmp; 5414 struct perf_event *event, *tmp;
5211 5415
5212 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5416 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5417 __perf_event_remove_from_context(event);
5418 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5213 __perf_event_remove_from_context(event); 5419 __perf_event_remove_from_context(event);
5214} 5420}
5215static void perf_event_exit_cpu(int cpu) 5421static void perf_event_exit_cpu(int cpu)
@@ -5247,6 +5453,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5247 perf_event_exit_cpu(cpu); 5453 perf_event_exit_cpu(cpu);
5248 break; 5454 break;
5249 5455
5456 case CPU_DEAD:
5457 hw_perf_event_setup_offline(cpu);
5458 break;
5459
5250 default: 5460 default:
5251 break; 5461 break;
5252 } 5462 }
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cbe..86b296943e5f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference(pid->tasks[type].first); 370 first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock));
371 if (first) 371 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 372 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 373 }
@@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 376EXPORT_SYMBOL(pid_task);
377 377
378/* 378/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 379 * Must be called under rcu_read_lock().
380 */ 380 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 382{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..1a22dfd42df9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
982 int maxfire; 982 int maxfire;
983 struct list_head *timers = tsk->cpu_timers; 983 struct list_head *timers = tsk->cpu_timers;
984 struct signal_struct *const sig = tsk->signal; 984 struct signal_struct *const sig = tsk->signal;
985 unsigned long soft;
985 986
986 maxfire = 20; 987 maxfire = 20;
987 tsk->cputime_expires.prof_exp = cputime_zero; 988 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
1030 /* 1031 /*
1031 * Check for the special case thread timers. 1032 * Check for the special case thread timers.
1032 */ 1033 */
1033 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 1034 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1034 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 1035 if (soft != RLIM_INFINITY) {
1035 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 1036 unsigned long hard =
1037 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1036 1038
1037 if (hard != RLIM_INFINITY && 1039 if (hard != RLIM_INFINITY &&
1038 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 1040 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
1043 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1045 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1044 return; 1046 return;
1045 } 1047 }
1046 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 1048 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1047 /* 1049 /*
1048 * At the soft limit, send a SIGXCPU every second. 1050 * At the soft limit, send a SIGXCPU every second.
1049 */ 1051 */
1050 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 1052 if (soft < hard) {
1051 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 1053 soft += USEC_PER_SEC;
1052 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1054 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1053 USEC_PER_SEC;
1054 } 1055 }
1055 printk(KERN_INFO 1056 printk(KERN_INFO
1056 "RT Watchdog Timeout: %s[%d]\n", 1057 "RT Watchdog Timeout: %s[%d]\n",
@@ -1121,6 +1122,7 @@ static void check_process_timers(struct task_struct *tsk,
1121 unsigned long long sum_sched_runtime, sched_expires; 1122 unsigned long long sum_sched_runtime, sched_expires;
1122 struct list_head *timers = sig->cpu_timers; 1123 struct list_head *timers = sig->cpu_timers;
1123 struct task_cputime cputime; 1124 struct task_cputime cputime;
1125 unsigned long soft;
1124 1126
1125 /* 1127 /*
1126 * Don't sample the current process CPU clocks if there are no timers. 1128 * Don't sample the current process CPU clocks if there are no timers.
@@ -1193,11 +1195,13 @@ static void check_process_timers(struct task_struct *tsk,
1193 SIGPROF); 1195 SIGPROF);
1194 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1196 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1195 SIGVTALRM); 1197 SIGVTALRM);
1196 1198 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1197 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1199 if (soft != RLIM_INFINITY) {
1198 unsigned long psecs = cputime_to_secs(ptime); 1200 unsigned long psecs = cputime_to_secs(ptime);
1201 unsigned long hard =
1202 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1199 cputime_t x; 1203 cputime_t x;
1200 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1204 if (psecs >= hard) {
1201 /* 1205 /*
1202 * At the hard limit, we just die. 1206 * At the hard limit, we just die.
1203 * No need to calculate anything else now. 1207 * No need to calculate anything else now.
@@ -1205,17 +1209,17 @@ static void check_process_timers(struct task_struct *tsk,
1205 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1209 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1206 return; 1210 return;
1207 } 1211 }
1208 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1212 if (psecs >= soft) {
1209 /* 1213 /*
1210 * At the soft limit, send a SIGXCPU every second. 1214 * At the soft limit, send a SIGXCPU every second.
1211 */ 1215 */
1212 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1216 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1213 if (sig->rlim[RLIMIT_CPU].rlim_cur 1217 if (soft < hard) {
1214 < sig->rlim[RLIMIT_CPU].rlim_max) { 1218 soft++;
1215 sig->rlim[RLIMIT_CPU].rlim_cur++; 1219 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1216 } 1220 }
1217 } 1221 }
1218 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1222 x = secs_to_cputime(soft);
1219 if (cputime_eq(prof_expires, cputime_zero) || 1223 if (cputime_eq(prof_expires, cputime_zero) ||
1220 cputime_lt(x, prof_expires)) { 1224 cputime_lt(x, prof_expires)) {
1221 prof_expires = x; 1225 prof_expires = x;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
256 return 0; 256 return 0;
257} 257}
258 258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 259static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{ 260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 262 return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
27 code. This is helpful when debugging and reporting PM bugs, like 27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support. 28 suspend support.
29 29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
30config PM_VERBOSE 39config PM_VERBOSE
31 bool "Verbose Power Management debugging" 40 bool "Verbose Power Management debugging"
32 depends on PM_DEBUG 41 depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 95 default y
87 96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
88config SUSPEND 102config SUSPEND
89 bool "Suspend to RAM and standby" 103 bool "Suspend to RAM and standby"
90 depends on PM && ARCH_SUSPEND_POSSIBLE 104 depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
222 and the bus type drivers of the buses the devices are on are 236 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and 237 responsible for the actual handling of the autosuspend requests and
224 wake-up events. 238 wake-up events.
239
240config PM_OPS
241 bool
242 depends on PM_SLEEP || PM_RUNTIME
243 default y
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..da5288ec2392 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mode)
323int hibernation_snapshot(int platform_mode) 323int hibernation_snapshot(int platform_mode)
324{ 324{
325 int error; 325 int error;
326 gfp_t saved_mask;
326 327
327 error = platform_begin(platform_mode); 328 error = platform_begin(platform_mode);
328 if (error) 329 if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode)
334 goto Close; 335 goto Close;
335 336
336 suspend_console(); 337 suspend_console();
338 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
337 error = dpm_suspend_start(PMSG_FREEZE); 339 error = dpm_suspend_start(PMSG_FREEZE);
338 if (error) 340 if (error)
339 goto Recover_platform; 341 goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode)
351 353
352 dpm_resume_end(in_suspend ? 354 dpm_resume_end(in_suspend ?
353 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 355 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
356 set_gfp_allowed_mask(saved_mask);
354 resume_console(); 357 resume_console();
355 Close: 358 Close:
356 platform_end(platform_mode); 359 platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode)
445int hibernation_restore(int platform_mode) 448int hibernation_restore(int platform_mode)
446{ 449{
447 int error; 450 int error;
451 gfp_t saved_mask;
448 452
449 pm_prepare_console(); 453 pm_prepare_console();
450 suspend_console(); 454 suspend_console();
455 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
451 error = dpm_suspend_start(PMSG_QUIESCE); 456 error = dpm_suspend_start(PMSG_QUIESCE);
452 if (!error) { 457 if (!error) {
453 error = resume_target_kernel(platform_mode); 458 error = resume_target_kernel(platform_mode);
454 dpm_resume_end(PMSG_RECOVER); 459 dpm_resume_end(PMSG_RECOVER);
455 } 460 }
461 set_gfp_allowed_mask(saved_mask);
456 resume_console(); 462 resume_console();
457 pm_restore_console(); 463 pm_restore_console();
458 return error; 464 return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode)
466int hibernation_platform_enter(void) 472int hibernation_platform_enter(void)
467{ 473{
468 int error; 474 int error;
475 gfp_t saved_mask;
469 476
470 if (!hibernation_ops) 477 if (!hibernation_ops)
471 return -ENOSYS; 478 return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
481 488
482 entering_platform_hibernation = true; 489 entering_platform_hibernation = true;
483 suspend_console(); 490 suspend_console();
491 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
484 error = dpm_suspend_start(PMSG_HIBERNATE); 492 error = dpm_suspend_start(PMSG_HIBERNATE);
485 if (error) { 493 if (error) {
486 if (hibernation_ops->recover) 494 if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
518 Resume_devices: 526 Resume_devices:
519 entering_platform_hibernation = false; 527 entering_platform_hibernation = false;
520 dpm_resume_end(PMSG_RESTORE); 528 dpm_resume_end(PMSG_RESTORE);
529 set_gfp_allowed_mask(saved_mask);
521 resume_console(); 530 resume_console();
522 531
523 Close: 532 Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
44 == NOTIFY_BAD) ? -EINVAL : 0; 44 == NOTIFY_BAD) ? -EINVAL : 0;
45} 45}
46 46
47/* If set, devices may be suspended and resumed asynchronously. */
48int pm_async_enabled = 1;
49
50static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
51 char *buf)
52{
53 return sprintf(buf, "%d\n", pm_async_enabled);
54}
55
56static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
57 const char *buf, size_t n)
58{
59 unsigned long val;
60
61 if (strict_strtoul(buf, 10, &val))
62 return -EINVAL;
63
64 if (val > 1)
65 return -EINVAL;
66
67 pm_async_enabled = val;
68 return n;
69}
70
71power_attr(pm_async);
72
47#ifdef CONFIG_PM_DEBUG 73#ifdef CONFIG_PM_DEBUG
48int pm_test_level = TEST_NONE; 74int pm_test_level = TEST_NONE;
49 75
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
208#ifdef CONFIG_PM_TRACE 234#ifdef CONFIG_PM_TRACE
209 &pm_trace_attr.attr, 235 &pm_trace_attr.attr,
210#endif 236#endif
211#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) 237#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr,
239#ifdef CONFIG_PM_DEBUG
212 &pm_test_attr.attr, 240 &pm_test_attr.attr,
213#endif 241#endif
242#endif
214 NULL, 243 NULL,
215}; 244};
216 245
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..830cadecbdfc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void)
1181 1181
1182 memory_bm_position_reset(&copy_bm); 1182 memory_bm_position_reset(&copy_bm);
1183 1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) { 1184 while (to_free_normal > 0 || to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn); 1186 struct page *page = pfn_to_page(pfn);
1187 1187
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void)
1500{ 1500{
1501 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
1502 1502
1503 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1503 printk(KERN_INFO "PM: Creating hibernation image:\n");
1504 1504
1505 drain_local_pages(NULL); 1505 drain_local_pages(NULL);
1506 nr_pages = count_data_pages(); 1506 nr_pages = count_data_pages();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..44cce10b582d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 189int suspend_devices_and_enter(suspend_state_t state)
190{ 190{
191 int error; 191 int error;
192 gfp_t saved_mask;
192 193
193 if (!suspend_ops) 194 if (!suspend_ops)
194 return -ENOSYS; 195 return -ENOSYS;
@@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 200 goto Close;
200 } 201 }
201 suspend_console(); 202 suspend_console();
203 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 204 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 205 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 206 if (error) {
@@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 217 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 218 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 219 suspend_test_finish("resume devices");
220 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 221 resume_console();
219 Close: 222 Close:
220 if (suspend_ops->end) 223 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..1d575733d4e1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p)
657 struct swsusp_info *header; 657 struct swsusp_info *header;
658 658
659 *flags_p = swsusp_header->flags; 659 *flags_p = swsusp_header->flags;
660 if (IS_ERR(resume_bdev)) {
661 pr_debug("PM: Image device not initialised\n");
662 return PTR_ERR(resume_bdev);
663 }
664 660
665 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 661 memset(&snapshot, 0, sizeof(struct snapshot_handle));
666 error = snapshot_write_next(&snapshot, PAGE_SIZE); 662 error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file provides code to write suspend image to swap and read it back.
5 *
6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 *
9 * This file is released under the GPLv2.
10 *
11 * I'd like to thank the following people for their work:
12 *
13 * Pavel Machek <pavel@ucw.cz>:
14 * Modifications, defectiveness pointing, being with me at the very beginning,
15 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16 *
17 * Steve Doddi <dirk@loth.demon.co.uk>:
18 * Support the possibility of hardware state restoring.
19 *
20 * Raph <grey.havens@earthling.net>:
21 * Support for preserving states of network devices and virtual console
22 * (including X and svgatextmode)
23 *
24 * Kurt Garloff <garloff@suse.de>:
25 * Straightened the critical function in order to prevent compilers from
26 * playing tricks with local variables.
27 *
28 * Andreas Mohr <a.mohr@mailto.de>
29 *
30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init
32 *
33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Reworked the freeing of memory and the handling of swap
35 *
36 * More state savers are welcome. Especially for the scsi layer...
37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */
40
41#include <linux/mm.h>
42#include <linux/suspend.h>
43#include <linux/spinlock.h>
44#include <linux/kernel.h>
45#include <linux/major.h>
46#include <linux/swap.h>
47#include <linux/pm.h>
48#include <linux/swapops.h>
49#include <linux/bootmem.h>
50#include <linux/syscalls.h>
51#include <linux/highmem.h>
52#include <linux/time.h>
53#include <linux/rbtree.h>
54#include <linux/io.h>
55
56#include "power.h"
57
58int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..4d2289626a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
195 return res; 195 return res;
196} 196}
197 197
198static void snapshot_deprecated_ioctl(unsigned int cmd)
199{
200 if (printk_ratelimit())
201 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
202 "be removed soon, update your suspend-to-disk "
203 "utilities\n",
204 __builtin_return_address(0), cmd);
205}
206
198static long snapshot_ioctl(struct file *filp, unsigned int cmd, 207static long snapshot_ioctl(struct file *filp, unsigned int cmd,
199 unsigned long arg) 208 unsigned long arg)
200{ 209{
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
246 data->frozen = 0; 255 data->frozen = 0;
247 break; 256 break;
248 257
249 case SNAPSHOT_CREATE_IMAGE:
250 case SNAPSHOT_ATOMIC_SNAPSHOT: 258 case SNAPSHOT_ATOMIC_SNAPSHOT:
259 snapshot_deprecated_ioctl(cmd);
260 case SNAPSHOT_CREATE_IMAGE:
251 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 261 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
252 error = -EPERM; 262 error = -EPERM;
253 break; 263 break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 data->ready = 0; 285 data->ready = 0;
276 break; 286 break;
277 287
278 case SNAPSHOT_PREF_IMAGE_SIZE:
279 case SNAPSHOT_SET_IMAGE_SIZE: 288 case SNAPSHOT_SET_IMAGE_SIZE:
289 snapshot_deprecated_ioctl(cmd);
290 case SNAPSHOT_PREF_IMAGE_SIZE:
280 image_size = arg; 291 image_size = arg;
281 break; 292 break;
282 293
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
290 error = put_user(size, (loff_t __user *)arg); 301 error = put_user(size, (loff_t __user *)arg);
291 break; 302 break;
292 303
293 case SNAPSHOT_AVAIL_SWAP_SIZE:
294 case SNAPSHOT_AVAIL_SWAP: 304 case SNAPSHOT_AVAIL_SWAP:
305 snapshot_deprecated_ioctl(cmd);
306 case SNAPSHOT_AVAIL_SWAP_SIZE:
295 size = count_swap_pages(data->swap, 1); 307 size = count_swap_pages(data->swap, 1);
296 size <<= PAGE_SHIFT; 308 size <<= PAGE_SHIFT;
297 error = put_user(size, (loff_t __user *)arg); 309 error = put_user(size, (loff_t __user *)arg);
298 break; 310 break;
299 311
300 case SNAPSHOT_ALLOC_SWAP_PAGE:
301 case SNAPSHOT_GET_SWAP_PAGE: 312 case SNAPSHOT_GET_SWAP_PAGE:
313 snapshot_deprecated_ioctl(cmd);
314 case SNAPSHOT_ALLOC_SWAP_PAGE:
302 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 315 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
303 error = -ENODEV; 316 error = -ENODEV;
304 break; 317 break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 break; 334 break;
322 335
323 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ 336 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
337 snapshot_deprecated_ioctl(cmd);
324 if (!swsusp_swap_in_use()) { 338 if (!swsusp_swap_in_use()) {
325 /* 339 /*
326 * User space encodes device types as two-byte values, 340 * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
362 break; 376 break;
363 377
364 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ 378 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
379 snapshot_deprecated_ioctl(cmd);
365 error = -EINVAL; 380 error = -EINVAL;
366 381
367 switch (arg) { 382 switch (arg) {
diff --git a/kernel/printk.c b/kernel/printk.c
index 1ded8e7dd19b..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 37#include <linux/kmsg_dump.h>
38#include <linux/syslog.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
@@ -69,8 +70,6 @@ int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 70 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
70}; 71};
71 72
72static int saved_console_loglevel = -1;
73
74/* 73/*
75 * Low level drivers may need that to know if they can schedule in 74 * Low level drivers may need that to know if they can schedule in
76 * their unblank() callback or not. So let's export it. 75 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
145static char *log_buf = __log_buf; 144static char *log_buf = __log_buf;
146static int log_buf_len = __LOG_BUF_LEN; 145static int log_buf_len = __LOG_BUF_LEN;
147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 146static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
147static int saved_console_loglevel = -1;
148 148
149#ifdef CONFIG_KEXEC 149#ifdef CONFIG_KEXEC
150/* 150/*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
258} 258}
259#endif 259#endif
260 260
261/* 261int do_syslog(int type, char __user *buf, int len, bool from_file)
262 * Commands to do_syslog:
263 *
264 * 0 -- Close the log. Currently a NOP.
265 * 1 -- Open the log. Currently a NOP.
266 * 2 -- Read from the log.
267 * 3 -- Read all messages remaining in the ring buffer.
268 * 4 -- Read and clear all messages remaining in the ring buffer
269 * 5 -- Clear ring buffer.
270 * 6 -- Disable printk's to console
271 * 7 -- Enable printk's to console
272 * 8 -- Set level of messages printed to console
273 * 9 -- Return number of unread characters in the log buffer
274 * 10 -- Return size of the log buffer
275 */
276int do_syslog(int type, char __user *buf, int len)
277{ 262{
278 unsigned i, j, limit, count; 263 unsigned i, j, limit, count;
279 int do_clear = 0; 264 int do_clear = 0;
280 char c; 265 char c;
281 int error = 0; 266 int error = 0;
282 267
283 error = security_syslog(type); 268 error = security_syslog(type, from_file);
284 if (error) 269 if (error)
285 return error; 270 return error;
286 271
287 switch (type) { 272 switch (type) {
288 case 0: /* Close log */ 273 case SYSLOG_ACTION_CLOSE: /* Close log */
289 break; 274 break;
290 case 1: /* Open log */ 275 case SYSLOG_ACTION_OPEN: /* Open log */
291 break; 276 break;
292 case 2: /* Read from log */ 277 case SYSLOG_ACTION_READ: /* Read from log */
293 error = -EINVAL; 278 error = -EINVAL;
294 if (!buf || len < 0) 279 if (!buf || len < 0)
295 goto out; 280 goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
320 if (!error) 305 if (!error)
321 error = i; 306 error = i;
322 break; 307 break;
323 case 4: /* Read/clear last kernel messages */ 308 /* Read/clear last kernel messages */
309 case SYSLOG_ACTION_READ_CLEAR:
324 do_clear = 1; 310 do_clear = 1;
325 /* FALL THRU */ 311 /* FALL THRU */
326 case 3: /* Read last kernel messages */ 312 /* Read last kernel messages */
313 case SYSLOG_ACTION_READ_ALL:
327 error = -EINVAL; 314 error = -EINVAL;
328 if (!buf || len < 0) 315 if (!buf || len < 0)
329 goto out; 316 goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
376 } 363 }
377 } 364 }
378 break; 365 break;
379 case 5: /* Clear ring buffer */ 366 /* Clear ring buffer */
367 case SYSLOG_ACTION_CLEAR:
380 logged_chars = 0; 368 logged_chars = 0;
381 break; 369 break;
382 case 6: /* Disable logging to console */ 370 /* Disable logging to console */
371 case SYSLOG_ACTION_CONSOLE_OFF:
383 if (saved_console_loglevel == -1) 372 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel; 373 saved_console_loglevel = console_loglevel;
385 console_loglevel = minimum_console_loglevel; 374 console_loglevel = minimum_console_loglevel;
386 break; 375 break;
387 case 7: /* Enable logging to console */ 376 /* Enable logging to console */
377 case SYSLOG_ACTION_CONSOLE_ON:
388 if (saved_console_loglevel != -1) { 378 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel; 379 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1; 380 saved_console_loglevel = -1;
391 } 381 }
392 break; 382 break;
393 case 8: /* Set level of messages printed to console */ 383 /* Set level of messages printed to console */
384 case SYSLOG_ACTION_CONSOLE_LEVEL:
394 error = -EINVAL; 385 error = -EINVAL;
395 if (len < 1 || len > 8) 386 if (len < 1 || len > 8)
396 goto out; 387 goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
401 saved_console_loglevel = -1; 392 saved_console_loglevel = -1;
402 error = 0; 393 error = 0;
403 break; 394 break;
404 case 9: /* Number of chars in the log buffer */ 395 /* Number of chars in the log buffer */
396 case SYSLOG_ACTION_SIZE_UNREAD:
405 error = log_end - log_start; 397 error = log_end - log_start;
406 break; 398 break;
407 case 10: /* Size of the log buffer */ 399 /* Size of the log buffer */
400 case SYSLOG_ACTION_SIZE_BUFFER:
408 error = log_buf_len; 401 error = log_buf_len;
409 break; 402 break;
410 default: 403 default:
@@ -417,7 +410,7 @@ out:
417 410
418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 411SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
419{ 412{
420 return do_syslog(type, buf, len); 413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 414}
422 415
423/* 416/*
@@ -1412,7 +1405,7 @@ static LIST_HEAD(dump_list);
1412 1405
1413/** 1406/**
1414 * kmsg_dump_register - register a kernel log dumper. 1407 * kmsg_dump_register - register a kernel log dumper.
1415 * @dump: pointer to the kmsg_dumper structure 1408 * @dumper: pointer to the kmsg_dumper structure
1416 * 1409 *
1417 * Adds a kernel log dumper to the system. The dump callback in the 1410 * Adds a kernel log dumper to the system. The dump callback in the
1418 * structure will be called when the kernel oopses or panics and must be 1411 * structure will be called when the kernel oopses or panics and must be
@@ -1442,7 +1435,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_register);
1442 1435
1443/** 1436/**
1444 * kmsg_dump_unregister - unregister a kmsg dumper. 1437 * kmsg_dump_unregister - unregister a kmsg dumper.
1445 * @dump: pointer to the kmsg_dumper structure 1438 * @dumper: pointer to the kmsg_dumper structure
1446 * 1439 *
1447 * Removes a dump device from the system. Returns zero on success and 1440 * Removes a dump device from the system. Returns zero on success and
1448 * %-EINVAL otherwise. 1441 * %-EINVAL otherwise.
@@ -1467,6 +1460,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1467static const char const *kmsg_reasons[] = { 1460static const char const *kmsg_reasons[] = {
1468 [KMSG_DUMP_OOPS] = "oops", 1461 [KMSG_DUMP_OOPS] = "oops",
1469 [KMSG_DUMP_PANIC] = "panic", 1462 [KMSG_DUMP_PANIC] = "panic",
1463 [KMSG_DUMP_KEXEC] = "kexec",
1470}; 1464};
1471 1465
1472static const char *kmsg_to_str(enum kmsg_dump_reason reason) 1466static const char *kmsg_to_str(enum kmsg_dump_reason reason)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/regset.h>
25 26
26 27
27/* 28/*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 512 return 0;
512} 513}
513 514
515#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
516
517static const struct user_regset *
518find_regset(const struct user_regset_view *view, unsigned int type)
519{
520 const struct user_regset *regset;
521 int n;
522
523 for (n = 0; n < view->n; ++n) {
524 regset = view->regsets + n;
525 if (regset->core_note_type == type)
526 return regset;
527 }
528
529 return NULL;
530}
531
532static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
533 struct iovec *kiov)
534{
535 const struct user_regset_view *view = task_user_regset_view(task);
536 const struct user_regset *regset = find_regset(view, type);
537 int regset_no;
538
539 if (!regset || (kiov->iov_len % regset->size) != 0)
540 return -EINVAL;
541
542 regset_no = regset - view->regsets;
543 kiov->iov_len = min(kiov->iov_len,
544 (__kernel_size_t) (regset->n * regset->size));
545
546 if (req == PTRACE_GETREGSET)
547 return copy_regset_to_user(task, view, regset_no, 0,
548 kiov->iov_len, kiov->iov_base);
549 else
550 return copy_regset_from_user(task, view, regset_no, 0,
551 kiov->iov_len, kiov->iov_base);
552}
553
554#endif
555
514int ptrace_request(struct task_struct *child, long request, 556int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 557 long addr, long data)
516{ 558{
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 615 return 0;
574 return ptrace_resume(child, request, SIGKILL); 616 return ptrace_resume(child, request, SIGKILL);
575 617
618#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
619 case PTRACE_GETREGSET:
620 case PTRACE_SETREGSET:
621 {
622 struct iovec kiov;
623 struct iovec __user *uiov = (struct iovec __user *) data;
624
625 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
626 return -EFAULT;
627
628 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
629 __get_user(kiov.iov_len, &uiov->iov_len))
630 return -EFAULT;
631
632 ret = ptrace_regset(child, request, addr, &kiov);
633 if (!ret)
634 ret = __put_user(kiov.iov_len, &uiov->iov_len);
635 break;
636 }
637#endif
576 default: 638 default:
577 break; 639 break;
578 } 640 }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 773 else
712 ret = ptrace_setsiginfo(child, &siginfo); 774 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 775 break;
776#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
777 case PTRACE_GETREGSET:
778 case PTRACE_SETREGSET:
779 {
780 struct iovec kiov;
781 struct compat_iovec __user *uiov =
782 (struct compat_iovec __user *) datap;
783 compat_uptr_t ptr;
784 compat_size_t len;
785
786 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
787 return -EFAULT;
788
789 if (__get_user(ptr, &uiov->iov_base) ||
790 __get_user(len, &uiov->iov_len))
791 return -EFAULT;
792
793 kiov.iov_base = compat_ptr(ptr);
794 kiov.iov_len = len;
795
796 ret = ptrace_regset(child, request, addr, &kiov);
797 if (!ret)
798 ret = __put_user(kiov.iov_len, &uiov->iov_len);
799 break;
800 }
801#endif
714 802
715 default: 803 default:
716 ret = ptrace_request(child, request, addr, data); 804 ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..f1125c1a6321 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,43 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
47 48
48#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
49static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
50struct lockdep_map rcu_lock_map = 51struct lockdep_map rcu_lock_map =
51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
52EXPORT_SYMBOL_GPL(rcu_lock_map); 53EXPORT_SYMBOL_GPL(rcu_lock_map);
54
55static struct lock_class_key rcu_bh_lock_key;
56struct lockdep_map rcu_bh_lock_map =
57 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
58EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
59
60static struct lock_class_key rcu_sched_lock_key;
61struct lockdep_map rcu_sched_lock_map =
62 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
53#endif 64#endif
54 65
66int rcu_scheduler_active __read_mostly;
67EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68
69/*
70 * This function is invoked towards the end of the scheduler's initialization
71 * process. Before this is called, the idle task might contain
72 * RCU read-side critical sections (during which time, this idle
73 * task is booting the system). After this function is called, the
74 * idle tasks are prohibited from containing RCU read-side critical
75 * sections.
76 */
77void rcu_scheduler_starting(void)
78{
79 WARN_ON(num_online_cpus() != 1);
80 WARN_ON(nr_context_switches() > 0);
81 rcu_scheduler_active = 1;
82}
83
55/* 84/*
56 * Awaken the corresponding synchronize_rcu() instance now that a 85 * Awaken the corresponding synchronize_rcu() instance now that a
57 * grace period has elapsed. 86 * grace period has elapsed.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af02..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */
64static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 67static char *torture_type = "rcu"; /* What RCU implementation to torture. */
65 68
66module_param(nreaders, int, 0444); 69module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
79MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 82MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
80module_param(irqreader, int, 0444); 83module_param(irqreader, int, 0444);
81MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 84MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
85module_param(fqs_duration, int, 0444);
86MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
87module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
82module_param(torture_type, charp, 0444); 91module_param(torture_type, charp, 0444);
83MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
84 93
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
99static struct task_struct *stats_task; 108static struct task_struct *stats_task;
100static struct task_struct *shuffler_task; 109static struct task_struct *shuffler_task;
101static struct task_struct *stutter_task; 110static struct task_struct *stutter_task;
111static struct task_struct *fqs_task;
102 112
103#define RCU_TORTURE_PIPE_LEN 10 113#define RCU_TORTURE_PIPE_LEN 10
104 114
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
263 void (*deferred_free)(struct rcu_torture *p); 273 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 274 void (*sync)(void);
265 void (*cb_barrier)(void); 275 void (*cb_barrier)(void);
276 void (*fqs)(void);
266 int (*stats)(char *page); 277 int (*stats)(char *page);
267 int irq_capable; 278 int irq_capable;
268 char *name; 279 char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
347 .deferred_free = rcu_torture_deferred_free, 358 .deferred_free = rcu_torture_deferred_free,
348 .sync = synchronize_rcu, 359 .sync = synchronize_rcu,
349 .cb_barrier = rcu_barrier, 360 .cb_barrier = rcu_barrier,
361 .fqs = rcu_force_quiescent_state,
350 .stats = NULL, 362 .stats = NULL,
351 .irq_capable = 1, 363 .irq_capable = 1,
352 .name = "rcu" 364 .name = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .deferred_free = rcu_sync_torture_deferred_free, 400 .deferred_free = rcu_sync_torture_deferred_free,
389 .sync = synchronize_rcu, 401 .sync = synchronize_rcu,
390 .cb_barrier = NULL, 402 .cb_barrier = NULL,
403 .fqs = rcu_force_quiescent_state,
391 .stats = NULL, 404 .stats = NULL,
392 .irq_capable = 1, 405 .irq_capable = 1,
393 .name = "rcu_sync" 406 .name = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
403 .deferred_free = rcu_sync_torture_deferred_free, 416 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited, 417 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL, 418 .cb_barrier = NULL,
419 .fqs = rcu_force_quiescent_state,
406 .stats = NULL, 420 .stats = NULL,
407 .irq_capable = 1, 421 .irq_capable = 1,
408 .name = "rcu_expedited" 422 .name = "rcu_expedited"
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
465 .deferred_free = rcu_bh_torture_deferred_free, 479 .deferred_free = rcu_bh_torture_deferred_free,
466 .sync = rcu_bh_torture_synchronize, 480 .sync = rcu_bh_torture_synchronize,
467 .cb_barrier = rcu_barrier_bh, 481 .cb_barrier = rcu_barrier_bh,
482 .fqs = rcu_bh_force_quiescent_state,
468 .stats = NULL, 483 .stats = NULL,
469 .irq_capable = 1, 484 .irq_capable = 1,
470 .name = "rcu_bh" 485 .name = "rcu_bh"
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
480 .deferred_free = rcu_sync_torture_deferred_free, 495 .deferred_free = rcu_sync_torture_deferred_free,
481 .sync = rcu_bh_torture_synchronize, 496 .sync = rcu_bh_torture_synchronize,
482 .cb_barrier = NULL, 497 .cb_barrier = NULL,
498 .fqs = rcu_bh_force_quiescent_state,
483 .stats = NULL, 499 .stats = NULL,
484 .irq_capable = 1, 500 .irq_capable = 1,
485 .name = "rcu_bh_sync" 501 .name = "rcu_bh_sync"
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = {
621 .deferred_free = rcu_sched_torture_deferred_free, 637 .deferred_free = rcu_sched_torture_deferred_free,
622 .sync = sched_torture_synchronize, 638 .sync = sched_torture_synchronize,
623 .cb_barrier = rcu_barrier_sched, 639 .cb_barrier = rcu_barrier_sched,
640 .fqs = rcu_sched_force_quiescent_state,
624 .stats = NULL, 641 .stats = NULL,
625 .irq_capable = 1, 642 .irq_capable = 1,
626 .name = "sched" 643 .name = "sched"
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = {
636 .deferred_free = rcu_sync_torture_deferred_free, 653 .deferred_free = rcu_sync_torture_deferred_free,
637 .sync = sched_torture_synchronize, 654 .sync = sched_torture_synchronize,
638 .cb_barrier = NULL, 655 .cb_barrier = NULL,
656 .fqs = rcu_sched_force_quiescent_state,
639 .stats = NULL, 657 .stats = NULL,
640 .name = "sched_sync" 658 .name = "sched_sync"
641}; 659};
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
650 .deferred_free = rcu_sync_torture_deferred_free, 668 .deferred_free = rcu_sync_torture_deferred_free,
651 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
652 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state,
653 .stats = rcu_expedited_torture_stats, 672 .stats = rcu_expedited_torture_stats,
654 .irq_capable = 1, 673 .irq_capable = 1,
655 .name = "sched_expedited" 674 .name = "sched_expedited"
656}; 675};
657 676
658/* 677/*
678 * RCU torture force-quiescent-state kthread. Repeatedly induces
679 * bursts of calls to force_quiescent_state(), increasing the probability
680 * of occurrence of some important types of race conditions.
681 */
682static int
683rcu_torture_fqs(void *arg)
684{
685 unsigned long fqs_resume_time;
686 int fqs_burst_remaining;
687
688 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
689 do {
690 fqs_resume_time = jiffies + fqs_stutter * HZ;
691 while (jiffies - fqs_resume_time > LONG_MAX) {
692 schedule_timeout_interruptible(1);
693 }
694 fqs_burst_remaining = fqs_duration;
695 while (fqs_burst_remaining > 0) {
696 cur_ops->fqs();
697 udelay(fqs_holdoff);
698 fqs_burst_remaining -= fqs_holdoff;
699 }
700 rcu_stutter_wait("rcu_torture_fqs");
701 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
702 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
703 rcutorture_shutdown_absorb("rcu_torture_fqs");
704 while (!kthread_should_stop())
705 schedule_timeout_uninterruptible(1);
706 return 0;
707}
708
709/*
659 * RCU torture writer kthread. Repeatedly substitutes a new structure 710 * RCU torture writer kthread. Repeatedly substitutes a new structure
660 * for that pointed to by rcu_torture_current, freeing the old structure 711 * for that pointed to by rcu_torture_current, freeing the old structure
661 * after a series of grace periods (the "pipeline"). 712 * after a series of grace periods (the "pipeline").
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
745 796
746 idx = cur_ops->readlock(); 797 idx = cur_ops->readlock();
747 completed = cur_ops->completed(); 798 completed = cur_ops->completed();
748 p = rcu_dereference(rcu_torture_current); 799 p = rcu_dereference_check(rcu_torture_current,
800 rcu_read_lock_held() ||
801 rcu_read_lock_bh_held() ||
802 rcu_read_lock_sched_held() ||
803 srcu_read_lock_held(&srcu_ctl));
749 if (p == NULL) { 804 if (p == NULL) {
750 /* Leave because rcu_torture_writer is not yet underway */ 805 /* Leave because rcu_torture_writer is not yet underway */
751 cur_ops->readunlock(idx); 806 cur_ops->readunlock(idx);
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
763 /* Should not happen, but... */ 818 /* Should not happen, but... */
764 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
765 } 820 }
766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 821 __this_cpu_inc(rcu_torture_count[pipe_count]);
767 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
768 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
769 /* Should not happen, but... */ 824 /* Should not happen, but... */
770 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
771 } 826 }
772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 827 __this_cpu_inc(rcu_torture_batch[completed]);
773 preempt_enable(); 828 preempt_enable();
774 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
775} 830}
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg)
798 do { 853 do {
799 if (irqreader && cur_ops->irq_capable) { 854 if (irqreader && cur_ops->irq_capable) {
800 if (!timer_pending(&t)) 855 if (!timer_pending(&t))
801 mod_timer(&t, 1); 856 mod_timer(&t, jiffies + 1);
802 } 857 }
803 idx = cur_ops->readlock(); 858 idx = cur_ops->readlock();
804 completed = cur_ops->completed(); 859 completed = cur_ops->completed();
805 p = rcu_dereference(rcu_torture_current); 860 p = rcu_dereference_check(rcu_torture_current,
861 rcu_read_lock_held() ||
862 rcu_read_lock_bh_held() ||
863 rcu_read_lock_sched_held() ||
864 srcu_read_lock_held(&srcu_ctl));
806 if (p == NULL) { 865 if (p == NULL) {
807 /* Wait for rcu_torture_writer to get underway */ 866 /* Wait for rcu_torture_writer to get underway */
808 cur_ops->readunlock(idx); 867 cur_ops->readunlock(idx);
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg)
818 /* Should not happen, but... */ 877 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 878 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 879 }
821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 880 __this_cpu_inc(rcu_torture_count[pipe_count]);
822 completed = cur_ops->completed() - completed; 881 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 882 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 883 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 884 completed = RCU_TORTURE_PIPE_LEN;
826 } 885 }
827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 886 __this_cpu_inc(rcu_torture_batch[completed]);
828 preempt_enable(); 887 preempt_enable();
829 cur_ops->readunlock(idx); 888 cur_ops->readunlock(idx);
830 schedule(); 889 schedule();
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
1030 printk(KERN_ALERT "%s" TORTURE_FLAG 1089 printk(KERN_ALERT "%s" TORTURE_FLAG
1031 "--- %s: nreaders=%d nfakewriters=%d " 1090 "--- %s: nreaders=%d nfakewriters=%d "
1032 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1091 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1033 "shuffle_interval=%d stutter=%d irqreader=%d\n", 1092 "shuffle_interval=%d stutter=%d irqreader=%d "
1093 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
1034 torture_type, tag, nrealreaders, nfakewriters, 1094 torture_type, tag, nrealreaders, nfakewriters,
1035 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1095 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1036 stutter, irqreader); 1096 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
1037} 1097}
1038 1098
1039static struct notifier_block rcutorture_nb = { 1099static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void)
1109 } 1169 }
1110 stats_task = NULL; 1170 stats_task = NULL;
1111 1171
1172 if (fqs_task) {
1173 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1174 kthread_stop(fqs_task);
1175 }
1176 fqs_task = NULL;
1177
1112 /* Wait for all RCU callbacks to fire. */ 1178 /* Wait for all RCU callbacks to fire. */
1113 1179
1114 if (cur_ops->cb_barrier != NULL) 1180 if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1220,11 @@ rcu_torture_init(void)
1154 mutex_unlock(&fullstop_mutex); 1220 mutex_unlock(&fullstop_mutex);
1155 return -EINVAL; 1221 return -EINVAL;
1156 } 1222 }
1223 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1224 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1225 "fqs_duration, fqs disabled.\n");
1226 fqs_duration = 0;
1227 }
1157 if (cur_ops->init) 1228 if (cur_ops->init)
1158 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1229 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1159 1230
@@ -1282,6 +1353,19 @@ rcu_torture_init(void)
1282 goto unwind; 1353 goto unwind;
1283 } 1354 }
1284 } 1355 }
1356 if (fqs_duration < 0)
1357 fqs_duration = 0;
1358 if (fqs_duration) {
1359 /* Create the stutter thread */
1360 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1361 "rcu_torture_fqs");
1362 if (IS_ERR(fqs_task)) {
1363 firsterr = PTR_ERR(fqs_task);
1364 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1365 fqs_task = NULL;
1366 goto unwind;
1367 }
1368 }
1285 register_reboot_notifier(&rcutorture_nb); 1369 register_reboot_notifier(&rcutorture_nb);
1286 mutex_unlock(&fullstop_mutex); 1370 mutex_unlock(&fullstop_mutex);
1287 return 0; 1371 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,7 +46,6 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
50 49
51#include "rcutree.h" 50#include "rcutree.h"
52 51
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
66 .signaled = RCU_GP_IDLE, \ 65 .signaled = RCU_GP_IDLE, \
67 .gpnum = -300, \ 66 .gpnum = -300, \
68 .completed = -300, \ 67 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \ 69 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \ 70 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \ 71 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 73 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 74 .n_force_qs_ngp = 0, \
76} 75}
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 82
84static int rcu_scheduler_active __read_mostly;
85
86
87/* 83/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node 85 * permit this function to be invoked without holding the root rcu_node
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void)
157EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 153EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
158 154
159/* 155/*
156 * Force a quiescent state for RCU BH.
157 */
158void rcu_bh_force_quiescent_state(void)
159{
160 force_quiescent_state(&rcu_bh_state, 0);
161}
162EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
163
164/*
165 * Force a quiescent state for RCU-sched.
166 */
167void rcu_sched_force_quiescent_state(void)
168{
169 force_quiescent_state(&rcu_sched_state, 0);
170}
171EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
172
173/*
160 * Does the CPU have callbacks ready to be invoked? 174 * Does the CPU have callbacks ready to be invoked?
161 */ 175 */
162static int 176static int
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
439 453
440 /* Only let one CPU complain about others per time interval. */ 454 /* Only let one CPU complain about others per time interval. */
441 455
442 spin_lock_irqsave(&rnp->lock, flags); 456 raw_spin_lock_irqsave(&rnp->lock, flags);
443 delta = jiffies - rsp->jiffies_stall; 457 delta = jiffies - rsp->jiffies_stall;
444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 458 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
445 spin_unlock_irqrestore(&rnp->lock, flags); 459 raw_spin_unlock_irqrestore(&rnp->lock, flags);
446 return; 460 return;
447 } 461 }
448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 462 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
452 * due to CPU offlining. 466 * due to CPU offlining.
453 */ 467 */
454 rcu_print_task_stall(rnp); 468 rcu_print_task_stall(rnp);
455 spin_unlock_irqrestore(&rnp->lock, flags); 469 raw_spin_unlock_irqrestore(&rnp->lock, flags);
456 470
457 /* OK, time to rat on our buddy... */ 471 /* OK, time to rat on our buddy... */
458 472
459 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 473 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
460 rcu_for_each_leaf_node(rsp, rnp) { 474 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags);
461 rcu_print_task_stall(rnp); 476 rcu_print_task_stall(rnp);
477 raw_spin_unlock_irqrestore(&rnp->lock, flags);
462 if (rnp->qsmask == 0) 478 if (rnp->qsmask == 0)
463 continue; 479 continue;
464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 480 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 485 smp_processor_id(), (long)(jiffies - rsp->gp_start));
470 trigger_all_cpu_backtrace(); 486 trigger_all_cpu_backtrace();
471 487
488 /* If so configured, complain about tasks blocking the grace period. */
489
490 rcu_print_detail_task_stall(rsp);
491
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 492 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 493}
474 494
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
481 smp_processor_id(), jiffies - rsp->gp_start); 501 smp_processor_id(), jiffies - rsp->gp_start);
482 trigger_all_cpu_backtrace(); 502 trigger_all_cpu_backtrace();
483 503
484 spin_lock_irqsave(&rnp->lock, flags); 504 raw_spin_lock_irqsave(&rnp->lock, flags);
485 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 505 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
486 rsp->jiffies_stall = 506 rsp->jiffies_stall =
487 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 507 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
488 spin_unlock_irqrestore(&rnp->lock, flags); 508 raw_spin_unlock_irqrestore(&rnp->lock, flags);
489 509
490 set_need_resched(); /* kick ourselves to get things going. */ 510 set_need_resched(); /* kick ourselves to get things going. */
491} 511}
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
545 local_irq_save(flags); 565 local_irq_save(flags);
546 rnp = rdp->mynode; 566 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ 567 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 568 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
549 local_irq_restore(flags); 569 local_irq_restore(flags);
550 return; 570 return;
551 } 571 }
552 __note_new_gpnum(rsp, rnp, rdp); 572 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags); 573 raw_spin_unlock_irqrestore(&rnp->lock, flags);
554} 574}
555 575
556/* 576/*
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
609 local_irq_save(flags); 629 local_irq_save(flags);
610 rnp = rdp->mynode; 630 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 631 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 632 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
613 local_irq_restore(flags); 633 local_irq_restore(flags);
614 return; 634 return;
615 } 635 }
616 __rcu_process_gp_end(rsp, rnp, rdp); 636 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags); 637 raw_spin_unlock_irqrestore(&rnp->lock, flags);
618} 638}
619 639
620/* 640/*
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
659 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 679 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
660 struct rcu_node *rnp = rcu_get_root(rsp); 680 struct rcu_node *rnp = rcu_get_root(rsp);
661 681
662 if (!cpu_needs_another_gp(rsp, rdp)) { 682 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
683 if (cpu_needs_another_gp(rsp, rdp))
684 rsp->fqs_need_gp = 1;
663 if (rnp->completed == rsp->completed) { 685 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags); 686 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 687 return;
666 } 688 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 689 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 690
669 /* 691 /*
670 * Propagate new ->completed value to rcu_node structures 692 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672 * of the next grace period to process their callbacks. 694 * of the next grace period to process their callbacks.
673 */ 695 */
674 rcu_for_each_node_breadth_first(rsp, rnp) { 696 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */ 697 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed; 698 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 699 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 } 700 }
679 local_irq_restore(flags); 701 local_irq_restore(flags);
680 return; 702 return;
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
695 rnp->completed = rsp->completed; 717 rnp->completed = rsp->completed;
696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 718 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp); 719 rcu_start_gp_per_cpu(rsp, rnp, rdp);
698 spin_unlock_irqrestore(&rnp->lock, flags); 720 raw_spin_unlock_irqrestore(&rnp->lock, flags);
699 return; 721 return;
700 } 722 }
701 723
702 spin_unlock(&rnp->lock); /* leave irqs disabled. */ 724 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
703 725
704 726
705 /* Exclude any concurrent CPU-hotplug operations. */ 727 /* Exclude any concurrent CPU-hotplug operations. */
706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 728 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
707 729
708 /* 730 /*
709 * Set the quiescent-state-needed bits in all the rcu_node 731 * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
723 * irqs disabled. 745 * irqs disabled.
724 */ 746 */
725 rcu_for_each_node_breadth_first(rsp, rnp) { 747 rcu_for_each_node_breadth_first(rsp, rnp) {
726 spin_lock(&rnp->lock); /* irqs already disabled. */ 748 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
727 rcu_preempt_check_blocked_tasks(rnp); 749 rcu_preempt_check_blocked_tasks(rnp);
728 rnp->qsmask = rnp->qsmaskinit; 750 rnp->qsmask = rnp->qsmaskinit;
729 rnp->gpnum = rsp->gpnum; 751 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed; 752 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode) 753 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp); 754 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 755 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
734 } 756 }
735 757
736 rnp = rcu_get_root(rsp); 758 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */ 759 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 760 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 761 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
740 spin_unlock_irqrestore(&rsp->onofflock, flags); 762 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
741} 763}
742 764
743/* 765/*
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
776 if (!(rnp->qsmask & mask)) { 798 if (!(rnp->qsmask & mask)) {
777 799
778 /* Our bit has already been cleared, so done. */ 800 /* Our bit has already been cleared, so done. */
779 spin_unlock_irqrestore(&rnp->lock, flags); 801 raw_spin_unlock_irqrestore(&rnp->lock, flags);
780 return; 802 return;
781 } 803 }
782 rnp->qsmask &= ~mask; 804 rnp->qsmask &= ~mask;
783 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 805 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
784 806
785 /* Other bits still set at this level, so done. */ 807 /* Other bits still set at this level, so done. */
786 spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
787 return; 809 return;
788 } 810 }
789 mask = rnp->grpmask; 811 mask = rnp->grpmask;
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
793 815
794 break; 816 break;
795 } 817 }
796 spin_unlock_irqrestore(&rnp->lock, flags); 818 raw_spin_unlock_irqrestore(&rnp->lock, flags);
797 rnp_c = rnp; 819 rnp_c = rnp;
798 rnp = rnp->parent; 820 rnp = rnp->parent;
799 spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
800 WARN_ON_ONCE(rnp_c->qsmask); 822 WARN_ON_ONCE(rnp_c->qsmask);
801 } 823 }
802 824
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
825 struct rcu_node *rnp; 847 struct rcu_node *rnp;
826 848
827 rnp = rdp->mynode; 849 rnp = rdp->mynode;
828 spin_lock_irqsave(&rnp->lock, flags); 850 raw_spin_lock_irqsave(&rnp->lock, flags);
829 if (lastcomp != rnp->completed) { 851 if (lastcomp != rnp->completed) {
830 852
831 /* 853 /*
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
837 * race occurred. 859 * race occurred.
838 */ 860 */
839 rdp->passed_quiesc = 0; /* try again later! */ 861 rdp->passed_quiesc = 0; /* try again later! */
840 spin_unlock_irqrestore(&rnp->lock, flags); 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
841 return; 863 return;
842 } 864 }
843 mask = rdp->grpmask; 865 mask = rdp->grpmask;
844 if ((rnp->qsmask & mask) == 0) { 866 if ((rnp->qsmask & mask) == 0) {
845 spin_unlock_irqrestore(&rnp->lock, flags); 867 raw_spin_unlock_irqrestore(&rnp->lock, flags);
846 } else { 868 } else {
847 rdp->qs_pending = 0; 869 rdp->qs_pending = 0;
848 870
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
906 928
907 if (rdp->nxtlist == NULL) 929 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */ 930 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 931 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist; 932 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 933 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL; 934 rdp->nxtlist = NULL;
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
914 rdp->nxttail[i] = &rdp->nxtlist; 936 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen; 937 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0; 938 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 939 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918} 940}
919 941
920/* 942/*
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
925 unsigned long flags; 947 unsigned long flags;
926 struct rcu_data *rdp; 948 struct rcu_data *rdp;
927 949
928 spin_lock_irqsave(&rsp->onofflock, flags); 950 raw_spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()]; 951 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) { 952 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags); 953 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return; 954 return;
933 } 955 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 956 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
937 rsp->orphan_cbs_list = NULL; 959 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 960 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0; 961 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 962 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
941} 963}
942 964
943/* 965/*
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
953 struct rcu_node *rnp; 975 struct rcu_node *rnp;
954 976
955 /* Exclude any attempts to start a new grace period. */ 977 /* Exclude any attempts to start a new grace period. */
956 spin_lock_irqsave(&rsp->onofflock, flags); 978 raw_spin_lock_irqsave(&rsp->onofflock, flags);
957 979
958 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 980 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
959 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ 981 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
960 mask = rdp->grpmask; /* rnp->grplo is constant. */ 982 mask = rdp->grpmask; /* rnp->grplo is constant. */
961 do { 983 do {
962 spin_lock(&rnp->lock); /* irqs already disabled. */ 984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
963 rnp->qsmaskinit &= ~mask; 985 rnp->qsmaskinit &= ~mask;
964 if (rnp->qsmaskinit != 0) { 986 if (rnp->qsmaskinit != 0) {
965 if (rnp != rdp->mynode) 987 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 988 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
967 break; 989 break;
968 } 990 }
969 if (rnp == rdp->mynode) 991 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 992 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else 993 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 994 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
973 mask = rnp->grpmask; 995 mask = rnp->grpmask;
974 rnp = rnp->parent; 996 rnp = rnp->parent;
975 } while (rnp != NULL); 997 } while (rnp != NULL);
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1002 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock. 1003 * held leads to deadlock.
982 */ 1004 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1005 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode; 1006 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP) 1007 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags); 1008 rcu_report_unblock_qs_rnp(rnp, flags);
987 else 1009 else
988 spin_unlock_irqrestore(&rnp->lock, flags); 1010 raw_spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP) 1011 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp); 1012 rcu_report_exp_rnp(rsp, rnp);
991 1013
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
1144/* 1166/*
1145 * Scan the leaf rcu_node structures, processing dyntick state for any that 1167 * Scan the leaf rcu_node structures, processing dyntick state for any that
1146 * have not yet encountered a quiescent state, using the function specified. 1168 * have not yet encountered a quiescent state, using the function specified.
1147 * Returns 1 if the current grace period ends while scanning (possibly 1169 * The caller must have suppressed start of new grace periods.
1148 * because we made it end).
1149 */ 1170 */
1150static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, 1171static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1151 int (*f)(struct rcu_data *))
1152{ 1172{
1153 unsigned long bit; 1173 unsigned long bit;
1154 int cpu; 1174 int cpu;
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1158 1178
1159 rcu_for_each_leaf_node(rsp, rnp) { 1179 rcu_for_each_leaf_node(rsp, rnp) {
1160 mask = 0; 1180 mask = 0;
1161 spin_lock_irqsave(&rnp->lock, flags); 1181 raw_spin_lock_irqsave(&rnp->lock, flags);
1162 if (rnp->completed != lastcomp) { 1182 if (!rcu_gp_in_progress(rsp)) {
1163 spin_unlock_irqrestore(&rnp->lock, flags); 1183 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1164 return 1; 1184 return;
1165 } 1185 }
1166 if (rnp->qsmask == 0) { 1186 if (rnp->qsmask == 0) {
1167 spin_unlock_irqrestore(&rnp->lock, flags); 1187 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168 continue; 1188 continue;
1169 } 1189 }
1170 cpu = rnp->grplo; 1190 cpu = rnp->grplo;
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1193 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1174 mask |= bit; 1194 mask |= bit;
1175 } 1195 }
1176 if (mask != 0 && rnp->completed == lastcomp) { 1196 if (mask != 0) {
1177 1197
1178 /* rcu_report_qs_rnp() releases rnp->lock. */ 1198 /* rcu_report_qs_rnp() releases rnp->lock. */
1179 rcu_report_qs_rnp(mask, rsp, rnp, flags); 1199 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1180 continue; 1200 continue;
1181 } 1201 }
1182 spin_unlock_irqrestore(&rnp->lock, flags); 1202 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1183 } 1203 }
1184 return 0;
1185} 1204}
1186 1205
1187/* 1206/*
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1191static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1210static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1192{ 1211{
1193 unsigned long flags; 1212 unsigned long flags;
1194 long lastcomp;
1195 struct rcu_node *rnp = rcu_get_root(rsp); 1213 struct rcu_node *rnp = rcu_get_root(rsp);
1196 u8 signaled;
1197 u8 forcenow;
1198 1214
1199 if (!rcu_gp_in_progress(rsp)) 1215 if (!rcu_gp_in_progress(rsp))
1200 return; /* No grace period in progress, nothing to force. */ 1216 return; /* No grace period in progress, nothing to force. */
1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1217 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1218 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1203 return; /* Someone else is already on the job. */ 1219 return; /* Someone else is already on the job. */
1204 } 1220 }
1205 if (relaxed && 1221 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1206 (long)(rsp->jiffies_force_qs - jiffies) >= 0) 1222 goto unlock_fqs_ret; /* no emergency and done recently. */
1207 goto unlock_ret; /* no emergency and done recently. */
1208 rsp->n_force_qs++; 1223 rsp->n_force_qs++;
1209 spin_lock(&rnp->lock); 1224 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1210 lastcomp = rsp->gpnum - 1;
1211 signaled = rsp->signaled;
1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1225 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1213 if(!rcu_gp_in_progress(rsp)) { 1226 if(!rcu_gp_in_progress(rsp)) {
1214 rsp->n_force_qs_ngp++; 1227 rsp->n_force_qs_ngp++;
1215 spin_unlock(&rnp->lock); 1228 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1216 goto unlock_ret; /* no GP in progress, time updated. */ 1229 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1217 } 1230 }
1218 spin_unlock(&rnp->lock); 1231 rsp->fqs_active = 1;
1219 switch (signaled) { 1232 switch (rsp->signaled) {
1220 case RCU_GP_IDLE: 1233 case RCU_GP_IDLE:
1221 case RCU_GP_INIT: 1234 case RCU_GP_INIT:
1222 1235
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1224 1237
1225 case RCU_SAVE_DYNTICK: 1238 case RCU_SAVE_DYNTICK:
1226 1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1227 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1228 break; /* So gcc recognizes the dead code. */ 1242 break; /* So gcc recognizes the dead code. */
1229 1243
1230 /* Record dyntick-idle state. */ 1244 /* Record dyntick-idle state. */
1231 if (rcu_process_dyntick(rsp, lastcomp, 1245 force_qs_rnp(rsp, dyntick_save_progress_counter);
1232 dyntick_save_progress_counter)) 1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1233 goto unlock_ret; 1247 if (rcu_gp_in_progress(rsp))
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1237
1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1240 spin_lock(&rnp->lock);
1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1244 rsp->signaled = RCU_FORCE_QS; 1248 rsp->signaled = RCU_FORCE_QS;
1245 rsp->completed_fqs = lastcomp; 1249 break;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1247 }
1248 spin_unlock(&rnp->lock);
1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1252 1250
1253 case RCU_FORCE_QS: 1251 case RCU_FORCE_QS:
1254 1252
1255 /* Check dyntick-idle state, send IPI to laggarts. */ 1253 /* Check dyntick-idle state, send IPI to laggarts. */
1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs, 1254 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1257 rcu_implicit_dynticks_qs)) 1255 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1258 goto unlock_ret;
1259 1256
1260 /* Leave state in case more forcing is required. */ 1257 /* Leave state in case more forcing is required. */
1261 1258
1259 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1262 break; 1260 break;
1263 } 1261 }
1264unlock_ret: 1262 rsp->fqs_active = 0;
1265 spin_unlock_irqrestore(&rsp->fqslock, flags); 1263 if (rsp->fqs_need_gp) {
1264 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1265 rsp->fqs_need_gp = 0;
1266 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1267 return;
1268 }
1269 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1270unlock_fqs_ret:
1271 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1266} 1272}
1267 1273
1268#else /* #ifdef CONFIG_SMP */ 1274#else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1290 * If an RCU GP has gone long enough, go check for dyntick 1296 * If an RCU GP has gone long enough, go check for dyntick
1291 * idle CPUs and, if needed, send resched IPIs. 1297 * idle CPUs and, if needed, send resched IPIs.
1292 */ 1298 */
1293 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1299 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1294 force_quiescent_state(rsp, 1); 1300 force_quiescent_state(rsp, 1);
1295 1301
1296 /* 1302 /*
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1304 1310
1305 /* Does this CPU require a not-yet-started grace period? */ 1311 /* Does this CPU require a not-yet-started grace period? */
1306 if (cpu_needs_another_gp(rsp, rdp)) { 1312 if (cpu_needs_another_gp(rsp, rdp)) {
1307 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 1313 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1308 rcu_start_gp(rsp, flags); /* releases above lock */ 1314 rcu_start_gp(rsp, flags); /* releases above lock */
1309 } 1315 }
1310 1316
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1335 * grace-period manipulations above. 1341 * grace-period manipulations above.
1336 */ 1342 */
1337 smp_mb(); /* See above block comment. */ 1343 smp_mb(); /* See above block comment. */
1344
1345 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1346 rcu_needs_cpu_flush();
1338} 1347}
1339 1348
1340static void 1349static void
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1369 unsigned long nestflag; 1378 unsigned long nestflag;
1370 struct rcu_node *rnp_root = rcu_get_root(rsp); 1379 struct rcu_node *rnp_root = rcu_get_root(rsp);
1371 1380
1372 spin_lock_irqsave(&rnp_root->lock, nestflag); 1381 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1382 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1374 } 1383 }
1375 1384
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1387 force_quiescent_state(rsp, 0); 1396 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs; 1397 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen; 1398 rdp->qlen_last_fqs_check = rdp->qlen;
1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1399 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1391 force_quiescent_state(rsp, 1); 1400 force_quiescent_state(rsp, 1);
1392 local_irq_restore(flags); 1401 local_irq_restore(flags);
1393} 1402}
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1520 1529
1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1530 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1522 if (rcu_gp_in_progress(rsp) && 1531 if (rcu_gp_in_progress(rsp) &&
1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1532 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1524 rdp->n_rp_need_fqs++; 1533 rdp->n_rp_need_fqs++;
1525 return 1; 1534 return 1;
1526 } 1535 }
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu)
1545/* 1554/*
1546 * Check to see if any future RCU-related work will need to be done 1555 * Check to see if any future RCU-related work will need to be done
1547 * by the current CPU, even if none need be done immediately, returning 1556 * by the current CPU, even if none need be done immediately, returning
1548 * 1 if so. This function is part of the RCU implementation; it is -not- 1557 * 1 if so.
1549 * an exported member of the RCU API.
1550 */ 1558 */
1551int rcu_needs_cpu(int cpu) 1559static int rcu_needs_cpu_quick_check(int cpu)
1552{ 1560{
1553 /* RCU callbacks either ready or pending? */ 1561 /* RCU callbacks either ready or pending? */
1554 return per_cpu(rcu_sched_data, cpu).nxtlist || 1562 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu)
1556 rcu_preempt_needs_cpu(cpu); 1564 rcu_preempt_needs_cpu(cpu);
1557} 1565}
1558 1566
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1567static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count; 1568static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex); 1569static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1659 struct rcu_node *rnp = rcu_get_root(rsp); 1652 struct rcu_node *rnp = rcu_get_root(rsp);
1660 1653
1661 /* Set up local state, ensuring consistent view of global state. */ 1654 /* Set up local state, ensuring consistent view of global state. */
1662 spin_lock_irqsave(&rnp->lock, flags); 1655 raw_spin_lock_irqsave(&rnp->lock, flags);
1663 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1656 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1664 rdp->nxtlist = NULL; 1657 rdp->nxtlist = NULL;
1665 for (i = 0; i < RCU_NEXT_SIZE; i++) 1658 for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1669 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1662 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1670#endif /* #ifdef CONFIG_NO_HZ */ 1663#endif /* #ifdef CONFIG_NO_HZ */
1671 rdp->cpu = cpu; 1664 rdp->cpu = cpu;
1672 spin_unlock_irqrestore(&rnp->lock, flags); 1665 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1673} 1666}
1674 1667
1675/* 1668/*
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1687 struct rcu_node *rnp = rcu_get_root(rsp); 1680 struct rcu_node *rnp = rcu_get_root(rsp);
1688 1681
1689 /* Set up local state, ensuring consistent view of global state. */ 1682 /* Set up local state, ensuring consistent view of global state. */
1690 spin_lock_irqsave(&rnp->lock, flags); 1683 raw_spin_lock_irqsave(&rnp->lock, flags);
1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1684 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1685 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1693 rdp->beenonline = 1; /* We have now been online. */ 1686 rdp->beenonline = 1; /* We have now been online. */
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1695 rdp->qlen_last_fqs_check = 0; 1688 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs; 1689 rdp->n_force_qs_snap = rsp->n_force_qs;
1697 rdp->blimit = blimit; 1690 rdp->blimit = blimit;
1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1691 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1699 1692
1700 /* 1693 /*
1701 * A new grace period might start here. If so, we won't be part 1694 * A new grace period might start here. If so, we won't be part
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1703 */ 1696 */
1704 1697
1705 /* Exclude any attempts to start a new GP on large systems. */ 1698 /* Exclude any attempts to start a new GP on large systems. */
1706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1699 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1707 1700
1708 /* Add CPU to rcu_node bitmasks. */ 1701 /* Add CPU to rcu_node bitmasks. */
1709 rnp = rdp->mynode; 1702 rnp = rdp->mynode;
1710 mask = rdp->grpmask; 1703 mask = rdp->grpmask;
1711 do { 1704 do {
1712 /* Exclude any attempts to start a new GP on small systems. */ 1705 /* Exclude any attempts to start a new GP on small systems. */
1713 spin_lock(&rnp->lock); /* irqs already disabled. */ 1706 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1714 rnp->qsmaskinit |= mask; 1707 rnp->qsmaskinit |= mask;
1715 mask = rnp->grpmask; 1708 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) { 1709 if (rnp == rdp->mynode) {
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1718 rdp->completed = rnp->completed; 1711 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1; 1712 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 } 1713 }
1721 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1714 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1722 rnp = rnp->parent; 1715 rnp = rnp->parent;
1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1716 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1724 1717
1725 spin_unlock_irqrestore(&rsp->onofflock, flags); 1718 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1726} 1719}
1727 1720
1728static void __cpuinit rcu_online_cpu(int cpu) 1721static void __cpuinit rcu_online_cpu(int cpu)
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1806 */ 1799 */
1807static void __init rcu_init_one(struct rcu_state *rsp) 1800static void __init rcu_init_one(struct rcu_state *rsp)
1808{ 1801{
1802 static char *buf[] = { "rcu_node_level_0",
1803 "rcu_node_level_1",
1804 "rcu_node_level_2",
1805 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
1809 int cpustride = 1; 1806 int cpustride = 1;
1810 int i; 1807 int i;
1811 int j; 1808 int j;
1812 struct rcu_node *rnp; 1809 struct rcu_node *rnp;
1813 1810
1811 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
1812
1814 /* Initialize the level-tracking arrays. */ 1813 /* Initialize the level-tracking arrays. */
1815 1814
1816 for (i = 1; i < NUM_RCU_LVLS; i++) 1815 for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1823 cpustride *= rsp->levelspread[i]; 1822 cpustride *= rsp->levelspread[i];
1824 rnp = rsp->level[i]; 1823 rnp = rsp->level[i];
1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1824 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1826 spin_lock_init(&rnp->lock); 1825 raw_spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]); 1826 lockdep_set_class_and_name(&rnp->lock,
1827 &rcu_node_class[i], buf[i]);
1828 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1829 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1830 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1876,7 +1876,7 @@ do { \
1876 1876
1877void __init rcu_init(void) 1877void __init rcu_init(void)
1878{ 1878{
1879 int i; 1879 int cpu;
1880 1880
1881 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -1896,8 +1896,8 @@ void __init rcu_init(void)
1896 * or the scheduler are operational. 1896 * or the scheduler are operational.
1897 */ 1897 */
1898 cpu_notifier(rcu_cpu_notify, 0); 1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i) 1899 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); 1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1901} 1901}
1902 1902
1903#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..1439eb504c22 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
90 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
91 */ 91 */
92struct rcu_node { 92struct rcu_node {
93 spinlock_t lock; /* Root rcu_node's lock protects some */ 93 raw_spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */ 94 /* rcu_state fields as well as following. */
95 long gpnum; /* Current grace period for this node. */ 95 unsigned long gpnum; /* Current grace period for this node. */
96 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
97 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */ 98 unsigned long completed; /* Last GP completed for this node. */
99 /* This will either be equal to or one */ 99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */ 100 /* behind the root rcu_node's gpnum. */
101 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
161/* Per-CPU data for read-copy update. */ 161/* Per-CPU data for read-copy update. */
162struct rcu_data { 162struct rcu_data {
163 /* 1) quiescent-state and grace-period handling : */ 163 /* 1) quiescent-state and grace-period handling : */
164 long completed; /* Track rsp->completed gp number */ 164 unsigned long completed; /* Track rsp->completed gp number */
165 /* in order to detect GP end. */ 165 /* in order to detect GP end. */
166 long gpnum; /* Highest gp number that this CPU */ 166 unsigned long gpnum; /* Highest gp number that this CPU */
167 /* is aware of having started. */ 167 /* is aware of having started. */
168 long passed_quiesc_completed; 168 unsigned long passed_quiesc_completed;
169 /* Value of completed at time of qs. */ 169 /* Value of completed at time of qs. */
170 bool passed_quiesc; /* User-mode/idle loop etc. */ 170 bool passed_quiesc; /* User-mode/idle loop etc. */
171 bool qs_pending; /* Core waits for quiesc state. */ 171 bool qs_pending; /* Core waits for quiesc state. */
@@ -221,14 +221,14 @@ struct rcu_data {
221 unsigned long resched_ipi; /* Sent a resched IPI. */ 221 unsigned long resched_ipi; /* Sent a resched IPI. */
222 222
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 long n_rp_cb_ready; 226 unsigned long n_rp_cb_ready;
227 long n_rp_cpu_needs_gp; 227 unsigned long n_rp_cpu_needs_gp;
228 long n_rp_gp_completed; 228 unsigned long n_rp_gp_completed;
229 long n_rp_gp_started; 229 unsigned long n_rp_gp_started;
230 long n_rp_need_fqs; 230 unsigned long n_rp_need_fqs;
231 long n_rp_need_nothing; 231 unsigned long n_rp_need_nothing;
232 232
233 int cpu; 233 int cpu;
234}; 234};
@@ -237,12 +237,11 @@ struct rcu_data {
237#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
238#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ 240#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
242#ifdef CONFIG_NO_HZ 241#ifdef CONFIG_NO_HZ
243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 242#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
244#else /* #ifdef CONFIG_NO_HZ */ 243#else /* #ifdef CONFIG_NO_HZ */
245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED 244#define RCU_SIGNAL_INIT RCU_FORCE_QS
246#endif /* #else #ifdef CONFIG_NO_HZ */ 245#endif /* #else #ifdef CONFIG_NO_HZ */
247 246
248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -256,6 +255,9 @@ struct rcu_data {
256 255
257#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 256#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
258 257
258#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
259#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
260
259/* 261/*
260 * RCU global state, including node hierarchy. This hierarchy is 262 * RCU global state, including node hierarchy. This hierarchy is
261 * represented in "heap" form in a dense array. The root (first level) 263 * represented in "heap" form in a dense array. The root (first level)
@@ -277,12 +279,19 @@ struct rcu_state {
277 279
278 u8 signaled ____cacheline_internodealigned_in_smp; 280 u8 signaled ____cacheline_internodealigned_in_smp;
279 /* Force QS state. */ 281 /* Force QS state. */
280 long gpnum; /* Current gp number. */ 282 u8 fqs_active; /* force_quiescent_state() */
281 long completed; /* # of last completed gp. */ 283 /* is running. */
284 u8 fqs_need_gp; /* A CPU was prevented from */
285 /* starting a new grace */
286 /* period because */
287 /* force_quiescent_state() */
288 /* was running. */
289 unsigned long gpnum; /* Current gp number. */
290 unsigned long completed; /* # of last completed gp. */
282 291
283 /* End of fields guarded by root rcu_node's lock. */ 292 /* End of fields guarded by root rcu_node's lock. */
284 293
285 spinlock_t onofflock; /* exclude on/offline and */ 294 raw_spinlock_t onofflock; /* exclude on/offline and */
286 /* starting new GP. Also */ 295 /* starting new GP. Also */
287 /* protects the following */ 296 /* protects the following */
288 /* orphan_cbs fields. */ 297 /* orphan_cbs fields. */
@@ -292,10 +301,8 @@ struct rcu_state {
292 /* going offline. */ 301 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ 302 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */ 303 long orphan_qlen; /* Number of orphaned cbs. */
295 spinlock_t fqslock; /* Only one task forcing */ 304 raw_spinlock_t fqslock; /* Only one task forcing */
296 /* quiescent states. */ 305 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
299 unsigned long jiffies_force_qs; /* Time at which to invoke */ 306 unsigned long jiffies_force_qs; /* Time at which to invoke */
300 /* force_quiescent_state(). */ 307 /* force_quiescent_state(). */
301 unsigned long n_force_qs; /* Number of calls to */ 308 unsigned long n_force_qs; /* Number of calls to */
@@ -319,8 +326,6 @@ struct rcu_state {
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ 326#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */ 327 /* GP were moved to root. */
321 328
322#ifdef RCU_TREE_NONCORE
323
324/* 329/*
325 * RCU implementation internal declarations: 330 * RCU implementation internal declarations:
326 */ 331 */
@@ -335,7 +340,7 @@ extern struct rcu_state rcu_preempt_state;
335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 340DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 341#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
337 342
338#else /* #ifdef RCU_TREE_NONCORE */ 343#ifndef RCU_TREE_NONCORE
339 344
340/* Forward declarations for rcutree_plugin.h */ 345/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void); 346static void rcu_bootup_announce(void);
@@ -347,6 +352,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags); 352 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 353#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 354#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
355static void rcu_print_detail_task_stall(struct rcu_state *rsp);
350static void rcu_print_task_stall(struct rcu_node *rnp); 356static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 357#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 358static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +373,6 @@ static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 373static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void); 374static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void); 375static void __init __rcu_init_preempt(void);
376static void rcu_needs_cpu_flush(void);
370 377
371#endif /* #else #ifdef RCU_TREE_NONCORE */ 378#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..464ad2cdee00 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -62,6 +62,15 @@ long rcu_batches_completed(void)
62EXPORT_SYMBOL_GPL(rcu_batches_completed); 62EXPORT_SYMBOL_GPL(rcu_batches_completed);
63 63
64/* 64/*
65 * Force a quiescent state for preemptible RCU.
66 */
67void rcu_force_quiescent_state(void)
68{
69 force_quiescent_state(&rcu_preempt_state, 0);
70}
71EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
72
73/*
65 * Record a preemptable-RCU quiescent state for the specified CPU. Note 74 * Record a preemptable-RCU quiescent state for the specified CPU. Note
66 * that this just means that the task currently running on the CPU is 75 * that this just means that the task currently running on the CPU is
67 * not in a quiescent state. There might be any number of tasks blocked 76 * not in a quiescent state. There might be any number of tasks blocked
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
102 /* Possibly blocking in an RCU read-side critical section. */ 111 /* Possibly blocking in an RCU read-side critical section. */
103 rdp = rcu_preempt_state.rda[cpu]; 112 rdp = rcu_preempt_state.rda[cpu];
104 rnp = rdp->mynode; 113 rnp = rdp->mynode;
105 spin_lock_irqsave(&rnp->lock, flags); 114 raw_spin_lock_irqsave(&rnp->lock, flags);
106 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 115 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
107 t->rcu_blocked_node = rnp; 116 t->rcu_blocked_node = rnp;
108 117
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
123 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 132 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
124 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 133 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
125 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 134 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
126 spin_unlock_irqrestore(&rnp->lock, flags); 135 raw_spin_unlock_irqrestore(&rnp->lock, flags);
127 } 136 }
128 137
129 /* 138 /*
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
180 struct rcu_node *rnp_p; 189 struct rcu_node *rnp_p;
181 190
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 191 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */ 193 return; /* Still need more quiescent states! */
185 } 194 }
186 195
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
197 206
198 /* Report up the rest of the hierarchy. */ 207 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask; 208 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 209 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */ 210 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 211 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203} 212}
204 213
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
248 */ 257 */
249 for (;;) { 258 for (;;) {
250 rnp = t->rcu_blocked_node; 259 rnp = t->rcu_blocked_node;
251 spin_lock(&rnp->lock); /* irqs already disabled. */ 260 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
252 if (rnp == t->rcu_blocked_node) 261 if (rnp == t->rcu_blocked_node)
253 break; 262 break;
254 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
255 } 264 }
256 empty = !rcu_preempted_readers(rnp); 265 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp); 266 empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 274 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
266 */ 275 */
267 if (empty) 276 if (empty)
268 spin_unlock_irqrestore(&rnp->lock, flags); 277 raw_spin_unlock_irqrestore(&rnp->lock, flags);
269 else 278 else
270 rcu_report_unblock_qs_rnp(rnp, flags); 279 rcu_report_unblock_qs_rnp(rnp, flags);
271 280
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void)
295 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 304 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
296 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 305 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
297 rcu_read_unlock_special(t); 306 rcu_read_unlock_special(t);
307#ifdef CONFIG_PROVE_LOCKING
308 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
309#endif /* #ifdef CONFIG_PROVE_LOCKING */
298} 310}
299EXPORT_SYMBOL_GPL(__rcu_read_unlock); 311EXPORT_SYMBOL_GPL(__rcu_read_unlock);
300 312
301#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 313#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
302 314
315#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
316
317/*
318 * Dump detailed information for all tasks blocking the current RCU
319 * grace period on the specified rcu_node structure.
320 */
321static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
322{
323 unsigned long flags;
324 struct list_head *lp;
325 int phase;
326 struct task_struct *t;
327
328 if (rcu_preempted_readers(rnp)) {
329 raw_spin_lock_irqsave(&rnp->lock, flags);
330 phase = rnp->gpnum & 0x1;
331 lp = &rnp->blocked_tasks[phase];
332 list_for_each_entry(t, lp, rcu_node_entry)
333 sched_show_task(t);
334 raw_spin_unlock_irqrestore(&rnp->lock, flags);
335 }
336}
337
338/*
339 * Dump detailed information for all tasks blocking the current RCU
340 * grace period.
341 */
342static void rcu_print_detail_task_stall(struct rcu_state *rsp)
343{
344 struct rcu_node *rnp = rcu_get_root(rsp);
345
346 rcu_print_detail_task_stall_rnp(rnp);
347 rcu_for_each_leaf_node(rsp, rnp)
348 rcu_print_detail_task_stall_rnp(rnp);
349}
350
351#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
352
353static void rcu_print_detail_task_stall(struct rcu_state *rsp)
354{
355}
356
357#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
358
303/* 359/*
304 * Scan the current list of tasks blocked within RCU read-side critical 360 * Scan the current list of tasks blocked within RCU read-side critical
305 * sections, printing out the tid of each. 361 * sections, printing out the tid of each.
306 */ 362 */
307static void rcu_print_task_stall(struct rcu_node *rnp) 363static void rcu_print_task_stall(struct rcu_node *rnp)
308{ 364{
309 unsigned long flags;
310 struct list_head *lp; 365 struct list_head *lp;
311 int phase; 366 int phase;
312 struct task_struct *t; 367 struct task_struct *t;
313 368
314 if (rcu_preempted_readers(rnp)) { 369 if (rcu_preempted_readers(rnp)) {
315 spin_lock_irqsave(&rnp->lock, flags);
316 phase = rnp->gpnum & 0x1; 370 phase = rnp->gpnum & 0x1;
317 lp = &rnp->blocked_tasks[phase]; 371 lp = &rnp->blocked_tasks[phase];
318 list_for_each_entry(t, lp, rcu_node_entry) 372 list_for_each_entry(t, lp, rcu_node_entry)
319 printk(" P%d", t->pid); 373 printk(" P%d", t->pid);
320 spin_unlock_irqrestore(&rnp->lock, flags);
321 } 374 }
322} 375}
323 376
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
388 lp_root = &rnp_root->blocked_tasks[i]; 441 lp_root = &rnp_root->blocked_tasks[i];
389 while (!list_empty(lp)) { 442 while (!list_empty(lp)) {
390 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 443 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
391 spin_lock(&rnp_root->lock); /* irqs already disabled */ 444 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
392 list_del(&tp->rcu_node_entry); 445 list_del(&tp->rcu_node_entry);
393 tp->rcu_blocked_node = rnp_root; 446 tp->rcu_blocked_node = rnp_root;
394 list_add(&tp->rcu_node_entry, lp_root); 447 list_add(&tp->rcu_node_entry, lp_root);
395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 448 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
396 } 449 }
397 } 450 }
398 return retval; 451 return retval;
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
516 unsigned long flags; 569 unsigned long flags;
517 unsigned long mask; 570 unsigned long mask;
518 571
519 spin_lock_irqsave(&rnp->lock, flags); 572 raw_spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) { 573 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp)) 574 if (!sync_rcu_preempt_exp_done(rnp))
522 break; 575 break;
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
525 break; 578 break;
526 } 579 }
527 mask = rnp->grpmask; 580 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */ 581 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent; 582 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */ 583 raw_spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask; 584 rnp->expmask &= ~mask;
532 } 585 }
533 spin_unlock_irqrestore(&rnp->lock, flags); 586 raw_spin_unlock_irqrestore(&rnp->lock, flags);
534} 587}
535 588
536/* 589/*
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{ 598{
546 int must_wait; 599 int must_wait;
547 600
548 spin_lock(&rnp->lock); /* irqs already disabled */ 601 raw_spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 602 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 603 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp); 604 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */ 605 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait) 606 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp); 607 rcu_report_exp_rnp(rsp, rnp);
555} 608}
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void)
594 /* force all RCU readers onto blocked_tasks[]. */ 647 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited(); 648 synchronize_sched_expedited();
596 649
597 spin_lock_irqsave(&rsp->onofflock, flags); 650 raw_spin_lock_irqsave(&rsp->onofflock, flags);
598 651
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 652 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 653 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */ 654 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit; 655 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 656 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 } 657 }
605 658
606 /* Snapshot current state of ->blocked_tasks[] lists. */ 659 /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void)
609 if (NUM_RCU_NODES > 1) 662 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 663 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611 664
612 spin_unlock_irqrestore(&rsp->onofflock, flags); 665 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
613 666
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 667 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp); 668 rnp = rcu_get_root(rsp);
@@ -713,6 +766,16 @@ long rcu_batches_completed(void)
713EXPORT_SYMBOL_GPL(rcu_batches_completed); 766EXPORT_SYMBOL_GPL(rcu_batches_completed);
714 767
715/* 768/*
769 * Force a quiescent state for RCU, which, because there is no preemptible
770 * RCU, becomes the same as rcu-sched.
771 */
772void rcu_force_quiescent_state(void)
773{
774 rcu_sched_force_quiescent_state();
775}
776EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
777
778/*
716 * Because preemptable RCU does not exist, we never have to check for 779 * Because preemptable RCU does not exist, we never have to check for
717 * CPUs being in quiescent states. 780 * CPUs being in quiescent states.
718 */ 781 */
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
734/* Because preemptible RCU does not exist, no quieting of tasks. */ 797/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 798static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{ 799{
737 spin_unlock_irqrestore(&rnp->lock, flags); 800 raw_spin_unlock_irqrestore(&rnp->lock, flags);
738} 801}
739 802
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 803#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
745 * Because preemptable RCU does not exist, we never have to check for 808 * Because preemptable RCU does not exist, we never have to check for
746 * tasks blocked within RCU read-side critical sections. 809 * tasks blocked within RCU read-side critical sections.
747 */ 810 */
811static void rcu_print_detail_task_stall(struct rcu_state *rsp)
812{
813}
814
815/*
816 * Because preemptable RCU does not exist, we never have to check for
817 * tasks blocked within RCU read-side critical sections.
818 */
748static void rcu_print_task_stall(struct rcu_node *rnp) 819static void rcu_print_task_stall(struct rcu_node *rnp)
749{ 820{
750} 821}
@@ -884,3 +955,113 @@ static void __init __rcu_init_preempt(void)
884} 955}
885 956
886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 957#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
958
959#if !defined(CONFIG_RCU_FAST_NO_HZ)
960
961/*
962 * Check to see if any future RCU-related work will need to be done
963 * by the current CPU, even if none need be done immediately, returning
964 * 1 if so. This function is part of the RCU implementation; it is -not-
965 * an exported member of the RCU API.
966 *
967 * Because we have preemptible RCU, just check whether this CPU needs
968 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
969 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
970 */
971int rcu_needs_cpu(int cpu)
972{
973 return rcu_needs_cpu_quick_check(cpu);
974}
975
976/*
977 * Check to see if we need to continue a callback-flush operations to
978 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
979 * entry is not configured, so we never do need to.
980 */
981static void rcu_needs_cpu_flush(void)
982{
983}
984
985#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
986
987#define RCU_NEEDS_CPU_FLUSHES 5
988static DEFINE_PER_CPU(int, rcu_dyntick_drain);
989static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
990
991/*
992 * Check to see if any future RCU-related work will need to be done
993 * by the current CPU, even if none need be done immediately, returning
994 * 1 if so. This function is part of the RCU implementation; it is -not-
995 * an exported member of the RCU API.
996 *
997 * Because we are not supporting preemptible RCU, attempt to accelerate
998 * any current grace periods so that RCU no longer needs this CPU, but
999 * only if all other CPUs are already in dynticks-idle mode. This will
1000 * allow the CPU cores to be powered down immediately, as opposed to after
1001 * waiting many milliseconds for grace periods to elapse.
1002 *
1003 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1004 * disabled, we do one pass of force_quiescent_state(), then do a
1005 * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
1006 * The per-cpu rcu_dyntick_drain variable controls the sequencing.
1007 */
1008int rcu_needs_cpu(int cpu)
1009{
1010 int c = 0;
1011 int thatcpu;
1012
1013 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1014 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1015 if (thatcpu != cpu) {
1016 per_cpu(rcu_dyntick_drain, cpu) = 0;
1017 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1018 return rcu_needs_cpu_quick_check(cpu);
1019 }
1020
1021 /* Check and update the rcu_dyntick_drain sequencing. */
1022 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1023 /* First time through, initialize the counter. */
1024 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1025 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1026 /* We have hit the limit, so time to give up. */
1027 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1028 return rcu_needs_cpu_quick_check(cpu);
1029 }
1030
1031 /* Do one step pushing remaining RCU callbacks through. */
1032 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1033 rcu_sched_qs(cpu);
1034 force_quiescent_state(&rcu_sched_state, 0);
1035 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1036 }
1037 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1038 rcu_bh_qs(cpu);
1039 force_quiescent_state(&rcu_bh_state, 0);
1040 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1041 }
1042
1043 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1044 if (c) {
1045 raise_softirq(RCU_SOFTIRQ);
1046 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1047 }
1048 return c;
1049}
1050
1051/*
1052 * Check to see if we need to continue a callback-flush operations to
1053 * allow the last CPU to enter dyntick-idle mode.
1054 */
1055static void rcu_needs_cpu_flush(void)
1056{
1057 int cpu = smp_processor_id();
1058 unsigned long flags;
1059
1060 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
1061 return;
1062 local_irq_save(flags);
1063 (void)rcu_needs_cpu(cpu);
1064 local_irq_restore(flags);
1065}
1066
1067#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 50{
51 if (!rdp->beenonline) 51 if (!rdp->beenonline)
52 return; 52 return;
53 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", 53 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
54 rdp->cpu, 54 rdp->cpu,
55 cpu_is_offline(rdp->cpu) ? '!' : ' ', 55 cpu_is_offline(rdp->cpu) ? '!' : ' ',
56 rdp->completed, rdp->gpnum, 56 rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
105{ 105{
106 if (!rdp->beenonline) 106 if (!rdp->beenonline)
107 return; 107 return;
108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
109 rdp->cpu, 109 rdp->cpu,
110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
111 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum; 158 unsigned long gpnum;
159 int level = 0; 159 int level = 0;
160 int phase; 160 int phase;
161 struct rcu_node *rnp; 161 struct rcu_node *rnp;
162 162
163 gpnum = rsp->gpnum; 163 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
166 rsp->completed, gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
167 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
215static int show_rcugp(struct seq_file *m, void *unused) 215static int show_rcugp(struct seq_file *m, void *unused)
216{ 216{
217#ifdef CONFIG_TREE_PREEMPT_RCU 217#ifdef CONFIG_TREE_PREEMPT_RCU
218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", 218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
219 rcu_preempt_state.completed, rcu_preempt_state.gpnum); 219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", 221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
222 rcu_sched_state.completed, rcu_sched_state.gpnum); 222 rcu_sched_state.completed, rcu_sched_state.gpnum);
223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
224 rcu_bh_state.completed, rcu_bh_state.gpnum); 224 rcu_bh_state.completed, rcu_bh_state.gpnum);
225 return 0; 225 return 0;
226} 226}
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index dc15686b7a77..2d5be5d9bf5f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old)
188 return -EINVAL; 188 return -EINVAL;
189} 189}
190 190
191static void __release_child_resources(struct resource *r)
192{
193 struct resource *tmp, *p;
194 resource_size_t size;
195
196 p = r->child;
197 r->child = NULL;
198 while (p) {
199 tmp = p;
200 p = p->sibling;
201
202 tmp->parent = NULL;
203 tmp->sibling = NULL;
204 __release_child_resources(tmp);
205
206 printk(KERN_DEBUG "release child resource %pR\n", tmp);
207 /* need to restore size, and keep flags */
208 size = resource_size(tmp);
209 tmp->start = 0;
210 tmp->end = size - 1;
211 }
212}
213
214void release_child_resources(struct resource *r)
215{
216 write_lock(&resource_lock);
217 __release_child_resources(r);
218 write_unlock(&resource_lock);
219}
220
191/** 221/**
192 * request_resource - request and reserve an I/O or memory resource 222 * request_resource - request and reserve an I/O or memory resource
193 * @root: root resource descriptor 223 * @root: root resource descriptor
@@ -274,7 +304,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *)) 304 void *arg, int (*func)(unsigned long, unsigned long, void *))
275{ 305{
276 struct resource res; 306 struct resource res;
277 unsigned long pfn, len; 307 unsigned long pfn, end_pfn;
278 u64 orig_end; 308 u64 orig_end;
279 int ret = -1; 309 int ret = -1;
280 310
@@ -284,9 +314,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
284 orig_end = res.end; 314 orig_end = res.end;
285 while ((res.start < res.end) && 315 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) { 316 (find_next_system_ram(&res, "System RAM") >= 0)) {
287 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 317 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 318 end_pfn = (res.end + 1) >> PAGE_SHIFT;
289 ret = (*func)(pfn, len, arg); 319 if (end_pfn > pfn)
320 ret = (*func)(pfn, end_pfn - pfn, arg);
290 if (ret) 321 if (ret)
291 break; 322 break;
292 res.start = res.end + 1; 323 res.start = res.end + 1;
@@ -297,48 +328,63 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
297 328
298#endif 329#endif
299 330
331static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
332{
333 return 1;
334}
335/*
336 * This generic page_is_ram() returns true if specified address is
337 * registered as "System RAM" in iomem_resource list.
338 */
339int __weak page_is_ram(unsigned long pfn)
340{
341 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
342}
343
300/* 344/*
301 * Find empty slot in the resource tree given range and alignment. 345 * Find empty slot in the resource tree given range and alignment.
302 */ 346 */
303static int find_resource(struct resource *root, struct resource *new, 347static int find_resource(struct resource *root, struct resource *new,
304 resource_size_t size, resource_size_t min, 348 resource_size_t size, resource_size_t min,
305 resource_size_t max, resource_size_t align, 349 resource_size_t max, resource_size_t align,
306 void (*alignf)(void *, struct resource *, 350 resource_size_t (*alignf)(void *,
307 resource_size_t, resource_size_t), 351 const struct resource *,
352 resource_size_t,
353 resource_size_t),
308 void *alignf_data) 354 void *alignf_data)
309{ 355{
310 struct resource *this = root->child; 356 struct resource *this = root->child;
311 resource_size_t start, end; 357 struct resource tmp = *new;
312 358
313 start = root->start; 359 tmp.start = root->start;
314 /* 360 /*
315 * Skip past an allocated resource that starts at 0, since the assignment 361 * Skip past an allocated resource that starts at 0, since the assignment
316 * of this->start - 1 to new->end below would cause an underflow. 362 * of this->start - 1 to tmp->end below would cause an underflow.
317 */ 363 */
318 if (this && this->start == 0) { 364 if (this && this->start == 0) {
319 start = this->end + 1; 365 tmp.start = this->end + 1;
320 this = this->sibling; 366 this = this->sibling;
321 } 367 }
322 for(;;) { 368 for(;;) {
323 if (this) 369 if (this)
324 end = this->start - 1; 370 tmp.end = this->start - 1;
325 else 371 else
326 end = root->end; 372 tmp.end = root->end;
327 if (start < min) 373 if (tmp.start < min)
328 start = min; 374 tmp.start = min;
329 if (end > max) 375 if (tmp.end > max)
330 end = max; 376 tmp.end = max;
331 start = ALIGN(start, align); 377 tmp.start = ALIGN(tmp.start, align);
332 if (alignf) 378 if (alignf)
333 alignf(alignf_data, new, size, align); 379 tmp.start = alignf(alignf_data, &tmp, size, align);
334 if (start < end && end - start >= size - 1) { 380 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
335 new->start = start; 381 new->start = tmp.start;
336 new->end = start + size - 1; 382 new->end = tmp.start + size - 1;
337 return 0; 383 return 0;
338 } 384 }
339 if (!this) 385 if (!this)
340 break; 386 break;
341 start = this->end + 1; 387 tmp.start = this->end + 1;
342 this = this->sibling; 388 this = this->sibling;
343 } 389 }
344 return -EBUSY; 390 return -EBUSY;
@@ -358,8 +404,10 @@ static int find_resource(struct resource *root, struct resource *new,
358int allocate_resource(struct resource *root, struct resource *new, 404int allocate_resource(struct resource *root, struct resource *new,
359 resource_size_t size, resource_size_t min, 405 resource_size_t size, resource_size_t min,
360 resource_size_t max, resource_size_t align, 406 resource_size_t max, resource_size_t align,
361 void (*alignf)(void *, struct resource *, 407 resource_size_t (*alignf)(void *,
362 resource_size_t, resource_size_t), 408 const struct resource *,
409 resource_size_t,
410 resource_size_t),
363 void *alignf_data) 411 void *alignf_data)
364{ 412{
365 int err; 413 int err;
diff --git a/kernel/sched.c b/kernel/sched.c
index 18cceeecce35..b47ceeec1a91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 602#endif
646} 603}
647 604
605#define rcu_dereference_check_sched_domain(p) \
606 rcu_dereference_check((p), \
607 rcu_read_lock_sched_held() || \
608 lockdep_is_held(&sched_domains_mutex))
609
648/* 610/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 611 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 612 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 615 * preempt-disabled sections.
654 */ 616 */
655#define for_each_domain(cpu, __sd) \ 617#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 618 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 619
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 620#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 621#define this_rq() (&__get_cpu_var(runqueues))
@@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 904
943/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
946 */ 921 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 923 __acquires(rq->lock)
949{ 924{
925 struct rq *rq;
926
950 for (;;) { 927 for (;;) {
951 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 931 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 933 return rq;
955 raw_spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
956 } 935 }
@@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 946 struct rq *rq;
968 947
969 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
970 local_irq_save(*flags); 951 local_irq_save(*flags);
971 rq = task_rq(p); 952 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 955 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 957 }
@@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1372};
1392 1373
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
1531 1486
1532static struct sched_group *group_of(int cpu) 1487static struct sched_group *group_of(int cpu)
1533{ 1488{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1535 1490
1536 if (!sd) 1491 if (!sd)
1537 return NULL; 1492 return NULL;
@@ -1566,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1566 1521
1567#ifdef CONFIG_FAIR_GROUP_SCHED 1522#ifdef CONFIG_FAIR_GROUP_SCHED
1568 1523
1569static __read_mostly unsigned long *update_shares_data; 1524static __read_mostly unsigned long __percpu *update_shares_data;
1570 1525
1571static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1526static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1572 1527
@@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1656 }
1702} 1657}
1703 1658
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1715{ 1660{
1716 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1670{
1726} 1671}
1727 1672
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1673#endif
1733 1674
1734#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1808#endif 1794#endif
1809 1795
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1820#endif
1835} 1821}
1836 1822
1837#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1824
1845#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1826#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1848 1828
1829#include "sched_stats.h"
1830
1849static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1850{ 1832{
1851 rq->nr_running++; 1833 rq->nr_running++;
@@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1865 *avg += diff >> 3;
1884} 1866}
1885 1867
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1870{
1888 if (wakeup) 1871 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1873
1891 sched_info_queued(p); 1874 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1894} 1877}
1895 1878
@@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1895}
1913 1896
1914/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1915 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1916 */ 1930 */
1917static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1971 return p->prio;
1958} 1972}
1959 1973
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1974/**
1985 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1976 * @p: the task in question.
@@ -2002,39 +1992,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2002 p->sched_class->prio_changed(rq, p, oldprio, running); 1992 p->sched_class->prio_changed(rq, p, oldprio, running);
2003} 1993}
2004 1994
2005/**
2006 * kthread_bind - bind a just-created kthread to a cpu.
2007 * @p: thread created by kthread_create().
2008 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2009 *
2010 * Description: This function is equivalent to set_cpus_allowed(),
2011 * except that @cpu doesn't need to be online, and the thread must be
2012 * stopped (i.e., just returned from kthread_create()).
2013 *
2014 * Function lives here instead of kthread.c because it messes with
2015 * scheduler internals which require locking.
2016 */
2017void kthread_bind(struct task_struct *p, unsigned int cpu)
2018{
2019 struct rq *rq = cpu_rq(cpu);
2020 unsigned long flags;
2021
2022 /* Must have done schedule() in kthread() before we set_task_cpu */
2023 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2024 WARN_ON(1);
2025 return;
2026 }
2027
2028 raw_spin_lock_irqsave(&rq->lock, flags);
2029 update_rq_clock(rq);
2030 set_task_cpu(p, cpu);
2031 p->cpus_allowed = cpumask_of_cpu(cpu);
2032 p->rt.nr_cpus_allowed = 1;
2033 p->flags |= PF_THREAD_BOUND;
2034 raw_spin_unlock_irqrestore(&rq->lock, flags);
2035}
2036EXPORT_SYMBOL(kthread_bind);
2037
2038#ifdef CONFIG_SMP 1995#ifdef CONFIG_SMP
2039/* 1996/*
2040 * Is this task likely cache-hot: 1997 * Is this task likely cache-hot:
@@ -2044,6 +2001,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2044{ 2001{
2045 s64 delta; 2002 s64 delta;
2046 2003
2004 if (p->sched_class != &fair_sched_class)
2005 return 0;
2006
2047 /* 2007 /*
2048 * Buddy candidates are cache hot: 2008 * Buddy candidates are cache hot:
2049 */ 2009 */
@@ -2052,9 +2012,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2052 &p->se == cfs_rq_of(&p->se)->last)) 2012 &p->se == cfs_rq_of(&p->se)->last))
2053 return 1; 2013 return 1;
2054 2014
2055 if (p->sched_class != &fair_sched_class)
2056 return 0;
2057
2058 if (sysctl_sched_migration_cost == -1) 2015 if (sysctl_sched_migration_cost == -1)
2059 return 1; 2016 return 1;
2060 if (sysctl_sched_migration_cost == 0) 2017 if (sysctl_sched_migration_cost == 0)
@@ -2065,22 +2022,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2065 return delta < (s64)sysctl_sched_migration_cost; 2022 return delta < (s64)sysctl_sched_migration_cost;
2066} 2023}
2067 2024
2068
2069void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2025void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2070{ 2026{
2071 int old_cpu = task_cpu(p); 2027#ifdef CONFIG_SCHED_DEBUG
2072 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2028 /*
2073 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2029 * We should never call set_task_cpu() on a blocked task,
2030 * ttwu() will sort out the placement.
2031 */
2032 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2033 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2034#endif
2074 2035
2075 trace_sched_migrate_task(p, new_cpu); 2036 trace_sched_migrate_task(p, new_cpu);
2076 2037
2077 if (old_cpu != new_cpu) { 2038 if (task_cpu(p) != new_cpu) {
2078 p->se.nr_migrations++; 2039 p->se.nr_migrations++;
2079 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2040 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2080 1, 1, NULL, 0);
2081 } 2041 }
2082 p->se.vruntime -= old_cfsrq->min_vruntime -
2083 new_cfsrq->min_vruntime;
2084 2042
2085 __set_task_cpu(p, new_cpu); 2043 __set_task_cpu(p, new_cpu);
2086} 2044}
@@ -2105,13 +2063,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2105 2063
2106 /* 2064 /*
2107 * If the task is not on a runqueue (and not running), then 2065 * If the task is not on a runqueue (and not running), then
2108 * it is sufficient to simply update the task's cpu field. 2066 * the next wake-up will properly place the task.
2109 */ 2067 */
2110 if (!p->se.on_rq && !task_running(rq, p)) { 2068 if (!p->se.on_rq && !task_running(rq, p))
2111 update_rq_clock(rq);
2112 set_task_cpu(p, dest_cpu);
2113 return 0; 2069 return 0;
2114 }
2115 2070
2116 init_completion(&req->done); 2071 init_completion(&req->done);
2117 req->task = p; 2072 req->task = p;
@@ -2317,10 +2272,71 @@ void task_oncpu_function_call(struct task_struct *p,
2317} 2272}
2318 2273
2319#ifdef CONFIG_SMP 2274#ifdef CONFIG_SMP
2275static int select_fallback_rq(int cpu, struct task_struct *p)
2276{
2277 int dest_cpu;
2278 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2279
2280 /* Look for allowed, online CPU in same node. */
2281 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2282 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2283 return dest_cpu;
2284
2285 /* Any allowed, online CPU? */
2286 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2287 if (dest_cpu < nr_cpu_ids)
2288 return dest_cpu;
2289
2290 /* No more Mr. Nice Guy. */
2291 if (dest_cpu >= nr_cpu_ids) {
2292 rcu_read_lock();
2293 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2294 rcu_read_unlock();
2295 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2296
2297 /*
2298 * Don't tell them about moving exiting tasks or
2299 * kernel threads (both mm NULL), since they never
2300 * leave kernel.
2301 */
2302 if (p->mm && printk_ratelimit()) {
2303 printk(KERN_INFO "process %d (%s) no "
2304 "longer affine to cpu%d\n",
2305 task_pid_nr(p), p->comm, cpu);
2306 }
2307 }
2308
2309 return dest_cpu;
2310}
2311
2312/*
2313 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2314 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2315 * by:
2316 *
2317 * exec: is unstable, retry loop
2318 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2319 */
2320static inline 2320static inline
2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2322{ 2322{
2323 return p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2323 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2324
2325 /*
2326 * In order not to call set_task_cpu() on a blocking task we need
2327 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2328 * cpu.
2329 *
2330 * Since this is common to all placement strategies, this lives here.
2331 *
2332 * [ this allows ->select_task() to simply return task_cpu(p) and
2333 * not worry about this generic constraint ]
2334 */
2335 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2336 !cpu_online(cpu)))
2337 cpu = select_fallback_rq(task_cpu(p), p);
2338
2339 return cpu;
2324} 2340}
2325#endif 2341#endif
2326 2342
@@ -2375,17 +2391,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2375 if (task_contributes_to_load(p)) 2391 if (task_contributes_to_load(p))
2376 rq->nr_uninterruptible--; 2392 rq->nr_uninterruptible--;
2377 p->state = TASK_WAKING; 2393 p->state = TASK_WAKING;
2394
2395 if (p->sched_class->task_waking)
2396 p->sched_class->task_waking(rq, p);
2397
2378 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2379 2399
2380 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2381 if (cpu != orig_cpu) 2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2382 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2408 }
2383 2409
2384 rq = __task_rq_lock(p); 2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2385 update_rq_clock(rq); 2412 update_rq_clock(rq);
2386 2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2387 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2388 cpu = task_cpu(p);
2389 2422
2390#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2391 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2438,8 +2471,8 @@ out_running:
2438 2471
2439 p->state = TASK_RUNNING; 2472 p->state = TASK_RUNNING;
2440#ifdef CONFIG_SMP 2473#ifdef CONFIG_SMP
2441 if (p->sched_class->task_wake_up) 2474 if (p->sched_class->task_woken)
2442 p->sched_class->task_wake_up(rq, p); 2475 p->sched_class->task_woken(rq, p);
2443 2476
2444 if (unlikely(rq->idle_stamp)) { 2477 if (unlikely(rq->idle_stamp)) {
2445 u64 delta = rq->clock - rq->idle_stamp; 2478 u64 delta = rq->clock - rq->idle_stamp;
@@ -2538,14 +2571,6 @@ static void __sched_fork(struct task_struct *p)
2538#ifdef CONFIG_PREEMPT_NOTIFIERS 2571#ifdef CONFIG_PREEMPT_NOTIFIERS
2539 INIT_HLIST_HEAD(&p->preempt_notifiers); 2572 INIT_HLIST_HEAD(&p->preempt_notifiers);
2540#endif 2573#endif
2541
2542 /*
2543 * We mark the process as running here, but have not actually
2544 * inserted it onto the runqueue yet. This guarantees that
2545 * nobody will actually run it, and a signal or other external
2546 * event cannot wake it up and insert it on the runqueue either.
2547 */
2548 p->state = TASK_RUNNING;
2549} 2574}
2550 2575
2551/* 2576/*
@@ -2556,6 +2581,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
2556 int cpu = get_cpu(); 2581 int cpu = get_cpu();
2557 2582
2558 __sched_fork(p); 2583 __sched_fork(p);
2584 /*
2585 * We mark the process as waking here. This guarantees that
2586 * nobody will actually run it, and a signal or other external
2587 * event cannot wake it up and insert it on the runqueue either.
2588 */
2589 p->state = TASK_WAKING;
2559 2590
2560 /* 2591 /*
2561 * Revert to default priority/policy on fork if requested. 2592 * Revert to default priority/policy on fork if requested.
@@ -2590,9 +2621,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
2590 if (p->sched_class->task_fork) 2621 if (p->sched_class->task_fork)
2591 p->sched_class->task_fork(p); 2622 p->sched_class->task_fork(p);
2592 2623
2593#ifdef CONFIG_SMP
2594 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2595#endif
2596 set_task_cpu(p, cpu); 2624 set_task_cpu(p, cpu);
2597 2625
2598#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2626#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2622,18 +2650,41 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2622{ 2650{
2623 unsigned long flags; 2651 unsigned long flags;
2624 struct rq *rq; 2652 struct rq *rq;
2653 int cpu = get_cpu();
2625 2654
2626 rq = task_rq_lock(p, &flags); 2655#ifdef CONFIG_SMP
2627 BUG_ON(p->state != TASK_RUNNING); 2656 /*
2657 * Fork balancing, do it here and not earlier because:
2658 * - cpus_allowed can change in the fork path
2659 * - any previously selected cpu might disappear through hotplug
2660 *
2661 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2662 * ->cpus_allowed is stable, we have preemption disabled, meaning
2663 * cpu_online_mask is stable.
2664 */
2665 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2666 set_task_cpu(p, cpu);
2667#endif
2668
2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675
2676 BUG_ON(p->state != TASK_WAKING);
2677 p->state = TASK_RUNNING;
2628 update_rq_clock(rq); 2678 update_rq_clock(rq);
2629 activate_task(rq, p, 0); 2679 activate_task(rq, p, 0);
2630 trace_sched_wakeup_new(rq, p, 1); 2680 trace_sched_wakeup_new(rq, p, 1);
2631 check_preempt_curr(rq, p, WF_FORK); 2681 check_preempt_curr(rq, p, WF_FORK);
2632#ifdef CONFIG_SMP 2682#ifdef CONFIG_SMP
2633 if (p->sched_class->task_wake_up) 2683 if (p->sched_class->task_woken)
2634 p->sched_class->task_wake_up(rq, p); 2684 p->sched_class->task_woken(rq, p);
2635#endif 2685#endif
2636 task_rq_unlock(rq, &flags); 2686 task_rq_unlock(rq, &flags);
2687 put_cpu();
2637} 2688}
2638 2689
2639#ifdef CONFIG_PREEMPT_NOTIFIERS 2690#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2752,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2752 */ 2803 */
2753 prev_state = prev->state; 2804 prev_state = prev->state;
2754 finish_arch_switch(prev); 2805 finish_arch_switch(prev);
2755 perf_event_task_sched_in(current, cpu_of(rq)); 2806#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2807 local_irq_disable();
2808#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2809 perf_event_task_sched_in(current);
2810#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2811 local_irq_enable();
2812#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2756 finish_lock_switch(rq, prev); 2813 finish_lock_switch(rq, prev);
2757 2814
2758 fire_sched_in_preempt_notifiers(current); 2815 fire_sched_in_preempt_notifiers(current);
@@ -3057,65 +3114,36 @@ static void update_cpu_load(struct rq *this_rq)
3057#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3058 3115
3059/* 3116/*
3060 * double_rq_lock - safely lock two runqueues 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3061 * 3118 * this point the task has the smallest effective memory and cache footprint.
3062 * Note this does not disable interrupts like task_rq_lock,
3063 * you need to do so manually before calling.
3064 */
3065static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3066 __acquires(rq1->lock)
3067 __acquires(rq2->lock)
3068{
3069 BUG_ON(!irqs_disabled());
3070 if (rq1 == rq2) {
3071 raw_spin_lock(&rq1->lock);
3072 __acquire(rq2->lock); /* Fake it out ;) */
3073 } else {
3074 if (rq1 < rq2) {
3075 raw_spin_lock(&rq1->lock);
3076 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3077 } else {
3078 raw_spin_lock(&rq2->lock);
3079 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3080 }
3081 }
3082 update_rq_clock(rq1);
3083 update_rq_clock(rq2);
3084}
3085
3086/*
3087 * double_rq_unlock - safely unlock two runqueues
3088 *
3089 * Note this does not restore interrupts like task_rq_unlock,
3090 * you need to do so manually after calling.
3091 */
3092static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3093 __releases(rq1->lock)
3094 __releases(rq2->lock)
3095{
3096 raw_spin_unlock(&rq1->lock);
3097 if (rq1 != rq2)
3098 raw_spin_unlock(&rq2->lock);
3099 else
3100 __release(rq2->lock);
3101}
3102
3103/*
3104 * If dest_cpu is allowed for this process, migrate the task to it.
3105 * This is accomplished by forcing the cpu_allowed mask to only
3106 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3107 * the cpu_allowed mask is restored.
3108 */ 3119 */
3109static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3120void sched_exec(void)
3110{ 3121{
3122 struct task_struct *p = current;
3111 struct migration_req req; 3123 struct migration_req req;
3124 int dest_cpu, this_cpu;
3112 unsigned long flags; 3125 unsigned long flags;
3113 struct rq *rq; 3126 struct rq *rq;
3114 3127
3128again:
3129 this_cpu = get_cpu();
3130 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3131 if (dest_cpu == this_cpu) {
3132 put_cpu();
3133 return;
3134 }
3135
3115 rq = task_rq_lock(p, &flags); 3136 rq = task_rq_lock(p, &flags);
3137 put_cpu();
3138
3139 /*
3140 * select_task_rq() can race against ->cpus_allowed
3141 */
3116 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3142 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3117 || unlikely(!cpu_active(dest_cpu))) 3143 || unlikely(!cpu_active(dest_cpu))) {
3118 goto out; 3144 task_rq_unlock(rq, &flags);
3145 goto again;
3146 }
3119 3147
3120 /* force the process onto the specified CPU */ 3148 /* force the process onto the specified CPU */
3121 if (migrate_task(p, dest_cpu, &req)) { 3149 if (migrate_task(p, dest_cpu, &req)) {
@@ -3130,1788 +3158,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3130 3158
3131 return; 3159 return;
3132 } 3160 }
3133out:
3134 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3135} 3162}
3136 3163
3137/*
3138 * sched_exec - execve() is a valuable balancing opportunity, because at
3139 * this point the task has the smallest effective memory and cache footprint.
3140 */
3141void sched_exec(void)
3142{
3143 int new_cpu, this_cpu = get_cpu();
3144 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3145 put_cpu();
3146 if (new_cpu != this_cpu)
3147 sched_migrate_task(current, new_cpu);
3148}
3149
3150/*
3151 * pull_task - move a task from a remote runqueue to the local runqueue.
3152 * Both runqueues must be locked.
3153 */
3154static void pull_task(struct rq *src_rq, struct task_struct *p,
3155 struct rq *this_rq, int this_cpu)
3156{
3157 deactivate_task(src_rq, p, 0);
3158 set_task_cpu(p, this_cpu);
3159 activate_task(this_rq, p, 0);
3160 check_preempt_curr(this_rq, p, 0);
3161}
3162
3163/*
3164 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3165 */
3166static
3167int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3168 struct sched_domain *sd, enum cpu_idle_type idle,
3169 int *all_pinned)
3170{
3171 int tsk_cache_hot = 0;
3172 /*
3173 * We do not migrate tasks that are:
3174 * 1) running (obviously), or
3175 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3176 * 3) are cache-hot on their current CPU.
3177 */
3178 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3179 schedstat_inc(p, se.nr_failed_migrations_affine);
3180 return 0;
3181 }
3182 *all_pinned = 0;
3183
3184 if (task_running(rq, p)) {
3185 schedstat_inc(p, se.nr_failed_migrations_running);
3186 return 0;
3187 }
3188
3189 /*
3190 * Aggressive migration if:
3191 * 1) task is cache cold, or
3192 * 2) too many balance attempts have failed.
3193 */
3194
3195 tsk_cache_hot = task_hot(p, rq->clock, sd);
3196 if (!tsk_cache_hot ||
3197 sd->nr_balance_failed > sd->cache_nice_tries) {
3198#ifdef CONFIG_SCHEDSTATS
3199 if (tsk_cache_hot) {
3200 schedstat_inc(sd, lb_hot_gained[idle]);
3201 schedstat_inc(p, se.nr_forced_migrations);
3202 }
3203#endif
3204 return 1;
3205 }
3206
3207 if (tsk_cache_hot) {
3208 schedstat_inc(p, se.nr_failed_migrations_hot);
3209 return 0;
3210 }
3211 return 1;
3212}
3213
3214static unsigned long
3215balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3216 unsigned long max_load_move, struct sched_domain *sd,
3217 enum cpu_idle_type idle, int *all_pinned,
3218 int *this_best_prio, struct rq_iterator *iterator)
3219{
3220 int loops = 0, pulled = 0, pinned = 0;
3221 struct task_struct *p;
3222 long rem_load_move = max_load_move;
3223
3224 if (max_load_move == 0)
3225 goto out;
3226
3227 pinned = 1;
3228
3229 /*
3230 * Start the load-balancing iterator:
3231 */
3232 p = iterator->start(iterator->arg);
3233next:
3234 if (!p || loops++ > sysctl_sched_nr_migrate)
3235 goto out;
3236
3237 if ((p->se.load.weight >> 1) > rem_load_move ||
3238 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3239 p = iterator->next(iterator->arg);
3240 goto next;
3241 }
3242
3243 pull_task(busiest, p, this_rq, this_cpu);
3244 pulled++;
3245 rem_load_move -= p->se.load.weight;
3246
3247#ifdef CONFIG_PREEMPT
3248 /*
3249 * NEWIDLE balancing is a source of latency, so preemptible kernels
3250 * will stop after the first task is pulled to minimize the critical
3251 * section.
3252 */
3253 if (idle == CPU_NEWLY_IDLE)
3254 goto out;
3255#endif
3256
3257 /*
3258 * We only want to steal up to the prescribed amount of weighted load.
3259 */
3260 if (rem_load_move > 0) {
3261 if (p->prio < *this_best_prio)
3262 *this_best_prio = p->prio;
3263 p = iterator->next(iterator->arg);
3264 goto next;
3265 }
3266out:
3267 /*
3268 * Right now, this is one of only two places pull_task() is called,
3269 * so we can safely collect pull_task() stats here rather than
3270 * inside pull_task().
3271 */
3272 schedstat_add(sd, lb_gained[idle], pulled);
3273
3274 if (all_pinned)
3275 *all_pinned = pinned;
3276
3277 return max_load_move - rem_load_move;
3278}
3279
3280/*
3281 * move_tasks tries to move up to max_load_move weighted load from busiest to
3282 * this_rq, as part of a balancing operation within domain "sd".
3283 * Returns 1 if successful and 0 otherwise.
3284 *
3285 * Called with both runqueues locked.
3286 */
3287static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3288 unsigned long max_load_move,
3289 struct sched_domain *sd, enum cpu_idle_type idle,
3290 int *all_pinned)
3291{
3292 const struct sched_class *class = sched_class_highest;
3293 unsigned long total_load_moved = 0;
3294 int this_best_prio = this_rq->curr->prio;
3295
3296 do {
3297 total_load_moved +=
3298 class->load_balance(this_rq, this_cpu, busiest,
3299 max_load_move - total_load_moved,
3300 sd, idle, all_pinned, &this_best_prio);
3301 class = class->next;
3302
3303#ifdef CONFIG_PREEMPT
3304 /*
3305 * NEWIDLE balancing is a source of latency, so preemptible
3306 * kernels will stop after the first task is pulled to minimize
3307 * the critical section.
3308 */
3309 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3310 break;
3311#endif
3312 } while (class && max_load_move > total_load_moved);
3313
3314 return total_load_moved > 0;
3315}
3316
3317static int
3318iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3319 struct sched_domain *sd, enum cpu_idle_type idle,
3320 struct rq_iterator *iterator)
3321{
3322 struct task_struct *p = iterator->start(iterator->arg);
3323 int pinned = 0;
3324
3325 while (p) {
3326 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3327 pull_task(busiest, p, this_rq, this_cpu);
3328 /*
3329 * Right now, this is only the second place pull_task()
3330 * is called, so we can safely collect pull_task()
3331 * stats here rather than inside pull_task().
3332 */
3333 schedstat_inc(sd, lb_gained[idle]);
3334
3335 return 1;
3336 }
3337 p = iterator->next(iterator->arg);
3338 }
3339
3340 return 0;
3341}
3342
3343/*
3344 * move_one_task tries to move exactly one task from busiest to this_rq, as
3345 * part of active balancing operations within "domain".
3346 * Returns 1 if successful and 0 otherwise.
3347 *
3348 * Called with both runqueues locked.
3349 */
3350static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3351 struct sched_domain *sd, enum cpu_idle_type idle)
3352{
3353 const struct sched_class *class;
3354
3355 for_each_class(class) {
3356 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3357 return 1;
3358 }
3359
3360 return 0;
3361}
3362/********** Helpers for find_busiest_group ************************/
3363/*
3364 * sd_lb_stats - Structure to store the statistics of a sched_domain
3365 * during load balancing.
3366 */
3367struct sd_lb_stats {
3368 struct sched_group *busiest; /* Busiest group in this sd */
3369 struct sched_group *this; /* Local group in this sd */
3370 unsigned long total_load; /* Total load of all groups in sd */
3371 unsigned long total_pwr; /* Total power of all groups in sd */
3372 unsigned long avg_load; /* Average load across all groups in sd */
3373
3374 /** Statistics of this group */
3375 unsigned long this_load;
3376 unsigned long this_load_per_task;
3377 unsigned long this_nr_running;
3378
3379 /* Statistics of the busiest group */
3380 unsigned long max_load;
3381 unsigned long busiest_load_per_task;
3382 unsigned long busiest_nr_running;
3383
3384 int group_imb; /* Is there imbalance in this sd */
3385#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3386 int power_savings_balance; /* Is powersave balance needed for this sd */
3387 struct sched_group *group_min; /* Least loaded group in sd */
3388 struct sched_group *group_leader; /* Group which relieves group_min */
3389 unsigned long min_load_per_task; /* load_per_task in group_min */
3390 unsigned long leader_nr_running; /* Nr running of group_leader */
3391 unsigned long min_nr_running; /* Nr running of group_min */
3392#endif
3393};
3394
3395/*
3396 * sg_lb_stats - stats of a sched_group required for load_balancing
3397 */
3398struct sg_lb_stats {
3399 unsigned long avg_load; /*Avg load across the CPUs of the group */
3400 unsigned long group_load; /* Total load over the CPUs of the group */
3401 unsigned long sum_nr_running; /* Nr tasks running in the group */
3402 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3403 unsigned long group_capacity;
3404 int group_imb; /* Is there an imbalance in the group ? */
3405};
3406
3407/**
3408 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3409 * @group: The group whose first cpu is to be returned.
3410 */
3411static inline unsigned int group_first_cpu(struct sched_group *group)
3412{
3413 return cpumask_first(sched_group_cpus(group));
3414}
3415
3416/**
3417 * get_sd_load_idx - Obtain the load index for a given sched domain.
3418 * @sd: The sched_domain whose load_idx is to be obtained.
3419 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3420 */
3421static inline int get_sd_load_idx(struct sched_domain *sd,
3422 enum cpu_idle_type idle)
3423{
3424 int load_idx;
3425
3426 switch (idle) {
3427 case CPU_NOT_IDLE:
3428 load_idx = sd->busy_idx;
3429 break;
3430
3431 case CPU_NEWLY_IDLE:
3432 load_idx = sd->newidle_idx;
3433 break;
3434 default:
3435 load_idx = sd->idle_idx;
3436 break;
3437 }
3438
3439 return load_idx;
3440}
3441
3442
3443#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3444/**
3445 * init_sd_power_savings_stats - Initialize power savings statistics for
3446 * the given sched_domain, during load balancing.
3447 *
3448 * @sd: Sched domain whose power-savings statistics are to be initialized.
3449 * @sds: Variable containing the statistics for sd.
3450 * @idle: Idle status of the CPU at which we're performing load-balancing.
3451 */
3452static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3453 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3454{
3455 /*
3456 * Busy processors will not participate in power savings
3457 * balance.
3458 */
3459 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3460 sds->power_savings_balance = 0;
3461 else {
3462 sds->power_savings_balance = 1;
3463 sds->min_nr_running = ULONG_MAX;
3464 sds->leader_nr_running = 0;
3465 }
3466}
3467
3468/**
3469 * update_sd_power_savings_stats - Update the power saving stats for a
3470 * sched_domain while performing load balancing.
3471 *
3472 * @group: sched_group belonging to the sched_domain under consideration.
3473 * @sds: Variable containing the statistics of the sched_domain
3474 * @local_group: Does group contain the CPU for which we're performing
3475 * load balancing ?
3476 * @sgs: Variable containing the statistics of the group.
3477 */
3478static inline void update_sd_power_savings_stats(struct sched_group *group,
3479 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3480{
3481
3482 if (!sds->power_savings_balance)
3483 return;
3484
3485 /*
3486 * If the local group is idle or completely loaded
3487 * no need to do power savings balance at this domain
3488 */
3489 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3490 !sds->this_nr_running))
3491 sds->power_savings_balance = 0;
3492
3493 /*
3494 * If a group is already running at full capacity or idle,
3495 * don't include that group in power savings calculations
3496 */
3497 if (!sds->power_savings_balance ||
3498 sgs->sum_nr_running >= sgs->group_capacity ||
3499 !sgs->sum_nr_running)
3500 return;
3501
3502 /*
3503 * Calculate the group which has the least non-idle load.
3504 * This is the group from where we need to pick up the load
3505 * for saving power
3506 */
3507 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3508 (sgs->sum_nr_running == sds->min_nr_running &&
3509 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3510 sds->group_min = group;
3511 sds->min_nr_running = sgs->sum_nr_running;
3512 sds->min_load_per_task = sgs->sum_weighted_load /
3513 sgs->sum_nr_running;
3514 }
3515
3516 /*
3517 * Calculate the group which is almost near its
3518 * capacity but still has some space to pick up some load
3519 * from other group and save more power
3520 */
3521 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3522 return;
3523
3524 if (sgs->sum_nr_running > sds->leader_nr_running ||
3525 (sgs->sum_nr_running == sds->leader_nr_running &&
3526 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3527 sds->group_leader = group;
3528 sds->leader_nr_running = sgs->sum_nr_running;
3529 }
3530}
3531
3532/**
3533 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3534 * @sds: Variable containing the statistics of the sched_domain
3535 * under consideration.
3536 * @this_cpu: Cpu at which we're currently performing load-balancing.
3537 * @imbalance: Variable to store the imbalance.
3538 *
3539 * Description:
3540 * Check if we have potential to perform some power-savings balance.
3541 * If yes, set the busiest group to be the least loaded group in the
3542 * sched_domain, so that it's CPUs can be put to idle.
3543 *
3544 * Returns 1 if there is potential to perform power-savings balance.
3545 * Else returns 0.
3546 */
3547static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3548 int this_cpu, unsigned long *imbalance)
3549{
3550 if (!sds->power_savings_balance)
3551 return 0;
3552
3553 if (sds->this != sds->group_leader ||
3554 sds->group_leader == sds->group_min)
3555 return 0;
3556
3557 *imbalance = sds->min_load_per_task;
3558 sds->busiest = sds->group_min;
3559
3560 return 1;
3561
3562}
3563#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3564static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3565 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3566{
3567 return;
3568}
3569
3570static inline void update_sd_power_savings_stats(struct sched_group *group,
3571 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3572{
3573 return;
3574}
3575
3576static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3577 int this_cpu, unsigned long *imbalance)
3578{
3579 return 0;
3580}
3581#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3582
3583
3584unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3585{
3586 return SCHED_LOAD_SCALE;
3587}
3588
3589unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3590{
3591 return default_scale_freq_power(sd, cpu);
3592}
3593
3594unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3595{
3596 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3597 unsigned long smt_gain = sd->smt_gain;
3598
3599 smt_gain /= weight;
3600
3601 return smt_gain;
3602}
3603
3604unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3605{
3606 return default_scale_smt_power(sd, cpu);
3607}
3608
3609unsigned long scale_rt_power(int cpu)
3610{
3611 struct rq *rq = cpu_rq(cpu);
3612 u64 total, available;
3613
3614 sched_avg_update(rq);
3615
3616 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3617 available = total - rq->rt_avg;
3618
3619 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3620 total = SCHED_LOAD_SCALE;
3621
3622 total >>= SCHED_LOAD_SHIFT;
3623
3624 return div_u64(available, total);
3625}
3626
3627static void update_cpu_power(struct sched_domain *sd, int cpu)
3628{
3629 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3630 unsigned long power = SCHED_LOAD_SCALE;
3631 struct sched_group *sdg = sd->groups;
3632
3633 if (sched_feat(ARCH_POWER))
3634 power *= arch_scale_freq_power(sd, cpu);
3635 else
3636 power *= default_scale_freq_power(sd, cpu);
3637
3638 power >>= SCHED_LOAD_SHIFT;
3639
3640 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3641 if (sched_feat(ARCH_POWER))
3642 power *= arch_scale_smt_power(sd, cpu);
3643 else
3644 power *= default_scale_smt_power(sd, cpu);
3645
3646 power >>= SCHED_LOAD_SHIFT;
3647 }
3648
3649 power *= scale_rt_power(cpu);
3650 power >>= SCHED_LOAD_SHIFT;
3651
3652 if (!power)
3653 power = 1;
3654
3655 sdg->cpu_power = power;
3656}
3657
3658static void update_group_power(struct sched_domain *sd, int cpu)
3659{
3660 struct sched_domain *child = sd->child;
3661 struct sched_group *group, *sdg = sd->groups;
3662 unsigned long power;
3663
3664 if (!child) {
3665 update_cpu_power(sd, cpu);
3666 return;
3667 }
3668
3669 power = 0;
3670
3671 group = child->groups;
3672 do {
3673 power += group->cpu_power;
3674 group = group->next;
3675 } while (group != child->groups);
3676
3677 sdg->cpu_power = power;
3678}
3679
3680/**
3681 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3682 * @sd: The sched_domain whose statistics are to be updated.
3683 * @group: sched_group whose statistics are to be updated.
3684 * @this_cpu: Cpu for which load balance is currently performed.
3685 * @idle: Idle status of this_cpu
3686 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3687 * @sd_idle: Idle status of the sched_domain containing group.
3688 * @local_group: Does group contain this_cpu.
3689 * @cpus: Set of cpus considered for load balancing.
3690 * @balance: Should we balance.
3691 * @sgs: variable to hold the statistics for this group.
3692 */
3693static inline void update_sg_lb_stats(struct sched_domain *sd,
3694 struct sched_group *group, int this_cpu,
3695 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3696 int local_group, const struct cpumask *cpus,
3697 int *balance, struct sg_lb_stats *sgs)
3698{
3699 unsigned long load, max_cpu_load, min_cpu_load;
3700 int i;
3701 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3702 unsigned long sum_avg_load_per_task;
3703 unsigned long avg_load_per_task;
3704
3705 if (local_group) {
3706 balance_cpu = group_first_cpu(group);
3707 if (balance_cpu == this_cpu)
3708 update_group_power(sd, this_cpu);
3709 }
3710
3711 /* Tally up the load of all CPUs in the group */
3712 sum_avg_load_per_task = avg_load_per_task = 0;
3713 max_cpu_load = 0;
3714 min_cpu_load = ~0UL;
3715
3716 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3717 struct rq *rq = cpu_rq(i);
3718
3719 if (*sd_idle && rq->nr_running)
3720 *sd_idle = 0;
3721
3722 /* Bias balancing toward cpus of our domain */
3723 if (local_group) {
3724 if (idle_cpu(i) && !first_idle_cpu) {
3725 first_idle_cpu = 1;
3726 balance_cpu = i;
3727 }
3728
3729 load = target_load(i, load_idx);
3730 } else {
3731 load = source_load(i, load_idx);
3732 if (load > max_cpu_load)
3733 max_cpu_load = load;
3734 if (min_cpu_load > load)
3735 min_cpu_load = load;
3736 }
3737
3738 sgs->group_load += load;
3739 sgs->sum_nr_running += rq->nr_running;
3740 sgs->sum_weighted_load += weighted_cpuload(i);
3741
3742 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3743 }
3744
3745 /*
3746 * First idle cpu or the first cpu(busiest) in this sched group
3747 * is eligible for doing load balancing at this and above
3748 * domains. In the newly idle case, we will allow all the cpu's
3749 * to do the newly idle load balance.
3750 */
3751 if (idle != CPU_NEWLY_IDLE && local_group &&
3752 balance_cpu != this_cpu && balance) {
3753 *balance = 0;
3754 return;
3755 }
3756
3757 /* Adjust by relative CPU power of the group */
3758 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3759
3760
3761 /*
3762 * Consider the group unbalanced when the imbalance is larger
3763 * than the average weight of two tasks.
3764 *
3765 * APZ: with cgroup the avg task weight can vary wildly and
3766 * might not be a suitable number - should we keep a
3767 * normalized nr_running number somewhere that negates
3768 * the hierarchy?
3769 */
3770 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3771 group->cpu_power;
3772
3773 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3774 sgs->group_imb = 1;
3775
3776 sgs->group_capacity =
3777 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3778}
3779
3780/**
3781 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3782 * @sd: sched_domain whose statistics are to be updated.
3783 * @this_cpu: Cpu for which load balance is currently performed.
3784 * @idle: Idle status of this_cpu
3785 * @sd_idle: Idle status of the sched_domain containing group.
3786 * @cpus: Set of cpus considered for load balancing.
3787 * @balance: Should we balance.
3788 * @sds: variable to hold the statistics for this sched_domain.
3789 */
3790static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3791 enum cpu_idle_type idle, int *sd_idle,
3792 const struct cpumask *cpus, int *balance,
3793 struct sd_lb_stats *sds)
3794{
3795 struct sched_domain *child = sd->child;
3796 struct sched_group *group = sd->groups;
3797 struct sg_lb_stats sgs;
3798 int load_idx, prefer_sibling = 0;
3799
3800 if (child && child->flags & SD_PREFER_SIBLING)
3801 prefer_sibling = 1;
3802
3803 init_sd_power_savings_stats(sd, sds, idle);
3804 load_idx = get_sd_load_idx(sd, idle);
3805
3806 do {
3807 int local_group;
3808
3809 local_group = cpumask_test_cpu(this_cpu,
3810 sched_group_cpus(group));
3811 memset(&sgs, 0, sizeof(sgs));
3812 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3813 local_group, cpus, balance, &sgs);
3814
3815 if (local_group && balance && !(*balance))
3816 return;
3817
3818 sds->total_load += sgs.group_load;
3819 sds->total_pwr += group->cpu_power;
3820
3821 /*
3822 * In case the child domain prefers tasks go to siblings
3823 * first, lower the group capacity to one so that we'll try
3824 * and move all the excess tasks away.
3825 */
3826 if (prefer_sibling)
3827 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3828
3829 if (local_group) {
3830 sds->this_load = sgs.avg_load;
3831 sds->this = group;
3832 sds->this_nr_running = sgs.sum_nr_running;
3833 sds->this_load_per_task = sgs.sum_weighted_load;
3834 } else if (sgs.avg_load > sds->max_load &&
3835 (sgs.sum_nr_running > sgs.group_capacity ||
3836 sgs.group_imb)) {
3837 sds->max_load = sgs.avg_load;
3838 sds->busiest = group;
3839 sds->busiest_nr_running = sgs.sum_nr_running;
3840 sds->busiest_load_per_task = sgs.sum_weighted_load;
3841 sds->group_imb = sgs.group_imb;
3842 }
3843
3844 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3845 group = group->next;
3846 } while (group != sd->groups);
3847}
3848
3849/**
3850 * fix_small_imbalance - Calculate the minor imbalance that exists
3851 * amongst the groups of a sched_domain, during
3852 * load balancing.
3853 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3854 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3855 * @imbalance: Variable to store the imbalance.
3856 */
3857static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3858 int this_cpu, unsigned long *imbalance)
3859{
3860 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3861 unsigned int imbn = 2;
3862
3863 if (sds->this_nr_running) {
3864 sds->this_load_per_task /= sds->this_nr_running;
3865 if (sds->busiest_load_per_task >
3866 sds->this_load_per_task)
3867 imbn = 1;
3868 } else
3869 sds->this_load_per_task =
3870 cpu_avg_load_per_task(this_cpu);
3871
3872 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3873 sds->busiest_load_per_task * imbn) {
3874 *imbalance = sds->busiest_load_per_task;
3875 return;
3876 }
3877
3878 /*
3879 * OK, we don't have enough imbalance to justify moving tasks,
3880 * however we may be able to increase total CPU power used by
3881 * moving them.
3882 */
3883
3884 pwr_now += sds->busiest->cpu_power *
3885 min(sds->busiest_load_per_task, sds->max_load);
3886 pwr_now += sds->this->cpu_power *
3887 min(sds->this_load_per_task, sds->this_load);
3888 pwr_now /= SCHED_LOAD_SCALE;
3889
3890 /* Amount of load we'd subtract */
3891 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3892 sds->busiest->cpu_power;
3893 if (sds->max_load > tmp)
3894 pwr_move += sds->busiest->cpu_power *
3895 min(sds->busiest_load_per_task, sds->max_load - tmp);
3896
3897 /* Amount of load we'd add */
3898 if (sds->max_load * sds->busiest->cpu_power <
3899 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3900 tmp = (sds->max_load * sds->busiest->cpu_power) /
3901 sds->this->cpu_power;
3902 else
3903 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3904 sds->this->cpu_power;
3905 pwr_move += sds->this->cpu_power *
3906 min(sds->this_load_per_task, sds->this_load + tmp);
3907 pwr_move /= SCHED_LOAD_SCALE;
3908
3909 /* Move if we gain throughput */
3910 if (pwr_move > pwr_now)
3911 *imbalance = sds->busiest_load_per_task;
3912}
3913
3914/**
3915 * calculate_imbalance - Calculate the amount of imbalance present within the
3916 * groups of a given sched_domain during load balance.
3917 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3918 * @this_cpu: Cpu for which currently load balance is being performed.
3919 * @imbalance: The variable to store the imbalance.
3920 */
3921static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3922 unsigned long *imbalance)
3923{
3924 unsigned long max_pull;
3925 /*
3926 * In the presence of smp nice balancing, certain scenarios can have
3927 * max load less than avg load(as we skip the groups at or below
3928 * its cpu_power, while calculating max_load..)
3929 */
3930 if (sds->max_load < sds->avg_load) {
3931 *imbalance = 0;
3932 return fix_small_imbalance(sds, this_cpu, imbalance);
3933 }
3934
3935 /* Don't want to pull so many tasks that a group would go idle */
3936 max_pull = min(sds->max_load - sds->avg_load,
3937 sds->max_load - sds->busiest_load_per_task);
3938
3939 /* How much load to actually move to equalise the imbalance */
3940 *imbalance = min(max_pull * sds->busiest->cpu_power,
3941 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3942 / SCHED_LOAD_SCALE;
3943
3944 /*
3945 * if *imbalance is less than the average load per runnable task
3946 * there is no gaurantee that any tasks will be moved so we'll have
3947 * a think about bumping its value to force at least one task to be
3948 * moved
3949 */
3950 if (*imbalance < sds->busiest_load_per_task)
3951 return fix_small_imbalance(sds, this_cpu, imbalance);
3952
3953}
3954/******* find_busiest_group() helpers end here *********************/
3955
3956/**
3957 * find_busiest_group - Returns the busiest group within the sched_domain
3958 * if there is an imbalance. If there isn't an imbalance, and
3959 * the user has opted for power-savings, it returns a group whose
3960 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3961 * such a group exists.
3962 *
3963 * Also calculates the amount of weighted load which should be moved
3964 * to restore balance.
3965 *
3966 * @sd: The sched_domain whose busiest group is to be returned.
3967 * @this_cpu: The cpu for which load balancing is currently being performed.
3968 * @imbalance: Variable which stores amount of weighted load which should
3969 * be moved to restore balance/put a group to idle.
3970 * @idle: The idle status of this_cpu.
3971 * @sd_idle: The idleness of sd
3972 * @cpus: The set of CPUs under consideration for load-balancing.
3973 * @balance: Pointer to a variable indicating if this_cpu
3974 * is the appropriate cpu to perform load balancing at this_level.
3975 *
3976 * Returns: - the busiest group if imbalance exists.
3977 * - If no imbalance and user has opted for power-savings balance,
3978 * return the least loaded group whose CPUs can be
3979 * put to idle by rebalancing its tasks onto our group.
3980 */
3981static struct sched_group *
3982find_busiest_group(struct sched_domain *sd, int this_cpu,
3983 unsigned long *imbalance, enum cpu_idle_type idle,
3984 int *sd_idle, const struct cpumask *cpus, int *balance)
3985{
3986 struct sd_lb_stats sds;
3987
3988 memset(&sds, 0, sizeof(sds));
3989
3990 /*
3991 * Compute the various statistics relavent for load balancing at
3992 * this level.
3993 */
3994 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3995 balance, &sds);
3996
3997 /* Cases where imbalance does not exist from POV of this_cpu */
3998 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3999 * at this level.
4000 * 2) There is no busy sibling group to pull from.
4001 * 3) This group is the busiest group.
4002 * 4) This group is more busy than the avg busieness at this
4003 * sched_domain.
4004 * 5) The imbalance is within the specified limit.
4005 * 6) Any rebalance would lead to ping-pong
4006 */
4007 if (balance && !(*balance))
4008 goto ret;
4009
4010 if (!sds.busiest || sds.busiest_nr_running == 0)
4011 goto out_balanced;
4012
4013 if (sds.this_load >= sds.max_load)
4014 goto out_balanced;
4015
4016 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4017
4018 if (sds.this_load >= sds.avg_load)
4019 goto out_balanced;
4020
4021 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4022 goto out_balanced;
4023
4024 sds.busiest_load_per_task /= sds.busiest_nr_running;
4025 if (sds.group_imb)
4026 sds.busiest_load_per_task =
4027 min(sds.busiest_load_per_task, sds.avg_load);
4028
4029 /*
4030 * We're trying to get all the cpus to the average_load, so we don't
4031 * want to push ourselves above the average load, nor do we wish to
4032 * reduce the max loaded cpu below the average load, as either of these
4033 * actions would just result in more rebalancing later, and ping-pong
4034 * tasks around. Thus we look for the minimum possible imbalance.
4035 * Negative imbalances (*we* are more loaded than anyone else) will
4036 * be counted as no imbalance for these purposes -- we can't fix that
4037 * by pulling tasks to us. Be careful of negative numbers as they'll
4038 * appear as very large values with unsigned longs.
4039 */
4040 if (sds.max_load <= sds.busiest_load_per_task)
4041 goto out_balanced;
4042
4043 /* Looks like there is an imbalance. Compute it */
4044 calculate_imbalance(&sds, this_cpu, imbalance);
4045 return sds.busiest;
4046
4047out_balanced:
4048 /*
4049 * There is no obvious imbalance. But check if we can do some balancing
4050 * to save power.
4051 */
4052 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4053 return sds.busiest;
4054ret:
4055 *imbalance = 0;
4056 return NULL;
4057}
4058
4059/*
4060 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4061 */
4062static struct rq *
4063find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4064 unsigned long imbalance, const struct cpumask *cpus)
4065{
4066 struct rq *busiest = NULL, *rq;
4067 unsigned long max_load = 0;
4068 int i;
4069
4070 for_each_cpu(i, sched_group_cpus(group)) {
4071 unsigned long power = power_of(i);
4072 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4073 unsigned long wl;
4074
4075 if (!cpumask_test_cpu(i, cpus))
4076 continue;
4077
4078 rq = cpu_rq(i);
4079 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4080 wl /= power;
4081
4082 if (capacity && rq->nr_running == 1 && wl > imbalance)
4083 continue;
4084
4085 if (wl > max_load) {
4086 max_load = wl;
4087 busiest = rq;
4088 }
4089 }
4090
4091 return busiest;
4092}
4093
4094/*
4095 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4096 * so long as it is large enough.
4097 */
4098#define MAX_PINNED_INTERVAL 512
4099
4100/* Working cpumask for load_balance and load_balance_newidle. */
4101static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4102
4103/*
4104 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4105 * tasks if there is an imbalance.
4106 */
4107static int load_balance(int this_cpu, struct rq *this_rq,
4108 struct sched_domain *sd, enum cpu_idle_type idle,
4109 int *balance)
4110{
4111 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4112 struct sched_group *group;
4113 unsigned long imbalance;
4114 struct rq *busiest;
4115 unsigned long flags;
4116 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4117
4118 cpumask_copy(cpus, cpu_active_mask);
4119
4120 /*
4121 * When power savings policy is enabled for the parent domain, idle
4122 * sibling can pick up load irrespective of busy siblings. In this case,
4123 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4124 * portraying it as CPU_NOT_IDLE.
4125 */
4126 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4127 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4128 sd_idle = 1;
4129
4130 schedstat_inc(sd, lb_count[idle]);
4131
4132redo:
4133 update_shares(sd);
4134 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4135 cpus, balance);
4136
4137 if (*balance == 0)
4138 goto out_balanced;
4139
4140 if (!group) {
4141 schedstat_inc(sd, lb_nobusyg[idle]);
4142 goto out_balanced;
4143 }
4144
4145 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4146 if (!busiest) {
4147 schedstat_inc(sd, lb_nobusyq[idle]);
4148 goto out_balanced;
4149 }
4150
4151 BUG_ON(busiest == this_rq);
4152
4153 schedstat_add(sd, lb_imbalance[idle], imbalance);
4154
4155 ld_moved = 0;
4156 if (busiest->nr_running > 1) {
4157 /*
4158 * Attempt to move tasks. If find_busiest_group has found
4159 * an imbalance but busiest->nr_running <= 1, the group is
4160 * still unbalanced. ld_moved simply stays zero, so it is
4161 * correctly treated as an imbalance.
4162 */
4163 local_irq_save(flags);
4164 double_rq_lock(this_rq, busiest);
4165 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4166 imbalance, sd, idle, &all_pinned);
4167 double_rq_unlock(this_rq, busiest);
4168 local_irq_restore(flags);
4169
4170 /*
4171 * some other cpu did the load balance for us.
4172 */
4173 if (ld_moved && this_cpu != smp_processor_id())
4174 resched_cpu(this_cpu);
4175
4176 /* All tasks on this runqueue were pinned by CPU affinity */
4177 if (unlikely(all_pinned)) {
4178 cpumask_clear_cpu(cpu_of(busiest), cpus);
4179 if (!cpumask_empty(cpus))
4180 goto redo;
4181 goto out_balanced;
4182 }
4183 }
4184
4185 if (!ld_moved) {
4186 schedstat_inc(sd, lb_failed[idle]);
4187 sd->nr_balance_failed++;
4188
4189 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4190
4191 raw_spin_lock_irqsave(&busiest->lock, flags);
4192
4193 /* don't kick the migration_thread, if the curr
4194 * task on busiest cpu can't be moved to this_cpu
4195 */
4196 if (!cpumask_test_cpu(this_cpu,
4197 &busiest->curr->cpus_allowed)) {
4198 raw_spin_unlock_irqrestore(&busiest->lock,
4199 flags);
4200 all_pinned = 1;
4201 goto out_one_pinned;
4202 }
4203
4204 if (!busiest->active_balance) {
4205 busiest->active_balance = 1;
4206 busiest->push_cpu = this_cpu;
4207 active_balance = 1;
4208 }
4209 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4210 if (active_balance)
4211 wake_up_process(busiest->migration_thread);
4212
4213 /*
4214 * We've kicked active balancing, reset the failure
4215 * counter.
4216 */
4217 sd->nr_balance_failed = sd->cache_nice_tries+1;
4218 }
4219 } else
4220 sd->nr_balance_failed = 0;
4221
4222 if (likely(!active_balance)) {
4223 /* We were unbalanced, so reset the balancing interval */
4224 sd->balance_interval = sd->min_interval;
4225 } else {
4226 /*
4227 * If we've begun active balancing, start to back off. This
4228 * case may not be covered by the all_pinned logic if there
4229 * is only 1 task on the busy runqueue (because we don't call
4230 * move_tasks).
4231 */
4232 if (sd->balance_interval < sd->max_interval)
4233 sd->balance_interval *= 2;
4234 }
4235
4236 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4237 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4238 ld_moved = -1;
4239
4240 goto out;
4241
4242out_balanced:
4243 schedstat_inc(sd, lb_balanced[idle]);
4244
4245 sd->nr_balance_failed = 0;
4246
4247out_one_pinned:
4248 /* tune up the balancing interval */
4249 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4250 (sd->balance_interval < sd->max_interval))
4251 sd->balance_interval *= 2;
4252
4253 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4254 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4255 ld_moved = -1;
4256 else
4257 ld_moved = 0;
4258out:
4259 if (ld_moved)
4260 update_shares(sd);
4261 return ld_moved;
4262}
4263
4264/*
4265 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4266 * tasks if there is an imbalance.
4267 *
4268 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4269 * this_rq is locked.
4270 */
4271static int
4272load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4273{
4274 struct sched_group *group;
4275 struct rq *busiest = NULL;
4276 unsigned long imbalance;
4277 int ld_moved = 0;
4278 int sd_idle = 0;
4279 int all_pinned = 0;
4280 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4281
4282 cpumask_copy(cpus, cpu_active_mask);
4283
4284 /*
4285 * When power savings policy is enabled for the parent domain, idle
4286 * sibling can pick up load irrespective of busy siblings. In this case,
4287 * let the state of idle sibling percolate up as IDLE, instead of
4288 * portraying it as CPU_NOT_IDLE.
4289 */
4290 if (sd->flags & SD_SHARE_CPUPOWER &&
4291 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4292 sd_idle = 1;
4293
4294 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4295redo:
4296 update_shares_locked(this_rq, sd);
4297 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4298 &sd_idle, cpus, NULL);
4299 if (!group) {
4300 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4301 goto out_balanced;
4302 }
4303
4304 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4305 if (!busiest) {
4306 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4307 goto out_balanced;
4308 }
4309
4310 BUG_ON(busiest == this_rq);
4311
4312 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4313
4314 ld_moved = 0;
4315 if (busiest->nr_running > 1) {
4316 /* Attempt to move tasks */
4317 double_lock_balance(this_rq, busiest);
4318 /* this_rq->clock is already updated */
4319 update_rq_clock(busiest);
4320 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4321 imbalance, sd, CPU_NEWLY_IDLE,
4322 &all_pinned);
4323 double_unlock_balance(this_rq, busiest);
4324
4325 if (unlikely(all_pinned)) {
4326 cpumask_clear_cpu(cpu_of(busiest), cpus);
4327 if (!cpumask_empty(cpus))
4328 goto redo;
4329 }
4330 }
4331
4332 if (!ld_moved) {
4333 int active_balance = 0;
4334
4335 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4336 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4337 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4338 return -1;
4339
4340 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4341 return -1;
4342
4343 if (sd->nr_balance_failed++ < 2)
4344 return -1;
4345
4346 /*
4347 * The only task running in a non-idle cpu can be moved to this
4348 * cpu in an attempt to completely freeup the other CPU
4349 * package. The same method used to move task in load_balance()
4350 * have been extended for load_balance_newidle() to speedup
4351 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4352 *
4353 * The package power saving logic comes from
4354 * find_busiest_group(). If there are no imbalance, then
4355 * f_b_g() will return NULL. However when sched_mc={1,2} then
4356 * f_b_g() will select a group from which a running task may be
4357 * pulled to this cpu in order to make the other package idle.
4358 * If there is no opportunity to make a package idle and if
4359 * there are no imbalance, then f_b_g() will return NULL and no
4360 * action will be taken in load_balance_newidle().
4361 *
4362 * Under normal task pull operation due to imbalance, there
4363 * will be more than one task in the source run queue and
4364 * move_tasks() will succeed. ld_moved will be true and this
4365 * active balance code will not be triggered.
4366 */
4367
4368 /* Lock busiest in correct order while this_rq is held */
4369 double_lock_balance(this_rq, busiest);
4370
4371 /*
4372 * don't kick the migration_thread, if the curr
4373 * task on busiest cpu can't be moved to this_cpu
4374 */
4375 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4376 double_unlock_balance(this_rq, busiest);
4377 all_pinned = 1;
4378 return ld_moved;
4379 }
4380
4381 if (!busiest->active_balance) {
4382 busiest->active_balance = 1;
4383 busiest->push_cpu = this_cpu;
4384 active_balance = 1;
4385 }
4386
4387 double_unlock_balance(this_rq, busiest);
4388 /*
4389 * Should not call ttwu while holding a rq->lock
4390 */
4391 raw_spin_unlock(&this_rq->lock);
4392 if (active_balance)
4393 wake_up_process(busiest->migration_thread);
4394 raw_spin_lock(&this_rq->lock);
4395
4396 } else
4397 sd->nr_balance_failed = 0;
4398
4399 update_shares_locked(this_rq, sd);
4400 return ld_moved;
4401
4402out_balanced:
4403 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4404 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4405 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4406 return -1;
4407 sd->nr_balance_failed = 0;
4408
4409 return 0;
4410}
4411
4412/*
4413 * idle_balance is called by schedule() if this_cpu is about to become
4414 * idle. Attempts to pull tasks from other CPUs.
4415 */
4416static void idle_balance(int this_cpu, struct rq *this_rq)
4417{
4418 struct sched_domain *sd;
4419 int pulled_task = 0;
4420 unsigned long next_balance = jiffies + HZ;
4421
4422 this_rq->idle_stamp = this_rq->clock;
4423
4424 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4425 return;
4426
4427 for_each_domain(this_cpu, sd) {
4428 unsigned long interval;
4429
4430 if (!(sd->flags & SD_LOAD_BALANCE))
4431 continue;
4432
4433 if (sd->flags & SD_BALANCE_NEWIDLE)
4434 /* If we've pulled tasks over stop searching: */
4435 pulled_task = load_balance_newidle(this_cpu, this_rq,
4436 sd);
4437
4438 interval = msecs_to_jiffies(sd->balance_interval);
4439 if (time_after(next_balance, sd->last_balance + interval))
4440 next_balance = sd->last_balance + interval;
4441 if (pulled_task) {
4442 this_rq->idle_stamp = 0;
4443 break;
4444 }
4445 }
4446 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4447 /*
4448 * We are going idle. next_balance may be set based on
4449 * a busy processor. So reset next_balance.
4450 */
4451 this_rq->next_balance = next_balance;
4452 }
4453}
4454
4455/*
4456 * active_load_balance is run by migration threads. It pushes running tasks
4457 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4458 * running on each physical CPU where possible, and avoids physical /
4459 * logical imbalances.
4460 *
4461 * Called with busiest_rq locked.
4462 */
4463static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4464{
4465 int target_cpu = busiest_rq->push_cpu;
4466 struct sched_domain *sd;
4467 struct rq *target_rq;
4468
4469 /* Is there any task to move? */
4470 if (busiest_rq->nr_running <= 1)
4471 return;
4472
4473 target_rq = cpu_rq(target_cpu);
4474
4475 /*
4476 * This condition is "impossible", if it occurs
4477 * we need to fix it. Originally reported by
4478 * Bjorn Helgaas on a 128-cpu setup.
4479 */
4480 BUG_ON(busiest_rq == target_rq);
4481
4482 /* move a task from busiest_rq to target_rq */
4483 double_lock_balance(busiest_rq, target_rq);
4484 update_rq_clock(busiest_rq);
4485 update_rq_clock(target_rq);
4486
4487 /* Search for an sd spanning us and the target CPU. */
4488 for_each_domain(target_cpu, sd) {
4489 if ((sd->flags & SD_LOAD_BALANCE) &&
4490 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4491 break;
4492 }
4493
4494 if (likely(sd)) {
4495 schedstat_inc(sd, alb_count);
4496
4497 if (move_one_task(target_rq, target_cpu, busiest_rq,
4498 sd, CPU_IDLE))
4499 schedstat_inc(sd, alb_pushed);
4500 else
4501 schedstat_inc(sd, alb_failed);
4502 }
4503 double_unlock_balance(busiest_rq, target_rq);
4504}
4505
4506#ifdef CONFIG_NO_HZ
4507static struct {
4508 atomic_t load_balancer;
4509 cpumask_var_t cpu_mask;
4510 cpumask_var_t ilb_grp_nohz_mask;
4511} nohz ____cacheline_aligned = {
4512 .load_balancer = ATOMIC_INIT(-1),
4513};
4514
4515int get_nohz_load_balancer(void)
4516{
4517 return atomic_read(&nohz.load_balancer);
4518}
4519
4520#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4521/**
4522 * lowest_flag_domain - Return lowest sched_domain containing flag.
4523 * @cpu: The cpu whose lowest level of sched domain is to
4524 * be returned.
4525 * @flag: The flag to check for the lowest sched_domain
4526 * for the given cpu.
4527 *
4528 * Returns the lowest sched_domain of a cpu which contains the given flag.
4529 */
4530static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4531{
4532 struct sched_domain *sd;
4533
4534 for_each_domain(cpu, sd)
4535 if (sd && (sd->flags & flag))
4536 break;
4537
4538 return sd;
4539}
4540
4541/**
4542 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4543 * @cpu: The cpu whose domains we're iterating over.
4544 * @sd: variable holding the value of the power_savings_sd
4545 * for cpu.
4546 * @flag: The flag to filter the sched_domains to be iterated.
4547 *
4548 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4549 * set, starting from the lowest sched_domain to the highest.
4550 */
4551#define for_each_flag_domain(cpu, sd, flag) \
4552 for (sd = lowest_flag_domain(cpu, flag); \
4553 (sd && (sd->flags & flag)); sd = sd->parent)
4554
4555/**
4556 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4557 * @ilb_group: group to be checked for semi-idleness
4558 *
4559 * Returns: 1 if the group is semi-idle. 0 otherwise.
4560 *
4561 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4562 * and atleast one non-idle CPU. This helper function checks if the given
4563 * sched_group is semi-idle or not.
4564 */
4565static inline int is_semi_idle_group(struct sched_group *ilb_group)
4566{
4567 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4568 sched_group_cpus(ilb_group));
4569
4570 /*
4571 * A sched_group is semi-idle when it has atleast one busy cpu
4572 * and atleast one idle cpu.
4573 */
4574 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4575 return 0;
4576
4577 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4578 return 0;
4579
4580 return 1;
4581}
4582/**
4583 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4584 * @cpu: The cpu which is nominating a new idle_load_balancer.
4585 *
4586 * Returns: Returns the id of the idle load balancer if it exists,
4587 * Else, returns >= nr_cpu_ids.
4588 *
4589 * This algorithm picks the idle load balancer such that it belongs to a
4590 * semi-idle powersavings sched_domain. The idea is to try and avoid
4591 * completely idle packages/cores just for the purpose of idle load balancing
4592 * when there are other idle cpu's which are better suited for that job.
4593 */
4594static int find_new_ilb(int cpu)
4595{
4596 struct sched_domain *sd;
4597 struct sched_group *ilb_group;
4598
4599 /*
4600 * Have idle load balancer selection from semi-idle packages only
4601 * when power-aware load balancing is enabled
4602 */
4603 if (!(sched_smt_power_savings || sched_mc_power_savings))
4604 goto out_done;
4605
4606 /*
4607 * Optimize for the case when we have no idle CPUs or only one
4608 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4609 */
4610 if (cpumask_weight(nohz.cpu_mask) < 2)
4611 goto out_done;
4612
4613 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4614 ilb_group = sd->groups;
4615
4616 do {
4617 if (is_semi_idle_group(ilb_group))
4618 return cpumask_first(nohz.ilb_grp_nohz_mask);
4619
4620 ilb_group = ilb_group->next;
4621
4622 } while (ilb_group != sd->groups);
4623 }
4624
4625out_done:
4626 return cpumask_first(nohz.cpu_mask);
4627}
4628#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4629static inline int find_new_ilb(int call_cpu)
4630{
4631 return cpumask_first(nohz.cpu_mask);
4632}
4633#endif
4634
4635/*
4636 * This routine will try to nominate the ilb (idle load balancing)
4637 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4638 * load balancing on behalf of all those cpus. If all the cpus in the system
4639 * go into this tickless mode, then there will be no ilb owner (as there is
4640 * no need for one) and all the cpus will sleep till the next wakeup event
4641 * arrives...
4642 *
4643 * For the ilb owner, tick is not stopped. And this tick will be used
4644 * for idle load balancing. ilb owner will still be part of
4645 * nohz.cpu_mask..
4646 *
4647 * While stopping the tick, this cpu will become the ilb owner if there
4648 * is no other owner. And will be the owner till that cpu becomes busy
4649 * or if all cpus in the system stop their ticks at which point
4650 * there is no need for ilb owner.
4651 *
4652 * When the ilb owner becomes busy, it nominates another owner, during the
4653 * next busy scheduler_tick()
4654 */
4655int select_nohz_load_balancer(int stop_tick)
4656{
4657 int cpu = smp_processor_id();
4658
4659 if (stop_tick) {
4660 cpu_rq(cpu)->in_nohz_recently = 1;
4661
4662 if (!cpu_active(cpu)) {
4663 if (atomic_read(&nohz.load_balancer) != cpu)
4664 return 0;
4665
4666 /*
4667 * If we are going offline and still the leader,
4668 * give up!
4669 */
4670 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4671 BUG();
4672
4673 return 0;
4674 }
4675
4676 cpumask_set_cpu(cpu, nohz.cpu_mask);
4677
4678 /* time for ilb owner also to sleep */
4679 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4680 if (atomic_read(&nohz.load_balancer) == cpu)
4681 atomic_set(&nohz.load_balancer, -1);
4682 return 0;
4683 }
4684
4685 if (atomic_read(&nohz.load_balancer) == -1) {
4686 /* make me the ilb owner */
4687 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4688 return 1;
4689 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4690 int new_ilb;
4691
4692 if (!(sched_smt_power_savings ||
4693 sched_mc_power_savings))
4694 return 1;
4695 /*
4696 * Check to see if there is a more power-efficient
4697 * ilb.
4698 */
4699 new_ilb = find_new_ilb(cpu);
4700 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4701 atomic_set(&nohz.load_balancer, -1);
4702 resched_cpu(new_ilb);
4703 return 0;
4704 }
4705 return 1;
4706 }
4707 } else {
4708 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4709 return 0;
4710
4711 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4712
4713 if (atomic_read(&nohz.load_balancer) == cpu)
4714 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4715 BUG();
4716 }
4717 return 0;
4718}
4719#endif
4720
4721static DEFINE_SPINLOCK(balancing);
4722
4723/*
4724 * It checks each scheduling domain to see if it is due to be balanced,
4725 * and initiates a balancing operation if so.
4726 *
4727 * Balancing parameters are set up in arch_init_sched_domains.
4728 */
4729static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4730{
4731 int balance = 1;
4732 struct rq *rq = cpu_rq(cpu);
4733 unsigned long interval;
4734 struct sched_domain *sd;
4735 /* Earliest time when we have to do rebalance again */
4736 unsigned long next_balance = jiffies + 60*HZ;
4737 int update_next_balance = 0;
4738 int need_serialize;
4739
4740 for_each_domain(cpu, sd) {
4741 if (!(sd->flags & SD_LOAD_BALANCE))
4742 continue;
4743
4744 interval = sd->balance_interval;
4745 if (idle != CPU_IDLE)
4746 interval *= sd->busy_factor;
4747
4748 /* scale ms to jiffies */
4749 interval = msecs_to_jiffies(interval);
4750 if (unlikely(!interval))
4751 interval = 1;
4752 if (interval > HZ*NR_CPUS/10)
4753 interval = HZ*NR_CPUS/10;
4754
4755 need_serialize = sd->flags & SD_SERIALIZE;
4756
4757 if (need_serialize) {
4758 if (!spin_trylock(&balancing))
4759 goto out;
4760 }
4761
4762 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4763 if (load_balance(cpu, rq, sd, idle, &balance)) {
4764 /*
4765 * We've pulled tasks over so either we're no
4766 * longer idle, or one of our SMT siblings is
4767 * not idle.
4768 */
4769 idle = CPU_NOT_IDLE;
4770 }
4771 sd->last_balance = jiffies;
4772 }
4773 if (need_serialize)
4774 spin_unlock(&balancing);
4775out:
4776 if (time_after(next_balance, sd->last_balance + interval)) {
4777 next_balance = sd->last_balance + interval;
4778 update_next_balance = 1;
4779 }
4780
4781 /*
4782 * Stop the load balance at this level. There is another
4783 * CPU in our sched group which is doing load balancing more
4784 * actively.
4785 */
4786 if (!balance)
4787 break;
4788 }
4789
4790 /*
4791 * next_balance will be updated only when there is a need.
4792 * When the cpu is attached to null domain for ex, it will not be
4793 * updated.
4794 */
4795 if (likely(update_next_balance))
4796 rq->next_balance = next_balance;
4797}
4798
4799/*
4800 * run_rebalance_domains is triggered when needed from the scheduler tick.
4801 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4802 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4803 */
4804static void run_rebalance_domains(struct softirq_action *h)
4805{
4806 int this_cpu = smp_processor_id();
4807 struct rq *this_rq = cpu_rq(this_cpu);
4808 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4809 CPU_IDLE : CPU_NOT_IDLE;
4810
4811 rebalance_domains(this_cpu, idle);
4812
4813#ifdef CONFIG_NO_HZ
4814 /*
4815 * If this cpu is the owner for idle load balancing, then do the
4816 * balancing on behalf of the other idle cpus whose ticks are
4817 * stopped.
4818 */
4819 if (this_rq->idle_at_tick &&
4820 atomic_read(&nohz.load_balancer) == this_cpu) {
4821 struct rq *rq;
4822 int balance_cpu;
4823
4824 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4825 if (balance_cpu == this_cpu)
4826 continue;
4827
4828 /*
4829 * If this cpu gets work to do, stop the load balancing
4830 * work being done for other cpus. Next load
4831 * balancing owner will pick it up.
4832 */
4833 if (need_resched())
4834 break;
4835
4836 rebalance_domains(balance_cpu, CPU_IDLE);
4837
4838 rq = cpu_rq(balance_cpu);
4839 if (time_after(this_rq->next_balance, rq->next_balance))
4840 this_rq->next_balance = rq->next_balance;
4841 }
4842 }
4843#endif
4844}
4845
4846static inline int on_null_domain(int cpu)
4847{
4848 return !rcu_dereference(cpu_rq(cpu)->sd);
4849}
4850
4851/*
4852 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4853 *
4854 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4855 * idle load balancing owner or decide to stop the periodic load balancing,
4856 * if the whole system is idle.
4857 */
4858static inline void trigger_load_balance(struct rq *rq, int cpu)
4859{
4860#ifdef CONFIG_NO_HZ
4861 /*
4862 * If we were in the nohz mode recently and busy at the current
4863 * scheduler tick, then check if we need to nominate new idle
4864 * load balancer.
4865 */
4866 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4867 rq->in_nohz_recently = 0;
4868
4869 if (atomic_read(&nohz.load_balancer) == cpu) {
4870 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4871 atomic_set(&nohz.load_balancer, -1);
4872 }
4873
4874 if (atomic_read(&nohz.load_balancer) == -1) {
4875 int ilb = find_new_ilb(cpu);
4876
4877 if (ilb < nr_cpu_ids)
4878 resched_cpu(ilb);
4879 }
4880 }
4881
4882 /*
4883 * If this cpu is idle and doing idle load balancing for all the
4884 * cpus with ticks stopped, is it time for that to stop?
4885 */
4886 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4887 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4888 resched_cpu(cpu);
4889 return;
4890 }
4891
4892 /*
4893 * If this cpu is idle and the idle load balancing is done by
4894 * someone else, then no need raise the SCHED_SOFTIRQ
4895 */
4896 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4897 cpumask_test_cpu(cpu, nohz.cpu_mask))
4898 return;
4899#endif
4900 /* Don't need to rebalance while attached to NULL domain */
4901 if (time_after_eq(jiffies, rq->next_balance) &&
4902 likely(!on_null_domain(cpu)))
4903 raise_softirq(SCHED_SOFTIRQ);
4904}
4905
4906#else /* CONFIG_SMP */
4907
4908/*
4909 * on UP we do not need to balance between CPUs:
4910 */
4911static inline void idle_balance(int cpu, struct rq *rq)
4912{
4913}
4914
4915#endif 3164#endif
4916 3165
4917DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5266,7 +3515,7 @@ void scheduler_tick(void)
5266 curr->sched_class->task_tick(rq, curr, 0); 3515 curr->sched_class->task_tick(rq, curr, 0);
5267 raw_spin_unlock(&rq->lock); 3516 raw_spin_unlock(&rq->lock);
5268 3517
5269 perf_event_task_tick(curr, cpu); 3518 perf_event_task_tick(curr);
5270 3519
5271#ifdef CONFIG_SMP 3520#ifdef CONFIG_SMP
5272 rq->idle_at_tick = idle_cpu(cpu); 3521 rq->idle_at_tick = idle_cpu(cpu);
@@ -5480,7 +3729,7 @@ need_resched_nonpreemptible:
5480 3729
5481 if (likely(prev != next)) { 3730 if (likely(prev != next)) {
5482 sched_info_switch(prev, next); 3731 sched_info_switch(prev, next);
5483 perf_event_task_sched_out(prev, next, cpu); 3732 perf_event_task_sched_out(prev, next);
5484 3733
5485 rq->nr_switches++; 3734 rq->nr_switches++;
5486 rq->curr = next; 3735 rq->curr = next;
@@ -5498,8 +3747,11 @@ need_resched_nonpreemptible:
5498 3747
5499 post_schedule(rq); 3748 post_schedule(rq);
5500 3749
5501 if (unlikely(reacquire_kernel_lock(current) < 0)) 3750 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3751 prev = rq->curr;
3752 switch_count = &prev->nivcsw;
5502 goto need_resched_nonpreemptible; 3753 goto need_resched_nonpreemptible;
3754 }
5503 3755
5504 preempt_enable_no_resched(); 3756 preempt_enable_no_resched();
5505 if (need_resched()) 3757 if (need_resched())
@@ -5911,14 +4163,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
5911 */ 4163 */
5912bool try_wait_for_completion(struct completion *x) 4164bool try_wait_for_completion(struct completion *x)
5913{ 4165{
4166 unsigned long flags;
5914 int ret = 1; 4167 int ret = 1;
5915 4168
5916 spin_lock_irq(&x->wait.lock); 4169 spin_lock_irqsave(&x->wait.lock, flags);
5917 if (!x->done) 4170 if (!x->done)
5918 ret = 0; 4171 ret = 0;
5919 else 4172 else
5920 x->done--; 4173 x->done--;
5921 spin_unlock_irq(&x->wait.lock); 4174 spin_unlock_irqrestore(&x->wait.lock, flags);
5922 return ret; 4175 return ret;
5923} 4176}
5924EXPORT_SYMBOL(try_wait_for_completion); 4177EXPORT_SYMBOL(try_wait_for_completion);
@@ -5933,12 +4186,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
5933 */ 4186 */
5934bool completion_done(struct completion *x) 4187bool completion_done(struct completion *x)
5935{ 4188{
4189 unsigned long flags;
5936 int ret = 1; 4190 int ret = 1;
5937 4191
5938 spin_lock_irq(&x->wait.lock); 4192 spin_lock_irqsave(&x->wait.lock, flags);
5939 if (!x->done) 4193 if (!x->done)
5940 ret = 0; 4194 ret = 0;
5941 spin_unlock_irq(&x->wait.lock); 4195 spin_unlock_irqrestore(&x->wait.lock, flags);
5942 return ret; 4196 return ret;
5943} 4197}
5944EXPORT_SYMBOL(completion_done); 4198EXPORT_SYMBOL(completion_done);
@@ -6006,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6006 unsigned long flags; 4260 unsigned long flags;
6007 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6008 struct rq *rq; 4262 struct rq *rq;
6009 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6010 4264
6011 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6012 4266
@@ -6014,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6014 update_rq_clock(rq); 4268 update_rq_clock(rq);
6015 4269
6016 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6017 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6018 running = task_current(rq, p); 4273 running = task_current(rq, p);
6019 if (on_rq) 4274 if (on_rq)
@@ -6031,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6031 if (running) 4286 if (running)
6032 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6033 if (on_rq) { 4288 if (on_rq) {
6034 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6035 4290
6036 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6037 } 4292 }
@@ -6075,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6075 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6076 4331
6077 if (on_rq) { 4332 if (on_rq) {
6078 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6079 /* 4334 /*
6080 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6081 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6098,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice)
6098 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4353 /* convert nice value [19,-20] to rlimit style value [1,40] */
6099 int nice_rlim = 20 - nice; 4354 int nice_rlim = 20 - nice;
6100 4355
6101 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4356 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6102 capable(CAP_SYS_NICE)); 4357 capable(CAP_SYS_NICE));
6103} 4358}
6104 4359
@@ -6233,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6233{ 4488{
6234 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6235 unsigned long flags; 4490 unsigned long flags;
6236 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6237 struct rq *rq; 4492 struct rq *rq;
6238 int reset_on_fork; 4493 int reset_on_fork;
6239 4494
@@ -6275,7 +4530,7 @@ recheck:
6275 4530
6276 if (!lock_task_sighand(p, &flags)) 4531 if (!lock_task_sighand(p, &flags))
6277 return -ESRCH; 4532 return -ESRCH;
6278 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4533 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6279 unlock_task_sighand(p, &flags); 4534 unlock_task_sighand(p, &flags);
6280 4535
6281 /* can't set/change the rt policy */ 4536 /* can't set/change the rt policy */
@@ -6347,6 +4602,7 @@ recheck:
6347 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6348 4603
6349 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6350 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6351 4607
6352 if (running) 4608 if (running)
@@ -6457,7 +4713,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6457 return -EINVAL; 4713 return -EINVAL;
6458 4714
6459 retval = -ESRCH; 4715 retval = -ESRCH;
6460 read_lock(&tasklist_lock); 4716 rcu_read_lock();
6461 p = find_process_by_pid(pid); 4717 p = find_process_by_pid(pid);
6462 if (p) { 4718 if (p) {
6463 retval = security_task_getscheduler(p); 4719 retval = security_task_getscheduler(p);
@@ -6465,7 +4721,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6465 retval = p->policy 4721 retval = p->policy
6466 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4722 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6467 } 4723 }
6468 read_unlock(&tasklist_lock); 4724 rcu_read_unlock();
6469 return retval; 4725 return retval;
6470} 4726}
6471 4727
@@ -6483,7 +4739,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6483 if (!param || pid < 0) 4739 if (!param || pid < 0)
6484 return -EINVAL; 4740 return -EINVAL;
6485 4741
6486 read_lock(&tasklist_lock); 4742 rcu_read_lock();
6487 p = find_process_by_pid(pid); 4743 p = find_process_by_pid(pid);
6488 retval = -ESRCH; 4744 retval = -ESRCH;
6489 if (!p) 4745 if (!p)
@@ -6494,7 +4750,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6494 goto out_unlock; 4750 goto out_unlock;
6495 4751
6496 lp.sched_priority = p->rt_priority; 4752 lp.sched_priority = p->rt_priority;
6497 read_unlock(&tasklist_lock); 4753 rcu_read_unlock();
6498 4754
6499 /* 4755 /*
6500 * This one might sleep, we cannot do it with a spinlock held ... 4756 * This one might sleep, we cannot do it with a spinlock held ...
@@ -6504,7 +4760,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6504 return retval; 4760 return retval;
6505 4761
6506out_unlock: 4762out_unlock:
6507 read_unlock(&tasklist_lock); 4763 rcu_read_unlock();
6508 return retval; 4764 return retval;
6509} 4765}
6510 4766
@@ -6515,22 +4771,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6515 int retval; 4771 int retval;
6516 4772
6517 get_online_cpus(); 4773 get_online_cpus();
6518 read_lock(&tasklist_lock); 4774 rcu_read_lock();
6519 4775
6520 p = find_process_by_pid(pid); 4776 p = find_process_by_pid(pid);
6521 if (!p) { 4777 if (!p) {
6522 read_unlock(&tasklist_lock); 4778 rcu_read_unlock();
6523 put_online_cpus(); 4779 put_online_cpus();
6524 return -ESRCH; 4780 return -ESRCH;
6525 } 4781 }
6526 4782
6527 /* 4783 /* Prevent p going away */
6528 * It is not safe to call set_cpus_allowed with the
6529 * tasklist_lock held. We will bump the task_struct's
6530 * usage count and then drop tasklist_lock.
6531 */
6532 get_task_struct(p); 4784 get_task_struct(p);
6533 read_unlock(&tasklist_lock); 4785 rcu_read_unlock();
6534 4786
6535 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4787 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6536 retval = -ENOMEM; 4788 retval = -ENOMEM;
@@ -6616,7 +4868,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6616 int retval; 4868 int retval;
6617 4869
6618 get_online_cpus(); 4870 get_online_cpus();
6619 read_lock(&tasklist_lock); 4871 rcu_read_lock();
6620 4872
6621 retval = -ESRCH; 4873 retval = -ESRCH;
6622 p = find_process_by_pid(pid); 4874 p = find_process_by_pid(pid);
@@ -6632,7 +4884,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6632 task_rq_unlock(rq, &flags); 4884 task_rq_unlock(rq, &flags);
6633 4885
6634out_unlock: 4886out_unlock:
6635 read_unlock(&tasklist_lock); 4887 rcu_read_unlock();
6636 put_online_cpus(); 4888 put_online_cpus();
6637 4889
6638 return retval; 4890 return retval;
@@ -6876,7 +5128,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6876 return -EINVAL; 5128 return -EINVAL;
6877 5129
6878 retval = -ESRCH; 5130 retval = -ESRCH;
6879 read_lock(&tasklist_lock); 5131 rcu_read_lock();
6880 p = find_process_by_pid(pid); 5132 p = find_process_by_pid(pid);
6881 if (!p) 5133 if (!p)
6882 goto out_unlock; 5134 goto out_unlock;
@@ -6889,13 +5141,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6889 time_slice = p->sched_class->get_rr_interval(rq, p); 5141 time_slice = p->sched_class->get_rr_interval(rq, p);
6890 task_rq_unlock(rq, &flags); 5142 task_rq_unlock(rq, &flags);
6891 5143
6892 read_unlock(&tasklist_lock); 5144 rcu_read_unlock();
6893 jiffies_to_timespec(time_slice, &t); 5145 jiffies_to_timespec(time_slice, &t);
6894 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5146 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6895 return retval; 5147 return retval;
6896 5148
6897out_unlock: 5149out_unlock:
6898 read_unlock(&tasklist_lock); 5150 rcu_read_unlock();
6899 return retval; 5151 return retval;
6900} 5152}
6901 5153
@@ -6986,6 +5238,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6986 raw_spin_lock_irqsave(&rq->lock, flags); 5238 raw_spin_lock_irqsave(&rq->lock, flags);
6987 5239
6988 __sched_fork(idle); 5240 __sched_fork(idle);
5241 idle->state = TASK_RUNNING;
6989 idle->se.exec_start = sched_clock(); 5242 idle->se.exec_start = sched_clock();
6990 5243
6991 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5244 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
@@ -7101,6 +5354,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7101 int ret = 0; 5354 int ret = 0;
7102 5355
7103 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
5357
7104 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7105 ret = -EINVAL; 5359 ret = -EINVAL;
7106 goto out; 5360 goto out;
@@ -7156,7 +5410,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7156static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 5410static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7157{ 5411{
7158 struct rq *rq_dest, *rq_src; 5412 struct rq *rq_dest, *rq_src;
7159 int ret = 0, on_rq; 5413 int ret = 0;
7160 5414
7161 if (unlikely(!cpu_active(dest_cpu))) 5415 if (unlikely(!cpu_active(dest_cpu)))
7162 return ret; 5416 return ret;
@@ -7172,12 +5426,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7172 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 5426 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7173 goto fail; 5427 goto fail;
7174 5428
7175 on_rq = p->se.on_rq; 5429 /*
7176 if (on_rq) 5430 * If we're not on a rq, the next wake-up will ensure we're
5431 * placed properly.
5432 */
5433 if (p->se.on_rq) {
7177 deactivate_task(rq_src, p, 0); 5434 deactivate_task(rq_src, p, 0);
7178 5435 set_task_cpu(p, dest_cpu);
7179 set_task_cpu(p, dest_cpu);
7180 if (on_rq) {
7181 activate_task(rq_dest, p, 0); 5436 activate_task(rq_dest, p, 0);
7182 check_preempt_curr(rq_dest, p, 0); 5437 check_preempt_curr(rq_dest, p, 0);
7183 } 5438 }
@@ -7273,37 +5528,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7273static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5528static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7274{ 5529{
7275 int dest_cpu; 5530 int dest_cpu;
7276 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7277 5531
7278again: 5532again:
7279 /* Look for allowed, online CPU in same node. */ 5533 dest_cpu = select_fallback_rq(dead_cpu, p);
7280 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7281 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7282 goto move;
7283 5534
7284 /* Any allowed, online CPU? */
7285 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7286 if (dest_cpu < nr_cpu_ids)
7287 goto move;
7288
7289 /* No more Mr. Nice Guy. */
7290 if (dest_cpu >= nr_cpu_ids) {
7291 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7292 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7293
7294 /*
7295 * Don't tell them about moving exiting tasks or
7296 * kernel threads (both mm NULL), since they never
7297 * leave kernel.
7298 */
7299 if (p->mm && printk_ratelimit()) {
7300 printk(KERN_INFO "process %d (%s) no "
7301 "longer affine to cpu%d\n",
7302 task_pid_nr(p), p->comm, dead_cpu);
7303 }
7304 }
7305
7306move:
7307 /* It can have affinity changed while we were choosing. */ 5535 /* It can have affinity changed while we were choosing. */
7308 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5536 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7309 goto again; 5537 goto again;
@@ -9413,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9413 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9414 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9415 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9416 rt_rq->rt_se = rt_se;
9417 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9418 if (add) 7645 if (add)
9419 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9444,9 +7671,6 @@ void __init sched_init(void)
9444#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9445 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9446#endif 7673#endif
9447#ifdef CONFIG_USER_SCHED
9448 alloc_size *= 2;
9449#endif
9450#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9451 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9452#endif 7676#endif
@@ -9460,13 +7684,6 @@ void __init sched_init(void)
9460 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9461 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9462 7686
9463#ifdef CONFIG_USER_SCHED
9464 root_task_group.se = (struct sched_entity **)ptr;
9465 ptr += nr_cpu_ids * sizeof(void **);
9466
9467 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9468 ptr += nr_cpu_ids * sizeof(void **);
9469#endif /* CONFIG_USER_SCHED */
9470#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9471#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9472 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9475,13 +7692,6 @@ void __init sched_init(void)
9475 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9476 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9477 7694
9478#ifdef CONFIG_USER_SCHED
9479 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9480 ptr += nr_cpu_ids * sizeof(void **);
9481
9482 root_task_group.rt_rq = (struct rt_rq **)ptr;
9483 ptr += nr_cpu_ids * sizeof(void **);
9484#endif /* CONFIG_USER_SCHED */
9485#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9486#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9487 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9501,22 +7711,13 @@ void __init sched_init(void)
9501#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9502 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9503 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9504#ifdef CONFIG_USER_SCHED
9505 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9506 global_rt_period(), RUNTIME_INF);
9507#endif /* CONFIG_USER_SCHED */
9508#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9509 7715
9510#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9511 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9512 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9513 7719
9514#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9515 INIT_LIST_HEAD(&root_task_group.children);
9516 init_task_group.parent = &root_task_group;
9517 list_add(&init_task_group.siblings, &root_task_group.children);
9518#endif /* CONFIG_USER_SCHED */
9519#endif /* CONFIG_GROUP_SCHED */
9520 7721
9521#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9522 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9556,25 +7757,6 @@ void __init sched_init(void)
9556 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9557 */ 7758 */
9558 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9559#elif defined CONFIG_USER_SCHED
9560 root_task_group.shares = NICE_0_LOAD;
9561 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9562 /*
9563 * In case of task-groups formed thr' the user id of tasks,
9564 * init_task_group represents tasks belonging to root user.
9565 * Hence it forms a sibling of all subsequent groups formed.
9566 * In this case, init_task_group gets only a fraction of overall
9567 * system cpu resource, based on the weight assigned to root
9568 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9569 * by letting tasks of init_task_group sit in a separate cfs_rq
9570 * (init_tg_cfs_rq) and having one entity represent this group of
9571 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9572 */
9573 init_tg_cfs_entry(&init_task_group,
9574 &per_cpu(init_tg_cfs_rq, i),
9575 &per_cpu(init_sched_entity, i), i, 1,
9576 root_task_group.se[i]);
9577
9578#endif 7760#endif
9579#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9580 7762
@@ -9583,12 +7765,6 @@ void __init sched_init(void)
9583 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9584#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9585 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9586#elif defined CONFIG_USER_SCHED
9587 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9588 init_tg_rt_entry(&init_task_group,
9589 &per_cpu(init_rt_rq_var, i),
9590 &per_cpu(init_sched_rt_entity, i), i, 1,
9591 root_task_group.rt_se[i]);
9592#endif 7768#endif
9593#endif 7769#endif
9594 7770
@@ -9668,12 +7844,12 @@ void __init sched_init(void)
9668#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 7844#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9669static inline int preempt_count_equals(int preempt_offset) 7845static inline int preempt_count_equals(int preempt_offset)
9670{ 7846{
9671 int nested = preempt_count() & ~PREEMPT_ACTIVE; 7847 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
9672 7848
9673 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9674} 7850}
9675 7851
9676void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9677{ 7853{
9678#ifdef in_atomic 7854#ifdef in_atomic
9679 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -9984,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9984} 8160}
9985#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
9986 8162
9987#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
9988static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
9989{ 8165{
9990 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10083,17 +8259,17 @@ void sched_move_task(struct task_struct *tsk)
10083 8259
10084#ifdef CONFIG_FAIR_GROUP_SCHED 8260#ifdef CONFIG_FAIR_GROUP_SCHED
10085 if (tsk->sched_class->moved_group) 8261 if (tsk->sched_class->moved_group)
10086 tsk->sched_class->moved_group(tsk); 8262 tsk->sched_class->moved_group(tsk, on_rq);
10087#endif 8263#endif
10088 8264
10089 if (unlikely(running)) 8265 if (unlikely(running))
10090 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10091 if (on_rq) 8267 if (on_rq)
10092 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10093 8269
10094 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10095} 8271}
10096#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10097 8273
10098#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10099static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10235,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10235 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10236 } 8412 }
10237 8413
10238#ifdef CONFIG_USER_SCHED
10239 if (tg == &root_task_group) {
10240 period = global_rt_period();
10241 runtime = global_rt_runtime();
10242 }
10243#endif
10244
10245 /* 8414 /*
10246 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10247 */ 8416 */
@@ -10644,7 +8813,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10644struct cpuacct { 8813struct cpuacct {
10645 struct cgroup_subsys_state css; 8814 struct cgroup_subsys_state css;
10646 /* cpuusage holds pointer to a u64-type object on every cpu */ 8815 /* cpuusage holds pointer to a u64-type object on every cpu */
10647 u64 *cpuusage; 8816 u64 __percpu *cpuusage;
10648 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8817 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10649 struct cpuacct *parent; 8818 struct cpuacct *parent;
10650}; 8819};
@@ -10861,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10861} 9030}
10862 9031
10863/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10864 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10865 */ 9051 */
10866static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10867 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10868{ 9054{
10869 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10870 9057
10871 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10872 return; 9059 return;
@@ -10875,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10875 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10876 9063
10877 do { 9064 do {
10878 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10879 ca = ca->parent; 9066 ca = ca->parent;
10880 } while (ca); 9067 } while (ca);
10881 rcu_read_unlock(); 9068 rcu_read_unlock();
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 479ce5682d7c..5b496132c28a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
236} 236}
237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
238 238
239unsigned long long cpu_clock(int cpu)
240{
241 unsigned long long clock;
242 unsigned long flags;
243
244 local_irq_save(flags);
245 clock = sched_clock_cpu(cpu);
246 local_irq_restore(flags);
247
248 return clock;
249}
250
239#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 251#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
240 252
241void sched_clock_init(void) 253void sched_clock_init(void)
@@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu)
251 return sched_clock(); 263 return sched_clock();
252} 264}
253 265
254#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
255 266
256unsigned long long cpu_clock(int cpu) 267unsigned long long cpu_clock(int cpu)
257{ 268{
258 unsigned long long clock; 269 return sched_clock_cpu(cpu);
259 unsigned long flags; 270}
260 271
261 local_irq_save(flags); 272#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
262 clock = sched_clock_cpu(cpu);
263 local_irq_restore(flags);
264 273
265 return clock;
266}
267EXPORT_SYMBOL_GPL(cpu_clock); 274EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..82095bf2099f 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5bedf6e3ebf3..3e1fd96c6cf9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
510 curr->sum_exec_runtime += delta_exec; 510 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 511 schedstat_add(cfs_rq, exec_clock, delta_exec);
512 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 512 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
513
513 curr->vruntime += delta_exec_weighted; 514 curr->vruntime += delta_exec_weighted;
514 update_min_vruntime(cfs_rq); 515 update_min_vruntime(cfs_rq);
515} 516}
@@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
765 se->vruntime = vruntime; 766 se->vruntime = vruntime;
766} 767}
767 768
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
768static void 772static void
769enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
770{ 774{
771 /* 775 /*
776 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr().
778 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
780 se->vruntime += cfs_rq->min_vruntime;
781
782 /*
772 * Update run-time statistics of the 'current'. 783 * Update run-time statistics of the 'current'.
773 */ 784 */
774 update_curr(cfs_rq); 785 update_curr(cfs_rq);
775 account_entity_enqueue(cfs_rq, se); 786 account_entity_enqueue(cfs_rq, se);
776 787
777 if (wakeup) { 788 if (flags & ENQUEUE_WAKEUP) {
778 place_entity(cfs_rq, se, 0); 789 place_entity(cfs_rq, se, 0);
779 enqueue_sleeper(cfs_rq, se); 790 enqueue_sleeper(cfs_rq, se);
780 } 791 }
@@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
828 __dequeue_entity(cfs_rq, se); 839 __dequeue_entity(cfs_rq, se);
829 account_entity_dequeue(cfs_rq, se); 840 account_entity_dequeue(cfs_rq, se);
830 update_min_vruntime(cfs_rq); 841 update_min_vruntime(cfs_rq);
842
843 /*
844 * Normalize the entity after updating the min_vruntime because the
845 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position.
847 */
848 if (!sleep)
849 se->vruntime -= cfs_rq->min_vruntime;
831} 850}
832 851
833/* 852/*
@@ -1034,17 +1053,24 @@ static inline void hrtick_update(struct rq *rq)
1034 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1035 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1036 */ 1055 */
1037static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1038{ 1058{
1039 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1040 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1041 1067
1042 for_each_sched_entity(se) { 1068 for_each_sched_entity(se) {
1043 if (se->on_rq) 1069 if (se->on_rq)
1044 break; 1070 break;
1045 cfs_rq = cfs_rq_of(se); 1071 cfs_rq = cfs_rq_of(se);
1046 enqueue_entity(cfs_rq, se, wakeup); 1072 enqueue_entity(cfs_rq, se, flags);
1047 wakeup = 1; 1073 flags = ENQUEUE_WAKEUP;
1048 } 1074 }
1049 1075
1050 hrtick_update(rq); 1076 hrtick_update(rq);
@@ -1120,6 +1146,14 @@ static void yield_task_fair(struct rq *rq)
1120 1146
1121#ifdef CONFIG_SMP 1147#ifdef CONFIG_SMP
1122 1148
1149static void task_waking_fair(struct rq *rq, struct task_struct *p)
1150{
1151 struct sched_entity *se = &p->se;
1152 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1153
1154 se->vruntime -= cfs_rq->min_vruntime;
1155}
1156
1123#ifdef CONFIG_FAIR_GROUP_SCHED 1157#ifdef CONFIG_FAIR_GROUP_SCHED
1124/* 1158/*
1125 * effective_load() calculates the load change as seen from the root_task_group 1159 * effective_load() calculates the load change as seen from the root_task_group
@@ -1429,6 +1463,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1429 } 1463 }
1430 1464
1431 for_each_domain(cpu, tmp) { 1465 for_each_domain(cpu, tmp) {
1466 if (!(tmp->flags & SD_LOAD_BALANCE))
1467 continue;
1468
1432 /* 1469 /*
1433 * If power savings logic is enabled for a domain, see if we 1470 * If power savings logic is enabled for a domain, see if we
1434 * are not overloaded, if so, don't balance wider. 1471 * are not overloaded, if so, don't balance wider.
@@ -1472,7 +1509,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1472 * If there's an idle sibling in this domain, make that 1509 * If there's an idle sibling in this domain, make that
1473 * the wake_affine target instead of the current cpu. 1510 * the wake_affine target instead of the current cpu.
1474 */ 1511 */
1475 if (tmp->flags & SD_PREFER_SIBLING) 1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1476 target = select_idle_sibling(p, tmp, target); 1513 target = select_idle_sibling(p, tmp, target);
1477 1514
1478 if (target >= 0) { 1515 if (target >= 0) {
@@ -1779,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1779 */ 1816 */
1780 1817
1781/* 1818/*
1782 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1783 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1784 * dequeued so the iterator has to be dequeue-safe. Here we
1785 * achieve that by always pre-iterating before returning
1786 * the current task:
1787 */ 1821 */
1788static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1789__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1790{ 1824{
1791 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1792 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1793 1830
1794 if (next == &cfs_rq->tasks) 1831/*
1795 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1796 1851
1797 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1798 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1799 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1800 1856
1801 return p; 1857 /*
1802} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1803 1862
1804static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1805{ 1864 if (!tsk_cache_hot ||
1806 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1807 1874
1808 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1809} 1880}
1810 1881
1811static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1812{ 1892{
1813 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1814 1903
1815 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1816} 1916}
1817 1917
1818static unsigned long 1918static unsigned long
1819__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1820 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1821 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1822 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1823{ 1923{
1824 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1825 1927
1826 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1827 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1828 cfs_rq_iterator.arg = cfs_rq;
1829 1930
1830 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1831 max_load_move, sd, idle, all_pinned, 1932
1832 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1833} 1977}
1834 1978
1835#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1861,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1861 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1862 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1863 2007
1864 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1865 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1866 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1867 2011
1868 if (!moved_load) 2012 if (!moved_load)
1869 continue; 2013 continue;
@@ -1886,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1886 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1887 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1888{ 2032{
1889 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1890 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1891 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1892} 2036}
1893#endif 2037#endif
1894 2038
1895static int 2039/*
1896move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1897 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
2050{
2051 unsigned long total_load_moved = 0, load_moved;
2052 int this_best_prio = this_rq->curr->prio;
2053
2054 do {
2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
2058
2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
2062 /*
2063 * NEWIDLE balancing is a source of latency, so preemptible
2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
2066 */
2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
2068 break;
2069
2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
2155 }
2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
2297 return 0;
2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
1898{ 2841{
1899 struct cfs_rq *busy_cfs_rq; 2842 if (idle == CPU_NEWLY_IDLE) {
1900 struct rq_iterator cfs_rq_iterator; 2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
1901 2901
1902 cfs_rq_iterator.start = load_balance_start_fair; 2902redo:
1903 cfs_rq_iterator.next = load_balance_next_fair; 2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
1904 2906
1905 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
1906 /* 2927 /*
1907 * pass busy_cfs_rq argument into 2928 * Attempt to move tasks. If find_busiest_group has found
1908 * load_balance_[start|next]_fair iterators 2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
1909 */ 2932 */
1910 cfs_rq_iterator.arg = busy_cfs_rq; 2933 local_irq_save(flags);
1911 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2934 double_rq_lock(this_rq, busiest);
1912 &cfs_rq_iterator)) 2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
1913 return 1; 2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
1914 } 2953 }
1915 2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
1916 return 0; 3348 return 0;
1917} 3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
1918 3536
1919static void rq_online_fair(struct rq *rq) 3537static void rq_online_fair(struct rq *rq)
1920{ 3538{
@@ -1926,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
1926 update_sysctl(); 3544 update_sysctl();
1927} 3545}
1928 3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1929#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1930 3557
1931/* 3558/*
@@ -1975,6 +3602,8 @@ static void task_fork_fair(struct task_struct *p)
1975 resched_task(rq->curr); 3602 resched_task(rq->curr);
1976 } 3603 }
1977 3604
3605 se->vruntime -= cfs_rq->min_vruntime;
3606
1978 raw_spin_unlock_irqrestore(&rq->lock, flags); 3607 raw_spin_unlock_irqrestore(&rq->lock, flags);
1979} 3608}
1980 3609
@@ -2028,16 +3657,17 @@ static void set_curr_task_fair(struct rq *rq)
2028} 3657}
2029 3658
2030#ifdef CONFIG_FAIR_GROUP_SCHED 3659#ifdef CONFIG_FAIR_GROUP_SCHED
2031static void moved_group_fair(struct task_struct *p) 3660static void moved_group_fair(struct task_struct *p, int on_rq)
2032{ 3661{
2033 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3662 struct cfs_rq *cfs_rq = task_cfs_rq(p);
2034 3663
2035 update_curr(cfs_rq); 3664 update_curr(cfs_rq);
2036 place_entity(cfs_rq, &p->se, 1); 3665 if (!on_rq)
3666 place_entity(cfs_rq, &p->se, 1);
2037} 3667}
2038#endif 3668#endif
2039 3669
2040unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2041{ 3671{
2042 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2043 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
@@ -2069,10 +3699,10 @@ static const struct sched_class fair_sched_class = {
2069#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2070 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2071 3701
2072 .load_balance = load_balance_fair,
2073 .move_one_task = move_one_task_fair,
2074 .rq_online = rq_online_fair, 3702 .rq_online = rq_online_fair,
2075 .rq_offline = rq_offline_fair, 3703 .rq_offline = rq_offline_fair,
3704
3705 .task_waking = task_waking_fair,
2076#endif 3706#endif
2077 3707
2078 .set_curr_task = set_curr_task_fair, 3708 .set_curr_task = set_curr_task_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d2ea2828164e..5a6ed1f0990a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1472,7 +1482,7 @@ static void post_schedule_rt(struct rq *rq)
1472 * If we are not running and we are not going to reschedule soon, we should 1482 * If we are not running and we are not going to reschedule soon, we should
1473 * try to push tasks away now 1483 * try to push tasks away now
1474 */ 1484 */
1475static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1485static void task_woken_rt(struct rq *rq, struct task_struct *p)
1476{ 1486{
1477 if (!task_running(rq, p) && 1487 if (!task_running(rq, p) &&
1478 !test_tsk_need_resched(rq->curr) && 1488 !test_tsk_need_resched(rq->curr) &&
@@ -1481,24 +1491,6 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1670,8 +1662,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1670 if (!p->signal) 1662 if (!p->signal)
1671 return; 1663 return;
1672 1664
1673 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1665 /* max may change after cur was read, this will be fixed next tick */
1674 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1666 soft = task_rlimit(p, RLIMIT_RTTIME);
1667 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1675 1668
1676 if (soft != RLIM_INFINITY) { 1669 if (soft != RLIM_INFINITY) {
1677 unsigned long next; 1670 unsigned long next;
@@ -1721,7 +1714,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1714 dequeue_pushable_task(rq, p);
1722} 1715}
1723 1716
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1717static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1718{
1726 /* 1719 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1720 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,14 +1739,12 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1739#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1740 .select_task_rq = select_task_rq_rt,
1748 1741
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1742 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1743 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1744 .rq_offline = rq_offline_rt,
1754 .pre_schedule = pre_schedule_rt, 1745 .pre_schedule = pre_schedule_rt,
1755 .post_schedule = post_schedule_rt, 1746 .post_schedule = post_schedule_rt,
1756 .task_wake_up = task_wake_up_rt, 1747 .task_woken = task_woken_rt,
1757 .switched_from = switched_from_rt, 1748 .switched_from = switched_from_rt,
1758#endif 1749#endif
1759 1750
diff --git a/kernel/signal.c b/kernel/signal.c
index 1814e68e4de3..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
159 159
160/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
161 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
162int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
163{ 167{
164 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
166 170
167 s = pending->signal.sig; 171 s = pending->signal.sig;
168 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
169 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
170 default: 187 default:
171 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
172 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
173 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
174 break; 191 continue;
175 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
176 break; 195 break;
177 196
178 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
179 sig = 1; 198 x = s[1] &~ m[1];
180 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
181 sig = _NSIG_BPW + 1;
182 else
183 break; 200 break;
184 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
185 break; 202 break;
186 203
187 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
188 sig = ffz(~x) + 1; 205 /* Nothing to do */
189 break; 206 break;
190 } 207 }
191 208
@@ -218,17 +235,17 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
218 struct user_struct *user; 235 struct user_struct *user;
219 236
220 /* 237 /*
221 * We won't get problems with the target's UID changing under us 238 * Protect access to @t credentials. This can go away when all
222 * because changing it requires RCU be used, and if t != current, the 239 * callers hold rcu read lock.
223 * caller must be holding the RCU readlock (by way of a spinlock) and
224 * we use RCU protection here
225 */ 240 */
241 rcu_read_lock();
226 user = get_uid(__task_cred(t)->user); 242 user = get_uid(__task_cred(t)->user);
227 atomic_inc(&user->sigpending); 243 atomic_inc(&user->sigpending);
244 rcu_read_unlock();
228 245
229 if (override_rlimit || 246 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
232 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else { 250 } else {
234 print_dropped_signal(sig); 251 print_dropped_signal(sig);
@@ -979,7 +996,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
979 for (i = 0; i < 16; i++) { 996 for (i = 0; i < 16; i++) {
980 unsigned char insn; 997 unsigned char insn;
981 998
982 __get_user(insn, (unsigned char *)(regs->ip + i)); 999 if (get_user(insn, (unsigned char *)(regs->ip + i)))
1000 break;
983 printk("%02x ", insn); 1001 printk("%02x ", insn);
984 } 1002 }
985 } 1003 }
@@ -1179,11 +1197,12 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1179 int ret = -EINVAL; 1197 int ret = -EINVAL;
1180 struct task_struct *p; 1198 struct task_struct *p;
1181 const struct cred *pcred; 1199 const struct cred *pcred;
1200 unsigned long flags;
1182 1201
1183 if (!valid_signal(sig)) 1202 if (!valid_signal(sig))
1184 return ret; 1203 return ret;
1185 1204
1186 read_lock(&tasklist_lock); 1205 rcu_read_lock();
1187 p = pid_task(pid, PIDTYPE_PID); 1206 p = pid_task(pid, PIDTYPE_PID);
1188 if (!p) { 1207 if (!p) {
1189 ret = -ESRCH; 1208 ret = -ESRCH;
@@ -1199,14 +1218,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1199 ret = security_task_kill(p, info, sig, secid); 1218 ret = security_task_kill(p, info, sig, secid);
1200 if (ret) 1219 if (ret)
1201 goto out_unlock; 1220 goto out_unlock;
1202 if (sig && p->sighand) { 1221
1203 unsigned long flags; 1222 if (sig) {
1204 spin_lock_irqsave(&p->sighand->siglock, flags); 1223 if (lock_task_sighand(p, &flags)) {
1205 ret = __send_signal(sig, info, p, 1, 0); 1224 ret = __send_signal(sig, info, p, 1, 0);
1206 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1225 unlock_task_sighand(p, &flags);
1226 } else
1227 ret = -ESRCH;
1207 } 1228 }
1208out_unlock: 1229out_unlock:
1209 read_unlock(&tasklist_lock); 1230 rcu_read_unlock();
1210 return ret; 1231 return ret;
1211} 1232}
1212EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1233EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
diff --git a/kernel/smp.c b/kernel/smp.c
index de735a6637d0..9867b6bfefce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,8 +12,6 @@
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14 14
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 15static struct {
18 struct list_head queue; 16 struct list_head queue;
19 raw_spinlock_t lock; 17 raw_spinlock_t lock;
@@ -33,12 +31,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 31 cpumask_var_t cpumask;
34}; 32};
35 33
34static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 raw_spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 42
43static int 43static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void)
256 } 256 }
257} 257}
258 258
259static DEFINE_PER_CPU(struct call_single_data, csd_data); 259static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
260 260
261/* 261/*
262 * smp_call_function_single - Run a function on a specific CPU 262 * smp_call_function_single - Run a function on a specific CPU
@@ -347,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask,
347 goto call; 347 goto call;
348 348
349 /* Try for same node. */ 349 /* Try for same node. */
350 nodemask = cpumask_of_node(cpu); 350 nodemask = cpumask_of_node(cpu_to_node(cpu));
351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; 351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
352 cpu = cpumask_next_and(cpu, nodemask, mask)) { 352 cpu = cpumask_next_and(cpu, nodemask, mask)) {
353 if (cpu_online(cpu)) 353 if (cpu_online(cpu))
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef75..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e27..0d4c7898ab80 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
118 } 125 }
119 126
120 if (touch_ts == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..bde4295774c8 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,6 +34,30 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37static int init_srcu_struct_fields(struct srcu_struct *sp)
38{
39 sp->completed = 0;
40 mutex_init(&sp->mutex);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM;
43}
44
45#ifdef CONFIG_DEBUG_LOCK_ALLOC
46
47int __init_srcu_struct(struct srcu_struct *sp, const char *name,
48 struct lock_class_key *key)
49{
50#ifdef CONFIG_DEBUG_LOCK_ALLOC
51 /* Don't re-initialize a lock while it is held. */
52 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
53 lockdep_init_map(&sp->dep_map, name, key, 0);
54#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
55 return init_srcu_struct_fields(sp);
56}
57EXPORT_SYMBOL_GPL(__init_srcu_struct);
58
59#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
60
37/** 61/**
38 * init_srcu_struct - initialize a sleep-RCU structure 62 * init_srcu_struct - initialize a sleep-RCU structure
39 * @sp: structure to initialize. 63 * @sp: structure to initialize.
@@ -44,13 +68,12 @@
44 */ 68 */
45int init_srcu_struct(struct srcu_struct *sp) 69int init_srcu_struct(struct srcu_struct *sp)
46{ 70{
47 sp->completed = 0; 71 return init_srcu_struct_fields(sp);
48 mutex_init(&sp->mutex);
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 72}
52EXPORT_SYMBOL_GPL(init_srcu_struct); 73EXPORT_SYMBOL_GPL(init_srcu_struct);
53 74
75#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
76
54/* 77/*
55 * srcu_readers_active_idx -- returns approximate number of readers 78 * srcu_readers_active_idx -- returns approximate number of readers
56 * active on the specified rank of per-CPU counters. 79 * active on the specified rank of per-CPU counters.
@@ -100,15 +123,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
100} 123}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 124EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
102 125
103/** 126/*
104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
105 * @sp: srcu_struct in which to register the new reader.
106 *
107 * Counts the new reader in the appropriate per-CPU element of the 127 * Counts the new reader in the appropriate per-CPU element of the
108 * srcu_struct. Must be called from process context. 128 * srcu_struct. Must be called from process context.
109 * Returns an index that must be passed to the matching srcu_read_unlock(). 129 * Returns an index that must be passed to the matching srcu_read_unlock().
110 */ 130 */
111int srcu_read_lock(struct srcu_struct *sp) 131int __srcu_read_lock(struct srcu_struct *sp)
112{ 132{
113 int idx; 133 int idx;
114 134
@@ -120,31 +140,27 @@ int srcu_read_lock(struct srcu_struct *sp)
120 preempt_enable(); 140 preempt_enable();
121 return idx; 141 return idx;
122} 142}
123EXPORT_SYMBOL_GPL(srcu_read_lock); 143EXPORT_SYMBOL_GPL(__srcu_read_lock);
124 144
125/** 145/*
126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
127 * @sp: srcu_struct in which to unregister the old reader.
128 * @idx: return value from corresponding srcu_read_lock().
129 *
130 * Removes the count for the old reader from the appropriate per-CPU 146 * Removes the count for the old reader from the appropriate per-CPU
131 * element of the srcu_struct. Note that this may well be a different 147 * element of the srcu_struct. Note that this may well be a different
132 * CPU than that which was incremented by the corresponding srcu_read_lock(). 148 * CPU than that which was incremented by the corresponding srcu_read_lock().
133 * Must be called from process context. 149 * Must be called from process context.
134 */ 150 */
135void srcu_read_unlock(struct srcu_struct *sp, int idx) 151void __srcu_read_unlock(struct srcu_struct *sp, int idx)
136{ 152{
137 preempt_disable(); 153 preempt_disable();
138 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 154 srcu_barrier(); /* ensure compiler won't misorder critical section. */
139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 155 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
140 preempt_enable(); 156 preempt_enable();
141} 157}
142EXPORT_SYMBOL_GPL(srcu_read_unlock); 158EXPORT_SYMBOL_GPL(__srcu_read_unlock);
143 159
144/* 160/*
145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 161 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
146 */ 162 */
147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 163static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
148{ 164{
149 int idx; 165 int idx;
150 166
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void __percpu *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
51{ 51{
diff --git a/kernel/sys.c b/kernel/sys.c
index 20ccfb5da6af..9814e43fb23b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -162,6 +162,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
162 if (niceval > 19) 162 if (niceval > 19)
163 niceval = 19; 163 niceval = 19;
164 164
165 rcu_read_lock();
165 read_lock(&tasklist_lock); 166 read_lock(&tasklist_lock);
166 switch (which) { 167 switch (which) {
167 case PRIO_PROCESS: 168 case PRIO_PROCESS:
@@ -199,6 +200,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
199 } 200 }
200out_unlock: 201out_unlock:
201 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
203 rcu_read_unlock();
202out: 204out:
203 return error; 205 return error;
204} 206}
@@ -220,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
220 if (which > PRIO_USER || which < PRIO_PROCESS) 222 if (which > PRIO_USER || which < PRIO_PROCESS)
221 return -EINVAL; 223 return -EINVAL;
222 224
225 rcu_read_lock();
223 read_lock(&tasklist_lock); 226 read_lock(&tasklist_lock);
224 switch (which) { 227 switch (which) {
225 case PRIO_PROCESS: 228 case PRIO_PROCESS:
@@ -265,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
265 } 268 }
266out_unlock: 269out_unlock:
267 read_unlock(&tasklist_lock); 270 read_unlock(&tasklist_lock);
271 rcu_read_unlock();
268 272
269 return retval; 273 return retval;
270} 274}
@@ -567,13 +571,7 @@ static int set_user(struct cred *new)
567 if (!new_user) 571 if (!new_user)
568 return -EAGAIN; 572 return -EAGAIN;
569 573
570 if (!task_can_switch_user(new_user, current)) { 574 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
571 free_uid(new_user);
572 return -EINVAL;
573 }
574
575 if (atomic_read(&new_user->processes) >=
576 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
577 new_user != INIT_USER) { 575 new_user != INIT_USER) {
578 free_uid(new_user); 576 free_uid(new_user);
579 return -EAGAIN; 577 return -EAGAIN;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 45e4bef0012a..0ef19c614f6d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,6 +50,7 @@
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_event.h> 52#include <linux/perf_event.h>
53#include <linux/kprobes.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/processor.h> 56#include <asm/processor.h>
@@ -1131,7 +1132,7 @@ static struct ctl_table vm_table[] = {
1131 .data = &sysctl_max_map_count, 1132 .data = &sysctl_max_map_count,
1132 .maxlen = sizeof(sysctl_max_map_count), 1133 .maxlen = sizeof(sysctl_max_map_count),
1133 .mode = 0644, 1134 .mode = 0644,
1134 .proc_handler = proc_dointvec, 1135 .proc_handler = proc_dointvec_minmax,
1135 .extra1 = &zero, 1136 .extra1 = &zero,
1136 }, 1137 },
1137#else 1138#else
@@ -1214,6 +1215,7 @@ static struct ctl_table vm_table[] = {
1214 .proc_handler = proc_dointvec_jiffies, 1215 .proc_handler = proc_dointvec_jiffies,
1215 }, 1216 },
1216#endif 1217#endif
1218#ifdef CONFIG_MMU
1217 { 1219 {
1218 .procname = "mmap_min_addr", 1220 .procname = "mmap_min_addr",
1219 .data = &dac_mmap_min_addr, 1221 .data = &dac_mmap_min_addr,
@@ -1221,6 +1223,7 @@ static struct ctl_table vm_table[] = {
1221 .mode = 0644, 1223 .mode = 0644,
1222 .proc_handler = mmap_min_addr_handler, 1224 .proc_handler = mmap_min_addr_handler,
1223 }, 1225 },
1226#endif
1224#ifdef CONFIG_NUMA 1227#ifdef CONFIG_NUMA
1225 { 1228 {
1226 .procname = "numa_zonelist_order", 1229 .procname = "numa_zonelist_order",
@@ -1439,7 +1442,7 @@ static struct ctl_table fs_table[] = {
1439}; 1442};
1440 1443
1441static struct ctl_table debug_table[] = { 1444static struct ctl_table debug_table[] = {
1442#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1445#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
1443 { 1446 {
1444 .procname = "exception-trace", 1447 .procname = "exception-trace",
1445 .data = &show_unhandled_signals, 1448 .data = &show_unhandled_signals,
@@ -1448,6 +1451,17 @@ static struct ctl_table debug_table[] = {
1448 .proc_handler = proc_dointvec 1451 .proc_handler = proc_dointvec
1449 }, 1452 },
1450#endif 1453#endif
1454#if defined(CONFIG_OPTPROBES)
1455 {
1456 .procname = "kprobes-optimization",
1457 .data = &sysctl_kprobes_optimization,
1458 .maxlen = sizeof(int),
1459 .mode = 0644,
1460 .proc_handler = proc_kprobes_optimization_handler,
1461 .extra1 = &zero,
1462 .extra2 = &one,
1463 },
1464#endif
1451 { } 1465 { }
1452}; 1466};
1453 1467
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 112533d5fc08..8cd50d8f9bde 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1331 ssize_t result; 1331 ssize_t result;
1332 char *pathname; 1332 char *pathname;
1333 int flags; 1333 int flags;
1334 int acc_mode, fmode; 1334 int acc_mode;
1335 1335
1336 pathname = sysctl_getname(name, nlen, &table); 1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname); 1337 result = PTR_ERR(pathname);
@@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1342 if (oldval && oldlen && newval && newlen) { 1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR; 1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE; 1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) { 1345 } else if (newval && newlen) {
1347 flags = O_WRONLY; 1346 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE; 1347 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) { 1348 } else if (oldval && oldlen) {
1351 flags = O_RDONLY; 1349 flags = O_RDONLY;
1352 acc_mode = MAY_READ; 1350 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else { 1351 } else {
1355 result = 0; 1352 result = 0;
1356 goto out_putname; 1353 goto out_putname;
@@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1361 if (result) 1358 if (result)
1362 goto out_putname; 1359 goto out_putname;
1363 1360
1364 result = may_open(&nd.path, acc_mode, fmode); 1361 result = may_open(&nd.path, acc_mode, flags);
1365 if (result) 1362 if (result)
1366 goto out_putpath; 1363 goto out_putpath;
1367 1364
@@ -1417,6 +1414,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1417 return; 1414 return;
1418} 1415}
1419 1416
1417#define WARN_ONCE_HASH_BITS 8
1418#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
1419
1420static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
1421
1422#define FNV32_OFFSET 2166136261U
1423#define FNV32_PRIME 0x01000193
1424
1425/*
1426 * Print each legacy sysctl (approximately) only once.
1427 * To avoid making the tables non-const use a external
1428 * hash-table instead.
1429 * Worst case hash collision: 6, but very rarely.
1430 * NOTE! We don't use the SMP-safe bit tests. We simply
1431 * don't care enough.
1432 */
1433static void warn_on_bintable(const int *name, int nlen)
1434{
1435 int i;
1436 u32 hash = FNV32_OFFSET;
1437
1438 for (i = 0; i < nlen; i++)
1439 hash = (hash ^ name[i]) * FNV32_PRIME;
1440 hash %= WARN_ONCE_HASH_SIZE;
1441 if (__test_and_set_bit(hash, warn_once_bitmap))
1442 return;
1443 deprecated_sysctl_warning(name, nlen);
1444}
1445
1420static ssize_t do_sysctl(int __user *args_name, int nlen, 1446static ssize_t do_sysctl(int __user *args_name, int nlen,
1421 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1447 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1422{ 1448{
@@ -1431,7 +1457,7 @@ static ssize_t do_sysctl(int __user *args_name, int nlen,
1431 if (get_user(name[i], args_name + i)) 1457 if (get_user(name[i], args_name + i))
1432 return -EFAULT; 1458 return -EFAULT;
1433 1459
1434 deprecated_sysctl_warning(name, nlen); 1460 warn_on_bintable(name, nlen);
1435 1461
1436 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); 1462 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1437} 1463}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..899ca51be5e8 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -46,15 +46,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 46 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 47};
48 48
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 49static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 50 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 52 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 53 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 54
56static struct nla_policy 55static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 56 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 57};
60 58
diff --git a/kernel/time.c b/kernel/time.c
index c6324d96009e..804798005d19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,6 +136,7 @@ static inline void warp_clock(void)
136 write_seqlock_irq(&xtime_lock); 136 write_seqlock_irq(&xtime_lock);
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 138 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
139 update_xtime_cache(0);
139 write_sequnlock_irq(&xtime_lock); 140 write_sequnlock_irq(&xtime_lock);
140 clock_was_set(); 141 clock_was_set();
141} 142}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d5fc0fd1cca..d7395fdfb9f3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -238,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
238 */ 238 */
239void clockevents_notify(unsigned long reason, void *arg) 239void clockevents_notify(unsigned long reason, void *arg)
240{ 240{
241 struct list_head *node, *tmp; 241 struct clock_event_device *dev, *tmp;
242 unsigned long flags; 242 unsigned long flags;
243 int cpu;
243 244
244 raw_spin_lock_irqsave(&clockevents_lock, flags); 245 raw_spin_lock_irqsave(&clockevents_lock, flags);
245 clockevents_do_notify(reason, arg); 246 clockevents_do_notify(reason, arg);
@@ -250,8 +251,20 @@ void clockevents_notify(unsigned long reason, void *arg)
250 * Unregister the clock event devices which were 251 * Unregister the clock event devices which were
251 * released from the users in the notify chain. 252 * released from the users in the notify chain.
252 */ 253 */
253 list_for_each_safe(node, tmp, &clockevents_released) 254 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
254 list_del(node); 255 list_del(&dev->list);
256 /*
257 * Now check whether the CPU has left unused per cpu devices
258 */
259 cpu = *((int *)arg);
260 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
261 if (cpumask_test_cpu(cpu, dev->cpumask) &&
262 cpumask_weight(dev->cpumask) == 1 &&
263 !tick_is_broadcast_device(dev)) {
264 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
265 list_del(&dev->list);
266 }
267 }
255 break; 268 break;
256 default: 269 default:
257 break; 270 break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d34..1f663d23e85e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
343{ 343{
344 unsigned long flags; 344 unsigned long flags;
345 345
346 spin_lock_irqsave(&watchdog_lock, flags); 346 /*
347 * We use trylock here to avoid a potential dead lock when
348 * kgdb calls this code after the kernel has been stopped with
349 * watchdog_lock held. When watchdog_lock is held we just
350 * return and accept, that the watchdog might trigger and mark
351 * the monitored clock source (usually TSC) unstable.
352 *
353 * This does not affect the other caller clocksource_resume()
354 * because at this point the kernel is UP, interrupts are
355 * disabled and nothing can hold watchdog_lock.
356 */
357 if (!spin_trylock_irqsave(&watchdog_lock, flags))
358 return;
347 clocksource_reset_watchdog(); 359 clocksource_reset_watchdog();
348 spin_unlock_irqrestore(&watchdog_lock, flags); 360 spin_unlock_irqrestore(&watchdog_lock, flags);
349} 361}
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
441#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
442 454
443/** 455/**
456 * clocksource_suspend - suspend the clocksource(s)
457 */
458void clocksource_suspend(void)
459{
460 struct clocksource *cs;
461
462 list_for_each_entry_reverse(cs, &clocksource_list, list)
463 if (cs->suspend)
464 cs->suspend(cs);
465}
466
467/**
444 * clocksource_resume - resume the clocksource(s) 468 * clocksource_resume - resume the clocksource(s)
445 */ 469 */
446void clocksource_resume(void) 470void clocksource_resume(void)
@@ -449,7 +473,7 @@ void clocksource_resume(void)
449 473
450 list_for_each_entry(cs, &clocksource_list, list) 474 list_for_each_entry(cs, &clocksource_list, list)
451 if (cs->resume) 475 if (cs->resume)
452 cs->resume(); 476 cs->resume(cs);
453 477
454 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
455} 479}
@@ -458,8 +482,8 @@ void clocksource_resume(void)
458 * clocksource_touch_watchdog - Update watchdog 482 * clocksource_touch_watchdog - Update watchdog
459 * 483 *
460 * Update the watchdog after exception contexts such as kgdb so as not 484 * Update the watchdog after exception contexts such as kgdb so as not
461 * to incorrectly trip the watchdog. 485 * to incorrectly trip the watchdog. This might fail when the kernel
462 * 486 * was stopped in code which holds watchdog_lock.
463 */ 487 */
464void clocksource_touch_watchdog(void) 488void clocksource_touch_watchdog(void)
465{ 489{
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64 time_offset;
58static long time_constant = 2; 58static long time_constant = 2;
59 59
60/* maximum error (usecs): */ 60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT; 61static long time_maxerror = NTP_PHASE_LIMIT;
62 62
63/* estimated error (usecs): */ 63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT; 64static long time_esterror = NTP_PHASE_LIMIT;
65 65
66/* frequency offset (scaled nsecs/secs): */ 66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq; 67static s64 time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
142 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
143 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
144 */ 144 */
145 secs = xtime.tv_sec - time_reftime; 145 secs = get_seconds() - time_reftime;
146 if (unlikely(time_status & STA_FREQHOLD)) 146 if (unlikely(time_status & STA_FREQHOLD))
147 secs = 0; 147 secs = 0;
148 148
149 time_reftime = xtime.tv_sec; 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
368 * reference time to current time. 368 * reference time to current time.
369 */ 369 */
370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) 370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
371 time_reftime = xtime.tv_sec; 371 time_reftime = get_seconds();
372 372
373 /* only set allowed bits */ 373 /* only set allowed bits */
374 time_status &= STA_RONLY; 374 time_status &= STA_RONLY;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index af4135f05825..16736379a9ca 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,6 +165,13 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
168/* must hold xtime_lock */ 175/* must hold xtime_lock */
169void timekeeping_leap_insert(int leapsecond) 176void timekeeping_leap_insert(int leapsecond)
170{ 177{
@@ -325,6 +332,8 @@ int do_settimeofday(struct timespec *tv)
325 332
326 xtime = *tv; 333 xtime = *tv;
327 334
335 update_xtime_cache(0);
336
328 timekeeper.ntp_error = 0; 337 timekeeper.ntp_error = 0;
329 ntp_clear(); 338 ntp_clear();
330 339
@@ -550,6 +559,7 @@ void __init timekeeping_init(void)
550 } 559 }
551 set_normalized_timespec(&wall_to_monotonic, 560 set_normalized_timespec(&wall_to_monotonic,
552 -boot.tv_sec, -boot.tv_nsec); 561 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
553 total_sleep_time.tv_sec = 0; 563 total_sleep_time.tv_sec = 0;
554 total_sleep_time.tv_nsec = 0; 564 total_sleep_time.tv_nsec = 0;
555 write_sequnlock_irqrestore(&xtime_lock, flags); 565 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -583,6 +593,7 @@ static int timekeeping_resume(struct sys_device *dev)
583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
584 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 594 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
585 } 595 }
596 update_xtime_cache(0);
586 /* re-base the last cycle value */ 597 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
588 timekeeper.ntp_error = 0; 599 timekeeper.ntp_error = 0;
@@ -611,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
611 write_sequnlock_irqrestore(&xtime_lock, flags); 622 write_sequnlock_irqrestore(&xtime_lock, flags);
612 623
613 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
625 clocksource_suspend();
614 626
615 return 0; 627 return 0;
616} 628}
@@ -722,6 +734,7 @@ static void timekeeping_adjust(s64 offset)
722 timekeeper.ntp_error_shift; 734 timekeeper.ntp_error_shift;
723} 735}
724 736
737
725/** 738/**
726 * logarithmic_accumulation - shifted accumulation of cycles 739 * logarithmic_accumulation - shifted accumulation of cycles
727 * 740 *
@@ -765,6 +778,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
765 return offset; 778 return offset;
766} 779}
767 780
781
768/** 782/**
769 * update_wall_time - Uses the current clocksource to increment the wall time 783 * update_wall_time - Uses the current clocksource to increment the wall time
770 * 784 *
@@ -774,6 +788,7 @@ void update_wall_time(void)
774{ 788{
775 struct clocksource *clock; 789 struct clocksource *clock;
776 cycle_t offset; 790 cycle_t offset;
791 u64 nsecs;
777 int shift = 0, maxshift; 792 int shift = 0, maxshift;
778 793
779 /* Make sure we're fully resumed: */ 794 /* Make sure we're fully resumed: */
@@ -839,6 +854,9 @@ void update_wall_time(void)
839 timekeeper.ntp_error += timekeeper.xtime_nsec << 854 timekeeper.ntp_error += timekeeper.xtime_nsec <<
840 timekeeper.ntp_error_shift; 855 timekeeper.ntp_error_shift;
841 856
857 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
858 update_xtime_cache(nsecs);
859
842 /* check to see if there is a new clocksource to use */ 860 /* check to see if there is a new clocksource to use */
843 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 861 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
844} 862}
@@ -863,6 +881,7 @@ void getboottime(struct timespec *ts)
863 881
864 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 882 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
865} 883}
884EXPORT_SYMBOL_GPL(getboottime);
866 885
867/** 886/**
868 * monotonic_to_bootbased - Convert the monotonic time to boot based. 887 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -872,16 +891,17 @@ void monotonic_to_bootbased(struct timespec *ts)
872{ 891{
873 *ts = timespec_add_safe(*ts, total_sleep_time); 892 *ts = timespec_add_safe(*ts, total_sleep_time);
874} 893}
894EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
875 895
876unsigned long get_seconds(void) 896unsigned long get_seconds(void)
877{ 897{
878 return xtime.tv_sec; 898 return xtime_cache.tv_sec;
879} 899}
880EXPORT_SYMBOL(get_seconds); 900EXPORT_SYMBOL(get_seconds);
881 901
882struct timespec __current_kernel_time(void) 902struct timespec __current_kernel_time(void)
883{ 903{
884 return xtime; 904 return xtime_cache;
885} 905}
886 906
887struct timespec current_kernel_time(void) 907struct timespec current_kernel_time(void)
@@ -891,7 +911,8 @@ struct timespec current_kernel_time(void)
891 911
892 do { 912 do {
893 seq = read_seqbegin(&xtime_lock); 913 seq = read_seqbegin(&xtime_lock);
894 now = xtime; 914
915 now = xtime_cache;
895 } while (read_seqretry(&xtime_lock, seq)); 916 } while (read_seqretry(&xtime_lock, seq));
896 917
897 return now; 918 return now;
@@ -905,7 +926,8 @@ struct timespec get_monotonic_coarse(void)
905 926
906 do { 927 do {
907 seq = read_seqbegin(&xtime_lock); 928 seq = read_seqbegin(&xtime_lock);
908 now = xtime; 929
930 now = xtime_cache;
909 mono = wall_to_monotonic; 931 mono = wall_to_monotonic;
910 } while (read_seqretry(&xtime_lock, seq)); 932 } while (read_seqretry(&xtime_lock, seq));
911 933
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 28265636b6c2..bdfb8dd1050c 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -237,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m)
237#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 237#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
238 print_tickdevice(m, tick_get_broadcast_device(), -1); 238 print_tickdevice(m, tick_get_broadcast_device(), -1);
239 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 239 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
240 tick_get_broadcast_mask()->bits[0]); 240 cpumask_bits(tick_get_broadcast_mask())[0]);
241#ifdef CONFIG_TICK_ONESHOT 241#ifdef CONFIG_TICK_ONESHOT
242 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", 242 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
243 tick_get_broadcast_oneshot_mask()->bits[0]); 243 cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
244#endif 244#endif
245 SEQ_printf(m, "\n"); 245 SEQ_printf(m, "\n");
246#endif 246#endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d26811..c61a7949387f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
656 656
657 debug_activate(timer, expires); 657 debug_activate(timer, expires);
658 658
659 new_base = __get_cpu_var(tvec_bases);
660
661 cpu = smp_processor_id(); 659 cpu = smp_processor_id();
662 660
663#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 661#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
@@ -1200,6 +1198,7 @@ void update_process_times(int user_tick)
1200 run_local_timers(); 1198 run_local_timers();
1201 rcu_check_callbacks(cpu, user_tick); 1199 rcu_check_callbacks(cpu, user_tick);
1202 printk_tick(); 1200 printk_tick();
1201 perf_event_do_pending();
1203 scheduler_tick(); 1202 scheduler_tick();
1204 run_posix_cpu_timers(p); 1203 run_posix_cpu_timers(p);
1205} 1204}
@@ -1211,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h)
1211{ 1210{
1212 struct tvec_base *base = __get_cpu_var(tvec_bases); 1211 struct tvec_base *base = __get_cpu_var(tvec_bases);
1213 1212
1214 perf_event_do_pending();
1215
1216 hrtimer_run_pending(); 1213 hrtimer_run_pending();
1217 1214
1218 if (time_after_eq(jiffies, base->timer_jiffies)) 1215 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d006554888dc..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,39 +12,37 @@ config NOP_TRACER
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help 14 help
15 See Documentation/trace/ftrace-implementation.txt 15 See Documentation/trace/ftrace-design.txt
16 16
17config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
18 bool 18 bool
19 help 19 help
20 See Documentation/trace/ftrace-implementation.txt 20 See Documentation/trace/ftrace-design.txt
21 21
22config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
23 bool 23 bool
24 help 24 help
25 See Documentation/trace/ftrace-implementation.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
29 help 29 help
30 An arch may pass in a unique value (frame pointer) to both the 30 See Documentation/trace/ftrace-design.txt
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
33 31
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 33 bool
36 help 34 help
37 See Documentation/trace/ftrace-implementation.txt 35 See Documentation/trace/ftrace-design.txt
38 36
39config HAVE_DYNAMIC_FTRACE 37config HAVE_DYNAMIC_FTRACE
40 bool 38 bool
41 help 39 help
42 See Documentation/trace/ftrace-implementation.txt 40 See Documentation/trace/ftrace-design.txt
43 41
44config HAVE_FTRACE_MCOUNT_RECORD 42config HAVE_FTRACE_MCOUNT_RECORD
45 bool 43 bool
46 help 44 help
47 See Documentation/trace/ftrace-implementation.txt 45 See Documentation/trace/ftrace-design.txt
48 46
49config HAVE_HW_BRANCH_TRACER 47config HAVE_HW_BRANCH_TRACER
50 bool 48 bool
@@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER
52config HAVE_SYSCALL_TRACEPOINTS 50config HAVE_SYSCALL_TRACEPOINTS
53 bool 51 bool
54 help 52 help
55 See Documentation/trace/ftrace-implementation.txt 53 See Documentation/trace/ftrace-design.txt
56 54
57config TRACER_MAX_TRACE 55config TRACER_MAX_TRACE
58 bool 56 bool
@@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP
83# This allows those options to appear when no other tracer is selected. But the 81# This allows those options to appear when no other tracer is selected. But the
84# options do not appear when something else selects it. We need the two options 82# options do not appear when something else selects it. We need the two options
85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 83# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
86# hidding of the automatic options. 84# hiding of the automatic options.
87 85
88config TRACING 86config TRACING
89 bool 87 bool
@@ -119,7 +117,7 @@ menuconfig FTRACE
119 bool "Tracers" 117 bool "Tracers"
120 default y if DEBUG_KERNEL 118 default y if DEBUG_KERNEL
121 help 119 help
122 Enable the kernel tracing infrastructure. 120 Enable the kernel tracing infrastructure.
123 121
124if FTRACE 122if FTRACE
125 123
@@ -133,7 +131,7 @@ config FUNCTION_TRACER
133 help 131 help
134 Enable the kernel to trace every kernel function. This is done 132 Enable the kernel to trace every kernel function. This is done
135 by using a compiler feature to insert a small, 5-byte No-Operation 133 by using a compiler feature to insert a small, 5-byte No-Operation
136 instruction to the beginning of every kernel function, which NOP 134 instruction at the beginning of every kernel function, which NOP
137 sequence is then dynamically patched into a tracer call when 135 sequence is then dynamically patched into a tracer call when
138 tracing is enabled by the administrator. If it's runtime disabled 136 tracing is enabled by the administrator. If it's runtime disabled
139 (the bootup default), then the overhead of the instructions is very 137 (the bootup default), then the overhead of the instructions is very
@@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER
150 and its entry. 148 and its entry.
151 Its first purpose is to trace the duration of functions and 149 Its first purpose is to trace the duration of functions and
152 draw a call graph for each thread with some information like 150 draw a call graph for each thread with some information like
153 the return value. This is done by setting the current return 151 the return value. This is done by setting the current return
154 address on the current task structure into a stack of calls. 152 address on the current task structure into a stack of calls.
155 153
156 154
@@ -173,7 +171,7 @@ config IRQSOFF_TRACER
173 171
174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 172 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
175 173
176 (Note that kernel size and overhead increases with this option 174 (Note that kernel size and overhead increase with this option
177 enabled. This option and the preempt-off timing option can be 175 enabled. This option and the preempt-off timing option can be
178 used together or separately.) 176 used together or separately.)
179 177
@@ -186,7 +184,7 @@ config PREEMPT_TRACER
186 select TRACER_MAX_TRACE 184 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP 185 select RING_BUFFER_ALLOW_SWAP
188 help 186 help
189 This option measures the time spent in preemption off critical 187 This option measures the time spent in preemption-off critical
190 sections, with microsecond accuracy. 188 sections, with microsecond accuracy.
191 189
192 The default measurement method is a maximum search, which is 190 The default measurement method is a maximum search, which is
@@ -195,7 +193,7 @@ config PREEMPT_TRACER
195 193
196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 194 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
197 195
198 (Note that kernel size and overhead increases with this option 196 (Note that kernel size and overhead increase with this option
199 enabled. This option and the irqs-off timing option can be 197 enabled. This option and the irqs-off timing option can be
200 used together or separately.) 198 used together or separately.)
201 199
@@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS
222 depends on !GENERIC_TRACER 220 depends on !GENERIC_TRACER
223 select TRACING 221 select TRACING
224 help 222 help
225 This tracer hooks to various trace points in the kernel 223 This tracer hooks to various trace points in the kernel,
226 allowing the user to pick and choose which trace point they 224 allowing the user to pick and choose which trace point they
227 want to trace. It also includes the sched_switch tracer plugin. 225 want to trace. It also includes the sched_switch tracer plugin.
228 226
@@ -265,19 +263,19 @@ choice
265 The likely/unlikely profiler only looks at the conditions that 263 The likely/unlikely profiler only looks at the conditions that
266 are annotated with a likely or unlikely macro. 264 are annotated with a likely or unlikely macro.
267 265
268 The "all branch" profiler will profile every if statement in the 266 The "all branch" profiler will profile every if-statement in the
269 kernel. This profiler will also enable the likely/unlikely 267 kernel. This profiler will also enable the likely/unlikely
270 profiler as well. 268 profiler.
271 269
272 Either of the above profilers add a bit of overhead to the system. 270 Either of the above profilers adds a bit of overhead to the system.
273 If unsure choose "No branch profiling". 271 If unsure, choose "No branch profiling".
274 272
275config BRANCH_PROFILE_NONE 273config BRANCH_PROFILE_NONE
276 bool "No branch profiling" 274 bool "No branch profiling"
277 help 275 help
278 No branch profiling. Branch profiling adds a bit of overhead. 276 No branch profiling. Branch profiling adds a bit of overhead.
279 Only enable it if you want to analyse the branching behavior. 277 Only enable it if you want to analyse the branching behavior.
280 Otherwise keep it disabled. 278 Otherwise keep it disabled.
281 279
282config PROFILE_ANNOTATED_BRANCHES 280config PROFILE_ANNOTATED_BRANCHES
283 bool "Trace likely/unlikely profiler" 281 bool "Trace likely/unlikely profiler"
@@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
288 286
289 /sys/kernel/debug/tracing/profile_annotated_branch 287 /sys/kernel/debug/tracing/profile_annotated_branch
290 288
291 Note: this will add a significant overhead, only turn this 289 Note: this will add a significant overhead; only turn this
292 on if you need to profile the system's use of these macros. 290 on if you need to profile the system's use of these macros.
293 291
294config PROFILE_ALL_BRANCHES 292config PROFILE_ALL_BRANCHES
@@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES
305 303
306 This configuration, when enabled, will impose a great overhead 304 This configuration, when enabled, will impose a great overhead
307 on the system. This should only be enabled when the system 305 on the system. This should only be enabled when the system
308 is to be analyzed 306 is to be analyzed in much detail.
309endchoice 307endchoice
310 308
311config TRACING_BRANCHES 309config TRACING_BRANCHES
@@ -330,15 +328,6 @@ config BRANCH_TRACER
330 328
331 Say N if unsure. 329 Say N if unsure.
332 330
333config POWER_TRACER
334 bool "Trace power consumption behavior"
335 depends on X86
336 select GENERIC_TRACER
337 help
338 This tracer helps developers to analyze and optimize the kernels
339 power management decisions, specifically the C-state and P-state
340 behavior.
341
342config KSYM_TRACER 331config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations" 332 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT 333 depends on HAVE_HW_BREAKPOINT
@@ -391,14 +380,14 @@ config HW_BRANCH_TRACER
391 select GENERIC_TRACER 380 select GENERIC_TRACER
392 help 381 help
393 This tracer records all branches on the system in a circular 382 This tracer records all branches on the system in a circular
394 buffer giving access to the last N branches for each cpu. 383 buffer, giving access to the last N branches for each cpu.
395 384
396config KMEMTRACE 385config KMEMTRACE
397 bool "Trace SLAB allocations" 386 bool "Trace SLAB allocations"
398 select GENERIC_TRACER 387 select GENERIC_TRACER
399 help 388 help
400 kmemtrace provides tracing for slab allocator functions, such as 389 kmemtrace provides tracing for slab allocator functions, such as
401 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 390 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
402 data is then fed to the userspace application in order to analyse 391 data is then fed to the userspace application in order to analyse
403 allocation hotspots, internal fragmentation and so on, making it 392 allocation hotspots, internal fragmentation and so on, making it
404 possible to see how well an allocator performs, as well as debug 393 possible to see how well an allocator performs, as well as debug
@@ -417,15 +406,15 @@ config WORKQUEUE_TRACER
417 bool "Trace workqueues" 406 bool "Trace workqueues"
418 select GENERIC_TRACER 407 select GENERIC_TRACER
419 help 408 help
420 The workqueue tracer provides some statistical informations 409 The workqueue tracer provides some statistical information
421 about each cpu workqueue thread such as the number of the 410 about each cpu workqueue thread such as the number of the
422 works inserted and executed since their creation. It can help 411 works inserted and executed since their creation. It can help
423 to evaluate the amount of work each of them have to perform. 412 to evaluate the amount of work each of them has to perform.
424 For example it can help a developer to decide whether he should 413 For example it can help a developer to decide whether he should
425 choose a per cpu workqueue instead of a singlethreaded one. 414 choose a per-cpu workqueue instead of a singlethreaded one.
426 415
427config BLK_DEV_IO_TRACE 416config BLK_DEV_IO_TRACE
428 bool "Support for tracing block io actions" 417 bool "Support for tracing block IO actions"
429 depends on SYSFS 418 depends on SYSFS
430 depends on BLOCK 419 depends on BLOCK
431 select RELAY 420 select RELAY
@@ -451,20 +440,20 @@ config BLK_DEV_IO_TRACE
451 440
452config KPROBE_EVENT 441config KPROBE_EVENT
453 depends on KPROBES 442 depends on KPROBES
454 depends on X86 443 depends on HAVE_REGS_AND_STACK_ACCESS_API
455 bool "Enable kprobes-based dynamic events" 444 bool "Enable kprobes-based dynamic events"
456 select TRACING 445 select TRACING
457 default y 446 default y
458 help 447 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly 448 This allows the user to add tracing events (similar to tracepoints)
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt 449 on the fly via the ftrace interface. See
461 for more details. 450 Documentation/trace/kprobetrace.txt for more details.
462 451
463 Those events can be inserted wherever kprobes can probe, and record 452 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values. 453 various register and memory values.
465 454
466 This option is also required by perf-probe subcommand of perf tools. If 455 This option is also required by perf-probe subcommand of perf tools.
467 you want to use perf tools, this option is strongly recommended. 456 If you want to use perf tools, this option is strongly recommended.
468 457
469config DYNAMIC_FTRACE 458config DYNAMIC_FTRACE
470 bool "enable/disable ftrace tracepoints dynamically" 459 bool "enable/disable ftrace tracepoints dynamically"
@@ -472,32 +461,32 @@ config DYNAMIC_FTRACE
472 depends on HAVE_DYNAMIC_FTRACE 461 depends on HAVE_DYNAMIC_FTRACE
473 default y 462 default y
474 help 463 help
475 This option will modify all the calls to ftrace dynamically 464 This option will modify all the calls to ftrace dynamically
476 (will patch them out of the binary image and replaces them 465 (will patch them out of the binary image and replace them
477 with a No-Op instruction) as they are called. A table is 466 with a No-Op instruction) as they are called. A table is
478 created to dynamically enable them again. 467 created to dynamically enable them again.
479 468
480 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise 469 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
481 has native performance as long as no tracing is active. 470 otherwise has native performance as long as no tracing is active.
482 471
483 The changes to the code are done by a kernel thread that 472 The changes to the code are done by a kernel thread that
484 wakes up once a second and checks to see if any ftrace calls 473 wakes up once a second and checks to see if any ftrace calls
485 were made. If so, it runs stop_machine (stops all CPUS) 474 were made. If so, it runs stop_machine (stops all CPUS)
486 and modifies the code to jump over the call to ftrace. 475 and modifies the code to jump over the call to ftrace.
487 476
488config FUNCTION_PROFILER 477config FUNCTION_PROFILER
489 bool "Kernel function profiler" 478 bool "Kernel function profiler"
490 depends on FUNCTION_TRACER 479 depends on FUNCTION_TRACER
491 default n 480 default n
492 help 481 help
493 This option enables the kernel function profiler. A file is created 482 This option enables the kernel function profiler. A file is created
494 in debugfs called function_profile_enabled which defaults to zero. 483 in debugfs called function_profile_enabled which defaults to zero.
495 When a 1 is echoed into this file profiling begins, and when a 484 When a 1 is echoed into this file profiling begins, and when a
496 zero is entered, profiling stops. A file in the trace_stats 485 zero is entered, profiling stops. A "functions" file is created in
497 directory called functions, that show the list of functions that 486 the trace_stats directory; this file shows the list of functions that
498 have been hit and their counters. 487 have been hit and their counters.
499 488
500 If in doubt, say N 489 If in doubt, say N.
501 490
502config FTRACE_MCOUNT_RECORD 491config FTRACE_MCOUNT_RECORD
503 def_bool y 492 def_bool y
@@ -556,8 +545,8 @@ config RING_BUFFER_BENCHMARK
556 tristate "Ring buffer benchmark stress tester" 545 tristate "Ring buffer benchmark stress tester"
557 depends on RING_BUFFER 546 depends on RING_BUFFER
558 help 547 help
559 This option creates a test to stress the ring buffer and bench mark it. 548 This option creates a test to stress the ring buffer and benchmark it.
560 It creates its own ring buffer such that it will not interfer with 549 It creates its own ring buffer such that it will not interfere with
561 any other users of the ring buffer (such as ftrace). It then creates 550 any other users of the ring buffer (such as ftrace). It then creates
562 a producer and consumer that will run for 10 seconds and sleep for 551 a producer and consumer that will run for 10 seconds and sleep for
563 10 seconds. Each interval it will print out the number of events 552 10 seconds. Each interval it will print out the number of events
@@ -566,7 +555,7 @@ config RING_BUFFER_BENCHMARK
566 It does not disable interrupts or raise its priority, so it may be 555 It does not disable interrupts or raise its priority, so it may be
567 affected by processes that are running. 556 affected by processes that are running.
568 557
569 If unsure, say N 558 If unsure, say N.
570 559
571endif # FTRACE 560endif # FTRACE
572 561
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..d00c6fe23f54 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 51obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
56endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..07f945a99430 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -540,9 +540,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
540 if (ret) 540 if (ret)
541 return ret; 541 return ret;
542 542
543 if (copy_to_user(arg, &buts, sizeof(buts))) 543 if (copy_to_user(arg, &buts, sizeof(buts))) {
544 blk_trace_remove(q);
544 return -EFAULT; 545 return -EFAULT;
545 546 }
546 return 0; 547 return 0;
547} 548}
548EXPORT_SYMBOL_GPL(blk_trace_setup); 549EXPORT_SYMBOL_GPL(blk_trace_setup);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7968762c8167..83783579378f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,7 +22,6 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
898 } \ 897 } \
899 } 898 }
900 899
901#ifdef CONFIG_KPROBES
902
903static int frozen_record_count;
904
905static inline void freeze_record(struct dyn_ftrace *rec)
906{
907 if (!(rec->flags & FTRACE_FL_FROZEN)) {
908 rec->flags |= FTRACE_FL_FROZEN;
909 frozen_record_count++;
910 }
911}
912
913static inline void unfreeze_record(struct dyn_ftrace *rec)
914{
915 if (rec->flags & FTRACE_FL_FROZEN) {
916 rec->flags &= ~FTRACE_FL_FROZEN;
917 frozen_record_count--;
918 }
919}
920
921static inline int record_frozen(struct dyn_ftrace *rec)
922{
923 return rec->flags & FTRACE_FL_FROZEN;
924}
925#else
926# define freeze_record(rec) ({ 0; })
927# define unfreeze_record(rec) ({ 0; })
928# define record_frozen(rec) ({ 0; })
929#endif /* CONFIG_KPROBES */
930
931static void ftrace_free_rec(struct dyn_ftrace *rec) 900static void ftrace_free_rec(struct dyn_ftrace *rec)
932{ 901{
933 rec->freelist = ftrace_free_records; 902 rec->freelist = ftrace_free_records;
@@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1025} 994}
1026 995
1027 996
997/* Return 1 if the address range is reserved for ftrace */
998int ftrace_text_reserved(void *start, void *end)
999{
1000 struct dyn_ftrace *rec;
1001 struct ftrace_page *pg;
1002
1003 do_for_each_ftrace_rec(pg, rec) {
1004 if (rec->ip <= (unsigned long)end &&
1005 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1006 return 1;
1007 } while_for_each_ftrace_rec();
1008 return 0;
1009}
1010
1011
1028static int 1012static int
1029__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1013__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1030{ 1014{
@@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable)
1076 !(rec->flags & FTRACE_FL_CONVERTED)) 1060 !(rec->flags & FTRACE_FL_CONVERTED))
1077 continue; 1061 continue;
1078 1062
1079 /* ignore updates to this record's mcount site */
1080 if (get_kprobe((void *)rec->ip)) {
1081 freeze_record(rec);
1082 continue;
1083 } else {
1084 unfreeze_record(rec);
1085 }
1086
1087 failed = __ftrace_replace_code(rec, enable); 1063 failed = __ftrace_replace_code(rec, enable);
1088 if (failed) { 1064 if (failed) {
1089 rec->flags |= FTRACE_FL_FAILED; 1065 rec->flags |= FTRACE_FL_FAILED;
@@ -1690,7 +1666,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1690static int ftrace_match(char *str, char *regex, int len, int type) 1666static int ftrace_match(char *str, char *regex, int len, int type)
1691{ 1667{
1692 int matched = 0; 1668 int matched = 0;
1693 char *ptr; 1669 int slen;
1694 1670
1695 switch (type) { 1671 switch (type) {
1696 case MATCH_FULL: 1672 case MATCH_FULL:
@@ -1706,8 +1682,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1706 matched = 1; 1682 matched = 1;
1707 break; 1683 break;
1708 case MATCH_END_ONLY: 1684 case MATCH_END_ONLY:
1709 ptr = strstr(str, regex); 1685 slen = strlen(str);
1710 if (ptr && (ptr[len] == 0)) 1686 if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
1711 matched = 1; 1687 matched = 1;
1712 break; 1688 break;
1713 } 1689 }
@@ -2426,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = {
2426static DEFINE_MUTEX(graph_lock); 2402static DEFINE_MUTEX(graph_lock);
2427 2403
2428int ftrace_graph_count; 2404int ftrace_graph_count;
2405int ftrace_graph_filter_enabled;
2429unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2406unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2430 2407
2431static void * 2408static void *
@@ -2448,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2448 mutex_lock(&graph_lock); 2425 mutex_lock(&graph_lock);
2449 2426
2450 /* Nothing, tell g_show to print all functions are enabled */ 2427 /* Nothing, tell g_show to print all functions are enabled */
2451 if (!ftrace_graph_count && !*pos) 2428 if (!ftrace_graph_filter_enabled && !*pos)
2452 return (void *)1; 2429 return (void *)1;
2453 2430
2454 return __g_next(m, pos); 2431 return __g_next(m, pos);
@@ -2494,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2494 mutex_lock(&graph_lock); 2471 mutex_lock(&graph_lock);
2495 if ((file->f_mode & FMODE_WRITE) && 2472 if ((file->f_mode & FMODE_WRITE) &&
2496 (file->f_flags & O_TRUNC)) { 2473 (file->f_flags & O_TRUNC)) {
2474 ftrace_graph_filter_enabled = 0;
2497 ftrace_graph_count = 0; 2475 ftrace_graph_count = 0;
2498 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2476 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2499 } 2477 }
@@ -2519,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2519 struct dyn_ftrace *rec; 2497 struct dyn_ftrace *rec;
2520 struct ftrace_page *pg; 2498 struct ftrace_page *pg;
2521 int search_len; 2499 int search_len;
2522 int found = 0; 2500 int fail = 1;
2523 int type, not; 2501 int type, not;
2524 char *search; 2502 char *search;
2525 bool exists; 2503 bool exists;
@@ -2530,37 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2530 2508
2531 /* decode regex */ 2509 /* decode regex */
2532 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 2510 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2533 if (not) 2511 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2534 return -EINVAL; 2512 return -EBUSY;
2535 2513
2536 search_len = strlen(search); 2514 search_len = strlen(search);
2537 2515
2538 mutex_lock(&ftrace_lock); 2516 mutex_lock(&ftrace_lock);
2539 do_for_each_ftrace_rec(pg, rec) { 2517 do_for_each_ftrace_rec(pg, rec) {
2540 2518
2541 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2542 break;
2543
2544 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2519 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2545 continue; 2520 continue;
2546 2521
2547 if (ftrace_match_record(rec, search, search_len, type)) { 2522 if (ftrace_match_record(rec, search, search_len, type)) {
2548 /* ensure it is not already in the array */ 2523 /* if it is in the array */
2549 exists = false; 2524 exists = false;
2550 for (i = 0; i < *idx; i++) 2525 for (i = 0; i < *idx; i++) {
2551 if (array[i] == rec->ip) { 2526 if (array[i] == rec->ip) {
2552 exists = true; 2527 exists = true;
2553 break; 2528 break;
2554 } 2529 }
2555 if (!exists) 2530 }
2556 array[(*idx)++] = rec->ip; 2531
2557 found = 1; 2532 if (!not) {
2533 fail = 0;
2534 if (!exists) {
2535 array[(*idx)++] = rec->ip;
2536 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2537 goto out;
2538 }
2539 } else {
2540 if (exists) {
2541 array[i] = array[--(*idx)];
2542 array[*idx] = 0;
2543 fail = 0;
2544 }
2545 }
2558 } 2546 }
2559 } while_for_each_ftrace_rec(); 2547 } while_for_each_ftrace_rec();
2560 2548out:
2561 mutex_unlock(&ftrace_lock); 2549 mutex_unlock(&ftrace_lock);
2562 2550
2563 return found ? 0 : -EINVAL; 2551 if (fail)
2552 return -EINVAL;
2553
2554 ftrace_graph_filter_enabled = 1;
2555 return 0;
2564} 2556}
2565 2557
2566static ssize_t 2558static ssize_t
@@ -2570,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2570 struct trace_parser parser; 2562 struct trace_parser parser;
2571 ssize_t read, ret; 2563 ssize_t read, ret;
2572 2564
2573 if (!cnt || cnt < 0) 2565 if (!cnt)
2574 return 0; 2566 return 0;
2575 2567
2576 mutex_lock(&graph_lock); 2568 mutex_lock(&graph_lock);
2577 2569
2578 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2579 ret = -EBUSY;
2580 goto out_unlock;
2581 }
2582
2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2570 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2584 ret = -ENOMEM; 2571 ret = -ENOMEM;
2585 goto out_unlock; 2572 goto out_unlock;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2326b04c95c4..0287f9f52f5a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22 22
23#include <asm/local.h>
23#include "trace.h" 24#include "trace.h"
24 25
25/* 26/*
@@ -464,6 +465,8 @@ struct ring_buffer_iter {
464 struct ring_buffer_per_cpu *cpu_buffer; 465 struct ring_buffer_per_cpu *cpu_buffer;
465 unsigned long head; 466 unsigned long head;
466 struct buffer_page *head_page; 467 struct buffer_page *head_page;
468 struct buffer_page *cache_reader_page;
469 unsigned long cache_read;
467 u64 read_stamp; 470 u64 read_stamp;
468}; 471};
469 472
@@ -2716,6 +2719,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2716 iter->read_stamp = cpu_buffer->read_stamp; 2719 iter->read_stamp = cpu_buffer->read_stamp;
2717 else 2720 else
2718 iter->read_stamp = iter->head_page->page->time_stamp; 2721 iter->read_stamp = iter->head_page->page->time_stamp;
2722 iter->cache_reader_page = cpu_buffer->reader_page;
2723 iter->cache_read = cpu_buffer->read;
2719} 2724}
2720 2725
2721/** 2726/**
@@ -2869,7 +2874,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2869 * Splice the empty reader page into the list around the head. 2874 * Splice the empty reader page into the list around the head.
2870 */ 2875 */
2871 reader = rb_set_head_page(cpu_buffer); 2876 reader = rb_set_head_page(cpu_buffer);
2872 cpu_buffer->reader_page->list.next = reader->list.next; 2877 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
2873 cpu_buffer->reader_page->list.prev = reader->list.prev; 2878 cpu_buffer->reader_page->list.prev = reader->list.prev;
2874 2879
2875 /* 2880 /*
@@ -2906,7 +2911,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2906 * 2911 *
2907 * Now make the new head point back to the reader page. 2912 * Now make the new head point back to the reader page.
2908 */ 2913 */
2909 reader->list.next->prev = &cpu_buffer->reader_page->list; 2914 rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
2910 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2915 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2911 2916
2912 /* Finally update the reader page to the new head */ 2917 /* Finally update the reader page to the new head */
@@ -3060,13 +3065,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3060 struct ring_buffer_event *event; 3065 struct ring_buffer_event *event;
3061 int nr_loops = 0; 3066 int nr_loops = 0;
3062 3067
3063 if (ring_buffer_iter_empty(iter))
3064 return NULL;
3065
3066 cpu_buffer = iter->cpu_buffer; 3068 cpu_buffer = iter->cpu_buffer;
3067 buffer = cpu_buffer->buffer; 3069 buffer = cpu_buffer->buffer;
3068 3070
3071 /*
3072 * Check if someone performed a consuming read to
3073 * the buffer. A consuming read invalidates the iterator
3074 * and we need to reset the iterator in this case.
3075 */
3076 if (unlikely(iter->cache_read != cpu_buffer->read ||
3077 iter->cache_reader_page != cpu_buffer->reader_page))
3078 rb_iter_reset(iter);
3079
3069 again: 3080 again:
3081 if (ring_buffer_iter_empty(iter))
3082 return NULL;
3083
3070 /* 3084 /*
3071 * We repeat when a timestamp is encountered. 3085 * We repeat when a timestamp is encountered.
3072 * We can get multiple timestamps by nested interrupts or also 3086 * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3095,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3081 if (rb_per_cpu_empty(cpu_buffer)) 3095 if (rb_per_cpu_empty(cpu_buffer))
3082 return NULL; 3096 return NULL;
3083 3097
3098 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3099 rb_inc_iter(iter);
3100 goto again;
3101 }
3102
3084 event = rb_iter_head_event(iter); 3103 event = rb_iter_head_event(iter);
3085 3104
3086 switch (event->type_len) { 3105 switch (event->type_len) {
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b9f20ab8eed..ed01fdba4a55 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,7 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
35#include <linux/ctype.h> 36#include <linux/ctype.h>
36#include <linux/init.h> 37#include <linux/init.h>
37#include <linux/poll.h> 38#include <linux/poll.h>
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
91static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
92{ 93{
93 preempt_disable(); 94 preempt_disable();
94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
95} 96}
96 97
97static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
98{ 99{
99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
100 preempt_enable(); 101 preempt_enable();
101} 102}
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -951,6 +1028,11 @@ void trace_find_cmdline(int pid, char comm[])
951 return; 1028 return;
952 } 1029 }
953 1030
1031 if (WARN_ON_ONCE(pid < 0)) {
1032 strcpy(comm, "<XXX>");
1033 return;
1034 }
1035
954 if (pid > PID_MAX_DEFAULT) { 1036 if (pid > PID_MAX_DEFAULT) {
955 strcpy(comm, "<...>"); 1037 strcpy(comm, "<...>");
956 return; 1038 return;
@@ -1084,7 +1166,7 @@ trace_function(struct trace_array *tr,
1084 struct ftrace_entry *entry; 1166 struct ftrace_entry *entry;
1085 1167
1086 /* If we are reading the ring buffer, don't trace */ 1168 /* If we are reading the ring buffer, don't trace */
1087 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 1169 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1088 return; 1170 return;
1089 1171
1090 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1172 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1315,8 +1397,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1315 entry->fmt = fmt; 1397 entry->fmt = fmt;
1316 1398
1317 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1399 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1318 if (!filter_check_discard(call, entry, buffer, event)) 1400 if (!filter_check_discard(call, entry, buffer, event)) {
1319 ring_buffer_unlock_commit(buffer, event); 1401 ring_buffer_unlock_commit(buffer, event);
1402 ftrace_trace_stack(buffer, flags, 6, pc);
1403 }
1320 1404
1321out_unlock: 1405out_unlock:
1322 arch_spin_unlock(&trace_buf_lock); 1406 arch_spin_unlock(&trace_buf_lock);
@@ -1389,8 +1473,10 @@ int trace_array_vprintk(struct trace_array *tr,
1389 1473
1390 memcpy(&entry->buf, trace_buf, len); 1474 memcpy(&entry->buf, trace_buf, len);
1391 entry->buf[len] = '\0'; 1475 entry->buf[len] = '\0';
1392 if (!filter_check_discard(call, entry, buffer, event)) 1476 if (!filter_check_discard(call, entry, buffer, event)) {
1393 ring_buffer_unlock_commit(buffer, event); 1477 ring_buffer_unlock_commit(buffer, event);
1478 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1479 }
1394 1480
1395 out_unlock: 1481 out_unlock:
1396 arch_spin_unlock(&trace_buf_lock); 1482 arch_spin_unlock(&trace_buf_lock);
@@ -1580,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1580} 1666}
1581 1667
1582/* 1668/*
1583 * No necessary locking here. The worst thing which can
1584 * happen is loosing events consumed at the same time
1585 * by a trace_pipe reader.
1586 * Other than that, we don't risk to crash the ring buffer
1587 * because it serializes the readers.
1588 *
1589 * The current tracer is copied to avoid a global locking 1669 * The current tracer is copied to avoid a global locking
1590 * all around. 1670 * all around.
1591 */ 1671 */
@@ -1640,12 +1720,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1640 } 1720 }
1641 1721
1642 trace_event_read_lock(); 1722 trace_event_read_lock();
1723 trace_access_lock(cpu_file);
1643 return p; 1724 return p;
1644} 1725}
1645 1726
1646static void s_stop(struct seq_file *m, void *p) 1727static void s_stop(struct seq_file *m, void *p)
1647{ 1728{
1729 struct trace_iterator *iter = m->private;
1730
1648 atomic_dec(&trace_record_cmdline_disabled); 1731 atomic_dec(&trace_record_cmdline_disabled);
1732 trace_access_unlock(iter->cpu_file);
1649 trace_event_read_unlock(); 1733 trace_event_read_unlock();
1650} 1734}
1651 1735
@@ -2836,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2836 2920
2837 mutex_lock(&trace_types_lock); 2921 mutex_lock(&trace_types_lock);
2838 2922
2839 /* We only allow one reader per cpu */
2840 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2841 if (!cpumask_empty(tracing_reader_cpumask)) {
2842 ret = -EBUSY;
2843 goto out;
2844 }
2845 cpumask_setall(tracing_reader_cpumask);
2846 } else {
2847 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2848 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2849 else {
2850 ret = -EBUSY;
2851 goto out;
2852 }
2853 }
2854
2855 /* create a buffer to store the information to pass to userspace */ 2923 /* create a buffer to store the information to pass to userspace */
2856 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2924 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2857 if (!iter) { 2925 if (!iter) {
@@ -2907,12 +2975,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2907 2975
2908 mutex_lock(&trace_types_lock); 2976 mutex_lock(&trace_types_lock);
2909 2977
2910 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2911 cpumask_clear(tracing_reader_cpumask);
2912 else
2913 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2914
2915
2916 if (iter->trace->pipe_close) 2978 if (iter->trace->pipe_close)
2917 iter->trace->pipe_close(iter); 2979 iter->trace->pipe_close(iter);
2918 2980
@@ -3074,6 +3136,7 @@ waitagain:
3074 iter->pos = -1; 3136 iter->pos = -1;
3075 3137
3076 trace_event_read_lock(); 3138 trace_event_read_lock();
3139 trace_access_lock(iter->cpu_file);
3077 while (find_next_entry_inc(iter) != NULL) { 3140 while (find_next_entry_inc(iter) != NULL) {
3078 enum print_line_t ret; 3141 enum print_line_t ret;
3079 int len = iter->seq.len; 3142 int len = iter->seq.len;
@@ -3090,6 +3153,7 @@ waitagain:
3090 if (iter->seq.len >= cnt) 3153 if (iter->seq.len >= cnt)
3091 break; 3154 break;
3092 } 3155 }
3156 trace_access_unlock(iter->cpu_file);
3093 trace_event_read_unlock(); 3157 trace_event_read_unlock();
3094 3158
3095 /* Now copy what we have to the user */ 3159 /* Now copy what we have to the user */
@@ -3215,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3215 } 3279 }
3216 3280
3217 trace_event_read_lock(); 3281 trace_event_read_lock();
3282 trace_access_lock(iter->cpu_file);
3218 3283
3219 /* Fill as many pages as possible. */ 3284 /* Fill as many pages as possible. */
3220 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3285 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3238,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3238 trace_seq_init(&iter->seq); 3303 trace_seq_init(&iter->seq);
3239 } 3304 }
3240 3305
3306 trace_access_unlock(iter->cpu_file);
3241 trace_event_read_unlock(); 3307 trace_event_read_unlock();
3242 mutex_unlock(&iter->mutex); 3308 mutex_unlock(&iter->mutex);
3243 3309
@@ -3539,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3539 3605
3540 info->read = 0; 3606 info->read = 0;
3541 3607
3608 trace_access_lock(info->cpu);
3542 ret = ring_buffer_read_page(info->tr->buffer, 3609 ret = ring_buffer_read_page(info->tr->buffer,
3543 &info->spare, 3610 &info->spare,
3544 count, 3611 count,
3545 info->cpu, 0); 3612 info->cpu, 0);
3613 trace_access_unlock(info->cpu);
3546 if (ret < 0) 3614 if (ret < 0)
3547 return 0; 3615 return 0;
3548 3616
@@ -3670,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3670 len &= PAGE_MASK; 3738 len &= PAGE_MASK;
3671 } 3739 }
3672 3740
3741 trace_access_lock(info->cpu);
3673 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3742 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3674 3743
3675 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3744 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3717,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3717 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3786 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3718 } 3787 }
3719 3788
3789 trace_access_unlock(info->cpu);
3720 spd.nr_pages = i; 3790 spd.nr_pages = i;
3721 3791
3722 /* did we read anything? */ 3792 /* did we read anything? */
@@ -3949,7 +4019,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3949 if (!!(topt->flags->val & topt->opt->bit) != val) { 4019 if (!!(topt->flags->val & topt->opt->bit) != val) {
3950 mutex_lock(&trace_types_lock); 4020 mutex_lock(&trace_types_lock);
3951 ret = __set_tracer_option(current_trace, topt->flags, 4021 ret = __set_tracer_option(current_trace, topt->flags,
3952 topt->opt, val); 4022 topt->opt, !val);
3953 mutex_unlock(&trace_types_lock); 4023 mutex_unlock(&trace_types_lock);
3954 if (ret) 4024 if (ret)
3955 return ret; 4025 return ret;
@@ -4153,6 +4223,8 @@ static __init int tracer_init_debugfs(void)
4153 struct dentry *d_tracer; 4223 struct dentry *d_tracer;
4154 int cpu; 4224 int cpu;
4155 4225
4226 trace_access_lock_init();
4227
4156 d_tracer = tracing_init_dentry(); 4228 d_tracer = tracing_init_dentry();
4157 4229
4158 trace_create_file("tracing_enabled", 0644, d_tracer, 4230 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4387,9 +4459,6 @@ __init static int tracer_alloc_buffers(void)
4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4459 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4388 goto out_free_buffer_mask; 4460 goto out_free_buffer_mask;
4389 4461
4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4391 goto out_free_tracing_cpumask;
4392
4393 /* To save memory, keep the ring buffer size to its minimum */ 4462 /* To save memory, keep the ring buffer size to its minimum */
4394 if (ring_buffer_expanded) 4463 if (ring_buffer_expanded)
4395 ring_buf_size = trace_buf_size; 4464 ring_buf_size = trace_buf_size;
@@ -4447,8 +4516,6 @@ __init static int tracer_alloc_buffers(void)
4447 return 0; 4516 return 0;
4448 4517
4449out_free_cpumask: 4518out_free_cpumask:
4450 free_cpumask_var(tracing_reader_cpumask);
4451out_free_tracing_cpumask:
4452 free_cpumask_var(tracing_cpumask); 4519 free_cpumask_var(tracing_cpumask);
4453out_free_buffer_mask: 4520out_free_buffer_mask:
4454 free_cpumask_var(tracing_buffer_mask); 4521 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..fd05bcaf91b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
497#ifdef CONFIG_DYNAMIC_FTRACE 497#ifdef CONFIG_DYNAMIC_FTRACE
498/* TODO: make this variable */ 498/* TODO: make this variable */
499#define FTRACE_GRAPH_MAX_FUNCS 32 499#define FTRACE_GRAPH_MAX_FUNCS 32
500extern int ftrace_graph_filter_enabled;
500extern int ftrace_graph_count; 501extern int ftrace_graph_count;
501extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 502extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
502 503
@@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
504{ 505{
505 int i; 506 int i;
506 507
507 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 508 if (!ftrace_graph_filter_enabled)
508 return 1; 509 return 1;
509 510
510 for (i = 0; i < ftrace_graph_count; i++) { 511 for (i = 0; i < ftrace_graph_count; i++) {
@@ -791,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[];
791 792
792#undef FTRACE_ENTRY 793#undef FTRACE_ENTRY
793#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 794#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
794 extern struct ftrace_event_call event_##call; 795 extern struct ftrace_event_call \
796 __attribute__((__aligned__(4))) event_##call;
795#undef FTRACE_ENTRY_DUP 797#undef FTRACE_ENTRY_DUP
796#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 798#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
797 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 799 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 307 return -1;
308 if (percent_a > percent_b) 308 if (percent_a > percent_b)
309 return 1; 309 return 1;
310 else 310
311 return 0; 311 if (a->incorrect < b->incorrect)
312 return -1;
313 if (a->incorrect > b->incorrect)
314 return 1;
315
316 /*
317 * Since the above shows worse (incorrect) cases
318 * first, we continue that by showing best (correct)
319 * cases last.
320 */
321 if (a->correct > b->correct)
322 return -1;
323 if (a->correct < b->correct)
324 return 1;
325
326 return 0;
312} 327}
313 328
314static struct tracer_stat annotated_branch_stats = { 329static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573242cf..f0d693005075 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h>
9#include "trace.h" 10#include "trace.h"
10 11
11 12
12char *perf_trace_buf; 13static char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 14static char *perf_trace_buf_nmi;
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 15
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19 17
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
120 } 118 }
121 mutex_unlock(&event_mutex); 119 mutex_unlock(&event_mutex);
122} 120}
121
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags)
124{
125 struct trace_entry *entry;
126 char *trace_buf, *raw_data;
127 int pc, cpu;
128
129 pc = preempt_count();
130
131 /* Protect the per cpu buffer, begin the rcu read side */
132 local_irq_save(*irq_flags);
133
134 *rctxp = perf_swevent_get_recursion_context();
135 if (*rctxp < 0)
136 goto err_recursion;
137
138 cpu = smp_processor_id();
139
140 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi);
142 else
143 trace_buf = rcu_dereference(perf_trace_buf);
144
145 if (!trace_buf)
146 goto err;
147
148 raw_data = per_cpu_ptr(trace_buf, cpu);
149
150 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
152
153 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc);
155 entry->type = type;
156
157 return raw_data;
158err:
159 perf_swevent_put_recursion_context(*rctxp);
160err_recursion:
161 local_irq_restore(*irq_flags);
162 return NULL;
163}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..3f972ad98d04 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
60 return 0; 60 return 0;
61 61
62err: 62err:
63 if (field) { 63 if (field)
64 kfree(field->name); 64 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 65 kfree(field);
68 66
69 return -ENOMEM; 67 return -ENOMEM;
@@ -520,41 +518,16 @@ out:
520 return ret; 518 return ret;
521} 519}
522 520
523extern char *__bad_type_size(void);
524
525#undef FIELD
526#define FIELD(type, name) \
527 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
528 #type, "common_" #name, offsetof(typeof(field), name), \
529 sizeof(field.name), is_signed_type(type)
530
531static int trace_write_header(struct trace_seq *s)
532{
533 struct trace_entry field;
534
535 /* struct trace_entry */
536 return trace_seq_printf(s,
537 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
538 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
539 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
540 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
541 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
542 "\n",
543 FIELD(unsigned short, type),
544 FIELD(unsigned char, flags),
545 FIELD(unsigned char, preempt_count),
546 FIELD(int, pid),
547 FIELD(int, lock_depth));
548}
549
550static ssize_t 521static ssize_t
551event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 522event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
552 loff_t *ppos) 523 loff_t *ppos)
553{ 524{
554 struct ftrace_event_call *call = filp->private_data; 525 struct ftrace_event_call *call = filp->private_data;
526 struct ftrace_event_field *field;
555 struct trace_seq *s; 527 struct trace_seq *s;
528 int common_field_count = 5;
556 char *buf; 529 char *buf;
557 int r; 530 int r = 0;
558 531
559 if (*ppos) 532 if (*ppos)
560 return 0; 533 return 0;
@@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
565 538
566 trace_seq_init(s); 539 trace_seq_init(s);
567 540
568 /* If any of the first writes fail, so will the show_format. */
569
570 trace_seq_printf(s, "name: %s\n", call->name); 541 trace_seq_printf(s, "name: %s\n", call->name);
571 trace_seq_printf(s, "ID: %d\n", call->id); 542 trace_seq_printf(s, "ID: %d\n", call->id);
572 trace_seq_printf(s, "format:\n"); 543 trace_seq_printf(s, "format:\n");
573 trace_write_header(s);
574 544
575 r = call->show_format(call, s); 545 list_for_each_entry_reverse(field, &call->fields, link) {
546 /*
547 * Smartly shows the array type(except dynamic array).
548 * Normal:
549 * field:TYPE VAR
550 * If TYPE := TYPE[LEN], it is shown:
551 * field:TYPE VAR[LEN]
552 */
553 const char *array_descriptor = strchr(field->type, '[');
554
555 if (!strncmp(field->type, "__data_loc", 10))
556 array_descriptor = NULL;
557
558 if (!array_descriptor) {
559 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
560 "\tsize:%u;\tsigned:%d;\n",
561 field->type, field->name, field->offset,
562 field->size, !!field->is_signed);
563 } else {
564 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
565 "\tsize:%u;\tsigned:%d;\n",
566 (int)(array_descriptor - field->type),
567 field->type, field->name,
568 array_descriptor, field->offset,
569 field->size, !!field->is_signed);
570 }
571
572 if (--common_field_count == 0)
573 r = trace_seq_printf(s, "\n");
574
575 if (!r)
576 break;
577 }
578
579 if (r)
580 r = trace_seq_printf(s, "\nprint fmt: %s\n",
581 call->print_fmt);
582
576 if (!r) { 583 if (!r) {
577 /* 584 /*
578 * ug! The format output is bigger than a PAGE!! 585 * ug! The format output is bigger than a PAGE!!
@@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
948 filter); 955 filter);
949 } 956 }
950 957
951 /* A trace may not want to export its format */
952 if (!call->show_format)
953 return 0;
954
955 trace_create_file("format", 0444, call->dir, call, 958 trace_create_file("format", 0444, call->dir, call,
956 format); 959 format);
957 960
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 50504cb228de..4615f62a04f1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
211{ 211{
212 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
213 int cmp, match; 213 int cmp, match;
214 int len = strlen(*addr) + 1; /* including tailing '\0' */
214 215
215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); 216 cmp = pred->regex.match(*addr, &pred->regex, len);
216 217
217 match = cmp ^ pred->not; 218 match = cmp ^ pred->not;
218 219
@@ -251,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
251 return 0; 252 return 0;
252} 253}
253 254
254/* Basic regex callbacks */ 255/*
256 * regex_match_foo - Basic regex callbacks
257 *
258 * @str: the string to be searched
259 * @r: the regex structure containing the pattern string
260 * @len: the length of the string to be searched (including '\0')
261 *
262 * Note:
263 * - @str might not be NULL-terminated if it's of type DYN_STRING
264 * or STATIC_STRING
265 */
266
255static int regex_match_full(char *str, struct regex *r, int len) 267static int regex_match_full(char *str, struct regex *r, int len)
256{ 268{
257 if (strncmp(str, r->pattern, len) == 0) 269 if (strncmp(str, r->pattern, len) == 0)
@@ -261,23 +273,24 @@ static int regex_match_full(char *str, struct regex *r, int len)
261 273
262static int regex_match_front(char *str, struct regex *r, int len) 274static int regex_match_front(char *str, struct regex *r, int len)
263{ 275{
264 if (strncmp(str, r->pattern, len) == 0) 276 if (strncmp(str, r->pattern, r->len) == 0)
265 return 1; 277 return 1;
266 return 0; 278 return 0;
267} 279}
268 280
269static int regex_match_middle(char *str, struct regex *r, int len) 281static int regex_match_middle(char *str, struct regex *r, int len)
270{ 282{
271 if (strstr(str, r->pattern)) 283 if (strnstr(str, r->pattern, len))
272 return 1; 284 return 1;
273 return 0; 285 return 0;
274} 286}
275 287
276static int regex_match_end(char *str, struct regex *r, int len) 288static int regex_match_end(char *str, struct regex *r, int len)
277{ 289{
278 char *ptr = strstr(str, r->pattern); 290 int strlen = len - 1;
279 291
280 if (ptr && (ptr[r->len] == 0)) 292 if (strlen >= r->len &&
293 memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
281 return 1; 294 return 1;
282 return 0; 295 return 0;
283} 296}
@@ -781,10 +794,8 @@ static int filter_add_pred(struct filter_parse_state *ps,
781 pred->regex.field_len = field->size; 794 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING) 795 } else if (field->filter_type == FILTER_DYN_STRING)
783 fn = filter_pred_strloc; 796 fn = filter_pred_strloc;
784 else { 797 else
785 fn = filter_pred_pchar; 798 fn = filter_pred_pchar;
786 pred->regex.field_len = strlen(pred->regex.pattern);
787 }
788 } else { 799 } else {
789 if (field->is_signed) 800 if (field->is_signed)
790 ret = strict_strtoll(pred->regex.pattern, 0, &val); 801 ret = strict_strtoll(pred->regex.pattern, 0, &val);
@@ -1360,7 +1371,7 @@ out_unlock:
1360 return err; 1371 return err;
1361} 1372}
1362 1373
1363#ifdef CONFIG_EVENT_PROFILE 1374#ifdef CONFIG_PERF_EVENTS
1364 1375
1365void ftrace_profile_free_filter(struct perf_event *event) 1376void ftrace_profile_free_filter(struct perf_event *event)
1366{ 1377{
@@ -1428,5 +1439,5 @@ out_unlock:
1428 return err; 1439 return err;
1429} 1440}
1430 1441
1431#endif /* CONFIG_EVENT_PROFILE */ 1442#endif /* CONFIG_PERF_EVENTS */
1432 1443
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 458e5bfe26d0..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
84
85#undef __array
86#define __array(type, item, len) \
87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
89 offsetof(typeof(field), item), \
90 sizeof(field.item), is_signed_type(type)); \
91 if (!ret) \
92 return 0;
93
94#undef __array_desc
95#define __array_desc(type, container, item, len) \
96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
101 if (!ret) \
102 return 0;
103
104#undef __dynamic_array
105#define __dynamic_array(type, item) \
106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
112
113#undef F_printk
114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
115
116#undef __entry
117#define __entry REC
118
119#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121static int \
122ftrace_format_##name(struct ftrace_event_call *unused, \
123 struct trace_seq *s) \
124{ \
125 struct struct_name field __attribute__((unused)); \
126 int ret = 0; \
127 \
128 tstruct; \
129 \
130 trace_seq_printf(s, "\nprint fmt: " print); \
131 \
132 return ret; \
133}
134
135#include "trace_entries.h"
136
137#undef __field 65#undef __field
138#define __field(type, item) \ 66#define __field(type, item) \
139 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -158,7 +86,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
160 offsetof(typeof(field), item), \ 88 offsetof(typeof(field), item), \
161 sizeof(field.item), 0, FILTER_OTHER); \ 89 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \
162 if (ret) \ 91 if (ret) \
163 return ret; 92 return ret;
164 93
@@ -168,13 +97,18 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
168 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 97 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
169 offsetof(typeof(field), \ 98 offsetof(typeof(field), \
170 container.item), \ 99 container.item), \
171 sizeof(field.container.item), 0, \ 100 sizeof(field.container.item), \
172 FILTER_OTHER); \ 101 is_signed_type(type), FILTER_OTHER); \
173 if (ret) \ 102 if (ret) \
174 return ret; 103 return ret;
175 104
176#undef __dynamic_array 105#undef __dynamic_array
177#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
178 112
179#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
180#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -197,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
197 return 0; 131 return 0;
198} 132}
199 133
134#undef __entry
135#define __entry REC
136
200#undef __field 137#undef __field
201#define __field(type, item) 138#define __field(type, item)
202 139
@@ -212,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
212#undef __dynamic_array 149#undef __dynamic_array
213#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
214 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
215#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
216#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
217 \ 157 \
@@ -222,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
222 .id = type, \ 162 .id = type, \
223 .system = __stringify(TRACE_SYSTEM), \ 163 .system = __stringify(TRACE_SYSTEM), \
224 .raw_init = ftrace_raw_init_event, \ 164 .raw_init = ftrace_raw_init_event, \
225 .show_format = ftrace_format_##call, \ 165 .print_fmt = print, \
226 .define_fields = ftrace_define_fields_##call, \ 166 .define_fields = ftrace_define_fields_##call, \
227}; \ 167}; \
228 168
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..3fc2a575664f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -18,6 +18,7 @@ struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore; 20 int ignore;
21 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
21}; 22};
22 23
23struct fgraph_data { 24struct fgraph_data {
@@ -187,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr,
187 struct ring_buffer *buffer = tr->buffer; 188 struct ring_buffer *buffer = tr->buffer;
188 struct ftrace_graph_ent_entry *entry; 189 struct ftrace_graph_ent_entry *entry;
189 190
190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 191 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
191 return 0; 192 return 0;
192 193
193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 194 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
212 int cpu; 213 int cpu;
213 int pc; 214 int pc;
214 215
215 if (unlikely(!tr))
216 return 0;
217
218 if (!ftrace_trace_task(current)) 216 if (!ftrace_trace_task(current))
219 return 0; 217 return 0;
220 218
221 if (!ftrace_graph_addr(trace->func)) 219 /* trace it when it is-nested-in or is a function enabled. */
220 if (!(trace->depth || ftrace_graph_addr(trace->func)))
222 return 0; 221 return 0;
223 222
224 local_irq_save(flags); 223 local_irq_save(flags);
@@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
231 } else { 230 } else {
232 ret = 0; 231 ret = 0;
233 } 232 }
234 /* Only do the atomic if it is not already set */
235 if (!test_tsk_trace_graph(current))
236 set_tsk_trace_graph(current);
237 233
238 atomic_dec(&data->disabled); 234 atomic_dec(&data->disabled);
239 local_irq_restore(flags); 235 local_irq_restore(flags);
@@ -251,7 +247,7 @@ static void __trace_graph_return(struct trace_array *tr,
251 struct ring_buffer *buffer = tr->buffer; 247 struct ring_buffer *buffer = tr->buffer;
252 struct ftrace_graph_ret_entry *entry; 248 struct ftrace_graph_ret_entry *entry;
253 249
254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 250 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
255 return; 251 return;
256 252
257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 253 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
281 pc = preempt_count(); 277 pc = preempt_count();
282 __trace_graph_return(tr, trace, flags, pc); 278 __trace_graph_return(tr, trace, flags, pc);
283 } 279 }
284 if (!trace->depth)
285 clear_tsk_trace_graph(current);
286 atomic_dec(&data->disabled); 280 atomic_dec(&data->disabled);
287 local_irq_restore(flags); 281 local_irq_restore(flags);
288} 282}
289 283
284void set_graph_array(struct trace_array *tr)
285{
286 graph_array = tr;
287
288 /* Make graph_array visible before we start tracing */
289
290 smp_mb();
291}
292
290static int graph_trace_init(struct trace_array *tr) 293static int graph_trace_init(struct trace_array *tr)
291{ 294{
292 int ret; 295 int ret;
293 296
294 graph_array = tr; 297 set_graph_array(tr);
295 ret = register_ftrace_graph(&trace_graph_return, 298 ret = register_ftrace_graph(&trace_graph_return,
296 &trace_graph_entry); 299 &trace_graph_entry);
297 if (ret) 300 if (ret)
@@ -301,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr)
301 return 0; 304 return 0;
302} 305}
303 306
304void set_graph_array(struct trace_array *tr)
305{
306 graph_array = tr;
307}
308
309static void graph_trace_reset(struct trace_array *tr) 307static void graph_trace_reset(struct trace_array *tr)
310{ 308{
311 tracing_stop_cmdline_record(); 309 tracing_stop_cmdline_record();
@@ -673,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
673 duration = graph_ret->rettime - graph_ret->calltime; 671 duration = graph_ret->rettime - graph_ret->calltime;
674 672
675 if (data) { 673 if (data) {
674 struct fgraph_cpu_data *cpu_data;
676 int cpu = iter->cpu; 675 int cpu = iter->cpu;
677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 676
677 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
678 678
679 /* 679 /*
680 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
681 * this is a leaf function, keep the comments 681 * this is a leaf function, keep the comments
682 * equal to this depth. 682 * equal to this depth.
683 */ 683 */
684 *depth = call->depth - 1; 684 cpu_data->depth = call->depth - 1;
685
686 /* No need to keep this function around for this depth */
687 if (call->depth < FTRACE_RETFUNC_DEPTH)
688 cpu_data->enter_funcs[call->depth] = 0;
685 } 689 }
686 690
687 /* Overhead */ 691 /* Overhead */
@@ -721,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
721 int i; 725 int i;
722 726
723 if (data) { 727 if (data) {
728 struct fgraph_cpu_data *cpu_data;
724 int cpu = iter->cpu; 729 int cpu = iter->cpu;
725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
726 730
727 *depth = call->depth; 731 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
732 cpu_data->depth = call->depth;
733
734 /* Save this function pointer to see if the exit matches */
735 if (call->depth < FTRACE_RETFUNC_DEPTH)
736 cpu_data->enter_funcs[call->depth] = call->func;
728 } 737 }
729 738
730 /* No overhead */ 739 /* No overhead */
@@ -854,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
854 struct fgraph_data *data = iter->private; 863 struct fgraph_data *data = iter->private;
855 pid_t pid = ent->pid; 864 pid_t pid = ent->pid;
856 int cpu = iter->cpu; 865 int cpu = iter->cpu;
866 int func_match = 1;
857 int ret; 867 int ret;
858 int i; 868 int i;
859 869
860 if (data) { 870 if (data) {
871 struct fgraph_cpu_data *cpu_data;
861 int cpu = iter->cpu; 872 int cpu = iter->cpu;
862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 873
874 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
863 875
864 /* 876 /*
865 * Comments display at + 1 to depth. This is the 877 * Comments display at + 1 to depth. This is the
866 * return from a function, we now want the comments 878 * return from a function, we now want the comments
867 * to display at the same level of the bracket. 879 * to display at the same level of the bracket.
868 */ 880 */
869 *depth = trace->depth - 1; 881 cpu_data->depth = trace->depth - 1;
882
883 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
884 if (cpu_data->enter_funcs[trace->depth] != trace->func)
885 func_match = 0;
886 cpu_data->enter_funcs[trace->depth] = 0;
887 }
870 } 888 }
871 889
872 if (print_graph_prologue(iter, s, 0, 0)) 890 if (print_graph_prologue(iter, s, 0, 0))
@@ -891,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
891 return TRACE_TYPE_PARTIAL_LINE; 909 return TRACE_TYPE_PARTIAL_LINE;
892 } 910 }
893 911
894 ret = trace_seq_printf(s, "}\n"); 912 /*
895 if (!ret) 913 * If the return function does not have a matching entry,
896 return TRACE_TYPE_PARTIAL_LINE; 914 * then the entry was lost. Instead of just printing
915 * the '}' and letting the user guess what function this
916 * belongs to, write out the function name.
917 */
918 if (func_match) {
919 ret = trace_seq_printf(s, "}\n");
920 if (!ret)
921 return TRACE_TYPE_PARTIAL_LINE;
922 } else {
923 ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
924 if (!ret)
925 return TRACE_TYPE_PARTIAL_LINE;
926 }
897 927
898 /* Overrun */ 928 /* Overrun */
899 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 929 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7ecab06547a5..505c92273b1a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
91 return retval; 91 return retval;
92} 92}
93 93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy) 95 void *dummy)
101{ 96{
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{ 226{
232 int ret = -EINVAL; 227 int ret = -EINVAL;
233 228
234 if (ff->func == fetch_argument) 229 if (ff->func == fetch_register) {
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name; 230 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data)); 231 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name); 232 ret = snprintf(buf, n, "%%%s", name);
@@ -282,6 +275,18 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri, 275static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs); 276 struct pt_regs *regs);
284 277
278/* Check the name is good for event/group */
279static int check_event_name(const char *name)
280{
281 if (!isalpha(*name) && *name != '_')
282 return 0;
283 while (*++name != '\0') {
284 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
285 return 0;
286 }
287 return 1;
288}
289
285/* 290/*
286 * Allocate new trace_probe and initialize it (including kprobes). 291 * Allocate new trace_probe and initialize it (including kprobes).
287 */ 292 */
@@ -293,10 +298,11 @@ static struct trace_probe *alloc_trace_probe(const char *group,
293 int nargs, int is_return) 298 int nargs, int is_return)
294{ 299{
295 struct trace_probe *tp; 300 struct trace_probe *tp;
301 int ret = -ENOMEM;
296 302
297 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); 303 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
298 if (!tp) 304 if (!tp)
299 return ERR_PTR(-ENOMEM); 305 return ERR_PTR(ret);
300 306
301 if (symbol) { 307 if (symbol) {
302 tp->symbol = kstrdup(symbol, GFP_KERNEL); 308 tp->symbol = kstrdup(symbol, GFP_KERNEL);
@@ -312,14 +318,20 @@ static struct trace_probe *alloc_trace_probe(const char *group,
312 else 318 else
313 tp->rp.kp.pre_handler = kprobe_dispatcher; 319 tp->rp.kp.pre_handler = kprobe_dispatcher;
314 320
315 if (!event) 321 if (!event || !check_event_name(event)) {
322 ret = -EINVAL;
316 goto error; 323 goto error;
324 }
325
317 tp->call.name = kstrdup(event, GFP_KERNEL); 326 tp->call.name = kstrdup(event, GFP_KERNEL);
318 if (!tp->call.name) 327 if (!tp->call.name)
319 goto error; 328 goto error;
320 329
321 if (!group) 330 if (!group || !check_event_name(group)) {
331 ret = -EINVAL;
322 goto error; 332 goto error;
333 }
334
323 tp->call.system = kstrdup(group, GFP_KERNEL); 335 tp->call.system = kstrdup(group, GFP_KERNEL);
324 if (!tp->call.system) 336 if (!tp->call.system)
325 goto error; 337 goto error;
@@ -330,7 +342,7 @@ error:
330 kfree(tp->call.name); 342 kfree(tp->call.name);
331 kfree(tp->symbol); 343 kfree(tp->symbol);
332 kfree(tp); 344 kfree(tp);
333 return ERR_PTR(-ENOMEM); 345 return ERR_PTR(ret);
334} 346}
335 347
336static void free_probe_arg(struct probe_arg *arg) 348static void free_probe_arg(struct probe_arg *arg)
@@ -470,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
470 } 482 }
471 } else 483 } else
472 ret = -EINVAL; 484 ret = -EINVAL;
473 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
474 ret = strict_strtoul(arg + 3, 10, &param);
475 if (ret || param > PARAM_MAX_ARGS)
476 ret = -EINVAL;
477 else {
478 ff->func = fetch_argument;
479 ff->data = (void *)param;
480 }
481 } else 485 } else
482 ret = -EINVAL; 486 ret = -EINVAL;
483 return ret; 487 return ret;
@@ -592,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
592 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 596 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
593 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 597 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
594 * Fetch args: 598 * Fetch args:
595 * $argN : fetch Nth of function argument. (N:0-)
596 * $retval : fetch return value 599 * $retval : fetch return value
597 * $stack : fetch stack address 600 * $stack : fetch stack address
598 * $stackN : fetch Nth of stack (N:0-) 601 * $stackN : fetch Nth of stack (N:0-)
@@ -632,12 +635,12 @@ static int create_trace_probe(int argc, char **argv)
632 event = strchr(group, '/') + 1; 635 event = strchr(group, '/') + 1;
633 event[-1] = '\0'; 636 event[-1] = '\0';
634 if (strlen(group) == 0) { 637 if (strlen(group) == 0) {
635 pr_info("Group name is not specifiled\n"); 638 pr_info("Group name is not specified\n");
636 return -EINVAL; 639 return -EINVAL;
637 } 640 }
638 } 641 }
639 if (strlen(event) == 0) { 642 if (strlen(event) == 0) {
640 pr_info("Event name is not specifiled\n"); 643 pr_info("Event name is not specified\n");
641 return -EINVAL; 644 return -EINVAL;
642 } 645 }
643 } 646 }
@@ -670,7 +673,7 @@ static int create_trace_probe(int argc, char **argv)
670 return -EINVAL; 673 return -EINVAL;
671 } 674 }
672 /* an address specified */ 675 /* an address specified */
673 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); 676 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
674 if (ret) { 677 if (ret) {
675 pr_info("Failed to parse address.\n"); 678 pr_info("Failed to parse address.\n");
676 return ret; 679 return ret;
@@ -695,10 +698,10 @@ static int create_trace_probe(int argc, char **argv)
695 if (!event) { 698 if (!event) {
696 /* Make a new event name */ 699 /* Make a new event name */
697 if (symbol) 700 if (symbol)
698 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", 701 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
699 is_return ? 'r' : 'p', symbol, offset); 702 is_return ? 'r' : 'p', symbol, offset);
700 else 703 else
701 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", 704 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
702 is_return ? 'r' : 'p', addr); 705 is_return ? 'r' : 'p', addr);
703 event = buf; 706 event = buf;
704 } 707 }
@@ -939,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
939}; 942};
940 943
941/* Kprobe handler */ 944/* Kprobe handler */
942static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
943{ 946{
944 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
945 struct kprobe_trace_entry *entry; 948 struct kprobe_trace_entry *entry;
@@ -959,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
959 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
960 irq_flags, pc); 963 irq_flags, pc);
961 if (!event) 964 if (!event)
962 return 0; 965 return;
963 966
964 entry = ring_buffer_event_data(event); 967 entry = ring_buffer_event_data(event);
965 entry->nargs = tp->nr_args; 968 entry->nargs = tp->nr_args;
@@ -969,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
969 972
970 if (!filter_current_check_discard(buffer, call, entry, event)) 973 if (!filter_current_check_discard(buffer, call, entry, event))
971 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
972 return 0;
973} 975}
974 976
975/* Kretprobe handler */ 977/* Kretprobe handler */
976static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, 978static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
977 struct pt_regs *regs) 979 struct pt_regs *regs)
978{ 980{
979 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -992,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
992 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
993 irq_flags, pc); 995 irq_flags, pc);
994 if (!event) 996 if (!event)
995 return 0; 997 return;
996 998
997 entry = ring_buffer_event_data(event); 999 entry = ring_buffer_event_data(event);
998 entry->nargs = tp->nr_args; 1000 entry->nargs = tp->nr_args;
@@ -1003,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1003 1005
1004 if (!filter_current_check_discard(buffer, call, entry, event)) 1006 if (!filter_current_check_discard(buffer, call, entry, event))
1005 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1006
1007 return 0;
1008} 1008}
1009 1009
1010/* Event entry printers */ 1010/* Event entry printers */
@@ -1155,212 +1155,123 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1155 return 0; 1155 return 0;
1156} 1156}
1157 1157
1158static int __probe_event_show_format(struct trace_seq *s, 1158static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1159 struct trace_probe *tp, const char *fmt,
1160 const char *arg)
1161{ 1159{
1162 int i; 1160 int i;
1161 int pos = 0;
1163 1162
1164 /* Show format */ 1163 const char *fmt, *arg;
1165 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1166 return 0;
1167
1168 for (i = 0; i < tp->nr_args; i++)
1169 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
1170 return 0;
1171 1164
1172 if (!trace_seq_printf(s, "\", %s", arg)) 1165 if (!probe_is_return(tp)) {
1173 return 0; 1166 fmt = "(%lx)";
1167 arg = "REC->" FIELD_STRING_IP;
1168 } else {
1169 fmt = "(%lx <- %lx)";
1170 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1171 }
1174 1172
1175 for (i = 0; i < tp->nr_args; i++) 1173 /* When len=0, we just calculate the needed length */
1176 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) 1174#define LEN_OR_ZERO (len ? len - pos : 0)
1177 return 0;
1178 1175
1179 return trace_seq_puts(s, "\n"); 1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1180}
1181 1177
1182#undef SHOW_FIELD 1178 for (i = 0; i < tp->nr_args; i++) {
1183#define SHOW_FIELD(type, item, name) \ 1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
1184 do { \ 1180 tp->args[i].name);
1185 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ 1181 }
1186 "offset:%u;\tsize:%u;\n", name, \
1187 (unsigned int)offsetof(typeof(field), item),\
1188 (unsigned int)sizeof(type)); \
1189 if (!ret) \
1190 return 0; \
1191 } while (0)
1192 1182
1193static int kprobe_event_show_format(struct ftrace_event_call *call, 1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1194 struct trace_seq *s)
1195{
1196 struct kprobe_trace_entry field __attribute__((unused));
1197 int ret, i;
1198 struct trace_probe *tp = (struct trace_probe *)call->data;
1199 1184
1200 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); 1185 for (i = 0; i < tp->nr_args; i++) {
1201 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1187 tp->args[i].name);
1188 }
1202 1189
1203 /* Show fields */ 1190#undef LEN_OR_ZERO
1204 for (i = 0; i < tp->nr_args; i++)
1205 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1206 trace_seq_puts(s, "\n");
1207 1191
1208 return __probe_event_show_format(s, tp, "(%lx)", 1192 /* return the length of print_fmt */
1209 "REC->" FIELD_STRING_IP); 1193 return pos;
1210} 1194}
1211 1195
1212static int kretprobe_event_show_format(struct ftrace_event_call *call, 1196static int set_print_fmt(struct trace_probe *tp)
1213 struct trace_seq *s)
1214{ 1197{
1215 struct kretprobe_trace_entry field __attribute__((unused)); 1198 int len;
1216 int ret, i; 1199 char *print_fmt;
1217 struct trace_probe *tp = (struct trace_probe *)call->data;
1218 1200
1219 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); 1201 /* First: called with 0 length to calculate the needed length */
1220 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); 1202 len = __set_print_fmt(tp, NULL, 0);
1221 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1203 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1204 if (!print_fmt)
1205 return -ENOMEM;
1222 1206
1223 /* Show fields */ 1207 /* Second: actually write the @print_fmt */
1224 for (i = 0; i < tp->nr_args; i++) 1208 __set_print_fmt(tp, print_fmt, len + 1);
1225 SHOW_FIELD(unsigned long, args[i], tp->args[i].name); 1209 tp->call.print_fmt = print_fmt;
1226 trace_seq_puts(s, "\n");
1227 1210
1228 return __probe_event_show_format(s, tp, "(%lx <- %lx)", 1211 return 0;
1229 "REC->" FIELD_STRING_FUNC
1230 ", REC->" FIELD_STRING_RETIP);
1231} 1212}
1232 1213
1233#ifdef CONFIG_EVENT_PROFILE 1214#ifdef CONFIG_PERF_EVENTS
1234 1215
1235/* Kprobe profile handler */ 1216/* Kprobe profile handler */
1236static __kprobes int kprobe_profile_func(struct kprobe *kp, 1217static __kprobes void kprobe_profile_func(struct kprobe *kp,
1237 struct pt_regs *regs) 1218 struct pt_regs *regs)
1238{ 1219{
1239 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1240 struct ftrace_event_call *call = &tp->call; 1221 struct ftrace_event_call *call = &tp->call;
1241 struct kprobe_trace_entry *entry; 1222 struct kprobe_trace_entry *entry;
1242 struct trace_entry *ent; 1223 int size, __size, i;
1243 int size, __size, i, pc, __cpu;
1244 unsigned long irq_flags; 1224 unsigned long irq_flags;
1245 char *trace_buf;
1246 char *raw_data;
1247 int rctx; 1225 int rctx;
1248 1226
1249 pc = preempt_count();
1250 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1251 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1228 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1252 size -= sizeof(u32); 1229 size -= sizeof(u32);
1253 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1230 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1254 "profile buffer not large enough")) 1231 "profile buffer not large enough"))
1255 return 0; 1232 return;
1256 1233
1257 /* 1234 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1258 * Protect the non nmi buffer 1235 if (!entry)
1259 * This also protects the rcu read side 1236 return;
1260 */
1261 local_irq_save(irq_flags);
1262
1263 rctx = perf_swevent_get_recursion_context();
1264 if (rctx < 0)
1265 goto end_recursion;
1266
1267 __cpu = smp_processor_id();
1268
1269 if (in_nmi())
1270 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1271 else
1272 trace_buf = rcu_dereference(perf_trace_buf);
1273
1274 if (!trace_buf)
1275 goto end;
1276
1277 raw_data = per_cpu_ptr(trace_buf, __cpu);
1278
1279 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1280 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1281 entry = (struct kprobe_trace_entry *)raw_data;
1282 ent = &entry->ent;
1283 1237
1284 tracing_generic_entry_update(ent, irq_flags, pc);
1285 ent->type = call->id;
1286 entry->nargs = tp->nr_args; 1238 entry->nargs = tp->nr_args;
1287 entry->ip = (unsigned long)kp->addr; 1239 entry->ip = (unsigned long)kp->addr;
1288 for (i = 0; i < tp->nr_args; i++) 1240 for (i = 0; i < tp->nr_args; i++)
1289 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1290 perf_tp_event(call->id, entry->ip, 1, entry, size);
1291
1292end:
1293 perf_swevent_put_recursion_context(rctx);
1294end_recursion:
1295 local_irq_restore(irq_flags);
1296 1242
1297 return 0; 1243 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
1298} 1244}
1299 1245
1300/* Kretprobe profile handler */ 1246/* Kretprobe profile handler */
1301static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, 1247static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1302 struct pt_regs *regs) 1248 struct pt_regs *regs)
1303{ 1249{
1304 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1305 struct ftrace_event_call *call = &tp->call; 1251 struct ftrace_event_call *call = &tp->call;
1306 struct kretprobe_trace_entry *entry; 1252 struct kretprobe_trace_entry *entry;
1307 struct trace_entry *ent; 1253 int size, __size, i;
1308 int size, __size, i, pc, __cpu;
1309 unsigned long irq_flags; 1254 unsigned long irq_flags;
1310 char *trace_buf;
1311 char *raw_data;
1312 int rctx; 1255 int rctx;
1313 1256
1314 pc = preempt_count();
1315 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1316 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1258 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1317 size -= sizeof(u32); 1259 size -= sizeof(u32);
1318 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1260 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1319 "profile buffer not large enough")) 1261 "profile buffer not large enough"))
1320 return 0; 1262 return;
1321
1322 /*
1323 * Protect the non nmi buffer
1324 * This also protects the rcu read side
1325 */
1326 local_irq_save(irq_flags);
1327
1328 rctx = perf_swevent_get_recursion_context();
1329 if (rctx < 0)
1330 goto end_recursion;
1331
1332 __cpu = smp_processor_id();
1333
1334 if (in_nmi())
1335 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1336 else
1337 trace_buf = rcu_dereference(perf_trace_buf);
1338
1339 if (!trace_buf)
1340 goto end;
1341
1342 raw_data = per_cpu_ptr(trace_buf, __cpu);
1343 1263
1344 /* Zero dead bytes from alignment to avoid buffer leak to userspace */ 1264 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1345 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 1265 if (!entry)
1346 entry = (struct kretprobe_trace_entry *)raw_data; 1266 return;
1347 ent = &entry->ent;
1348 1267
1349 tracing_generic_entry_update(ent, irq_flags, pc);
1350 ent->type = call->id;
1351 entry->nargs = tp->nr_args; 1268 entry->nargs = tp->nr_args;
1352 entry->func = (unsigned long)tp->rp.kp.addr; 1269 entry->func = (unsigned long)tp->rp.kp.addr;
1353 entry->ret_ip = (unsigned long)ri->ret_addr; 1270 entry->ret_ip = (unsigned long)ri->ret_addr;
1354 for (i = 0; i < tp->nr_args; i++) 1271 for (i = 0; i < tp->nr_args; i++)
1355 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1356 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1357
1358end:
1359 perf_swevent_put_recursion_context(rctx);
1360end_recursion:
1361 local_irq_restore(irq_flags);
1362 1273
1363 return 0; 1274 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
1364} 1275}
1365 1276
1366static int probe_profile_enable(struct ftrace_event_call *call) 1277static int probe_profile_enable(struct ftrace_event_call *call)
@@ -1388,7 +1299,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1388 disable_kprobe(&tp->rp.kp); 1299 disable_kprobe(&tp->rp.kp);
1389 } 1300 }
1390} 1301}
1391#endif /* CONFIG_EVENT_PROFILE */ 1302#endif /* CONFIG_PERF_EVENTS */
1392 1303
1393 1304
1394static __kprobes 1305static __kprobes
@@ -1398,10 +1309,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1398 1309
1399 if (tp->flags & TP_FLAG_TRACE) 1310 if (tp->flags & TP_FLAG_TRACE)
1400 kprobe_trace_func(kp, regs); 1311 kprobe_trace_func(kp, regs);
1401#ifdef CONFIG_EVENT_PROFILE 1312#ifdef CONFIG_PERF_EVENTS
1402 if (tp->flags & TP_FLAG_PROFILE) 1313 if (tp->flags & TP_FLAG_PROFILE)
1403 kprobe_profile_func(kp, regs); 1314 kprobe_profile_func(kp, regs);
1404#endif /* CONFIG_EVENT_PROFILE */ 1315#endif
1405 return 0; /* We don't tweek kernel, so just return 0 */ 1316 return 0; /* We don't tweek kernel, so just return 0 */
1406} 1317}
1407 1318
@@ -1412,10 +1323,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1412 1323
1413 if (tp->flags & TP_FLAG_TRACE) 1324 if (tp->flags & TP_FLAG_TRACE)
1414 kretprobe_trace_func(ri, regs); 1325 kretprobe_trace_func(ri, regs);
1415#ifdef CONFIG_EVENT_PROFILE 1326#ifdef CONFIG_PERF_EVENTS
1416 if (tp->flags & TP_FLAG_PROFILE) 1327 if (tp->flags & TP_FLAG_PROFILE)
1417 kretprobe_profile_func(ri, regs); 1328 kretprobe_profile_func(ri, regs);
1418#endif /* CONFIG_EVENT_PROFILE */ 1329#endif
1419 return 0; /* We don't tweek kernel, so just return 0 */ 1330 return 0; /* We don't tweek kernel, so just return 0 */
1420} 1331}
1421 1332
@@ -1428,23 +1339,25 @@ static int register_probe_event(struct trace_probe *tp)
1428 if (probe_is_return(tp)) { 1339 if (probe_is_return(tp)) {
1429 tp->event.trace = print_kretprobe_event; 1340 tp->event.trace = print_kretprobe_event;
1430 call->raw_init = probe_event_raw_init; 1341 call->raw_init = probe_event_raw_init;
1431 call->show_format = kretprobe_event_show_format;
1432 call->define_fields = kretprobe_event_define_fields; 1342 call->define_fields = kretprobe_event_define_fields;
1433 } else { 1343 } else {
1434 tp->event.trace = print_kprobe_event; 1344 tp->event.trace = print_kprobe_event;
1435 call->raw_init = probe_event_raw_init; 1345 call->raw_init = probe_event_raw_init;
1436 call->show_format = kprobe_event_show_format;
1437 call->define_fields = kprobe_event_define_fields; 1346 call->define_fields = kprobe_event_define_fields;
1438 } 1347 }
1348 if (set_print_fmt(tp) < 0)
1349 return -ENOMEM;
1439 call->event = &tp->event; 1350 call->event = &tp->event;
1440 call->id = register_ftrace_event(&tp->event); 1351 call->id = register_ftrace_event(&tp->event);
1441 if (!call->id) 1352 if (!call->id) {
1353 kfree(call->print_fmt);
1442 return -ENODEV; 1354 return -ENODEV;
1355 }
1443 call->enabled = 0; 1356 call->enabled = 0;
1444 call->regfunc = probe_event_enable; 1357 call->regfunc = probe_event_enable;
1445 call->unregfunc = probe_event_disable; 1358 call->unregfunc = probe_event_disable;
1446 1359
1447#ifdef CONFIG_EVENT_PROFILE 1360#ifdef CONFIG_PERF_EVENTS
1448 call->profile_enable = probe_profile_enable; 1361 call->profile_enable = probe_profile_enable;
1449 call->profile_disable = probe_profile_disable; 1362 call->profile_disable = probe_profile_disable;
1450#endif 1363#endif
@@ -1452,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp)
1452 ret = trace_add_event_call(call); 1365 ret = trace_add_event_call(call);
1453 if (ret) { 1366 if (ret) {
1454 pr_info("Failed to register kprobe event: %s\n", call->name); 1367 pr_info("Failed to register kprobe event: %s\n", call->name);
1368 kfree(call->print_fmt);
1455 unregister_ftrace_event(&tp->event); 1369 unregister_ftrace_event(&tp->event);
1456 } 1370 }
1457 return ret; 1371 return ret;
@@ -1461,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1461{ 1375{
1462 /* tp->event is unregistered in trace_remove_event_call() */ 1376 /* tp->event is unregistered in trace_remove_event_call() */
1463 trace_remove_event_call(&tp->call); 1377 trace_remove_event_call(&tp->call);
1378 kfree(tp->call.print_fmt);
1464} 1379}
1465 1380
1466/* Make a debugfs interface for controling probe points */ 1381/* Make a debugfs interface for controling probe points */
@@ -1503,28 +1418,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1503 1418
1504static __init int kprobe_trace_self_tests_init(void) 1419static __init int kprobe_trace_self_tests_init(void)
1505{ 1420{
1506 int ret; 1421 int ret, warn = 0;
1507 int (*target)(int, int, int, int, int, int); 1422 int (*target)(int, int, int, int, int, int);
1423 struct trace_probe *tp;
1508 1424
1509 target = kprobe_trace_selftest_target; 1425 target = kprobe_trace_selftest_target;
1510 1426
1511 pr_info("Testing kprobe tracing: "); 1427 pr_info("Testing kprobe tracing: ");
1512 1428
1513 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1429 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1514 "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); 1430 "$stack $stack0 +0($stack)");
1515 if (WARN_ON_ONCE(ret)) 1431 if (WARN_ON_ONCE(ret)) {
1516 pr_warning("error enabling function entry\n"); 1432 pr_warning("error on probing function entry.\n");
1433 warn++;
1434 } else {
1435 /* Enable trace point */
1436 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1437 if (WARN_ON_ONCE(tp == NULL)) {
1438 pr_warning("error on getting new probe.\n");
1439 warn++;
1440 } else
1441 probe_event_enable(&tp->call);
1442 }
1517 1443
1518 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1444 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1519 "$retval"); 1445 "$retval");
1520 if (WARN_ON_ONCE(ret)) 1446 if (WARN_ON_ONCE(ret)) {
1521 pr_warning("error enabling function return\n"); 1447 pr_warning("error on probing function return.\n");
1448 warn++;
1449 } else {
1450 /* Enable trace point */
1451 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1452 if (WARN_ON_ONCE(tp == NULL)) {
1453 pr_warning("error on getting new probe.\n");
1454 warn++;
1455 } else
1456 probe_event_enable(&tp->call);
1457 }
1458
1459 if (warn)
1460 goto end;
1522 1461
1523 ret = target(1, 2, 3, 4, 5, 6); 1462 ret = target(1, 2, 3, 4, 5, 6);
1524 1463
1525 cleanup_all_probes(); 1464 ret = command_trace_probe("-:testprobe");
1465 if (WARN_ON_ONCE(ret)) {
1466 pr_warning("error on deleting a probe.\n");
1467 warn++;
1468 }
1469
1470 ret = command_trace_probe("-:testprobe2");
1471 if (WARN_ON_ONCE(ret)) {
1472 pr_warning("error on deleting a probe.\n");
1473 warn++;
1474 }
1526 1475
1527 pr_cont("OK\n"); 1476end:
1477 cleanup_all_probes();
1478 if (warn)
1479 pr_cont("NG: Some tests are failed. Please check them.\n");
1480 else
1481 pr_cont("OK\n");
1528 return 0; 1482 return 0;
1529} 1483}
1530 1484
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index faf37fa4408c..94103cdcf9d8 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -26,12 +26,13 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27 27
28#include "trace_output.h" 28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h" 29#include "trace.h"
31 30
32#include <linux/hw_breakpoint.h> 31#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h> 32#include <asm/hw_breakpoint.h>
34 33
34#include <asm/atomic.h>
35
35/* 36/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number 37 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers. 38 * of available hardware breakpoint registers.
@@ -44,7 +45,7 @@ struct trace_ksym {
44 struct perf_event **ksym_hbp; 45 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr; 46 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER 47#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter; 48 atomic64_t counter;
48#endif 49#endif
49 struct hlist_node ksym_hlist; 50 struct hlist_node ksym_hlist;
50}; 51};
@@ -69,9 +70,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
69 70
70 rcu_read_lock(); 71 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { 72 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) && 73 if (entry->attr.bp_addr == hbp_hit_addr) {
73 (entry->counter <= MAX_UL_INT)) { 74 atomic64_inc(&entry->counter);
74 entry->counter++;
75 break; 75 break;
76 } 76 }
77 } 77 }
@@ -197,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
197 entry->attr.bp_addr = addr; 197 entry->attr.bp_addr = addr;
198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4; 198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
199 199
200 ret = -EAGAIN;
201 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, 200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
202 ksym_hbp_handler); 201 ksym_hbp_handler);
203 202
@@ -300,8 +299,8 @@ static ssize_t ksym_trace_filter_write(struct file *file,
300 * 2: echo 0 > ksym_trace_filter 299 * 2: echo 0 > ksym_trace_filter
301 * 3: echo "*:---" > ksym_trace_filter 300 * 3: echo "*:---" > ksym_trace_filter
302 */ 301 */
303 if (!buf[0] || !strcmp(buf, "0") || 302 if (!input_string[0] || !strcmp(input_string, "0") ||
304 !strcmp(buf, "*:---")) { 303 !strcmp(input_string, "*:---")) {
305 __ksym_trace_reset(); 304 __ksym_trace_reset();
306 ret = 0; 305 ret = 0;
307 goto out; 306 goto out;
@@ -444,102 +443,77 @@ struct tracer ksym_tracer __read_mostly =
444 .print_line = ksym_trace_output 443 .print_line = ksym_trace_output
445}; 444};
446 445
447__init static int init_ksym_trace(void)
448{
449 struct dentry *d_tracer;
450 struct dentry *entry;
451
452 d_tracer = tracing_init_dentry();
453 ksym_filter_entry_count = 0;
454
455 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
456 NULL, &ksym_tracing_fops);
457 if (!entry)
458 pr_warning("Could not create debugfs "
459 "'ksym_trace_filter' file\n");
460
461 return register_tracer(&ksym_tracer);
462}
463device_initcall(init_ksym_trace);
464
465
466#ifdef CONFIG_PROFILE_KSYM_TRACER 446#ifdef CONFIG_PROFILE_KSYM_TRACER
467static int ksym_tracer_stat_headers(struct seq_file *m) 447static int ksym_profile_show(struct seq_file *m, void *v)
468{ 448{
449 struct hlist_node *node;
450 struct trace_ksym *entry;
451 int access_type = 0;
452 char fn_name[KSYM_NAME_LEN];
453
469 seq_puts(m, " Access Type "); 454 seq_puts(m, " Access Type ");
470 seq_puts(m, " Symbol Counter\n"); 455 seq_puts(m, " Symbol Counter\n");
471 seq_puts(m, " ----------- "); 456 seq_puts(m, " ----------- ");
472 seq_puts(m, " ------ -------\n"); 457 seq_puts(m, " ------ -------\n");
473 return 0;
474}
475 458
476static int ksym_tracer_stat_show(struct seq_file *m, void *v) 459 rcu_read_lock();
477{ 460 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
478 struct hlist_node *stat = v;
479 struct trace_ksym *entry;
480 int access_type = 0;
481 char fn_name[KSYM_NAME_LEN];
482 461
483 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); 462 access_type = entry->attr.bp_type;
484 463
485 access_type = entry->attr.bp_type; 464 switch (access_type) {
465 case HW_BREAKPOINT_R:
466 seq_puts(m, " R ");
467 break;
468 case HW_BREAKPOINT_W:
469 seq_puts(m, " W ");
470 break;
471 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
472 seq_puts(m, " RW ");
473 break;
474 default:
475 seq_puts(m, " NA ");
476 }
486 477
487 switch (access_type) { 478 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
488 case HW_BREAKPOINT_R: 479 seq_printf(m, " %-36s", fn_name);
489 seq_puts(m, " R "); 480 else
490 break; 481 seq_printf(m, " %-36s", "<NA>");
491 case HW_BREAKPOINT_W: 482 seq_printf(m, " %15llu\n",
492 seq_puts(m, " W "); 483 (unsigned long long)atomic64_read(&entry->counter));
493 break;
494 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
495 seq_puts(m, " RW ");
496 break;
497 default:
498 seq_puts(m, " NA ");
499 } 484 }
500 485 rcu_read_unlock();
501 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
502 seq_printf(m, " %-36s", fn_name);
503 else
504 seq_printf(m, " %-36s", "<NA>");
505 seq_printf(m, " %15lu\n", entry->counter);
506 486
507 return 0; 487 return 0;
508} 488}
509 489
510static void *ksym_tracer_stat_start(struct tracer_stat *trace) 490static int ksym_profile_open(struct inode *node, struct file *file)
511{ 491{
512 return ksym_filter_head.first; 492 return single_open(file, ksym_profile_show, NULL);
513}
514
515static void *
516ksym_tracer_stat_next(void *v, int idx)
517{
518 struct hlist_node *stat = v;
519
520 return stat->next;
521} 493}
522 494
523static struct tracer_stat ksym_tracer_stats = { 495static const struct file_operations ksym_profile_fops = {
524 .name = "ksym_tracer", 496 .open = ksym_profile_open,
525 .stat_start = ksym_tracer_stat_start, 497 .read = seq_read,
526 .stat_next = ksym_tracer_stat_next, 498 .llseek = seq_lseek,
527 .stat_headers = ksym_tracer_stat_headers, 499 .release = single_release,
528 .stat_show = ksym_tracer_stat_show
529}; 500};
501#endif /* CONFIG_PROFILE_KSYM_TRACER */
530 502
531__init static int ksym_tracer_stat_init(void) 503__init static int init_ksym_trace(void)
532{ 504{
533 int ret; 505 struct dentry *d_tracer;
534 506
535 ret = register_stat_tracer(&ksym_tracer_stats); 507 d_tracer = tracing_init_dentry();
536 if (ret) {
537 printk(KERN_WARNING "Warning: could not register "
538 "ksym tracer stats\n");
539 return 1;
540 }
541 508
542 return 0; 509 trace_create_file("ksym_trace_filter", 0644, d_tracer,
510 NULL, &ksym_tracing_fops);
511
512#ifdef CONFIG_PROFILE_KSYM_TRACER
513 trace_create_file("ksym_profile", 0444, d_tracer,
514 NULL, &ksym_profile_fops);
515#endif
516
517 return register_tracer(&ksym_tracer);
543} 518}
544fs_initcall(ksym_tracer_stat_init); 519device_initcall(init_ksym_trace);
545#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee30..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
174 arch_spin_lock(&max_stack_lock); 184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 arch_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
210 arch_spin_lock(&max_stack_lock); 228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
238 int cpu;
239
220 arch_spin_unlock(&max_stack_lock); 240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..cba47d7935cc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -143,70 +143,65 @@ extern char *__bad_type_size(void);
143 #type, #name, offsetof(typeof(trace), name), \ 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type) 144 sizeof(trace.name), is_signed_type(type)
145 145
146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146static
147int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
147{ 148{
148 int i; 149 int i;
149 int ret; 150 int pos = 0;
150 struct syscall_metadata *entry = call->data;
151 struct syscall_trace_enter trace;
152 int offset = offsetof(struct syscall_trace_enter, args);
153 151
154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 152 /* When len=0, we just calculate the needed length */
155 "\tsigned:%u;\n", 153#define LEN_OR_ZERO (len ? len - pos : 0)
156 SYSCALL_FIELD(int, nr));
157 if (!ret)
158 return 0;
159 154
155 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
160 for (i = 0; i < entry->nb_args; i++) { 156 for (i = 0; i < entry->nb_args; i++) {
161 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 157 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
162 entry->args[i]); 158 entry->args[i], sizeof(unsigned long),
163 if (!ret) 159 i == entry->nb_args - 1 ? "" : ", ");
164 return 0;
165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
169 if (!ret)
170 return 0;
171 offset += sizeof(unsigned long);
172 } 160 }
161 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
173 162
174 trace_seq_puts(s, "\nprint fmt: \"");
175 for (i = 0; i < entry->nb_args; i++) { 163 for (i = 0; i < entry->nb_args; i++) {
176 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 164 pos += snprintf(buf + pos, LEN_OR_ZERO,
177 sizeof(unsigned long), 165 ", ((unsigned long)(REC->%s))", entry->args[i]);
178 i == entry->nb_args - 1 ? "" : ", ");
179 if (!ret)
180 return 0;
181 } 166 }
182 trace_seq_putc(s, '"');
183 167
184 for (i = 0; i < entry->nb_args; i++) { 168#undef LEN_OR_ZERO
185 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186 entry->args[i]);
187 if (!ret)
188 return 0;
189 }
190 169
191 return trace_seq_putc(s, '\n'); 170 /* return the length of print_fmt */
171 return pos;
192} 172}
193 173
194int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 174static int set_syscall_print_fmt(struct ftrace_event_call *call)
195{ 175{
196 int ret; 176 char *print_fmt;
197 struct syscall_trace_exit trace; 177 int len;
178 struct syscall_metadata *entry = call->data;
198 179
199 ret = trace_seq_printf(s, 180 if (entry->enter_event != call) {
200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 181 call->print_fmt = "\"0x%lx\", REC->ret";
201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
204 SYSCALL_FIELD(int, nr),
205 SYSCALL_FIELD(long, ret));
206 if (!ret)
207 return 0; 182 return 0;
183 }
184
185 /* First: called with 0 length to calculate the needed length */
186 len = __set_enter_print_fmt(entry, NULL, 0);
187
188 print_fmt = kmalloc(len + 1, GFP_KERNEL);
189 if (!print_fmt)
190 return -ENOMEM;
191
192 /* Second: actually write the @print_fmt */
193 __set_enter_print_fmt(entry, print_fmt, len + 1);
194 call->print_fmt = print_fmt;
208 195
209 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 196 return 0;
197}
198
199static void free_syscall_print_fmt(struct ftrace_event_call *call)
200{
201 struct syscall_metadata *entry = call->data;
202
203 if (entry->enter_event == call)
204 kfree(call->print_fmt);
210} 205}
211 206
212int syscall_enter_define_fields(struct ftrace_event_call *call) 207int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
386{ 381{
387 int id; 382 int id;
388 383
389 id = register_ftrace_event(call->event); 384 if (set_syscall_print_fmt(call) < 0)
390 if (!id) 385 return -ENOMEM;
391 return -ENODEV; 386
392 call->id = id; 387 id = trace_event_raw_init(call);
393 INIT_LIST_HEAD(&call->fields); 388
394 return 0; 389 if (id < 0) {
390 free_syscall_print_fmt(call);
391 return id;
392 }
393
394 return id;
395}
396
397unsigned long __init arch_syscall_addr(int nr)
398{
399 return (unsigned long)sys_call_table[nr];
395} 400}
396 401
397int __init init_ftrace_syscalls(void) 402int __init init_ftrace_syscalls(void)
@@ -421,7 +426,7 @@ int __init init_ftrace_syscalls(void)
421} 426}
422core_initcall(init_ftrace_syscalls); 427core_initcall(init_ftrace_syscalls);
423 428
424#ifdef CONFIG_EVENT_PROFILE 429#ifdef CONFIG_PERF_EVENTS
425 430
426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 431static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 432static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -433,12 +438,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
433 struct syscall_metadata *sys_data; 438 struct syscall_metadata *sys_data;
434 struct syscall_trace_enter *rec; 439 struct syscall_trace_enter *rec;
435 unsigned long flags; 440 unsigned long flags;
436 char *trace_buf;
437 char *raw_data;
438 int syscall_nr; 441 int syscall_nr;
439 int rctx; 442 int rctx;
440 int size; 443 int size;
441 int cpu;
442 444
443 syscall_nr = syscall_get_nr(current, regs); 445 syscall_nr = syscall_get_nr(current, regs);
444 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 446 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +459,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
457 "profile buffer not large enough")) 459 "profile buffer not large enough"))
458 return; 460 return;
459 461
460 /* Protect the per cpu buffer, begin the rcu read side */ 462 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
461 local_irq_save(flags); 463 sys_data->enter_event->id, &rctx, &flags);
462 464 if (!rec)
463 rctx = perf_swevent_get_recursion_context(); 465 return;
464 if (rctx < 0)
465 goto end_recursion;
466
467 cpu = smp_processor_id();
468
469 trace_buf = rcu_dereference(perf_trace_buf);
470
471 if (!trace_buf)
472 goto end;
473
474 raw_data = per_cpu_ptr(trace_buf, cpu);
475
476 /* zero the dead bytes from align to not leak stack to user */
477 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
478 466
479 rec = (struct syscall_trace_enter *) raw_data;
480 tracing_generic_entry_update(&rec->ent, 0, 0);
481 rec->ent.type = sys_data->enter_event->id;
482 rec->nr = syscall_nr; 467 rec->nr = syscall_nr;
483 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 468 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
484 (unsigned long *)&rec->args); 469 (unsigned long *)&rec->args);
485 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); 470 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
486
487end:
488 perf_swevent_put_recursion_context(rctx);
489end_recursion:
490 local_irq_restore(flags);
491} 471}
492 472
493int prof_sysenter_enable(struct ftrace_event_call *call) 473int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +511,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
531 struct syscall_trace_exit *rec; 511 struct syscall_trace_exit *rec;
532 unsigned long flags; 512 unsigned long flags;
533 int syscall_nr; 513 int syscall_nr;
534 char *trace_buf;
535 char *raw_data;
536 int rctx; 514 int rctx;
537 int size; 515 int size;
538 int cpu;
539 516
540 syscall_nr = syscall_get_nr(current, regs); 517 syscall_nr = syscall_get_nr(current, regs);
541 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 518 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +534,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
557 "exit event has grown above profile buffer size")) 534 "exit event has grown above profile buffer size"))
558 return; 535 return;
559 536
560 /* Protect the per cpu buffer, begin the rcu read side */ 537 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
561 local_irq_save(flags); 538 sys_data->exit_event->id, &rctx, &flags);
562 539 if (!rec)
563 rctx = perf_swevent_get_recursion_context(); 540 return;
564 if (rctx < 0)
565 goto end_recursion;
566
567 cpu = smp_processor_id();
568
569 trace_buf = rcu_dereference(perf_trace_buf);
570
571 if (!trace_buf)
572 goto end;
573
574 raw_data = per_cpu_ptr(trace_buf, cpu);
575
576 /* zero the dead bytes from align to not leak stack to user */
577 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
578
579 rec = (struct syscall_trace_exit *)raw_data;
580 541
581 tracing_generic_entry_update(&rec->ent, 0, 0);
582 rec->ent.type = sys_data->exit_event->id;
583 rec->nr = syscall_nr; 542 rec->nr = syscall_nr;
584 rec->ret = syscall_get_return_value(current, regs); 543 rec->ret = syscall_get_return_value(current, regs);
585 544
586 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); 545 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
587
588end:
589 perf_swevent_put_recursion_context(rctx);
590end_recursion:
591 local_irq_restore(flags);
592} 546}
593 547
594int prof_sysexit_enable(struct ftrace_event_call *call) 548int prof_sysexit_enable(struct ftrace_event_call *call)
@@ -603,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
603 ret = register_trace_sys_exit(prof_syscall_exit); 557 ret = register_trace_sys_exit(prof_syscall_exit);
604 if (ret) { 558 if (ret) {
605 pr_info("event trace: Could not activate" 559 pr_info("event trace: Could not activate"
606 "syscall entry trace point"); 560 "syscall exit trace point");
607 } else { 561 } else {
608 set_bit(num, enabled_prof_exit_syscalls); 562 set_bit(num, enabled_prof_exit_syscalls);
609 sys_prof_refcount_exit++; 563 sys_prof_refcount_exit++;
@@ -626,6 +580,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
626 mutex_unlock(&syscall_trace_lock); 580 mutex_unlock(&syscall_trace_lock);
627} 581}
628 582
629#endif 583#endif /* CONFIG_PERF_EVENTS */
630
631 584
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f6693969287d..a7974a552ca9 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = {
93 .warning_symbol = backtrace_warning_symbol, 93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack, 94 .stack = backtrace_stack,
95 .address = backtrace_address, 95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
96}; 97};
97 98
98static int 99static int
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186