aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_tree.c100
-rw-r--r--kernel/auditsc.c7
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c716
-rw-r--r--kernel/cpu.c12
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/early_res.c578
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c80
-rw-r--r--kernel/futex.c57
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/hw_breakpoint.c79
-rw-r--r--kernel/irq/chip.c54
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/kfifo.c111
-rw-r--r--kernel/kgdb.c9
-rw-r--r--kernel/kmod.c12
-rw-r--r--kernel/kprobes.c683
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep.c21
-rw-r--r--kernel/module.c49
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/nsproxy.c13
-rw-r--r--kernel/padata.c696
-rw-r--r--kernel/panic.c49
-rw-r--r--kernel/params.c12
-rw-r--r--kernel/perf_event.c697
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/posix-cpu-timers.c36
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig19
-rw-r--r--kernel/power/hibernate.c9
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/swsusp.c58
-rw-r--r--kernel/power/user.c23
-rw-r--r--kernel/printk.c56
-rw-r--r--kernel/ptrace.c88
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c29
-rw-r--r--kernel/rcutorture.c102
-rw-r--r--kernel/rcutree.c268
-rw-r--r--kernel/rcutree.h82
-rw-r--r--kernel/rcutree_plugin.h231
-rw-r--r--kernel/rcutree_trace.c14
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c66
-rw-r--r--kernel/sched.c2254
-rw-r--r--kernel/sched_cpupri.c6
-rw-r--r--kernel/sched_fair.c1701
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c66
-rw-r--r--kernel/signal.c48
-rw-r--r--kernel/smp.c10
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/softlockup.c15
-rw-r--r--kernel/srcu.c52
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c77
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c51
-rw-r--r--kernel/sysctl_binary.c38
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time/clockevents.c3
-rw-r--r--kernel/time/clocksource.c36
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/timer.c3
-rw-r--r--kernel/trace/Kconfig125
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c5
-rw-r--r--kernel/trace/ftrace.c141
-rw-r--r--kernel/trace/ring_buffer.c45
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c206
-rw-r--r--kernel/trace/trace.h11
-rw-r--r--kernel/trace/trace_branch.c19
-rw-r--r--kernel/trace/trace_clock.c1
-rw-r--r--kernel/trace/trace_event_profile.c52
-rw-r--r--kernel/trace/trace_events.c81
-rw-r--r--kernel/trace/trace_events_filter.c33
-rw-r--r--kernel/trace/trace_export.c94
-rw-r--r--kernel/trace/trace_functions_graph.c107
-rw-r--r--kernel/trace/trace_kprobe.c305
-rw-r--r--kernel/trace/trace_ksym.c140
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/trace/trace_syscalls.c189
-rw-r--r--kernel/tsacct.c1
-rw-r--r--kernel/user.c305
101 files changed, 7373 insertions, 4451 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..a987aa1676b5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
90obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 93obj-$(CONFIG_LATENCYTOP) += latencytop.o
94obj-$(CONFIG_BINFMT_ELF) += elfcore.o
95obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
@@ -100,6 +104,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
100obj-$(CONFIG_PERF_EVENTS) += perf_event.o 104obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
107obj-$(CONFIG_PADATA) += padata.o
103 108
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 109ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 110# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..24f8c81fc48d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -588,16 +588,6 @@ out:
588} 588}
589 589
590/** 590/**
591 * acct_init_pacct - initialize a new pacct_struct
592 * @pacct: per-process accounting info struct to initialize
593 */
594void acct_init_pacct(struct pacct_struct *pacct)
595{
596 memset(pacct, 0, sizeof(struct pacct_struct));
597 pacct->ac_utime = pacct->ac_stime = cputime_zero;
598}
599
600/**
601 * acct_collect - collect accounting information into pacct_struct 591 * acct_collect - collect accounting information into pacct_struct
602 * @exitcode: task exit code 592 * @exitcode: task exit code
603 * @group_dead: not 0, if this thread is the last one in the process. 593 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..78f7f86aa238 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 skb_get(skb); 398 skb_get(skb);
399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
400 if (err < 0) { 400 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 403 audit_log_lost("auditd dissapeared\n");
404 audit_pid = 0; 404 audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..028e85663f27 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
548 return 0; 548 return 0;
549} 549}
550 550
551static int compare_root(struct vfsmount *mnt, void *arg)
552{
553 return mnt->mnt_root->d_inode == arg;
554}
555
551void audit_trim_trees(void) 556void audit_trim_trees(void)
552{ 557{
553 struct list_head cursor; 558 struct list_head cursor;
@@ -559,7 +564,6 @@ void audit_trim_trees(void)
559 struct path path; 564 struct path path;
560 struct vfsmount *root_mnt; 565 struct vfsmount *root_mnt;
561 struct node *node; 566 struct node *node;
562 struct list_head list;
563 int err; 567 int err;
564 568
565 tree = container_of(cursor.next, struct audit_tree, list); 569 tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +581,16 @@ void audit_trim_trees(void)
577 if (!root_mnt) 581 if (!root_mnt)
578 goto skip_it; 582 goto skip_it;
579 583
580 list_add_tail(&list, &root_mnt->mnt_list);
581 spin_lock(&hash_lock); 584 spin_lock(&hash_lock);
582 list_for_each_entry(node, &tree->chunks, list) { 585 list_for_each_entry(node, &tree->chunks, list) {
583 struct audit_chunk *chunk = find_chunk(node); 586 struct inode *inode = find_chunk(node)->watch.inode;
584 struct inode *inode = chunk->watch.inode;
585 struct vfsmount *mnt;
586 node->index |= 1U<<31; 587 node->index |= 1U<<31;
587 list_for_each_entry(mnt, &list, mnt_list) { 588 if (iterate_mounts(compare_root, inode, root_mnt))
588 if (mnt->mnt_root->d_inode == inode) { 589 node->index &= ~(1U<<31);
589 node->index &= ~(1U<<31);
590 break;
591 }
592 }
593 } 590 }
594 spin_unlock(&hash_lock); 591 spin_unlock(&hash_lock);
595 trim_marked(tree); 592 trim_marked(tree);
596 put_tree(tree); 593 put_tree(tree);
597 list_del_init(&list);
598 drop_collected_mounts(root_mnt); 594 drop_collected_mounts(root_mnt);
599skip_it: 595skip_it:
600 mutex_lock(&audit_filter_mutex); 596 mutex_lock(&audit_filter_mutex);
@@ -603,22 +599,6 @@ skip_it:
603 mutex_unlock(&audit_filter_mutex); 599 mutex_unlock(&audit_filter_mutex);
604} 600}
605 601
606static int is_under(struct vfsmount *mnt, struct dentry *dentry,
607 struct path *path)
608{
609 if (mnt != path->mnt) {
610 for (;;) {
611 if (mnt->mnt_parent == mnt)
612 return 0;
613 if (mnt->mnt_parent == path->mnt)
614 break;
615 mnt = mnt->mnt_parent;
616 }
617 dentry = mnt->mnt_mountpoint;
618 }
619 return is_subdir(dentry, path->dentry);
620}
621
622int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 602int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
623{ 603{
624 604
@@ -638,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree)
638 put_tree(tree); 618 put_tree(tree);
639} 619}
640 620
621static int tag_mount(struct vfsmount *mnt, void *arg)
622{
623 return tag_chunk(mnt->mnt_root->d_inode, arg);
624}
625
641/* called with audit_filter_mutex */ 626/* called with audit_filter_mutex */
642int audit_add_tree_rule(struct audit_krule *rule) 627int audit_add_tree_rule(struct audit_krule *rule)
643{ 628{
644 struct audit_tree *seed = rule->tree, *tree; 629 struct audit_tree *seed = rule->tree, *tree;
645 struct path path; 630 struct path path;
646 struct vfsmount *mnt, *p; 631 struct vfsmount *mnt;
647 struct list_head list;
648 int err; 632 int err;
649 633
650 list_for_each_entry(tree, &tree_list, list) { 634 list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
670 err = -ENOMEM; 654 err = -ENOMEM;
671 goto Err; 655 goto Err;
672 } 656 }
673 list_add_tail(&list, &mnt->mnt_list);
674 657
675 get_tree(tree); 658 get_tree(tree);
676 list_for_each_entry(p, &list, mnt_list) { 659 err = iterate_mounts(tag_mount, tree, mnt);
677 err = tag_chunk(p->mnt_root->d_inode, tree);
678 if (err)
679 break;
680 }
681
682 list_del(&list);
683 drop_collected_mounts(mnt); 660 drop_collected_mounts(mnt);
684 661
685 if (!err) { 662 if (!err) {
@@ -714,31 +691,23 @@ int audit_tag_tree(char *old, char *new)
714{ 691{
715 struct list_head cursor, barrier; 692 struct list_head cursor, barrier;
716 int failed = 0; 693 int failed = 0;
717 struct path path; 694 struct path path1, path2;
718 struct vfsmount *tagged; 695 struct vfsmount *tagged;
719 struct list_head list;
720 struct vfsmount *mnt;
721 struct dentry *dentry;
722 int err; 696 int err;
723 697
724 err = kern_path(new, 0, &path); 698 err = kern_path(new, 0, &path2);
725 if (err) 699 if (err)
726 return err; 700 return err;
727 tagged = collect_mounts(&path); 701 tagged = collect_mounts(&path2);
728 path_put(&path); 702 path_put(&path2);
729 if (!tagged) 703 if (!tagged)
730 return -ENOMEM; 704 return -ENOMEM;
731 705
732 err = kern_path(old, 0, &path); 706 err = kern_path(old, 0, &path1);
733 if (err) { 707 if (err) {
734 drop_collected_mounts(tagged); 708 drop_collected_mounts(tagged);
735 return err; 709 return err;
736 } 710 }
737 mnt = mntget(path.mnt);
738 dentry = dget(path.dentry);
739 path_put(&path);
740
741 list_add_tail(&list, &tagged->mnt_list);
742 711
743 mutex_lock(&audit_filter_mutex); 712 mutex_lock(&audit_filter_mutex);
744 list_add(&barrier, &tree_list); 713 list_add(&barrier, &tree_list);
@@ -746,7 +715,7 @@ int audit_tag_tree(char *old, char *new)
746 715
747 while (cursor.next != &tree_list) { 716 while (cursor.next != &tree_list) {
748 struct audit_tree *tree; 717 struct audit_tree *tree;
749 struct vfsmount *p; 718 int good_one = 0;
750 719
751 tree = container_of(cursor.next, struct audit_tree, list); 720 tree = container_of(cursor.next, struct audit_tree, list);
752 get_tree(tree); 721 get_tree(tree);
@@ -754,30 +723,19 @@ int audit_tag_tree(char *old, char *new)
754 list_add(&cursor, &tree->list); 723 list_add(&cursor, &tree->list);
755 mutex_unlock(&audit_filter_mutex); 724 mutex_unlock(&audit_filter_mutex);
756 725
757 err = kern_path(tree->pathname, 0, &path); 726 err = kern_path(tree->pathname, 0, &path2);
758 if (err) { 727 if (!err) {
759 put_tree(tree); 728 good_one = path_is_under(&path1, &path2);
760 mutex_lock(&audit_filter_mutex); 729 path_put(&path2);
761 continue;
762 } 730 }
763 731
764 spin_lock(&vfsmount_lock); 732 if (!good_one) {
765 if (!is_under(mnt, dentry, &path)) {
766 spin_unlock(&vfsmount_lock);
767 path_put(&path);
768 put_tree(tree); 733 put_tree(tree);
769 mutex_lock(&audit_filter_mutex); 734 mutex_lock(&audit_filter_mutex);
770 continue; 735 continue;
771 } 736 }
772 spin_unlock(&vfsmount_lock);
773 path_put(&path);
774
775 list_for_each_entry(p, &list, mnt_list) {
776 failed = tag_chunk(p->mnt_root->d_inode, tree);
777 if (failed)
778 break;
779 }
780 737
738 failed = iterate_mounts(tag_mount, tree, tagged);
781 if (failed) { 739 if (failed) {
782 put_tree(tree); 740 put_tree(tree);
783 mutex_lock(&audit_filter_mutex); 741 mutex_lock(&audit_filter_mutex);
@@ -818,10 +776,8 @@ int audit_tag_tree(char *old, char *new)
818 } 776 }
819 list_del(&barrier); 777 list_del(&barrier);
820 list_del(&cursor); 778 list_del(&cursor);
821 list_del(&list);
822 mutex_unlock(&audit_filter_mutex); 779 mutex_unlock(&audit_filter_mutex);
823 dput(dentry); 780 path_put(&path1);
824 mntput(mnt);
825 drop_collected_mounts(tagged); 781 drop_collected_mounts(tagged);
826 return failed; 782 return failed;
827} 783}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..f3a461c0970a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1988 1988
1989/** 1989/**
1990 * audit_inode_child - collect inode info for created/removed objects 1990 * audit_inode_child - collect inode info for created/removed objects
1991 * @dname: inode's dentry name
1992 * @dentry: dentry being audited 1991 * @dentry: dentry being audited
1993 * @parent: inode of dentry parent 1992 * @parent: inode of dentry parent
1994 * 1993 *
@@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2000 * must be hooked prior, in order to capture the target inode during 1999 * must be hooked prior, in order to capture the target inode during
2001 * unsuccessful attempts. 2000 * unsuccessful attempts.
2002 */ 2001 */
2003void __audit_inode_child(const char *dname, const struct dentry *dentry, 2002void __audit_inode_child(const struct dentry *dentry,
2004 const struct inode *parent) 2003 const struct inode *parent)
2005{ 2004{
2006 int idx; 2005 int idx;
2007 struct audit_context *context = current->audit_context; 2006 struct audit_context *context = current->audit_context;
2008 const char *found_parent = NULL, *found_child = NULL; 2007 const char *found_parent = NULL, *found_child = NULL;
2009 const struct inode *inode = dentry->d_inode; 2008 const struct inode *inode = dentry->d_inode;
2009 const char *dname = dentry->d_name.name;
2010 int dirlen = 0; 2010 int dirlen = 0;
2011 2011
2012 if (!context->in_syscall) 2012 if (!context->in_syscall)
@@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2014 2014
2015 if (inode) 2015 if (inode)
2016 handle_one(inode); 2016 handle_one(inode);
2017 /* determine matching parent */
2018 if (!dname)
2019 goto add_names;
2020 2017
2021 /* parent is more likely, look for it first */ 2018 /* parent is more likely, look for it first */
2022 for (idx = 0; idx < context->name_count; idx++) { 2019 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
135 if (pid && (pid != task_pid_vnr(current))) { 135 if (pid && (pid != task_pid_vnr(current))) {
136 struct task_struct *target; 136 struct task_struct *target;
137 137
138 read_lock(&tasklist_lock); 138 rcu_read_lock();
139 139
140 target = find_task_by_vpid(pid); 140 target = find_task_by_vpid(pid);
141 if (!target) 141 if (!target)
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
143 else 143 else
144 ret = security_capget(target, pEp, pIp, pPp); 144 ret = security_capget(target, pEp, pIp, pPp);
145 145
146 read_unlock(&tasklist_lock); 146 rcu_read_unlock();
147 } else 147 } else
148 ret = security_capget(current, pEp, pIp, pPp); 148 ret = security_capget(current, pEp, pIp, pPp);
149 149
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0249f4be9b5c..ef909a329750 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -23,6 +27,7 @@
23 */ 27 */
24 28
25#include <linux/cgroup.h> 29#include <linux/cgroup.h>
30#include <linux/module.h>
26#include <linux/ctype.h> 31#include <linux/ctype.h>
27#include <linux/errno.h> 32#include <linux/errno.h>
28#include <linux/fs.h> 33#include <linux/fs.h>
@@ -43,6 +48,7 @@
43#include <linux/string.h> 48#include <linux/string.h>
44#include <linux/sort.h> 49#include <linux/sort.h>
45#include <linux/kmod.h> 50#include <linux/kmod.h>
51#include <linux/module.h>
46#include <linux/delayacct.h> 52#include <linux/delayacct.h>
47#include <linux/cgroupstats.h> 53#include <linux/cgroupstats.h>
48#include <linux/hash.h> 54#include <linux/hash.h>
@@ -51,15 +57,21 @@
51#include <linux/pid_namespace.h> 57#include <linux/pid_namespace.h>
52#include <linux/idr.h> 58#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h>
61#include <linux/poll.h>
54 62
55#include <asm/atomic.h> 63#include <asm/atomic.h>
56 64
57static DEFINE_MUTEX(cgroup_mutex); 65static DEFINE_MUTEX(cgroup_mutex);
58 66
59/* Generate an array of cgroup subsystem pointers */ 67/*
68 * Generate an array of cgroup subsystem pointers. At boot time, this is
69 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
70 * registered after that. The mutable section of this array is protected by
71 * cgroup_mutex.
72 */
60#define SUBSYS(_x) &_x ## _subsys, 73#define SUBSYS(_x) &_x ## _subsys,
61 74static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
62static struct cgroup_subsys *subsys[] = {
63#include <linux/cgroup_subsys.h> 75#include <linux/cgroup_subsys.h>
64}; 76};
65 77
@@ -146,6 +158,35 @@ struct css_id {
146 unsigned short stack[0]; /* Array of Length (depth+1) */ 158 unsigned short stack[0]; /* Array of Length (depth+1) */
147}; 159};
148 160
161/*
162 * cgroup_event represents events which userspace want to recieve.
163 */
164struct cgroup_event {
165 /*
166 * Cgroup which the event belongs to.
167 */
168 struct cgroup *cgrp;
169 /*
170 * Control file which the event associated.
171 */
172 struct cftype *cft;
173 /*
174 * eventfd to signal userspace about the event.
175 */
176 struct eventfd_ctx *eventfd;
177 /*
178 * Each of these stored in a list by the cgroup.
179 */
180 struct list_head list;
181 /*
182 * All fields below needed to unregister event when
183 * userspace closes eventfd.
184 */
185 poll_table pt;
186 wait_queue_head_t *wqh;
187 wait_queue_t wait;
188 struct work_struct remove;
189};
149 190
150/* The list of hierarchy roots */ 191/* The list of hierarchy roots */
151 192
@@ -166,6 +207,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 207 */
167static int need_forkexit_callback __read_mostly; 208static int need_forkexit_callback __read_mostly;
168 209
210#ifdef CONFIG_PROVE_LOCKING
211int cgroup_lock_is_held(void)
212{
213 return lockdep_is_held(&cgroup_mutex);
214}
215#else /* #ifdef CONFIG_PROVE_LOCKING */
216int cgroup_lock_is_held(void)
217{
218 return mutex_is_locked(&cgroup_mutex);
219}
220#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
221
222EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
223
169/* convenient tests for these bits */ 224/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 225inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 226{
@@ -235,7 +290,8 @@ struct cg_cgroup_link {
235static struct css_set init_css_set; 290static struct css_set init_css_set;
236static struct cg_cgroup_link init_css_set_link; 291static struct cg_cgroup_link init_css_set_link;
237 292
238static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 293static int cgroup_init_idr(struct cgroup_subsys *ss,
294 struct cgroup_subsys_state *css);
239 295
240/* css_set_lock protects the list of css_set objects, and the 296/* css_set_lock protects the list of css_set objects, and the
241 * chain of tasks off each css_set. Nests outside task->alloc_lock 297 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -433,8 +489,11 @@ static struct css_set *find_existing_css_set(
433 struct hlist_node *node; 489 struct hlist_node *node;
434 struct css_set *cg; 490 struct css_set *cg;
435 491
436 /* Built the set of subsystem state objects that we want to 492 /*
437 * see in the new css_set */ 493 * Build the set of subsystem state objects that we want to see in the
494 * new css_set. while subsystems can change globally, the entries here
495 * won't change, so no need for locking.
496 */
438 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 497 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
439 if (root->subsys_bits & (1UL << i)) { 498 if (root->subsys_bits & (1UL << i)) {
440 /* Subsystem is in this hierarchy. So we want 499 /* Subsystem is in this hierarchy. So we want
@@ -681,6 +740,7 @@ void cgroup_lock(void)
681{ 740{
682 mutex_lock(&cgroup_mutex); 741 mutex_lock(&cgroup_mutex);
683} 742}
743EXPORT_SYMBOL_GPL(cgroup_lock);
684 744
685/** 745/**
686 * cgroup_unlock - release lock on cgroup changes 746 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +751,7 @@ void cgroup_unlock(void)
691{ 751{
692 mutex_unlock(&cgroup_mutex); 752 mutex_unlock(&cgroup_mutex);
693} 753}
754EXPORT_SYMBOL_GPL(cgroup_unlock);
694 755
695/* 756/*
696 * A couple of forward declarations required, due to cyclic reference loop: 757 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +803,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
742 if (ret) 803 if (ret)
743 break; 804 break;
744 } 805 }
806
745 return ret; 807 return ret;
746} 808}
747 809
@@ -869,7 +931,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
869 css_put(css); 931 css_put(css);
870} 932}
871 933
872 934/*
935 * Call with cgroup_mutex held. Drops reference counts on modules, including
936 * any duplicate ones that parse_cgroupfs_options took. If this function
937 * returns an error, no reference counts are touched.
938 */
873static int rebind_subsystems(struct cgroupfs_root *root, 939static int rebind_subsystems(struct cgroupfs_root *root,
874 unsigned long final_bits) 940 unsigned long final_bits)
875{ 941{
@@ -877,6 +943,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
877 struct cgroup *cgrp = &root->top_cgroup; 943 struct cgroup *cgrp = &root->top_cgroup;
878 int i; 944 int i;
879 945
946 BUG_ON(!mutex_is_locked(&cgroup_mutex));
947
880 removed_bits = root->actual_subsys_bits & ~final_bits; 948 removed_bits = root->actual_subsys_bits & ~final_bits;
881 added_bits = final_bits & ~root->actual_subsys_bits; 949 added_bits = final_bits & ~root->actual_subsys_bits;
882 /* Check that any added subsystems are currently free */ 950 /* Check that any added subsystems are currently free */
@@ -885,6 +953,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
885 struct cgroup_subsys *ss = subsys[i]; 953 struct cgroup_subsys *ss = subsys[i];
886 if (!(bit & added_bits)) 954 if (!(bit & added_bits))
887 continue; 955 continue;
956 /*
957 * Nobody should tell us to do a subsys that doesn't exist:
958 * parse_cgroupfs_options should catch that case and refcounts
959 * ensure that subsystems won't disappear once selected.
960 */
961 BUG_ON(ss == NULL);
888 if (ss->root != &rootnode) { 962 if (ss->root != &rootnode) {
889 /* Subsystem isn't free */ 963 /* Subsystem isn't free */
890 return -EBUSY; 964 return -EBUSY;
@@ -904,6 +978,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
904 unsigned long bit = 1UL << i; 978 unsigned long bit = 1UL << i;
905 if (bit & added_bits) { 979 if (bit & added_bits) {
906 /* We're binding this subsystem to this hierarchy */ 980 /* We're binding this subsystem to this hierarchy */
981 BUG_ON(ss == NULL);
907 BUG_ON(cgrp->subsys[i]); 982 BUG_ON(cgrp->subsys[i]);
908 BUG_ON(!dummytop->subsys[i]); 983 BUG_ON(!dummytop->subsys[i]);
909 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 984 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +990,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
915 if (ss->bind) 990 if (ss->bind)
916 ss->bind(ss, cgrp); 991 ss->bind(ss, cgrp);
917 mutex_unlock(&ss->hierarchy_mutex); 992 mutex_unlock(&ss->hierarchy_mutex);
993 /* refcount was already taken, and we're keeping it */
918 } else if (bit & removed_bits) { 994 } else if (bit & removed_bits) {
919 /* We're removing this subsystem */ 995 /* We're removing this subsystem */
996 BUG_ON(ss == NULL);
920 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 997 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
921 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 998 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
922 mutex_lock(&ss->hierarchy_mutex); 999 mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1004,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
927 subsys[i]->root = &rootnode; 1004 subsys[i]->root = &rootnode;
928 list_move(&ss->sibling, &rootnode.subsys_list); 1005 list_move(&ss->sibling, &rootnode.subsys_list);
929 mutex_unlock(&ss->hierarchy_mutex); 1006 mutex_unlock(&ss->hierarchy_mutex);
1007 /* subsystem is now free - drop reference on module */
1008 module_put(ss->module);
930 } else if (bit & final_bits) { 1009 } else if (bit & final_bits) {
931 /* Subsystem state should already exist */ 1010 /* Subsystem state should already exist */
1011 BUG_ON(ss == NULL);
932 BUG_ON(!cgrp->subsys[i]); 1012 BUG_ON(!cgrp->subsys[i]);
1013 /*
1014 * a refcount was taken, but we already had one, so
1015 * drop the extra reference.
1016 */
1017 module_put(ss->module);
1018#ifdef CONFIG_MODULE_UNLOAD
1019 BUG_ON(ss->module && !module_refcount(ss->module));
1020#endif
933 } else { 1021 } else {
934 /* Subsystem state shouldn't exist */ 1022 /* Subsystem state shouldn't exist */
935 BUG_ON(cgrp->subsys[i]); 1023 BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1059,20 @@ struct cgroup_sb_opts {
971 1059
972}; 1060};
973 1061
974/* Convert a hierarchy specifier into a bitmask of subsystems and 1062/*
975 * flags. */ 1063 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
976static int parse_cgroupfs_options(char *data, 1064 * with cgroup_mutex held to protect the subsys[] array. This function takes
977 struct cgroup_sb_opts *opts) 1065 * refcounts on subsystems to be used, unless it returns error, in which case
1066 * no refcounts are taken.
1067 */
1068static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
978{ 1069{
979 char *token, *o = data ?: "all"; 1070 char *token, *o = data ?: "all";
980 unsigned long mask = (unsigned long)-1; 1071 unsigned long mask = (unsigned long)-1;
1072 int i;
1073 bool module_pin_failed = false;
1074
1075 BUG_ON(!mutex_is_locked(&cgroup_mutex));
981 1076
982#ifdef CONFIG_CPUSETS 1077#ifdef CONFIG_CPUSETS
983 mask = ~(1UL << cpuset_subsys_id); 1078 mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1085,11 @@ static int parse_cgroupfs_options(char *data,
990 return -EINVAL; 1085 return -EINVAL;
991 if (!strcmp(token, "all")) { 1086 if (!strcmp(token, "all")) {
992 /* Add all non-disabled subsystems */ 1087 /* Add all non-disabled subsystems */
993 int i;
994 opts->subsys_bits = 0; 1088 opts->subsys_bits = 0;
995 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1089 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
996 struct cgroup_subsys *ss = subsys[i]; 1090 struct cgroup_subsys *ss = subsys[i];
1091 if (ss == NULL)
1092 continue;
997 if (!ss->disabled) 1093 if (!ss->disabled)
998 opts->subsys_bits |= 1ul << i; 1094 opts->subsys_bits |= 1ul << i;
999 } 1095 }
@@ -1011,7 +1107,6 @@ static int parse_cgroupfs_options(char *data,
1011 if (!opts->release_agent) 1107 if (!opts->release_agent)
1012 return -ENOMEM; 1108 return -ENOMEM;
1013 } else if (!strncmp(token, "name=", 5)) { 1109 } else if (!strncmp(token, "name=", 5)) {
1014 int i;
1015 const char *name = token + 5; 1110 const char *name = token + 5;
1016 /* Can't specify an empty name */ 1111 /* Can't specify an empty name */
1017 if (!strlen(name)) 1112 if (!strlen(name))
@@ -1035,9 +1130,10 @@ static int parse_cgroupfs_options(char *data,
1035 return -ENOMEM; 1130 return -ENOMEM;
1036 } else { 1131 } else {
1037 struct cgroup_subsys *ss; 1132 struct cgroup_subsys *ss;
1038 int i;
1039 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1133 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1040 ss = subsys[i]; 1134 ss = subsys[i];
1135 if (ss == NULL)
1136 continue;
1041 if (!strcmp(token, ss->name)) { 1137 if (!strcmp(token, ss->name)) {
1042 if (!ss->disabled) 1138 if (!ss->disabled)
1043 set_bit(i, &opts->subsys_bits); 1139 set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1168,54 @@ static int parse_cgroupfs_options(char *data,
1072 if (!opts->subsys_bits && !opts->name) 1168 if (!opts->subsys_bits && !opts->name)
1073 return -EINVAL; 1169 return -EINVAL;
1074 1170
1171 /*
1172 * Grab references on all the modules we'll need, so the subsystems
1173 * don't dance around before rebind_subsystems attaches them. This may
1174 * take duplicate reference counts on a subsystem that's already used,
1175 * but rebind_subsystems handles this case.
1176 */
1177 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1178 unsigned long bit = 1UL << i;
1179
1180 if (!(bit & opts->subsys_bits))
1181 continue;
1182 if (!try_module_get(subsys[i]->module)) {
1183 module_pin_failed = true;
1184 break;
1185 }
1186 }
1187 if (module_pin_failed) {
1188 /*
1189 * oops, one of the modules was going away. this means that we
1190 * raced with a module_delete call, and to the user this is
1191 * essentially a "subsystem doesn't exist" case.
1192 */
1193 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1194 /* drop refcounts only on the ones we took */
1195 unsigned long bit = 1UL << i;
1196
1197 if (!(bit & opts->subsys_bits))
1198 continue;
1199 module_put(subsys[i]->module);
1200 }
1201 return -ENOENT;
1202 }
1203
1075 return 0; 1204 return 0;
1076} 1205}
1077 1206
1207static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1208{
1209 int i;
1210 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1211 unsigned long bit = 1UL << i;
1212
1213 if (!(bit & subsys_bits))
1214 continue;
1215 module_put(subsys[i]->module);
1216 }
1217}
1218
1078static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1219static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1079{ 1220{
1080 int ret = 0; 1221 int ret = 0;
@@ -1091,21 +1232,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1091 if (ret) 1232 if (ret)
1092 goto out_unlock; 1233 goto out_unlock;
1093 1234
1094 /* Don't allow flags to change at remount */ 1235 /* Don't allow flags or name to change at remount */
1095 if (opts.flags != root->flags) { 1236 if (opts.flags != root->flags ||
1096 ret = -EINVAL; 1237 (opts.name && strcmp(opts.name, root->name))) {
1097 goto out_unlock;
1098 }
1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL; 1238 ret = -EINVAL;
1239 drop_parsed_module_refcounts(opts.subsys_bits);
1103 goto out_unlock; 1240 goto out_unlock;
1104 } 1241 }
1105 1242
1106 ret = rebind_subsystems(root, opts.subsys_bits); 1243 ret = rebind_subsystems(root, opts.subsys_bits);
1107 if (ret) 1244 if (ret) {
1245 drop_parsed_module_refcounts(opts.subsys_bits);
1108 goto out_unlock; 1246 goto out_unlock;
1247 }
1109 1248
1110 /* (re)populate subsystem files */ 1249 /* (re)populate subsystem files */
1111 cgroup_populate_dir(cgrp); 1250 cgroup_populate_dir(cgrp);
@@ -1136,6 +1275,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1136 INIT_LIST_HEAD(&cgrp->release_list); 1275 INIT_LIST_HEAD(&cgrp->release_list);
1137 INIT_LIST_HEAD(&cgrp->pidlists); 1276 INIT_LIST_HEAD(&cgrp->pidlists);
1138 mutex_init(&cgrp->pidlist_mutex); 1277 mutex_init(&cgrp->pidlist_mutex);
1278 INIT_LIST_HEAD(&cgrp->event_list);
1279 spin_lock_init(&cgrp->event_list_lock);
1139} 1280}
1140 1281
1141static void init_cgroup_root(struct cgroupfs_root *root) 1282static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1432,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1291 struct cgroupfs_root *new_root; 1432 struct cgroupfs_root *new_root;
1292 1433
1293 /* First find the desired set of subsystems */ 1434 /* First find the desired set of subsystems */
1435 mutex_lock(&cgroup_mutex);
1294 ret = parse_cgroupfs_options(data, &opts); 1436 ret = parse_cgroupfs_options(data, &opts);
1437 mutex_unlock(&cgroup_mutex);
1295 if (ret) 1438 if (ret)
1296 goto out_err; 1439 goto out_err;
1297 1440
@@ -1302,7 +1445,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1302 new_root = cgroup_root_from_opts(&opts); 1445 new_root = cgroup_root_from_opts(&opts);
1303 if (IS_ERR(new_root)) { 1446 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root); 1447 ret = PTR_ERR(new_root);
1305 goto out_err; 1448 goto drop_modules;
1306 } 1449 }
1307 opts.new_root = new_root; 1450 opts.new_root = new_root;
1308 1451
@@ -1311,7 +1454,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1311 if (IS_ERR(sb)) { 1454 if (IS_ERR(sb)) {
1312 ret = PTR_ERR(sb); 1455 ret = PTR_ERR(sb);
1313 cgroup_drop_root(opts.new_root); 1456 cgroup_drop_root(opts.new_root);
1314 goto out_err; 1457 goto drop_modules;
1315 } 1458 }
1316 1459
1317 root = sb->s_fs_info; 1460 root = sb->s_fs_info;
@@ -1367,6 +1510,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1367 free_cg_links(&tmp_cg_links); 1510 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super; 1511 goto drop_new_super;
1369 } 1512 }
1513 /*
1514 * There must be no failure case after here, since rebinding
1515 * takes care of subsystems' refcounts, which are explicitly
1516 * dropped in the failure exit path.
1517 */
1370 1518
1371 /* EBUSY should be the only error here */ 1519 /* EBUSY should be the only error here */
1372 BUG_ON(ret); 1520 BUG_ON(ret);
@@ -1405,6 +1553,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1405 * any) is not needed 1553 * any) is not needed
1406 */ 1554 */
1407 cgroup_drop_root(opts.new_root); 1555 cgroup_drop_root(opts.new_root);
1556 /* no subsys rebinding, so refcounts don't change */
1557 drop_parsed_module_refcounts(opts.subsys_bits);
1408 } 1558 }
1409 1559
1410 simple_set_mnt(mnt, sb); 1560 simple_set_mnt(mnt, sb);
@@ -1414,6 +1564,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1414 1564
1415 drop_new_super: 1565 drop_new_super:
1416 deactivate_locked_super(sb); 1566 deactivate_locked_super(sb);
1567 drop_modules:
1568 drop_parsed_module_refcounts(opts.subsys_bits);
1417 out_err: 1569 out_err:
1418 kfree(opts.release_agent); 1570 kfree(opts.release_agent);
1419 kfree(opts.name); 1571 kfree(opts.name);
@@ -1527,6 +1679,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1527 memmove(buf, start, buf + buflen - start); 1679 memmove(buf, start, buf + buflen - start);
1528 return 0; 1680 return 0;
1529} 1681}
1682EXPORT_SYMBOL_GPL(cgroup_path);
1530 1683
1531/** 1684/**
1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1685 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1692,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1539int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1692int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1540{ 1693{
1541 int retval = 0; 1694 int retval = 0;
1542 struct cgroup_subsys *ss; 1695 struct cgroup_subsys *ss, *failed_ss = NULL;
1543 struct cgroup *oldcgrp; 1696 struct cgroup *oldcgrp;
1544 struct css_set *cg; 1697 struct css_set *cg;
1545 struct css_set *newcg; 1698 struct css_set *newcg;
@@ -1553,8 +1706,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1553 for_each_subsys(root, ss) { 1706 for_each_subsys(root, ss) {
1554 if (ss->can_attach) { 1707 if (ss->can_attach) {
1555 retval = ss->can_attach(ss, cgrp, tsk, false); 1708 retval = ss->can_attach(ss, cgrp, tsk, false);
1556 if (retval) 1709 if (retval) {
1557 return retval; 1710 /*
1711 * Remember on which subsystem the can_attach()
1712 * failed, so that we only call cancel_attach()
1713 * against the subsystems whose can_attach()
1714 * succeeded. (See below)
1715 */
1716 failed_ss = ss;
1717 goto out;
1718 }
1558 } 1719 }
1559 } 1720 }
1560 1721
@@ -1568,14 +1729,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 */ 1729 */
1569 newcg = find_css_set(cg, cgrp); 1730 newcg = find_css_set(cg, cgrp);
1570 put_css_set(cg); 1731 put_css_set(cg);
1571 if (!newcg) 1732 if (!newcg) {
1572 return -ENOMEM; 1733 retval = -ENOMEM;
1734 goto out;
1735 }
1573 1736
1574 task_lock(tsk); 1737 task_lock(tsk);
1575 if (tsk->flags & PF_EXITING) { 1738 if (tsk->flags & PF_EXITING) {
1576 task_unlock(tsk); 1739 task_unlock(tsk);
1577 put_css_set(newcg); 1740 put_css_set(newcg);
1578 return -ESRCH; 1741 retval = -ESRCH;
1742 goto out;
1579 } 1743 }
1580 rcu_assign_pointer(tsk->cgroups, newcg); 1744 rcu_assign_pointer(tsk->cgroups, newcg);
1581 task_unlock(tsk); 1745 task_unlock(tsk);
@@ -1601,7 +1765,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1601 * is no longer empty. 1765 * is no longer empty.
1602 */ 1766 */
1603 cgroup_wakeup_rmdir_waiter(cgrp); 1767 cgroup_wakeup_rmdir_waiter(cgrp);
1604 return 0; 1768out:
1769 if (retval) {
1770 for_each_subsys(root, ss) {
1771 if (ss == failed_ss)
1772 /*
1773 * This subsystem was the one that failed the
1774 * can_attach() check earlier, so we don't need
1775 * to call cancel_attach() against it or any
1776 * remaining subsystems.
1777 */
1778 break;
1779 if (ss->cancel_attach)
1780 ss->cancel_attach(ss, cgrp, tsk, false);
1781 }
1782 }
1783 return retval;
1605} 1784}
1606 1785
1607/* 1786/*
@@ -1667,6 +1846,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1667 } 1846 }
1668 return true; 1847 return true;
1669} 1848}
1849EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1670 1850
1671static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1851static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1672 const char *buffer) 1852 const char *buffer)
@@ -1935,6 +2115,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1935 .rename = cgroup_rename, 2115 .rename = cgroup_rename,
1936}; 2116};
1937 2117
2118/*
2119 * Check if a file is a control file
2120 */
2121static inline struct cftype *__file_cft(struct file *file)
2122{
2123 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2124 return ERR_PTR(-EINVAL);
2125 return __d_cft(file->f_dentry);
2126}
2127
1938static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2128static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1939 struct super_block *sb) 2129 struct super_block *sb)
1940{ 2130{
@@ -2054,6 +2244,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2054 error = PTR_ERR(dentry); 2244 error = PTR_ERR(dentry);
2055 return error; 2245 return error;
2056} 2246}
2247EXPORT_SYMBOL_GPL(cgroup_add_file);
2057 2248
2058int cgroup_add_files(struct cgroup *cgrp, 2249int cgroup_add_files(struct cgroup *cgrp,
2059 struct cgroup_subsys *subsys, 2250 struct cgroup_subsys *subsys,
@@ -2068,6 +2259,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2068 } 2259 }
2069 return 0; 2260 return 0;
2070} 2261}
2262EXPORT_SYMBOL_GPL(cgroup_add_files);
2071 2263
2072/** 2264/**
2073 * cgroup_task_count - count the number of tasks in a cgroup. 2265 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2645,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2453{ 2645{
2454 struct cgroup_pidlist *l; 2646 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */ 2647 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2648 struct pid_namespace *ns = current->nsproxy->pid_ns;
2649
2457 /* 2650 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case 2651 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same 2652 * the last ref-holder is trying to remove l from the list at the same
@@ -2463,12 +2656,9 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2463 mutex_lock(&cgrp->pidlist_mutex); 2656 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) { 2657 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) { 2658 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */ 2659 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2660 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2661 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l; 2662 return l;
2473 } 2663 }
2474 } 2664 }
@@ -2476,13 +2666,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2476 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2666 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2477 if (!l) { 2667 if (!l) {
2478 mutex_unlock(&cgrp->pidlist_mutex); 2668 mutex_unlock(&cgrp->pidlist_mutex);
2479 put_pid_ns(ns);
2480 return l; 2669 return l;
2481 } 2670 }
2482 init_rwsem(&l->mutex); 2671 init_rwsem(&l->mutex);
2483 down_write(&l->mutex); 2672 down_write(&l->mutex);
2484 l->key.type = type; 2673 l->key.type = type;
2485 l->key.ns = ns; 2674 l->key.ns = get_pid_ns(ns);
2486 l->use_count = 0; /* don't increment here */ 2675 l->use_count = 0; /* don't increment here */
2487 l->list = NULL; 2676 l->list = NULL;
2488 l->owner = cgrp; 2677 l->owner = cgrp;
@@ -2790,6 +2979,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2790} 2979}
2791 2980
2792/* 2981/*
2982 * Unregister event and free resources.
2983 *
2984 * Gets called from workqueue.
2985 */
2986static void cgroup_event_remove(struct work_struct *work)
2987{
2988 struct cgroup_event *event = container_of(work, struct cgroup_event,
2989 remove);
2990 struct cgroup *cgrp = event->cgrp;
2991
2992 /* TODO: check return code */
2993 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2994
2995 eventfd_ctx_put(event->eventfd);
2996 kfree(event);
2997 dput(cgrp->dentry);
2998}
2999
3000/*
3001 * Gets called on POLLHUP on eventfd when user closes it.
3002 *
3003 * Called with wqh->lock held and interrupts disabled.
3004 */
3005static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3006 int sync, void *key)
3007{
3008 struct cgroup_event *event = container_of(wait,
3009 struct cgroup_event, wait);
3010 struct cgroup *cgrp = event->cgrp;
3011 unsigned long flags = (unsigned long)key;
3012
3013 if (flags & POLLHUP) {
3014 remove_wait_queue_locked(event->wqh, &event->wait);
3015 spin_lock(&cgrp->event_list_lock);
3016 list_del(&event->list);
3017 spin_unlock(&cgrp->event_list_lock);
3018 /*
3019 * We are in atomic context, but cgroup_event_remove() may
3020 * sleep, so we have to call it in workqueue.
3021 */
3022 schedule_work(&event->remove);
3023 }
3024
3025 return 0;
3026}
3027
3028static void cgroup_event_ptable_queue_proc(struct file *file,
3029 wait_queue_head_t *wqh, poll_table *pt)
3030{
3031 struct cgroup_event *event = container_of(pt,
3032 struct cgroup_event, pt);
3033
3034 event->wqh = wqh;
3035 add_wait_queue(wqh, &event->wait);
3036}
3037
3038/*
3039 * Parse input and register new cgroup event handler.
3040 *
3041 * Input must be in format '<event_fd> <control_fd> <args>'.
3042 * Interpretation of args is defined by control file implementation.
3043 */
3044static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3045 const char *buffer)
3046{
3047 struct cgroup_event *event = NULL;
3048 unsigned int efd, cfd;
3049 struct file *efile = NULL;
3050 struct file *cfile = NULL;
3051 char *endp;
3052 int ret;
3053
3054 efd = simple_strtoul(buffer, &endp, 10);
3055 if (*endp != ' ')
3056 return -EINVAL;
3057 buffer = endp + 1;
3058
3059 cfd = simple_strtoul(buffer, &endp, 10);
3060 if ((*endp != ' ') && (*endp != '\0'))
3061 return -EINVAL;
3062 buffer = endp + 1;
3063
3064 event = kzalloc(sizeof(*event), GFP_KERNEL);
3065 if (!event)
3066 return -ENOMEM;
3067 event->cgrp = cgrp;
3068 INIT_LIST_HEAD(&event->list);
3069 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3070 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3071 INIT_WORK(&event->remove, cgroup_event_remove);
3072
3073 efile = eventfd_fget(efd);
3074 if (IS_ERR(efile)) {
3075 ret = PTR_ERR(efile);
3076 goto fail;
3077 }
3078
3079 event->eventfd = eventfd_ctx_fileget(efile);
3080 if (IS_ERR(event->eventfd)) {
3081 ret = PTR_ERR(event->eventfd);
3082 goto fail;
3083 }
3084
3085 cfile = fget(cfd);
3086 if (!cfile) {
3087 ret = -EBADF;
3088 goto fail;
3089 }
3090
3091 /* the process need read permission on control file */
3092 ret = file_permission(cfile, MAY_READ);
3093 if (ret < 0)
3094 goto fail;
3095
3096 event->cft = __file_cft(cfile);
3097 if (IS_ERR(event->cft)) {
3098 ret = PTR_ERR(event->cft);
3099 goto fail;
3100 }
3101
3102 if (!event->cft->register_event || !event->cft->unregister_event) {
3103 ret = -EINVAL;
3104 goto fail;
3105 }
3106
3107 ret = event->cft->register_event(cgrp, event->cft,
3108 event->eventfd, buffer);
3109 if (ret)
3110 goto fail;
3111
3112 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3113 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3114 ret = 0;
3115 goto fail;
3116 }
3117
3118 /*
3119 * Events should be removed after rmdir of cgroup directory, but before
3120 * destroying subsystem state objects. Let's take reference to cgroup
3121 * directory dentry to do that.
3122 */
3123 dget(cgrp->dentry);
3124
3125 spin_lock(&cgrp->event_list_lock);
3126 list_add(&event->list, &cgrp->event_list);
3127 spin_unlock(&cgrp->event_list_lock);
3128
3129 fput(cfile);
3130 fput(efile);
3131
3132 return 0;
3133
3134fail:
3135 if (cfile)
3136 fput(cfile);
3137
3138 if (event && event->eventfd && !IS_ERR(event->eventfd))
3139 eventfd_ctx_put(event->eventfd);
3140
3141 if (!IS_ERR_OR_NULL(efile))
3142 fput(efile);
3143
3144 kfree(event);
3145
3146 return ret;
3147}
3148
3149/*
2793 * for the common functions, 'private' gives the type of file 3150 * for the common functions, 'private' gives the type of file
2794 */ 3151 */
2795/* for hysterical raisins, we can't put this on the older files */ 3152/* for hysterical raisins, we can't put this on the older files */
@@ -2814,6 +3171,11 @@ static struct cftype files[] = {
2814 .read_u64 = cgroup_read_notify_on_release, 3171 .read_u64 = cgroup_read_notify_on_release,
2815 .write_u64 = cgroup_write_notify_on_release, 3172 .write_u64 = cgroup_write_notify_on_release,
2816 }, 3173 },
3174 {
3175 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3176 .write_string = cgroup_write_event_control,
3177 .mode = S_IWUGO,
3178 },
2817}; 3179};
2818 3180
2819static struct cftype cft_release_agent = { 3181static struct cftype cft_release_agent = {
@@ -2878,8 +3240,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2878 /* We need to take each hierarchy_mutex in a consistent order */ 3240 /* We need to take each hierarchy_mutex in a consistent order */
2879 int i; 3241 int i;
2880 3242
3243 /*
3244 * No worry about a race with rebind_subsystems that might mess up the
3245 * locking order, since both parties are under cgroup_mutex.
3246 */
2881 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3247 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2882 struct cgroup_subsys *ss = subsys[i]; 3248 struct cgroup_subsys *ss = subsys[i];
3249 if (ss == NULL)
3250 continue;
2883 if (ss->root == root) 3251 if (ss->root == root)
2884 mutex_lock(&ss->hierarchy_mutex); 3252 mutex_lock(&ss->hierarchy_mutex);
2885 } 3253 }
@@ -2891,6 +3259,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2891 3259
2892 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3260 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2893 struct cgroup_subsys *ss = subsys[i]; 3261 struct cgroup_subsys *ss = subsys[i];
3262 if (ss == NULL)
3263 continue;
2894 if (ss->root == root) 3264 if (ss->root == root)
2895 mutex_unlock(&ss->hierarchy_mutex); 3265 mutex_unlock(&ss->hierarchy_mutex);
2896 } 3266 }
@@ -2937,14 +3307,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2937 3307
2938 for_each_subsys(root, ss) { 3308 for_each_subsys(root, ss) {
2939 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3309 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3310
2940 if (IS_ERR(css)) { 3311 if (IS_ERR(css)) {
2941 err = PTR_ERR(css); 3312 err = PTR_ERR(css);
2942 goto err_destroy; 3313 goto err_destroy;
2943 } 3314 }
2944 init_cgroup_css(css, ss, cgrp); 3315 init_cgroup_css(css, ss, cgrp);
2945 if (ss->use_id) 3316 if (ss->use_id) {
2946 if (alloc_css_id(ss, parent, cgrp)) 3317 err = alloc_css_id(ss, parent, cgrp);
3318 if (err)
2947 goto err_destroy; 3319 goto err_destroy;
3320 }
2948 /* At error, ->destroy() callback has to free assigned ID. */ 3321 /* At error, ->destroy() callback has to free assigned ID. */
2949 } 3322 }
2950 3323
@@ -3011,11 +3384,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3011 * synchronization other than RCU, and the subsystem linked 3384 * synchronization other than RCU, and the subsystem linked
3012 * list isn't RCU-safe */ 3385 * list isn't RCU-safe */
3013 int i; 3386 int i;
3387 /*
3388 * We won't need to lock the subsys array, because the subsystems
3389 * we're concerned about aren't going anywhere since our cgroup root
3390 * has a reference on them.
3391 */
3014 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3392 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3015 struct cgroup_subsys *ss = subsys[i]; 3393 struct cgroup_subsys *ss = subsys[i];
3016 struct cgroup_subsys_state *css; 3394 struct cgroup_subsys_state *css;
3017 /* Skip subsystems not in this hierarchy */ 3395 /* Skip subsystems not present or not in this hierarchy */
3018 if (ss->root != cgrp->root) 3396 if (ss == NULL || ss->root != cgrp->root)
3019 continue; 3397 continue;
3020 css = cgrp->subsys[ss->subsys_id]; 3398 css = cgrp->subsys[ss->subsys_id];
3021 /* When called from check_for_release() it's possible 3399 /* When called from check_for_release() it's possible
@@ -3089,6 +3467,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3089 struct dentry *d; 3467 struct dentry *d;
3090 struct cgroup *parent; 3468 struct cgroup *parent;
3091 DEFINE_WAIT(wait); 3469 DEFINE_WAIT(wait);
3470 struct cgroup_event *event, *tmp;
3092 int ret; 3471 int ret;
3093 3472
3094 /* the vfs holds both inode->i_mutex already */ 3473 /* the vfs holds both inode->i_mutex already */
@@ -3172,6 +3551,20 @@ again:
3172 set_bit(CGRP_RELEASABLE, &parent->flags); 3551 set_bit(CGRP_RELEASABLE, &parent->flags);
3173 check_for_release(parent); 3552 check_for_release(parent);
3174 3553
3554 /*
3555 * Unregister events and notify userspace.
3556 * Notify userspace about cgroup removing only after rmdir of cgroup
3557 * directory to avoid race between userspace and kernelspace
3558 */
3559 spin_lock(&cgrp->event_list_lock);
3560 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3561 list_del(&event->list);
3562 remove_wait_queue(event->wqh, &event->wait);
3563 eventfd_signal(event->eventfd, 1);
3564 schedule_work(&event->remove);
3565 }
3566 spin_unlock(&cgrp->event_list_lock);
3567
3175 mutex_unlock(&cgroup_mutex); 3568 mutex_unlock(&cgroup_mutex);
3176 return 0; 3569 return 0;
3177} 3570}
@@ -3206,7 +3599,196 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3206 mutex_init(&ss->hierarchy_mutex); 3599 mutex_init(&ss->hierarchy_mutex);
3207 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3600 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3208 ss->active = 1; 3601 ss->active = 1;
3602
3603 /* this function shouldn't be used with modular subsystems, since they
3604 * need to register a subsys_id, among other things */
3605 BUG_ON(ss->module);
3606}
3607
3608/**
3609 * cgroup_load_subsys: load and register a modular subsystem at runtime
3610 * @ss: the subsystem to load
3611 *
3612 * This function should be called in a modular subsystem's initcall. If the
3613 * subsytem is built as a module, it will be assigned a new subsys_id and set
3614 * up for use. If the subsystem is built-in anyway, work is delegated to the
3615 * simpler cgroup_init_subsys.
3616 */
3617int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3618{
3619 int i;
3620 struct cgroup_subsys_state *css;
3621
3622 /* check name and function validity */
3623 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3624 ss->create == NULL || ss->destroy == NULL)
3625 return -EINVAL;
3626
3627 /*
3628 * we don't support callbacks in modular subsystems. this check is
3629 * before the ss->module check for consistency; a subsystem that could
3630 * be a module should still have no callbacks even if the user isn't
3631 * compiling it as one.
3632 */
3633 if (ss->fork || ss->exit)
3634 return -EINVAL;
3635
3636 /*
3637 * an optionally modular subsystem is built-in: we want to do nothing,
3638 * since cgroup_init_subsys will have already taken care of it.
3639 */
3640 if (ss->module == NULL) {
3641 /* a few sanity checks */
3642 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3643 BUG_ON(subsys[ss->subsys_id] != ss);
3644 return 0;
3645 }
3646
3647 /*
3648 * need to register a subsys id before anything else - for example,
3649 * init_cgroup_css needs it.
3650 */
3651 mutex_lock(&cgroup_mutex);
3652 /* find the first empty slot in the array */
3653 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3654 if (subsys[i] == NULL)
3655 break;
3656 }
3657 if (i == CGROUP_SUBSYS_COUNT) {
3658 /* maximum number of subsystems already registered! */
3659 mutex_unlock(&cgroup_mutex);
3660 return -EBUSY;
3661 }
3662 /* assign ourselves the subsys_id */
3663 ss->subsys_id = i;
3664 subsys[i] = ss;
3665
3666 /*
3667 * no ss->create seems to need anything important in the ss struct, so
3668 * this can happen first (i.e. before the rootnode attachment).
3669 */
3670 css = ss->create(ss, dummytop);
3671 if (IS_ERR(css)) {
3672 /* failure case - need to deassign the subsys[] slot. */
3673 subsys[i] = NULL;
3674 mutex_unlock(&cgroup_mutex);
3675 return PTR_ERR(css);
3676 }
3677
3678 list_add(&ss->sibling, &rootnode.subsys_list);
3679 ss->root = &rootnode;
3680
3681 /* our new subsystem will be attached to the dummy hierarchy. */
3682 init_cgroup_css(css, ss, dummytop);
3683 /* init_idr must be after init_cgroup_css because it sets css->id. */
3684 if (ss->use_id) {
3685 int ret = cgroup_init_idr(ss, css);
3686 if (ret) {
3687 dummytop->subsys[ss->subsys_id] = NULL;
3688 ss->destroy(ss, dummytop);
3689 subsys[i] = NULL;
3690 mutex_unlock(&cgroup_mutex);
3691 return ret;
3692 }
3693 }
3694
3695 /*
3696 * Now we need to entangle the css into the existing css_sets. unlike
3697 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3698 * will need a new pointer to it; done by iterating the css_set_table.
3699 * furthermore, modifying the existing css_sets will corrupt the hash
3700 * table state, so each changed css_set will need its hash recomputed.
3701 * this is all done under the css_set_lock.
3702 */
3703 write_lock(&css_set_lock);
3704 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3705 struct css_set *cg;
3706 struct hlist_node *node, *tmp;
3707 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3708
3709 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3710 /* skip entries that we already rehashed */
3711 if (cg->subsys[ss->subsys_id])
3712 continue;
3713 /* remove existing entry */
3714 hlist_del(&cg->hlist);
3715 /* set new value */
3716 cg->subsys[ss->subsys_id] = css;
3717 /* recompute hash and restore entry */
3718 new_bucket = css_set_hash(cg->subsys);
3719 hlist_add_head(&cg->hlist, new_bucket);
3720 }
3721 }
3722 write_unlock(&css_set_lock);
3723
3724 mutex_init(&ss->hierarchy_mutex);
3725 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3726 ss->active = 1;
3727
3728 /* success! */
3729 mutex_unlock(&cgroup_mutex);
3730 return 0;
3209} 3731}
3732EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3733
3734/**
3735 * cgroup_unload_subsys: unload a modular subsystem
3736 * @ss: the subsystem to unload
3737 *
3738 * This function should be called in a modular subsystem's exitcall. When this
3739 * function is invoked, the refcount on the subsystem's module will be 0, so
3740 * the subsystem will not be attached to any hierarchy.
3741 */
3742void cgroup_unload_subsys(struct cgroup_subsys *ss)
3743{
3744 struct cg_cgroup_link *link;
3745 struct hlist_head *hhead;
3746
3747 BUG_ON(ss->module == NULL);
3748
3749 /*
3750 * we shouldn't be called if the subsystem is in use, and the use of
3751 * try_module_get in parse_cgroupfs_options should ensure that it
3752 * doesn't start being used while we're killing it off.
3753 */
3754 BUG_ON(ss->root != &rootnode);
3755
3756 mutex_lock(&cgroup_mutex);
3757 /* deassign the subsys_id */
3758 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3759 subsys[ss->subsys_id] = NULL;
3760
3761 /* remove subsystem from rootnode's list of subsystems */
3762 list_del(&ss->sibling);
3763
3764 /*
3765 * disentangle the css from all css_sets attached to the dummytop. as
3766 * in loading, we need to pay our respects to the hashtable gods.
3767 */
3768 write_lock(&css_set_lock);
3769 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3770 struct css_set *cg = link->cg;
3771
3772 hlist_del(&cg->hlist);
3773 BUG_ON(!cg->subsys[ss->subsys_id]);
3774 cg->subsys[ss->subsys_id] = NULL;
3775 hhead = css_set_hash(cg->subsys);
3776 hlist_add_head(&cg->hlist, hhead);
3777 }
3778 write_unlock(&css_set_lock);
3779
3780 /*
3781 * remove subsystem's css from the dummytop and free it - need to free
3782 * before marking as null because ss->destroy needs the cgrp->subsys
3783 * pointer to find their state. note that this also takes care of
3784 * freeing the css_id.
3785 */
3786 ss->destroy(ss, dummytop);
3787 dummytop->subsys[ss->subsys_id] = NULL;
3788
3789 mutex_unlock(&cgroup_mutex);
3790}
3791EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3210 3792
3211/** 3793/**
3212 * cgroup_init_early - cgroup initialization at system boot 3794 * cgroup_init_early - cgroup initialization at system boot
@@ -3236,7 +3818,8 @@ int __init cgroup_init_early(void)
3236 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3818 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3237 INIT_HLIST_HEAD(&css_set_table[i]); 3819 INIT_HLIST_HEAD(&css_set_table[i]);
3238 3820
3239 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3821 /* at bootup time, we don't worry about modular subsystems */
3822 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3240 struct cgroup_subsys *ss = subsys[i]; 3823 struct cgroup_subsys *ss = subsys[i];
3241 3824
3242 BUG_ON(!ss->name); 3825 BUG_ON(!ss->name);
@@ -3271,12 +3854,13 @@ int __init cgroup_init(void)
3271 if (err) 3854 if (err)
3272 return err; 3855 return err;
3273 3856
3274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3857 /* at bootup time, we don't worry about modular subsystems */
3858 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3275 struct cgroup_subsys *ss = subsys[i]; 3859 struct cgroup_subsys *ss = subsys[i];
3276 if (!ss->early_init) 3860 if (!ss->early_init)
3277 cgroup_init_subsys(ss); 3861 cgroup_init_subsys(ss);
3278 if (ss->use_id) 3862 if (ss->use_id)
3279 cgroup_subsys_init_idr(ss); 3863 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3280 } 3864 }
3281 3865
3282 /* Add init_css_set to the hash table */ 3866 /* Add init_css_set to the hash table */
@@ -3380,9 +3964,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3380 int i; 3964 int i;
3381 3965
3382 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3966 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3967 /*
3968 * ideally we don't want subsystems moving around while we do this.
3969 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3970 * subsys/hierarchy state.
3971 */
3383 mutex_lock(&cgroup_mutex); 3972 mutex_lock(&cgroup_mutex);
3384 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3973 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3385 struct cgroup_subsys *ss = subsys[i]; 3974 struct cgroup_subsys *ss = subsys[i];
3975 if (ss == NULL)
3976 continue;
3386 seq_printf(m, "%s\t%d\t%d\t%d\n", 3977 seq_printf(m, "%s\t%d\t%d\t%d\n",
3387 ss->name, ss->root->hierarchy_id, 3978 ss->name, ss->root->hierarchy_id,
3388 ss->root->number_of_cgroups, !ss->disabled); 3979 ss->root->number_of_cgroups, !ss->disabled);
@@ -3440,7 +4031,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3440{ 4031{
3441 if (need_forkexit_callback) { 4032 if (need_forkexit_callback) {
3442 int i; 4033 int i;
3443 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4034 /*
4035 * forkexit callbacks are only supported for builtin
4036 * subsystems, and the builtin section of the subsys array is
4037 * immutable, so we don't need to lock the subsys array here.
4038 */
4039 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3444 struct cgroup_subsys *ss = subsys[i]; 4040 struct cgroup_subsys *ss = subsys[i];
3445 if (ss->fork) 4041 if (ss->fork)
3446 ss->fork(ss, child); 4042 ss->fork(ss, child);
@@ -3509,7 +4105,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3509 struct css_set *cg; 4105 struct css_set *cg;
3510 4106
3511 if (run_callbacks && need_forkexit_callback) { 4107 if (run_callbacks && need_forkexit_callback) {
3512 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4108 /*
4109 * modular subsystems can't use callbacks, so no need to lock
4110 * the subsys array
4111 */
4112 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3513 struct cgroup_subsys *ss = subsys[i]; 4113 struct cgroup_subsys *ss = subsys[i];
3514 if (ss->exit) 4114 if (ss->exit)
3515 ss->exit(ss, tsk); 4115 ss->exit(ss, tsk);
@@ -3703,12 +4303,13 @@ static void check_for_release(struct cgroup *cgrp)
3703 } 4303 }
3704} 4304}
3705 4305
3706void __css_put(struct cgroup_subsys_state *css) 4306/* Caller must verify that the css is not for root cgroup */
4307void __css_put(struct cgroup_subsys_state *css, int count)
3707{ 4308{
3708 struct cgroup *cgrp = css->cgroup; 4309 struct cgroup *cgrp = css->cgroup;
3709 int val; 4310 int val;
3710 rcu_read_lock(); 4311 rcu_read_lock();
3711 val = atomic_dec_return(&css->refcnt); 4312 val = atomic_sub_return(count, &css->refcnt);
3712 if (val == 1) { 4313 if (val == 1) {
3713 if (notify_on_release(cgrp)) { 4314 if (notify_on_release(cgrp)) {
3714 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4315 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3719,6 +4320,7 @@ void __css_put(struct cgroup_subsys_state *css)
3719 rcu_read_unlock(); 4320 rcu_read_unlock();
3720 WARN_ON_ONCE(val < 1); 4321 WARN_ON_ONCE(val < 1);
3721} 4322}
4323EXPORT_SYMBOL_GPL(__css_put);
3722 4324
3723/* 4325/*
3724 * Notify userspace when a cgroup is released, by running the 4326 * Notify userspace when a cgroup is released, by running the
@@ -3800,8 +4402,11 @@ static int __init cgroup_disable(char *str)
3800 while ((token = strsep(&str, ",")) != NULL) { 4402 while ((token = strsep(&str, ",")) != NULL) {
3801 if (!*token) 4403 if (!*token)
3802 continue; 4404 continue;
3803 4405 /*
3804 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4406 * cgroup_disable, being at boot time, can't know about module
4407 * subsystems, so we don't worry about them.
4408 */
4409 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3805 struct cgroup_subsys *ss = subsys[i]; 4410 struct cgroup_subsys *ss = subsys[i];
3806 4411
3807 if (!strcmp(token, ss->name)) { 4412 if (!strcmp(token, ss->name)) {
@@ -3831,6 +4436,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
3831 return cssid->id; 4436 return cssid->id;
3832 return 0; 4437 return 0;
3833} 4438}
4439EXPORT_SYMBOL_GPL(css_id);
3834 4440
3835unsigned short css_depth(struct cgroup_subsys_state *css) 4441unsigned short css_depth(struct cgroup_subsys_state *css)
3836{ 4442{
@@ -3840,6 +4446,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
3840 return cssid->depth; 4446 return cssid->depth;
3841 return 0; 4447 return 0;
3842} 4448}
4449EXPORT_SYMBOL_GPL(css_depth);
3843 4450
3844bool css_is_ancestor(struct cgroup_subsys_state *child, 4451bool css_is_ancestor(struct cgroup_subsys_state *child,
3845 const struct cgroup_subsys_state *root) 4452 const struct cgroup_subsys_state *root)
@@ -3876,6 +4483,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3876 spin_unlock(&ss->id_lock); 4483 spin_unlock(&ss->id_lock);
3877 call_rcu(&id->rcu_head, __free_css_id_cb); 4484 call_rcu(&id->rcu_head, __free_css_id_cb);
3878} 4485}
4486EXPORT_SYMBOL_GPL(free_css_id);
3879 4487
3880/* 4488/*
3881 * This is called by init or create(). Then, calls to this function are 4489 * This is called by init or create(). Then, calls to this function are
@@ -3925,15 +4533,14 @@ err_out:
3925 4533
3926} 4534}
3927 4535
3928static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4536static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4537 struct cgroup_subsys_state *rootcss)
3929{ 4538{
3930 struct css_id *newid; 4539 struct css_id *newid;
3931 struct cgroup_subsys_state *rootcss;
3932 4540
3933 spin_lock_init(&ss->id_lock); 4541 spin_lock_init(&ss->id_lock);
3934 idr_init(&ss->idr); 4542 idr_init(&ss->idr);
3935 4543
3936 rootcss = init_css_set.subsys[ss->subsys_id];
3937 newid = get_new_cssid(ss, 0); 4544 newid = get_new_cssid(ss, 0);
3938 if (IS_ERR(newid)) 4545 if (IS_ERR(newid))
3939 return PTR_ERR(newid); 4546 return PTR_ERR(newid);
@@ -3993,6 +4600,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3993 4600
3994 return rcu_dereference(cssid->css); 4601 return rcu_dereference(cssid->css);
3995} 4602}
4603EXPORT_SYMBOL_GPL(css_lookup);
3996 4604
3997/** 4605/**
3998 * css_get_next - lookup next cgroup under specified hierarchy. 4606 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee940..f8cced2692b3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
151 151
152 write_lock_irq(&tasklist_lock); 152 write_lock_irq(&tasklist_lock);
153 for_each_process(p) { 153 for_each_process(p) {
154 if (task_cpu(p) == cpu && 154 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
155 (!cputime_eq(p->utime, cputime_zero) || 155 (!cputime_eq(p->utime, cputime_zero) ||
156 !cputime_eq(p->stime, cputime_zero))) 156 !cputime_eq(p->stime, cputime_zero)))
157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
158 (state = %ld, flags = %x) \n", 158 "(state = %ld, flags = %x)\n",
159 p->comm, task_pid_nr(p), cpu, 159 p->comm, task_pid_nr(p), cpu,
160 p->state, p->flags); 160 p->state, p->flags);
161 } 161 }
162 write_unlock_irq(&tasklist_lock); 162 write_unlock_irq(&tasklist_lock);
163} 163}
@@ -338,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu)
338 if (!cpu_possible(cpu)) { 338 if (!cpu_possible(cpu)) {
339 printk(KERN_ERR "can't online cpu %d because it is not " 339 printk(KERN_ERR "can't online cpu %d because it is not "
340 "configured as may-hotadd at boot time\n", cpu); 340 "configured as may-hotadd at boot time\n", cpu);
341#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 341#if defined(CONFIG_IA64)
342 printk(KERN_ERR "please check additional_cpus= boot " 342 printk(KERN_ERR "please check additional_cpus= boot "
343 "parameter\n"); 343 "parameter\n");
344#endif 344#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..1ed8ca18790c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void)
224#ifdef CONFIG_KEYS 224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); 225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) { 226 if (!new->tgcred) {
227 kfree(new); 227 kmem_cache_free(cred_jar, new);
228 return NULL; 228 return NULL;
229 } 229 }
230 atomic_set(&new->tgcred->usage, 1); 230 atomic_set(&new->tgcred->usage, 1);
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..3cb2c661bb78
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,578 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10
11/*
12 * Early reserved memory areas.
13 */
14/*
15 * need to make sure this one is bigger enough before
16 * find_fw_memmap_area could be used
17 */
18#define MAX_EARLY_RES_X 32
19
20struct early_res {
21 u64 start, end;
22 char name[15];
23 char overlap_ok;
24};
25static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
26
27static int max_early_res __initdata = MAX_EARLY_RES_X;
28static struct early_res *early_res __initdata = &early_res_x[0];
29static int early_res_count __initdata;
30
31static int __init find_overlapped_early(u64 start, u64 end)
32{
33 int i;
34 struct early_res *r;
35
36 for (i = 0; i < max_early_res && early_res[i].end; i++) {
37 r = &early_res[i];
38 if (end > r->start && start < r->end)
39 break;
40 }
41
42 return i;
43}
44
45/*
46 * Drop the i-th range from the early reservation map,
47 * by copying any higher ranges down one over it, and
48 * clearing what had been the last slot.
49 */
50static void __init drop_range(int i)
51{
52 int j;
53
54 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
55 ;
56
57 memmove(&early_res[i], &early_res[i + 1],
58 (j - 1 - i) * sizeof(struct early_res));
59
60 early_res[j - 1].end = 0;
61 early_res_count--;
62}
63
64static void __init drop_range_partial(int i, u64 start, u64 end)
65{
66 u64 common_start, common_end;
67 u64 old_start, old_end;
68
69 old_start = early_res[i].start;
70 old_end = early_res[i].end;
71 common_start = max(old_start, start);
72 common_end = min(old_end, end);
73
74 /* no overlap ? */
75 if (common_start >= common_end)
76 return;
77
78 if (old_start < common_start) {
79 /* make head segment */
80 early_res[i].end = common_start;
81 if (old_end > common_end) {
82 char name[15];
83
84 /*
85 * Save a local copy of the name, since the
86 * early_res array could get resized inside
87 * reserve_early_without_check() ->
88 * __check_and_double_early_res(), which would
89 * make the current name pointer invalid.
90 */
91 strncpy(name, early_res[i].name,
92 sizeof(early_res[i].name) - 1);
93 /* add another for left over on tail */
94 reserve_early_without_check(common_end, old_end, name);
95 }
96 return;
97 } else {
98 if (old_end > common_end) {
99 /* reuse the entry for tail left */
100 early_res[i].start = common_end;
101 return;
102 }
103 /* all covered */
104 drop_range(i);
105 }
106}
107
108/*
109 * Split any existing ranges that:
110 * 1) are marked 'overlap_ok', and
111 * 2) overlap with the stated range [start, end)
112 * into whatever portion (if any) of the existing range is entirely
113 * below or entirely above the stated range. Drop the portion
114 * of the existing range that overlaps with the stated range,
115 * which will allow the caller of this routine to then add that
116 * stated range without conflicting with any existing range.
117 */
118static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
119{
120 int i;
121 struct early_res *r;
122 u64 lower_start, lower_end;
123 u64 upper_start, upper_end;
124 char name[15];
125
126 for (i = 0; i < max_early_res && early_res[i].end; i++) {
127 r = &early_res[i];
128
129 /* Continue past non-overlapping ranges */
130 if (end <= r->start || start >= r->end)
131 continue;
132
133 /*
134 * Leave non-ok overlaps as is; let caller
135 * panic "Overlapping early reservations"
136 * when it hits this overlap.
137 */
138 if (!r->overlap_ok)
139 return;
140
141 /*
142 * We have an ok overlap. We will drop it from the early
143 * reservation map, and add back in any non-overlapping
144 * portions (lower or upper) as separate, overlap_ok,
145 * non-overlapping ranges.
146 */
147
148 /* 1. Note any non-overlapping (lower or upper) ranges. */
149 strncpy(name, r->name, sizeof(name) - 1);
150
151 lower_start = lower_end = 0;
152 upper_start = upper_end = 0;
153 if (r->start < start) {
154 lower_start = r->start;
155 lower_end = start;
156 }
157 if (r->end > end) {
158 upper_start = end;
159 upper_end = r->end;
160 }
161
162 /* 2. Drop the original ok overlapping range */
163 drop_range(i);
164
165 i--; /* resume for-loop on copied down entry */
166
167 /* 3. Add back in any non-overlapping ranges. */
168 if (lower_end)
169 reserve_early_overlap_ok(lower_start, lower_end, name);
170 if (upper_end)
171 reserve_early_overlap_ok(upper_start, upper_end, name);
172 }
173}
174
175static void __init __reserve_early(u64 start, u64 end, char *name,
176 int overlap_ok)
177{
178 int i;
179 struct early_res *r;
180
181 i = find_overlapped_early(start, end);
182 if (i >= max_early_res)
183 panic("Too many early reservations");
184 r = &early_res[i];
185 if (r->end)
186 panic("Overlapping early reservations "
187 "%llx-%llx %s to %llx-%llx %s\n",
188 start, end - 1, name ? name : "", r->start,
189 r->end - 1, r->name);
190 r->start = start;
191 r->end = end;
192 r->overlap_ok = overlap_ok;
193 if (name)
194 strncpy(r->name, name, sizeof(r->name) - 1);
195 early_res_count++;
196}
197
198/*
199 * A few early reservtations come here.
200 *
201 * The 'overlap_ok' in the name of this routine does -not- mean it
202 * is ok for these reservations to overlap an earlier reservation.
203 * Rather it means that it is ok for subsequent reservations to
204 * overlap this one.
205 *
206 * Use this entry point to reserve early ranges when you are doing
207 * so out of "Paranoia", reserving perhaps more memory than you need,
208 * just in case, and don't mind a subsequent overlapping reservation
209 * that is known to be needed.
210 *
211 * The drop_overlaps_that_are_ok() call here isn't really needed.
212 * It would be needed if we had two colliding 'overlap_ok'
213 * reservations, so that the second such would not panic on the
214 * overlap with the first. We don't have any such as of this
215 * writing, but might as well tolerate such if it happens in
216 * the future.
217 */
218void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
219{
220 drop_overlaps_that_are_ok(start, end);
221 __reserve_early(start, end, name, 1);
222}
223
224static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
225{
226 u64 start, end, size, mem;
227 struct early_res *new;
228
229 /* do we have enough slots left ? */
230 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
231 return;
232
233 /* double it */
234 mem = -1ULL;
235 size = sizeof(struct early_res) * max_early_res * 2;
236 if (early_res == early_res_x)
237 start = 0;
238 else
239 start = early_res[0].end;
240 end = ex_start;
241 if (start + size < end)
242 mem = find_fw_memmap_area(start, end, size,
243 sizeof(struct early_res));
244 if (mem == -1ULL) {
245 start = ex_end;
246 end = get_max_mapped();
247 if (start + size < end)
248 mem = find_fw_memmap_area(start, end, size,
249 sizeof(struct early_res));
250 }
251 if (mem == -1ULL)
252 panic("can not find more space for early_res array");
253
254 new = __va(mem);
255 /* save the first one for own */
256 new[0].start = mem;
257 new[0].end = mem + size;
258 new[0].overlap_ok = 0;
259 /* copy old to new */
260 if (early_res == early_res_x) {
261 memcpy(&new[1], &early_res[0],
262 sizeof(struct early_res) * max_early_res);
263 memset(&new[max_early_res+1], 0,
264 sizeof(struct early_res) * (max_early_res - 1));
265 early_res_count++;
266 } else {
267 memcpy(&new[1], &early_res[1],
268 sizeof(struct early_res) * (max_early_res - 1));
269 memset(&new[max_early_res], 0,
270 sizeof(struct early_res) * max_early_res);
271 }
272 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
273 early_res = new;
274 max_early_res *= 2;
275 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
276 max_early_res, mem, mem + size - 1);
277}
278
279/*
280 * Most early reservations come here.
281 *
282 * We first have drop_overlaps_that_are_ok() drop any pre-existing
283 * 'overlap_ok' ranges, so that we can then reserve this memory
284 * range without risk of panic'ing on an overlapping overlap_ok
285 * early reservation.
286 */
287void __init reserve_early(u64 start, u64 end, char *name)
288{
289 if (start >= end)
290 return;
291
292 __check_and_double_early_res(start, end);
293
294 drop_overlaps_that_are_ok(start, end);
295 __reserve_early(start, end, name, 0);
296}
297
298void __init reserve_early_without_check(u64 start, u64 end, char *name)
299{
300 struct early_res *r;
301
302 if (start >= end)
303 return;
304
305 __check_and_double_early_res(start, end);
306
307 r = &early_res[early_res_count];
308
309 r->start = start;
310 r->end = end;
311 r->overlap_ok = 0;
312 if (name)
313 strncpy(r->name, name, sizeof(r->name) - 1);
314 early_res_count++;
315}
316
317void __init free_early(u64 start, u64 end)
318{
319 struct early_res *r;
320 int i;
321
322 i = find_overlapped_early(start, end);
323 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start)
325 panic("free_early on not reserved area: %llx-%llx!",
326 start, end - 1);
327
328 drop_range(i);
329}
330
331void __init free_early_partial(u64 start, u64 end)
332{
333 struct early_res *r;
334 int i;
335
336try_next:
337 i = find_overlapped_early(start, end);
338 if (i >= max_early_res)
339 return;
340
341 r = &early_res[i];
342 /* hole ? */
343 if (r->end >= end && r->start <= start) {
344 drop_range_partial(i, start, end);
345 return;
346 }
347
348 drop_range_partial(i, start, end);
349 goto try_next;
350}
351
352#ifdef CONFIG_NO_BOOTMEM
353static void __init subtract_early_res(struct range *range, int az)
354{
355 int i, count;
356 u64 final_start, final_end;
357 int idx = 0;
358
359 count = 0;
360 for (i = 0; i < max_early_res && early_res[i].end; i++)
361 count++;
362
363 /* need to skip first one ?*/
364 if (early_res != early_res_x)
365 idx = 1;
366
367#define DEBUG_PRINT_EARLY_RES 1
368
369#if DEBUG_PRINT_EARLY_RES
370 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
371#endif
372 for (i = idx; i < count; i++) {
373 struct early_res *r = &early_res[i];
374#if DEBUG_PRINT_EARLY_RES
375 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
376 r->start, r->end, r->name);
377#endif
378 final_start = PFN_DOWN(r->start);
379 final_end = PFN_UP(r->end);
380 if (final_start >= final_end)
381 continue;
382 subtract_range(range, az, final_start, final_end);
383 }
384
385}
386
387int __init get_free_all_memory_range(struct range **rangep, int nodeid)
388{
389 int i, count;
390 u64 start = 0, end;
391 u64 size;
392 u64 mem;
393 struct range *range;
394 int nr_range;
395
396 count = 0;
397 for (i = 0; i < max_early_res && early_res[i].end; i++)
398 count++;
399
400 count *= 2;
401
402 size = sizeof(struct range) * count;
403 end = get_max_mapped();
404#ifdef MAX_DMA32_PFN
405 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
406 start = MAX_DMA32_PFN << PAGE_SHIFT;
407#endif
408 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
409 if (mem == -1ULL)
410 panic("can not find more space for range free");
411
412 range = __va(mem);
413 /* use early_node_map[] and early_res to get range array at first */
414 memset(range, 0, size);
415 nr_range = 0;
416
417 /* need to go over early_node_map to find out good range for node */
418 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
419#ifdef CONFIG_X86_32
420 subtract_range(range, count, max_low_pfn, -1ULL);
421#endif
422 subtract_early_res(range, count);
423 nr_range = clean_sort_range(range, count);
424
425 /* need to clear it ? */
426 if (nodeid == MAX_NUMNODES) {
427 memset(&early_res[0], 0,
428 sizeof(struct early_res) * max_early_res);
429 early_res = NULL;
430 max_early_res = 0;
431 }
432
433 *rangep = range;
434 return nr_range;
435}
436#else
437void __init early_res_to_bootmem(u64 start, u64 end)
438{
439 int i, count;
440 u64 final_start, final_end;
441 int idx = 0;
442
443 count = 0;
444 for (i = 0; i < max_early_res && early_res[i].end; i++)
445 count++;
446
447 /* need to skip first one ?*/
448 if (early_res != early_res_x)
449 idx = 1;
450
451 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
452 count - idx, max_early_res, start, end);
453 for (i = idx; i < count; i++) {
454 struct early_res *r = &early_res[i];
455 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
456 r->start, r->end, r->name);
457 final_start = max(start, r->start);
458 final_end = min(end, r->end);
459 if (final_start >= final_end) {
460 printk(KERN_CONT "\n");
461 continue;
462 }
463 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
464 final_start, final_end);
465 reserve_bootmem_generic(final_start, final_end - final_start,
466 BOOTMEM_DEFAULT);
467 }
468 /* clear them */
469 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
470 early_res = NULL;
471 max_early_res = 0;
472 early_res_count = 0;
473}
474#endif
475
476/* Check for already reserved areas */
477static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
478{
479 int i;
480 u64 addr = *addrp;
481 int changed = 0;
482 struct early_res *r;
483again:
484 i = find_overlapped_early(addr, addr + size);
485 r = &early_res[i];
486 if (i < max_early_res && r->end) {
487 *addrp = addr = round_up(r->end, align);
488 changed = 1;
489 goto again;
490 }
491 return changed;
492}
493
494/* Check for already reserved areas */
495static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
496{
497 int i;
498 u64 addr = *addrp, last;
499 u64 size = *sizep;
500 int changed = 0;
501again:
502 last = addr + size;
503 for (i = 0; i < max_early_res && early_res[i].end; i++) {
504 struct early_res *r = &early_res[i];
505 if (last > r->start && addr < r->start) {
506 size = r->start - addr;
507 changed = 1;
508 goto again;
509 }
510 if (last > r->end && addr < r->end) {
511 addr = round_up(r->end, align);
512 size = last - addr;
513 changed = 1;
514 goto again;
515 }
516 if (last <= r->end && addr >= r->start) {
517 (*sizep)++;
518 return 0;
519 }
520 }
521 if (changed) {
522 *addrp = addr;
523 *sizep = size;
524 }
525 return changed;
526}
527
528/*
529 * Find a free area with specified alignment in a specific range.
530 * only with the area.between start to end is active range from early_node_map
531 * so they are good as RAM
532 */
533u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
534 u64 size, u64 align)
535{
536 u64 addr, last;
537
538 addr = round_up(ei_start, align);
539 if (addr < start)
540 addr = round_up(start, align);
541 if (addr >= ei_last)
542 goto out;
543 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
544 ;
545 last = addr + size;
546 if (last > ei_last)
547 goto out;
548 if (last > end)
549 goto out;
550
551 return addr;
552
553out:
554 return -1ULL;
555}
556
557u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
558 u64 *sizep, u64 align)
559{
560 u64 addr, last;
561
562 addr = round_up(ei_start, align);
563 if (addr < start)
564 addr = round_up(start, align);
565 if (addr >= ei_last)
566 goto out;
567 *sizep = ei_last - addr;
568 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
569 ;
570 last = addr + *sizep;
571 if (last > ei_last)
572 goto out;
573
574 return addr;
575
576out:
577 return -1ULL;
578}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a66..cce59cb5ee6a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk)
85 BUG_ON(!sig); 85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count)); 86 BUG_ON(!atomic_read(&sig->count));
87 87
88 sighand = rcu_dereference(tsk->sighand); 88 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() ||
90 lockdep_tasklist_lock_is_held());
89 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
90 92
91 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p)
170repeat: 172repeat:
171 tracehook_prepare_release_task(p); 173 tracehook_prepare_release_task(p);
172 /* don't need to get the RCU readlock here - the process is dead and 174 /* don't need to get the RCU readlock here - the process is dead and
173 * can't be modifying its own credentials */ 175 * can't be modifying its own credentials. But shut RCU-lockdep up */
176 rcu_read_lock();
174 atomic_dec(&__task_cred(p)->user->processes); 177 atomic_dec(&__task_cred(p)->user->processes);
178 rcu_read_unlock();
175 179
176 proc_flush_task(p); 180 proc_flush_task(p);
177 181
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files)
473 /* 477 /*
474 * It is safe to dereference the fd table without RCU or 478 * It is safe to dereference the fd table without RCU or
475 * ->file_lock because this is the last reference to the 479 * ->file_lock because this is the last reference to the
476 * files structure. 480 * files structure. But use RCU to shut RCU-lockdep up.
477 */ 481 */
482 rcu_read_lock();
478 fdt = files_fdtable(files); 483 fdt = files_fdtable(files);
484 rcu_read_unlock();
479 for (;;) { 485 for (;;) {
480 unsigned long set; 486 unsigned long set;
481 i = j * __NFDBITS; 487 i = j * __NFDBITS;
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files)
521 * at the end of the RCU grace period. Otherwise, 527 * at the end of the RCU grace period. Otherwise,
522 * you can free files immediately. 528 * you can free files immediately.
523 */ 529 */
530 rcu_read_lock();
524 fdt = files_fdtable(files); 531 fdt = files_fdtable(files);
525 if (fdt != &files->fdtab) 532 if (fdt != &files->fdtab)
526 kmem_cache_free(files_cachep, files); 533 kmem_cache_free(files_cachep, files);
527 free_fdtable(fdt); 534 free_fdtable(fdt);
535 rcu_read_unlock();
528 } 536 }
529} 537}
530 538
@@ -944,7 +952,8 @@ NORET_TYPE void do_exit(long code)
944 preempt_count()); 952 preempt_count());
945 953
946 acct_update_integrals(tsk); 954 acct_update_integrals(tsk);
947 955 /* sync mm's RSS info before statistics gathering */
956 sync_mm_rss(tsk, tsk->mm);
948 group_dead = atomic_dec_and_test(&tsk->signal->live); 957 group_dead = atomic_dec_and_test(&tsk->signal->live);
949 if (group_dead) { 958 if (group_dead) {
950 hrtimer_cancel(&tsk->signal->real_timer); 959 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1180,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1180 1189
1181 if (unlikely(wo->wo_flags & WNOWAIT)) { 1190 if (unlikely(wo->wo_flags & WNOWAIT)) {
1182 int exit_code = p->exit_code; 1191 int exit_code = p->exit_code;
1183 int why, status; 1192 int why;
1184 1193
1185 get_task_struct(p); 1194 get_task_struct(p);
1186 read_unlock(&tasklist_lock); 1195 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc2..4799c5f0e6d0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89 89
90#ifdef CONFIG_PROVE_RCU
91int lockdep_tasklist_lock_is_held(void)
92{
93 return lockdep_is_held(&tasklist_lock);
94}
95EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
96#endif /* #ifdef CONFIG_PROVE_RCU */
97
90int nr_processes(void) 98int nr_processes(void)
91{ 99{
92 int cpu; 100 int cpu;
@@ -328,15 +336,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (!tmp) 336 if (!tmp)
329 goto fail_nomem; 337 goto fail_nomem;
330 *tmp = *mpnt; 338 *tmp = *mpnt;
339 INIT_LIST_HEAD(&tmp->anon_vma_chain);
331 pol = mpol_dup(vma_policy(mpnt)); 340 pol = mpol_dup(vma_policy(mpnt));
332 retval = PTR_ERR(pol); 341 retval = PTR_ERR(pol);
333 if (IS_ERR(pol)) 342 if (IS_ERR(pol))
334 goto fail_nomem_policy; 343 goto fail_nomem_policy;
335 vma_set_policy(tmp, pol); 344 vma_set_policy(tmp, pol);
345 if (anon_vma_fork(tmp, mpnt))
346 goto fail_nomem_anon_vma_fork;
336 tmp->vm_flags &= ~VM_LOCKED; 347 tmp->vm_flags &= ~VM_LOCKED;
337 tmp->vm_mm = mm; 348 tmp->vm_mm = mm;
338 tmp->vm_next = NULL; 349 tmp->vm_next = NULL;
339 anon_vma_link(tmp);
340 file = tmp->vm_file; 350 file = tmp->vm_file;
341 if (file) { 351 if (file) {
342 struct inode *inode = file->f_path.dentry->d_inode; 352 struct inode *inode = file->f_path.dentry->d_inode;
@@ -391,6 +401,8 @@ out:
391 flush_tlb_mm(oldmm); 401 flush_tlb_mm(oldmm);
392 up_write(&oldmm->mmap_sem); 402 up_write(&oldmm->mmap_sem);
393 return retval; 403 return retval;
404fail_nomem_anon_vma_fork:
405 mpol_put(pol);
394fail_nomem_policy: 406fail_nomem_policy:
395 kmem_cache_free(vm_area_cachep, tmp); 407 kmem_cache_free(vm_area_cachep, tmp);
396fail_nomem: 408fail_nomem:
@@ -454,8 +466,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
454 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 466 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
455 mm->core_state = NULL; 467 mm->core_state = NULL;
456 mm->nr_ptes = 0; 468 mm->nr_ptes = 0;
457 set_mm_counter(mm, file_rss, 0); 469 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
458 set_mm_counter(mm, anon_rss, 0);
459 spin_lock_init(&mm->page_table_lock); 470 spin_lock_init(&mm->page_table_lock);
460 mm->free_area_cache = TASK_UNMAPPED_BASE; 471 mm->free_area_cache = TASK_UNMAPPED_BASE;
461 mm->cached_hole_size = ~0UL; 472 mm->cached_hole_size = ~0UL;
@@ -824,23 +835,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
824 */ 835 */
825static void posix_cpu_timers_init_group(struct signal_struct *sig) 836static void posix_cpu_timers_init_group(struct signal_struct *sig)
826{ 837{
838 unsigned long cpu_limit;
839
827 /* Thread group counters. */ 840 /* Thread group counters. */
828 thread_group_cputime_init(sig); 841 thread_group_cputime_init(sig);
829 842
830 /* Expiration times and increments. */ 843 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
831 sig->it[CPUCLOCK_PROF].expires = cputime_zero; 844 if (cpu_limit != RLIM_INFINITY) {
832 sig->it[CPUCLOCK_PROF].incr = cputime_zero; 845 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
833 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
834 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
835
836 /* Cached expiration times. */
837 sig->cputime_expires.prof_exp = cputime_zero;
838 sig->cputime_expires.virt_exp = cputime_zero;
839 sig->cputime_expires.sched_exp = 0;
840
841 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
842 sig->cputime_expires.prof_exp =
843 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
844 sig->cputimer.running = 1; 846 sig->cputimer.running = 1;
845 } 847 }
846 848
@@ -857,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
857 if (clone_flags & CLONE_THREAD) 859 if (clone_flags & CLONE_THREAD)
858 return 0; 860 return 0;
859 861
860 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 862 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
861 tsk->signal = sig; 863 tsk->signal = sig;
862 if (!sig) 864 if (!sig)
863 return -ENOMEM; 865 return -ENOMEM;
@@ -865,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
865 atomic_set(&sig->count, 1); 867 atomic_set(&sig->count, 1);
866 atomic_set(&sig->live, 1); 868 atomic_set(&sig->live, 1);
867 init_waitqueue_head(&sig->wait_chldexit); 869 init_waitqueue_head(&sig->wait_chldexit);
868 sig->flags = 0;
869 if (clone_flags & CLONE_NEWPID) 870 if (clone_flags & CLONE_NEWPID)
870 sig->flags |= SIGNAL_UNKILLABLE; 871 sig->flags |= SIGNAL_UNKILLABLE;
871 sig->group_exit_code = 0;
872 sig->group_exit_task = NULL;
873 sig->group_stop_count = 0;
874 sig->curr_target = tsk; 872 sig->curr_target = tsk;
875 init_sigpending(&sig->shared_pending); 873 init_sigpending(&sig->shared_pending);
876 INIT_LIST_HEAD(&sig->posix_timers); 874 INIT_LIST_HEAD(&sig->posix_timers);
877 875
878 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 876 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
879 sig->it_real_incr.tv64 = 0;
880 sig->real_timer.function = it_real_fn; 877 sig->real_timer.function = it_real_fn;
881 878
882 sig->leader = 0; /* session leadership doesn't inherit */
883 sig->tty_old_pgrp = NULL;
884 sig->tty = NULL;
885
886 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
887 sig->gtime = cputime_zero;
888 sig->cgtime = cputime_zero;
889#ifndef CONFIG_VIRT_CPU_ACCOUNTING
890 sig->prev_utime = sig->prev_stime = cputime_zero;
891#endif
892 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
893 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
894 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
895 sig->maxrss = sig->cmaxrss = 0;
896 task_io_accounting_init(&sig->ioac);
897 sig->sum_sched_runtime = 0;
898 taskstats_tgid_init(sig);
899
900 task_lock(current->group_leader); 879 task_lock(current->group_leader);
901 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 880 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
902 task_unlock(current->group_leader); 881 task_unlock(current->group_leader);
903 882
904 posix_cpu_timers_init_group(sig); 883 posix_cpu_timers_init_group(sig);
905 884
906 acct_init_pacct(&sig->pacct);
907
908 tty_audit_fork(sig); 885 tty_audit_fork(sig);
909 886
910 sig->oom_adj = current->signal->oom_adj; 887 sig->oom_adj = current->signal->oom_adj;
@@ -1033,7 +1010,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1033#endif 1010#endif
1034 retval = -EAGAIN; 1011 retval = -EAGAIN;
1035 if (atomic_read(&p->real_cred->user->processes) >= 1012 if (atomic_read(&p->real_cred->user->processes) >=
1036 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1013 task_rlimit(p, RLIMIT_NPROC)) {
1037 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1014 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1038 p->real_cred->user != INIT_USER) 1015 p->real_cred->user != INIT_USER)
1039 goto bad_fork_free; 1016 goto bad_fork_free;
@@ -1241,21 +1218,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1241 /* Need tasklist lock for parent etc handling! */ 1218 /* Need tasklist lock for parent etc handling! */
1242 write_lock_irq(&tasklist_lock); 1219 write_lock_irq(&tasklist_lock);
1243 1220
1244 /*
1245 * The task hasn't been attached yet, so its cpus_allowed mask will
1246 * not be changed, nor will its assigned CPU.
1247 *
1248 * The cpus_allowed mask of the parent may have changed after it was
1249 * copied first time - so re-copy it here, then check the child's CPU
1250 * to ensure it is on a valid CPU (and if not, just force it back to
1251 * parent's CPU). This avoids alot of nasty races.
1252 */
1253 p->cpus_allowed = current->cpus_allowed;
1254 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1255 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1256 !cpu_online(task_cpu(p))))
1257 set_task_cpu(p, smp_processor_id());
1258
1259 /* CLONE_PARENT re-uses the old parent */ 1221 /* CLONE_PARENT re-uses the old parent */
1260 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { 1222 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1261 p->real_parent = current->real_parent; 1223 p->real_parent = current->real_parent;
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ffe1b9a..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
203 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
205 * @key: address where result is stored. 205 * @key: address where result is stored.
206 * @rw: mapping needs to be read/write (values: VERIFY_READ,
207 * VERIFY_WRITE)
208 * 206 *
209 * Returns a negative error code or 0 207 * Returns a negative error code or 0
210 * The key words are stored in *key on success. 208 * The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
216 * lock_page() might sleep, the caller should not hold a spinlock. 214 * lock_page() might sleep, the caller should not hold a spinlock.
217 */ 215 */
218static int 216static int
219get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 217get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
220{ 218{
221 unsigned long address = (unsigned long)uaddr; 219 unsigned long address = (unsigned long)uaddr;
222 struct mm_struct *mm = current->mm; 220 struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
239 * but access_ok() should be faster than find_vma() 237 * but access_ok() should be faster than find_vma()
240 */ 238 */
241 if (!fshared) { 239 if (!fshared) {
242 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 240 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
243 return -EFAULT; 241 return -EFAULT;
244 key->private.mm = mm; 242 key->private.mm = mm;
245 key->private.address = address; 243 key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
248 } 246 }
249 247
250again: 248again:
251 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); 249 err = get_user_pages_fast(address, 1, 1, &page);
252 if (err < 0) 250 if (err < 0)
253 return err; 251 return err;
254 252
@@ -532,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
532 return -EINVAL; 530 return -EINVAL;
533 531
534 WARN_ON(!atomic_read(&pi_state->refcount)); 532 WARN_ON(!atomic_read(&pi_state->refcount));
535 WARN_ON(pid && pi_state->owner && 533
536 pi_state->owner->pid != pid); 534 /*
535 * When pi_state->owner is NULL then the owner died
536 * and another waiter is on the fly. pi_state->owner
537 * is fixed up by the task which acquires
538 * pi_state->rt_mutex.
539 *
540 * We do not check for pid == 0 which can happen when
541 * the owner died and robust_list_exit() cleared the
542 * TID.
543 */
544 if (pid && pi_state->owner) {
545 /*
546 * Bail out if user space manipulated the
547 * futex value.
548 */
549 if (pid != task_pid_vnr(pi_state->owner))
550 return -EINVAL;
551 }
537 552
538 atomic_inc(&pi_state->refcount); 553 atomic_inc(&pi_state->refcount);
539 *ps = pi_state; 554 *ps = pi_state;
@@ -760,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
760 if (!pi_state) 775 if (!pi_state)
761 return -EINVAL; 776 return -EINVAL;
762 777
778 /*
779 * If current does not own the pi_state then the futex is
780 * inconsistent and user space fiddled with the futex value.
781 */
782 if (pi_state->owner != current)
783 return -EINVAL;
784
763 raw_spin_lock(&pi_state->pi_mutex.wait_lock); 785 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
764 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 786 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
765 787
@@ -867,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
867 if (!bitset) 889 if (!bitset)
868 return -EINVAL; 890 return -EINVAL;
869 891
870 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); 892 ret = get_futex_key(uaddr, fshared, &key);
871 if (unlikely(ret != 0)) 893 if (unlikely(ret != 0))
872 goto out; 894 goto out;
873 895
@@ -913,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
913 int ret, op_ret; 935 int ret, op_ret;
914 936
915retry: 937retry:
916 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 938 ret = get_futex_key(uaddr1, fshared, &key1);
917 if (unlikely(ret != 0)) 939 if (unlikely(ret != 0))
918 goto out; 940 goto out;
919 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 941 ret = get_futex_key(uaddr2, fshared, &key2);
920 if (unlikely(ret != 0)) 942 if (unlikely(ret != 0))
921 goto out_put_key1; 943 goto out_put_key1;
922 944
@@ -1175,11 +1197,10 @@ retry:
1175 pi_state = NULL; 1197 pi_state = NULL;
1176 } 1198 }
1177 1199
1178 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1200 ret = get_futex_key(uaddr1, fshared, &key1);
1179 if (unlikely(ret != 0)) 1201 if (unlikely(ret != 0))
1180 goto out; 1202 goto out;
1181 ret = get_futex_key(uaddr2, fshared, &key2, 1203 ret = get_futex_key(uaddr2, fshared, &key2);
1182 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1183 if (unlikely(ret != 0)) 1204 if (unlikely(ret != 0))
1184 goto out_put_key1; 1205 goto out_put_key1;
1185 1206
@@ -1738,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1738 */ 1759 */
1739retry: 1760retry:
1740 q->key = FUTEX_KEY_INIT; 1761 q->key = FUTEX_KEY_INIT;
1741 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); 1762 ret = get_futex_key(uaddr, fshared, &q->key);
1742 if (unlikely(ret != 0)) 1763 if (unlikely(ret != 0))
1743 return ret; 1764 return ret;
1744 1765
@@ -1904,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1904 q.requeue_pi_key = NULL; 1925 q.requeue_pi_key = NULL;
1905retry: 1926retry:
1906 q.key = FUTEX_KEY_INIT; 1927 q.key = FUTEX_KEY_INIT;
1907 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1928 ret = get_futex_key(uaddr, fshared, &q.key);
1908 if (unlikely(ret != 0)) 1929 if (unlikely(ret != 0))
1909 goto out; 1930 goto out;
1910 1931
@@ -1974,7 +1995,7 @@ retry_private:
1974 /* Unqueue and drop the lock */ 1995 /* Unqueue and drop the lock */
1975 unqueue_me_pi(&q); 1996 unqueue_me_pi(&q);
1976 1997
1977 goto out; 1998 goto out_put_key;
1978 1999
1979out_unlock_put_key: 2000out_unlock_put_key:
1980 queue_unlock(&q, hb); 2001 queue_unlock(&q, hb);
@@ -2023,7 +2044,7 @@ retry:
2023 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2044 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2024 return -EPERM; 2045 return -EPERM;
2025 2046
2026 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); 2047 ret = get_futex_key(uaddr, fshared, &key);
2027 if (unlikely(ret != 0)) 2048 if (unlikely(ret != 0))
2028 goto out; 2049 goto out;
2029 2050
@@ -2215,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2215 rt_waiter.task = NULL; 2236 rt_waiter.task = NULL;
2216 2237
2217 key2 = FUTEX_KEY_INIT; 2238 key2 = FUTEX_KEY_INIT;
2218 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2239 ret = get_futex_key(uaddr2, fshared, &key2);
2219 if (unlikely(ret != 0)) 2240 if (unlikely(ret != 0))
2220 goto out; 2241 goto out;
2221 2242
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
146 struct task_struct *p; 146 struct task_struct *p;
147 147
148 ret = -ESRCH; 148 ret = -ESRCH;
149 read_lock(&tasklist_lock); 149 rcu_read_lock();
150 p = find_task_by_vpid(pid); 150 p = find_task_by_vpid(pid);
151 if (!p) 151 if (!p)
152 goto err_unlock; 152 goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
157 !capable(CAP_SYS_PTRACE)) 157 !capable(CAP_SYS_PTRACE))
158 goto err_unlock; 158 goto err_unlock;
159 head = p->compat_robust_list; 159 head = p->compat_robust_list;
160 read_unlock(&tasklist_lock); 160 rcu_read_unlock();
161 } 161 }
162 162
163 if (put_user(sizeof(*head), len_ptr)) 163 if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
165 return put_user(ptr_to_compat(head), head_ptr); 165 return put_user(ptr_to_compat(head), head_ptr);
166 166
167err_unlock: 167err_unlock:
168 read_unlock(&tasklist_lock); 168 rcu_read_unlock();
169 169
170 return ret; 170 return ret;
171} 171}
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index dbcbf6a33a08..03808ed342a6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,6 +40,7 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/cpu.h>
43#include <linux/smp.h> 44#include <linux/smp.h>
44 45
45#include <linux/hw_breakpoint.h> 46#include <linux/hw_breakpoint.h>
@@ -242,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
242 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
243 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM 244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
244 */ 245 */
245int reserve_bp_slot(struct perf_event *bp) 246static int __reserve_bp_slot(struct perf_event *bp)
246{ 247{
247 struct bp_busy_slots slots = {0}; 248 struct bp_busy_slots slots = {0};
248 int ret = 0;
249
250 mutex_lock(&nr_bp_mutex);
251 249
252 fetch_bp_busy_slots(&slots, bp); 250 fetch_bp_busy_slots(&slots, bp);
253 251
254 /* Flexible counters need to keep at least one slot */ 252 /* Flexible counters need to keep at least one slot */
255 if (slots.pinned + (!!slots.flexible) == HBP_NUM) { 253 if (slots.pinned + (!!slots.flexible) == HBP_NUM)
256 ret = -ENOSPC; 254 return -ENOSPC;
257 goto end;
258 }
259 255
260 toggle_bp_slot(bp, true); 256 toggle_bp_slot(bp, true);
261 257
262end: 258 return 0;
259}
260
261int reserve_bp_slot(struct perf_event *bp)
262{
263 int ret;
264
265 mutex_lock(&nr_bp_mutex);
266
267 ret = __reserve_bp_slot(bp);
268
263 mutex_unlock(&nr_bp_mutex); 269 mutex_unlock(&nr_bp_mutex);
264 270
265 return ret; 271 return ret;
266} 272}
267 273
274static void __release_bp_slot(struct perf_event *bp)
275{
276 toggle_bp_slot(bp, false);
277}
278
268void release_bp_slot(struct perf_event *bp) 279void release_bp_slot(struct perf_event *bp)
269{ 280{
270 mutex_lock(&nr_bp_mutex); 281 mutex_lock(&nr_bp_mutex);
271 282
272 toggle_bp_slot(bp, false); 283 __release_bp_slot(bp);
273 284
274 mutex_unlock(&nr_bp_mutex); 285 mutex_unlock(&nr_bp_mutex);
275} 286}
276 287
288/*
289 * Allow the kernel debugger to reserve breakpoint slots without
290 * taking a lock using the dbg_* variant of for the reserve and
291 * release breakpoint slots.
292 */
293int dbg_reserve_bp_slot(struct perf_event *bp)
294{
295 if (mutex_is_locked(&nr_bp_mutex))
296 return -1;
297
298 return __reserve_bp_slot(bp);
299}
300
301int dbg_release_bp_slot(struct perf_event *bp)
302{
303 if (mutex_is_locked(&nr_bp_mutex))
304 return -1;
305
306 __release_bp_slot(bp);
307
308 return 0;
309}
277 310
278int register_perf_hw_breakpoint(struct perf_event *bp) 311int register_perf_hw_breakpoint(struct perf_event *bp)
279{ 312{
@@ -295,6 +328,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
295 if (!bp->attr.disabled || !bp->overflow_handler) 328 if (!bp->attr.disabled || !bp->overflow_handler)
296 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
297 330
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret)
333 release_bp_slot(bp);
334
298 return ret; 335 return ret;
299} 336}
300 337
@@ -323,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
323int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) 360int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
324{ 361{
325 u64 old_addr = bp->attr.bp_addr; 362 u64 old_addr = bp->attr.bp_addr;
363 u64 old_len = bp->attr.bp_len;
326 int old_type = bp->attr.bp_type; 364 int old_type = bp->attr.bp_type;
327 int old_len = bp->attr.bp_len;
328 int err = 0; 365 int err = 0;
329 366
330 perf_event_disable(bp); 367 perf_event_disable(bp);
@@ -376,19 +413,20 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
376 * 413 *
377 * @return a set of per_cpu pointers to perf events 414 * @return a set of per_cpu pointers to perf events
378 */ 415 */
379struct perf_event ** 416struct perf_event * __percpu *
380register_wide_hw_breakpoint(struct perf_event_attr *attr, 417register_wide_hw_breakpoint(struct perf_event_attr *attr,
381 perf_overflow_handler_t triggered) 418 perf_overflow_handler_t triggered)
382{ 419{
383 struct perf_event **cpu_events, **pevent, *bp; 420 struct perf_event * __percpu *cpu_events, **pevent, *bp;
384 long err; 421 long err;
385 int cpu; 422 int cpu;
386 423
387 cpu_events = alloc_percpu(typeof(*cpu_events)); 424 cpu_events = alloc_percpu(typeof(*cpu_events));
388 if (!cpu_events) 425 if (!cpu_events)
389 return ERR_PTR(-ENOMEM); 426 return (void __percpu __force *)ERR_PTR(-ENOMEM);
390 427
391 for_each_possible_cpu(cpu) { 428 get_online_cpus();
429 for_each_online_cpu(cpu) {
392 pevent = per_cpu_ptr(cpu_events, cpu); 430 pevent = per_cpu_ptr(cpu_events, cpu);
393 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 431 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
394 432
@@ -399,19 +437,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
399 goto fail; 437 goto fail;
400 } 438 }
401 } 439 }
440 put_online_cpus();
402 441
403 return cpu_events; 442 return cpu_events;
404 443
405fail: 444fail:
406 for_each_possible_cpu(cpu) { 445 for_each_online_cpu(cpu) {
407 pevent = per_cpu_ptr(cpu_events, cpu); 446 pevent = per_cpu_ptr(cpu_events, cpu);
408 if (IS_ERR(*pevent)) 447 if (IS_ERR(*pevent))
409 break; 448 break;
410 unregister_hw_breakpoint(*pevent); 449 unregister_hw_breakpoint(*pevent);
411 } 450 }
451 put_online_cpus();
452
412 free_percpu(cpu_events); 453 free_percpu(cpu_events);
413 /* return the error if any */ 454 return (void __percpu __force *)ERR_PTR(err);
414 return ERR_PTR(err);
415} 455}
416EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
417 457
@@ -419,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
419 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel 459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
420 * @cpu_events: the per cpu set of events to unregister 460 * @cpu_events: the per cpu set of events to unregister
421 */ 461 */
422void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) 462void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
423{ 463{
424 int cpu; 464 int cpu;
425 struct perf_event **pevent; 465 struct perf_event **pevent;
@@ -449,5 +489,4 @@ struct pmu perf_ops_bp = {
449 .enable = arch_install_hw_breakpoint, 489 .enable = arch_install_hw_breakpoint,
450 .disable = arch_uninstall_hw_breakpoint, 490 .disable = arch_uninstall_hw_breakpoint,
451 .read = hw_breakpoint_pmu_read, 491 .read = hw_breakpoint_pmu_read,
452 .unthrottle = hw_breakpoint_pmu_unthrottle
453}; 492};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..42ec11b2af8a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
85 raw_spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 100}
87 101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
88 122
89/** 123/**
90 * set_irq_chip - set the irq chip for an irq 124 * set_irq_chip - set the irq chip for an irq
@@ -520,7 +554,7 @@ out:
520 * signal. The occurence is latched into the irq controller hardware 554 * signal. The occurence is latched into the irq controller hardware
521 * and must be acked in order to be reenabled. After the ack another 555 * and must be acked in order to be reenabled. After the ack another
522 * interrupt can happen on the same source even before the first one 556 * interrupt can happen on the same source even before the first one
523 * is handled by the assosiacted event handler. If this happens it 557 * is handled by the associated event handler. If this happens it
524 * might be necessary to disable (mask) the interrupt depending on the 558 * might be necessary to disable (mask) the interrupt depending on the
525 * controller hardware. This requires to reenable the interrupt inside 559 * controller hardware. This requires to reenable the interrupt inside
526 * of the loop which handles the interrupts which have arrived while 560 * of the loop which handles the interrupts which have arrived while
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * automatically freed on driver detach. 42 * automatically freed on driver detach.
43 * 43 *
44 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
45 * separately, dev_free_irq() must be used. 45 * separately, devm_free_irq() must be used.
46 */ 46 */
47int devm_request_threaded_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
48 irq_handler_t handler, irq_handler_t thread_fn, 48 irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
81 * Except for the extra @dev argument, this function takes the 81 * Except for the extra @dev argument, this function takes the
82 * same arguments and performs the same function as free_irq(). 82 * same arguments and performs the same function as free_irq().
83 * This function instead of free_irq() should be used to manually 83 * This function instead of free_irq() should be used to manually
84 * free IRQs allocated with dev_request_irq(). 84 * free IRQs allocated with devm_request_irq().
85 */ 85 */
86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) 86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
132 */ 128 */
133DEFINE_RAW_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..963559dbd858 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_to_desc(irq);
74 74
75 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
76 goto out_unlock; 76 goto out_unlock;
@@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 90 goto out_unlock;
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 replace_irq_desc(irq, desc);
94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a9a93d9ee7a7..87ebe8adc474 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,6 +32,7 @@
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
35 36
36#include <asm/page.h> 37#include <asm/page.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
@@ -40,7 +41,7 @@
40#include <asm/sections.h> 41#include <asm/sections.h>
41 42
42/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
43note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
44 45
45/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
46static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
@@ -1074,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
1074 if (mutex_trylock(&kexec_mutex)) { 1075 if (mutex_trylock(&kexec_mutex)) {
1075 if (kexec_crash_image) { 1076 if (kexec_crash_image) {
1076 struct pt_regs fixed_regs; 1077 struct pt_regs fixed_regs;
1078
1079 kmsg_dump(KMSG_DUMP_KEXEC);
1080
1077 crash_setup_regs(&fixed_regs, regs); 1081 crash_setup_regs(&fixed_regs, regs);
1078 crash_save_vmcoreinfo(); 1082 crash_save_vmcoreinfo();
1079 machine_crash_shutdown(&fixed_regs); 1083 machine_crash_shutdown(&fixed_regs);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index e92d519f93b1..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -28,7 +28,7 @@
28#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/uaccess.h> 29#include <linux/uaccess.h>
30 30
31static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer, 31static void _kfifo_init(struct kfifo *fifo, void *buffer,
32 unsigned int size) 32 unsigned int size)
33{ 33{
34 fifo->buffer = buffer; 34 fifo->buffer = buffer;
@@ -41,10 +41,10 @@ static void _kfifo_init(struct kfifo *fifo, unsigned char *buffer,
41 * kfifo_init - initialize a FIFO using a preallocated buffer 41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer 42 * @fifo: the fifo to assign the buffer
43 * @buffer: the preallocated buffer to be used. 43 * @buffer: the preallocated buffer to be used.
44 * @size: the size of the internal buffer, this have to be a power of 2. 44 * @size: the size of the internal buffer, this has to be a power of 2.
45 * 45 *
46 */ 46 */
47void kfifo_init(struct kfifo *fifo, unsigned char *buffer, unsigned int size) 47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
48{ 48{
49 /* size must be a power of 2 */ 49 /* size must be a power of 2 */
50 BUG_ON(!is_power_of_2(size)); 50 BUG_ON(!is_power_of_2(size));
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
80 80
81 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
82 if (!buffer) { 82 if (!buffer) {
83 _kfifo_init(fifo, 0, 0); 83 _kfifo_init(fifo, NULL, 0);
84 return -ENOMEM; 84 return -ENOMEM;
85 } 85 }
86 86
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
97void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
98{ 98{
99 kfree(fifo->buffer); 99 kfree(fifo->buffer);
100 _kfifo_init(fifo, NULL, 0);
100} 101}
101EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
102 103
@@ -159,8 +160,9 @@ static inline void __kfifo_out_data(struct kfifo *fifo,
159 memcpy(to + l, fifo->buffer, len - l); 160 memcpy(to + l, fifo->buffer, len - l);
160} 161}
161 162
162static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo, 163static inline int __kfifo_from_user_data(struct kfifo *fifo,
163 const void __user *from, unsigned int len, unsigned int off) 164 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout)
164{ 166{
165 unsigned int l; 167 unsigned int l;
166 int ret; 168 int ret;
@@ -177,16 +179,20 @@ static inline unsigned int __kfifo_from_user_data(struct kfifo *fifo,
177 /* first put the data starting from fifo->in to buffer end */ 179 /* first put the data starting from fifo->in to buffer end */
178 l = min(len, fifo->size - off); 180 l = min(len, fifo->size - off);
179 ret = copy_from_user(fifo->buffer + off, from, l); 181 ret = copy_from_user(fifo->buffer + off, from, l);
180 182 if (unlikely(ret)) {
181 if (unlikely(ret)) 183 *lenout = ret;
182 return ret + len - l; 184 return -EFAULT;
185 }
186 *lenout = l;
183 187
184 /* then put the rest (if any) at the beginning of the buffer */ 188 /* then put the rest (if any) at the beginning of the buffer */
185 return copy_from_user(fifo->buffer, from + l, len - l); 189 ret = copy_from_user(fifo->buffer, from + l, len - l);
190 *lenout += ret ? ret : len - l;
191 return ret ? -EFAULT : 0;
186} 192}
187 193
188static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo, 194static inline int __kfifo_to_user_data(struct kfifo *fifo,
189 void __user *to, unsigned int len, unsigned int off) 195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
190{ 196{
191 unsigned int l; 197 unsigned int l;
192 int ret; 198 int ret;
@@ -203,12 +209,21 @@ static inline unsigned int __kfifo_to_user_data(struct kfifo *fifo,
203 /* first get the data from fifo->out until the end of the buffer */ 209 /* first get the data from fifo->out until the end of the buffer */
204 l = min(len, fifo->size - off); 210 l = min(len, fifo->size - off);
205 ret = copy_to_user(to, fifo->buffer + off, l); 211 ret = copy_to_user(to, fifo->buffer + off, l);
206 212 *lenout = l;
207 if (unlikely(ret)) 213 if (unlikely(ret)) {
208 return ret + len - l; 214 *lenout -= ret;
215 return -EFAULT;
216 }
209 217
210 /* then get the rest (if any) from the beginning of the buffer */ 218 /* then get the rest (if any) from the beginning of the buffer */
211 return copy_to_user(to + l, fifo->buffer, len - l); 219 len -= l;
220 ret = copy_to_user(to + l, fifo->buffer, len);
221 if (unlikely(ret)) {
222 *lenout += len - ret;
223 return -EFAULT;
224 }
225 *lenout += len;
226 return 0;
212} 227}
213 228
214unsigned int __kfifo_in_n(struct kfifo *fifo, 229unsigned int __kfifo_in_n(struct kfifo *fifo,
@@ -235,7 +250,7 @@ EXPORT_SYMBOL(__kfifo_in_n);
235 * Note that with only one concurrent reader and one concurrent 250 * Note that with only one concurrent reader and one concurrent
236 * writer, you don't need extra locking to use these functions. 251 * writer, you don't need extra locking to use these functions.
237 */ 252 */
238unsigned int kfifo_in(struct kfifo *fifo, const unsigned char *from, 253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
239 unsigned int len) 254 unsigned int len)
240{ 255{
241 len = min(kfifo_avail(fifo), len); 256 len = min(kfifo_avail(fifo), len);
@@ -277,7 +292,7 @@ EXPORT_SYMBOL(__kfifo_out_n);
277 * Note that with only one concurrent reader and one concurrent 292 * Note that with only one concurrent reader and one concurrent
278 * writer, you don't need extra locking to use these functions. 293 * writer, you don't need extra locking to use these functions.
279 */ 294 */
280unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len) 295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
281{ 296{
282 len = min(kfifo_len(fifo), len); 297 len = min(kfifo_len(fifo), len);
283 298
@@ -288,6 +303,27 @@ unsigned int kfifo_out(struct kfifo *fifo, unsigned char *to, unsigned int len)
288} 303}
289EXPORT_SYMBOL(kfifo_out); 304EXPORT_SYMBOL(kfifo_out);
290 305
306/**
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it
308 * @fifo: the fifo to be used.
309 * @to: where the data must be copied.
310 * @len: the size of the destination buffer.
311 * @offset: offset into the fifo
312 *
313 * This function copies at most @len bytes at @offset from the FIFO
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
318 unsigned offset)
319{
320 len = min(kfifo_len(fifo), len + offset);
321
322 __kfifo_out_data(fifo, to, len, offset);
323 return len;
324}
325EXPORT_SYMBOL(kfifo_out_peek);
326
291unsigned int __kfifo_out_generic(struct kfifo *fifo, 327unsigned int __kfifo_out_generic(struct kfifo *fifo,
292 void *to, unsigned int len, unsigned int recsize, 328 void *to, unsigned int len, unsigned int recsize,
293 unsigned int *total) 329 unsigned int *total)
@@ -299,10 +335,13 @@ EXPORT_SYMBOL(__kfifo_out_generic);
299unsigned int __kfifo_from_user_n(struct kfifo *fifo, 335unsigned int __kfifo_from_user_n(struct kfifo *fifo,
300 const void __user *from, unsigned int len, unsigned int recsize) 336 const void __user *from, unsigned int len, unsigned int recsize)
301{ 337{
338 unsigned total;
339
302 if (kfifo_avail(fifo) < len + recsize) 340 if (kfifo_avail(fifo) < len + recsize)
303 return len + 1; 341 return len + 1;
304 342
305 return __kfifo_from_user_data(fifo, from, len, recsize); 343 __kfifo_from_user_data(fifo, from, len, recsize, &total);
344 return total;
306} 345}
307EXPORT_SYMBOL(__kfifo_from_user_n); 346EXPORT_SYMBOL(__kfifo_from_user_n);
308 347
@@ -311,20 +350,24 @@ EXPORT_SYMBOL(__kfifo_from_user_n);
311 * @fifo: the fifo to be used. 350 * @fifo: the fifo to be used.
312 * @from: pointer to the data to be added. 351 * @from: pointer to the data to be added.
313 * @len: the length of the data to be added. 352 * @len: the length of the data to be added.
353 * @total: the actual returned data length.
314 * 354 *
315 * This function copies at most @len bytes from the @from into the 355 * This function copies at most @len bytes from the @from into the
316 * FIFO depending and returns the number of copied bytes. 356 * FIFO depending and returns -EFAULT/0.
317 * 357 *
318 * Note that with only one concurrent reader and one concurrent 358 * Note that with only one concurrent reader and one concurrent
319 * writer, you don't need extra locking to use these functions. 359 * writer, you don't need extra locking to use these functions.
320 */ 360 */
321unsigned int kfifo_from_user(struct kfifo *fifo, 361int kfifo_from_user(struct kfifo *fifo,
322 const void __user *from, unsigned int len) 362 const void __user *from, unsigned int len, unsigned *total)
323{ 363{
364 int ret;
324 len = min(kfifo_avail(fifo), len); 365 len = min(kfifo_avail(fifo), len);
325 len -= __kfifo_from_user_data(fifo, from, len, 0); 366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
326 __kfifo_add_in(fifo, len); 369 __kfifo_add_in(fifo, len);
327 return len; 370 return 0;
328} 371}
329EXPORT_SYMBOL(kfifo_from_user); 372EXPORT_SYMBOL(kfifo_from_user);
330 373
@@ -339,17 +382,17 @@ unsigned int __kfifo_to_user_n(struct kfifo *fifo,
339 void __user *to, unsigned int len, unsigned int reclen, 382 void __user *to, unsigned int len, unsigned int reclen,
340 unsigned int recsize) 383 unsigned int recsize)
341{ 384{
342 unsigned int ret; 385 unsigned int ret, total;
343 386
344 if (kfifo_len(fifo) < reclen + recsize) 387 if (kfifo_len(fifo) < reclen + recsize)
345 return len; 388 return len;
346 389
347 ret = __kfifo_to_user_data(fifo, to, reclen, recsize); 390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
348 391
349 if (likely(ret == 0)) 392 if (likely(ret == 0))
350 __kfifo_add_out(fifo, reclen + recsize); 393 __kfifo_add_out(fifo, reclen + recsize);
351 394
352 return ret; 395 return total;
353} 396}
354EXPORT_SYMBOL(__kfifo_to_user_n); 397EXPORT_SYMBOL(__kfifo_to_user_n);
355 398
@@ -358,20 +401,22 @@ EXPORT_SYMBOL(__kfifo_to_user_n);
358 * @fifo: the fifo to be used. 401 * @fifo: the fifo to be used.
359 * @to: where the data must be copied. 402 * @to: where the data must be copied.
360 * @len: the size of the destination buffer. 403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
361 * 405 *
362 * This function copies at most @len bytes from the FIFO into the 406 * This function copies at most @len bytes from the FIFO into the
363 * @to buffer and returns the number of copied bytes. 407 * @to buffer and 0 or -EFAULT.
364 * 408 *
365 * Note that with only one concurrent reader and one concurrent 409 * Note that with only one concurrent reader and one concurrent
366 * writer, you don't need extra locking to use these functions. 410 * writer, you don't need extra locking to use these functions.
367 */ 411 */
368unsigned int kfifo_to_user(struct kfifo *fifo, 412int kfifo_to_user(struct kfifo *fifo,
369 void __user *to, unsigned int len) 413 void __user *to, unsigned int len, unsigned *lenout)
370{ 414{
415 int ret;
371 len = min(kfifo_len(fifo), len); 416 len = min(kfifo_len(fifo), len);
372 len -= __kfifo_to_user_data(fifo, to, len, 0); 417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
373 __kfifo_add_out(fifo, len); 418 __kfifo_add_out(fifo, *lenout);
374 return len; 419 return ret;
375} 420}
376EXPORT_SYMBOL(kfifo_to_user); 421EXPORT_SYMBOL(kfifo_to_user);
377 422
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 2eb517e23514..761fdd2b3034 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
583 smp_wmb(); 583 smp_wmb();
584 atomic_set(&cpu_in_kgdb[cpu], 1); 584 atomic_set(&cpu_in_kgdb[cpu], 1);
585 585
586 /* Disable any cpu specific hw breakpoints */
587 kgdb_disable_hw_debug(regs);
588
586 /* Wait till primary CPU is done with debugging */ 589 /* Wait till primary CPU is done with debugging */
587 while (atomic_read(&passive_cpu_wait[cpu])) 590 while (atomic_read(&passive_cpu_wait[cpu]))
588 cpu_relax(); 591 cpu_relax();
@@ -596,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
596 599
597 /* Signal the primary CPU that we are done: */ 600 /* Signal the primary CPU that we are done: */
598 atomic_set(&cpu_in_kgdb[cpu], 0); 601 atomic_set(&cpu_in_kgdb[cpu], 0);
599 touch_softlockup_watchdog(); 602 touch_softlockup_watchdog_sync();
600 clocksource_touch_watchdog(); 603 clocksource_touch_watchdog();
601 local_irq_restore(flags); 604 local_irq_restore(flags);
602} 605}
@@ -1450,7 +1453,7 @@ acquirelock:
1450 (kgdb_info[cpu].task && 1453 (kgdb_info[cpu].task &&
1451 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 1454 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1452 atomic_set(&kgdb_active, -1); 1455 atomic_set(&kgdb_active, -1);
1453 touch_softlockup_watchdog(); 1456 touch_softlockup_watchdog_sync();
1454 clocksource_touch_watchdog(); 1457 clocksource_touch_watchdog();
1455 local_irq_restore(flags); 1458 local_irq_restore(flags);
1456 1459
@@ -1550,7 +1553,7 @@ kgdb_restore:
1550 } 1553 }
1551 /* Free kgdb_active */ 1554 /* Free kgdb_active */
1552 atomic_set(&kgdb_active, -1); 1555 atomic_set(&kgdb_active, -1);
1553 touch_softlockup_watchdog(); 1556 touch_softlockup_watchdog_sync();
1554 clocksource_touch_watchdog(); 1557 clocksource_touch_watchdog();
1555 local_irq_restore(flags); 1558 local_irq_restore(flags);
1556 1559
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 25b103190364..bf0e231d9702 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
520 return -ENOMEM; 520 return -ENOMEM;
521 521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp); 522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) 523 if (ret < 0) {
524 goto out; 524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
525 527
526 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
527 531
528 out:
529 call_usermodehelper_freeinfo(sub_info);
530 return ret; 532 return ret;
531} 533}
532EXPORT_SYMBOL(call_usermodehelper_pipe); 534EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5342a344c43..fa034d29cf73 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,8 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h>
49#include <linux/cpu.h>
47 50
48#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -93,6 +96,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
93 {"native_get_debugreg",}, 96 {"native_get_debugreg",},
94 {"irq_entries_start",}, 97 {"irq_entries_start",},
95 {"common_interrupt",}, 98 {"common_interrupt",},
99 {"mcount",}, /* mcount can be called from everywhere */
96 {NULL} /* Terminator */ 100 {NULL} /* Terminator */
97}; 101};
98 102
@@ -103,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
104 * is a recipe for disaster 108 * is a recipe for disaster
105 */ 109 */
106#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
107
108struct kprobe_insn_page { 110struct kprobe_insn_page {
109 struct list_head list; 111 struct list_head list;
110 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
111 char slot_used[INSNS_PER_PAGE];
112 int nused; 113 int nused;
113 int ngarbage; 114 int ngarbage;
115 char slot_used[];
114}; 116};
115 117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
126};
127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
116enum kprobe_slot_state { 133enum kprobe_slot_state {
117 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
118 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
119 SLOT_USED = 2, 136 SLOT_USED = 2,
120}; 137};
121 138
122static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
123static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
124static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
125static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
126 143 .nr_garbage = 0,
127static int __kprobes check_safety(void) 144};
128{ 145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
129 int ret = 0;
130#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
131 ret = freeze_processes();
132 if (ret == 0) {
133 struct task_struct *p, *q;
134 do_each_thread(p, q) {
135 if (p != current && p->state == TASK_RUNNING &&
136 p->pid != 0) {
137 printk("Check failed: %s is running\n",p->comm);
138 ret = -1;
139 goto loop_end;
140 }
141 } while_each_thread(p, q);
142 }
143loop_end:
144 thaw_processes();
145#else
146 synchronize_sched();
147#endif
148 return ret;
149}
150 146
151/** 147/**
152 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
153 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
154 */ 150 */
155static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
156{ 152{
157 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
158 154
159 retry: 155 retry:
160 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
161 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
162 int i; 158 int i;
163 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
164 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
165 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
166 kip->nused++; 162 kip->nused++;
167 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
168 } 164 }
169 } 165 }
170 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
171 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
172 } 169 }
173 } 170 }
174 171
175 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
176 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
177 goto retry; 174 goto retry;
178 } 175
179 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
180 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
181 if (!kip) 178 if (!kip)
182 return NULL; 179 return NULL;
183 180
@@ -192,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
192 return NULL; 189 return NULL;
193 } 190 }
194 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
195 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
196 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
197 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
198 kip->nused = 1; 194 kip->nused = 1;
199 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
200 return kip->insns; 197 return kip->insns;
201} 198}
202 199
200
203kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
204{ 202{
205 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
206 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
207 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
208 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
209 return ret; 209 return ret;
210} 210}
211 211
@@ -221,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
221 * so as not to have to set it up again the 221 * so as not to have to set it up again the
222 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
223 */ 223 */
224 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
225 list_del(&kip->list); 225 list_del(&kip->list);
226 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
227 kfree(kip); 227 kfree(kip);
@@ -231,52 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
231 return 0; 231 return 0;
232} 232}
233 233
234static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
235{ 235{
236 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
237 237
238 /* Ensure no-one is preepmted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
239 if (check_safety()) 239 synchronize_sched();
240 return -EAGAIN;
241 240
242 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
243 int i; 242 int i;
244 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
245 continue; 244 continue;
246 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
247 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
248 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
249 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
250 break; 249 break;
251 } 250 }
252 } 251 }
253 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
254 return 0; 253 return 0;
255} 254}
256 255
257void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
258{ 258{
259 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
260 260
261 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
262 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) / c->insn_size;
263 if (kip->insns <= slot && 263 if (idx >= 0 && idx < slots_per_page(c)) {
264 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 WARN_ON(kip->slot_used[idx] != SLOT_USED);
265 int i = (slot - kip->insns) / MAX_INSN_SIZE;
266 if (dirty) { 265 if (dirty) {
267 kip->slot_used[i] = SLOT_DIRTY; 266 kip->slot_used[idx] = SLOT_DIRTY;
268 kip->ngarbage++; 267 kip->ngarbage++;
268 if (++c->nr_garbage > slots_per_page(c))
269 collect_garbage_slots(c);
269 } else 270 } else
270 collect_one_slot(kip, i); 271 collect_one_slot(kip, idx);
271 break; 272 return;
272 } 273 }
273 } 274 }
275 /* Could not free this slot. */
276 WARN_ON(1);
277}
274 278
275 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 279void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
276 collect_garbage_slots(); 280{
277 281 mutex_lock(&kprobe_insn_mutex);
282 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
278 mutex_unlock(&kprobe_insn_mutex); 283 mutex_unlock(&kprobe_insn_mutex);
279} 284}
285#ifdef CONFIG_OPTPROBES
286/* For optimized_kprobe buffer */
287static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
288static struct kprobe_insn_cache kprobe_optinsn_slots = {
289 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
290 /* .insn_size is initialized later */
291 .nr_garbage = 0,
292};
293/* Get a slot for optimized_kprobe buffer */
294kprobe_opcode_t __kprobes *get_optinsn_slot(void)
295{
296 kprobe_opcode_t *ret = NULL;
297
298 mutex_lock(&kprobe_optinsn_mutex);
299 ret = __get_insn_slot(&kprobe_optinsn_slots);
300 mutex_unlock(&kprobe_optinsn_mutex);
301
302 return ret;
303}
304
305void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
306{
307 mutex_lock(&kprobe_optinsn_mutex);
308 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
309 mutex_unlock(&kprobe_optinsn_mutex);
310}
311#endif
280#endif 312#endif
281 313
282/* We have preemption disabled.. so it is safe to use __ versions */ 314/* We have preemption disabled.. so it is safe to use __ versions */
@@ -307,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
307 if (p->addr == addr) 339 if (p->addr == addr)
308 return p; 340 return p;
309 } 341 }
342
310 return NULL; 343 return NULL;
311} 344}
312 345
346static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
347
348/* Return true if the kprobe is an aggregator */
349static inline int kprobe_aggrprobe(struct kprobe *p)
350{
351 return p->pre_handler == aggr_pre_handler;
352}
353
354/*
355 * Keep all fields in the kprobe consistent
356 */
357static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
358{
359 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
360 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
361}
362
363#ifdef CONFIG_OPTPROBES
364/* NOTE: change this value only with kprobe_mutex held */
365static bool kprobes_allow_optimization;
366
367/*
368 * Call all pre_handler on the list, but ignores its return value.
369 * This must be called from arch-dep optimized caller.
370 */
371void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
372{
373 struct kprobe *kp;
374
375 list_for_each_entry_rcu(kp, &p->list, list) {
376 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
377 set_kprobe_instance(kp);
378 kp->pre_handler(kp, regs);
379 }
380 reset_kprobe_instance();
381 }
382}
383
384/* Return true(!0) if the kprobe is ready for optimization. */
385static inline int kprobe_optready(struct kprobe *p)
386{
387 struct optimized_kprobe *op;
388
389 if (kprobe_aggrprobe(p)) {
390 op = container_of(p, struct optimized_kprobe, kp);
391 return arch_prepared_optinsn(&op->optinsn);
392 }
393
394 return 0;
395}
396
397/*
398 * Return an optimized kprobe whose optimizing code replaces
399 * instructions including addr (exclude breakpoint).
400 */
401struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
402{
403 int i;
404 struct kprobe *p = NULL;
405 struct optimized_kprobe *op;
406
407 /* Don't check i == 0, since that is a breakpoint case. */
408 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
409 p = get_kprobe((void *)(addr - i));
410
411 if (p && kprobe_optready(p)) {
412 op = container_of(p, struct optimized_kprobe, kp);
413 if (arch_within_optimized_kprobe(op, addr))
414 return p;
415 }
416
417 return NULL;
418}
419
420/* Optimization staging list, protected by kprobe_mutex */
421static LIST_HEAD(optimizing_list);
422
423static void kprobe_optimizer(struct work_struct *work);
424static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
425#define OPTIMIZE_DELAY 5
426
427/* Kprobe jump optimizer */
428static __kprobes void kprobe_optimizer(struct work_struct *work)
429{
430 struct optimized_kprobe *op, *tmp;
431
432 /* Lock modules while optimizing kprobes */
433 mutex_lock(&module_mutex);
434 mutex_lock(&kprobe_mutex);
435 if (kprobes_all_disarmed || !kprobes_allow_optimization)
436 goto end;
437
438 /*
439 * Wait for quiesence period to ensure all running interrupts
440 * are done. Because optprobe may modify multiple instructions
441 * there is a chance that Nth instruction is interrupted. In that
442 * case, running interrupt can return to 2nd-Nth byte of jump
443 * instruction. This wait is for avoiding it.
444 */
445 synchronize_sched();
446
447 /*
448 * The optimization/unoptimization refers online_cpus via
449 * stop_machine() and cpu-hotplug modifies online_cpus.
450 * And same time, text_mutex will be held in cpu-hotplug and here.
451 * This combination can cause a deadlock (cpu-hotplug try to lock
452 * text_mutex but stop_machine can not be done because online_cpus
453 * has been changed)
454 * To avoid this deadlock, we need to call get_online_cpus()
455 * for preventing cpu-hotplug outside of text_mutex locking.
456 */
457 get_online_cpus();
458 mutex_lock(&text_mutex);
459 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
460 WARN_ON(kprobe_disabled(&op->kp));
461 if (arch_optimize_kprobe(op) < 0)
462 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
463 list_del_init(&op->list);
464 }
465 mutex_unlock(&text_mutex);
466 put_online_cpus();
467end:
468 mutex_unlock(&kprobe_mutex);
469 mutex_unlock(&module_mutex);
470}
471
472/* Optimize kprobe if p is ready to be optimized */
473static __kprobes void optimize_kprobe(struct kprobe *p)
474{
475 struct optimized_kprobe *op;
476
477 /* Check if the kprobe is disabled or not ready for optimization. */
478 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
479 (kprobe_disabled(p) || kprobes_all_disarmed))
480 return;
481
482 /* Both of break_handler and post_handler are not supported. */
483 if (p->break_handler || p->post_handler)
484 return;
485
486 op = container_of(p, struct optimized_kprobe, kp);
487
488 /* Check there is no other kprobes at the optimized instructions */
489 if (arch_check_optimized_kprobe(op) < 0)
490 return;
491
492 /* Check if it is already optimized. */
493 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
494 return;
495
496 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
497 list_add(&op->list, &optimizing_list);
498 if (!delayed_work_pending(&optimizing_work))
499 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
500}
501
502/* Unoptimize a kprobe if p is optimized */
503static __kprobes void unoptimize_kprobe(struct kprobe *p)
504{
505 struct optimized_kprobe *op;
506
507 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
508 op = container_of(p, struct optimized_kprobe, kp);
509 if (!list_empty(&op->list))
510 /* Dequeue from the optimization queue */
511 list_del_init(&op->list);
512 else
513 /* Replace jump with break */
514 arch_unoptimize_kprobe(op);
515 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
516 }
517}
518
519/* Remove optimized instructions */
520static void __kprobes kill_optimized_kprobe(struct kprobe *p)
521{
522 struct optimized_kprobe *op;
523
524 op = container_of(p, struct optimized_kprobe, kp);
525 if (!list_empty(&op->list)) {
526 /* Dequeue from the optimization queue */
527 list_del_init(&op->list);
528 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
529 }
530 /* Don't unoptimize, because the target code will be freed. */
531 arch_remove_optimized_kprobe(op);
532}
533
534/* Try to prepare optimized instructions */
535static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
536{
537 struct optimized_kprobe *op;
538
539 op = container_of(p, struct optimized_kprobe, kp);
540 arch_prepare_optimized_kprobe(op);
541}
542
543/* Free optimized instructions and optimized_kprobe */
544static __kprobes void free_aggr_kprobe(struct kprobe *p)
545{
546 struct optimized_kprobe *op;
547
548 op = container_of(p, struct optimized_kprobe, kp);
549 arch_remove_optimized_kprobe(op);
550 kfree(op);
551}
552
553/* Allocate new optimized_kprobe and try to prepare optimized instructions */
554static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
555{
556 struct optimized_kprobe *op;
557
558 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
559 if (!op)
560 return NULL;
561
562 INIT_LIST_HEAD(&op->list);
563 op->kp.addr = p->addr;
564 arch_prepare_optimized_kprobe(op);
565
566 return &op->kp;
567}
568
569static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
570
571/*
572 * Prepare an optimized_kprobe and optimize it
573 * NOTE: p must be a normal registered kprobe
574 */
575static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
576{
577 struct kprobe *ap;
578 struct optimized_kprobe *op;
579
580 ap = alloc_aggr_kprobe(p);
581 if (!ap)
582 return;
583
584 op = container_of(ap, struct optimized_kprobe, kp);
585 if (!arch_prepared_optinsn(&op->optinsn)) {
586 /* If failed to setup optimizing, fallback to kprobe */
587 free_aggr_kprobe(ap);
588 return;
589 }
590
591 init_aggr_kprobe(ap, p);
592 optimize_kprobe(ap);
593}
594
595#ifdef CONFIG_SYSCTL
596static void __kprobes optimize_all_kprobes(void)
597{
598 struct hlist_head *head;
599 struct hlist_node *node;
600 struct kprobe *p;
601 unsigned int i;
602
603 /* If optimization is already allowed, just return */
604 if (kprobes_allow_optimization)
605 return;
606
607 kprobes_allow_optimization = true;
608 mutex_lock(&text_mutex);
609 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
610 head = &kprobe_table[i];
611 hlist_for_each_entry_rcu(p, node, head, hlist)
612 if (!kprobe_disabled(p))
613 optimize_kprobe(p);
614 }
615 mutex_unlock(&text_mutex);
616 printk(KERN_INFO "Kprobes globally optimized\n");
617}
618
619static void __kprobes unoptimize_all_kprobes(void)
620{
621 struct hlist_head *head;
622 struct hlist_node *node;
623 struct kprobe *p;
624 unsigned int i;
625
626 /* If optimization is already prohibited, just return */
627 if (!kprobes_allow_optimization)
628 return;
629
630 kprobes_allow_optimization = false;
631 printk(KERN_INFO "Kprobes globally unoptimized\n");
632 get_online_cpus(); /* For avoiding text_mutex deadlock */
633 mutex_lock(&text_mutex);
634 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
635 head = &kprobe_table[i];
636 hlist_for_each_entry_rcu(p, node, head, hlist) {
637 if (!kprobe_disabled(p))
638 unoptimize_kprobe(p);
639 }
640 }
641
642 mutex_unlock(&text_mutex);
643 put_online_cpus();
644 /* Allow all currently running kprobes to complete */
645 synchronize_sched();
646}
647
648int sysctl_kprobes_optimization;
649int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
650 void __user *buffer, size_t *length,
651 loff_t *ppos)
652{
653 int ret;
654
655 mutex_lock(&kprobe_mutex);
656 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
657 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
658
659 if (sysctl_kprobes_optimization)
660 optimize_all_kprobes();
661 else
662 unoptimize_all_kprobes();
663 mutex_unlock(&kprobe_mutex);
664
665 return ret;
666}
667#endif /* CONFIG_SYSCTL */
668
669static void __kprobes __arm_kprobe(struct kprobe *p)
670{
671 struct kprobe *old_p;
672
673 /* Check collision with other optimized kprobes */
674 old_p = get_optimized_kprobe((unsigned long)p->addr);
675 if (unlikely(old_p))
676 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
677
678 arch_arm_kprobe(p);
679 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
680}
681
682static void __kprobes __disarm_kprobe(struct kprobe *p)
683{
684 struct kprobe *old_p;
685
686 unoptimize_kprobe(p); /* Try to unoptimize */
687 arch_disarm_kprobe(p);
688
689 /* If another kprobe was blocked, optimize it. */
690 old_p = get_optimized_kprobe((unsigned long)p->addr);
691 if (unlikely(old_p))
692 optimize_kprobe(old_p);
693}
694
695#else /* !CONFIG_OPTPROBES */
696
697#define optimize_kprobe(p) do {} while (0)
698#define unoptimize_kprobe(p) do {} while (0)
699#define kill_optimized_kprobe(p) do {} while (0)
700#define prepare_optimized_kprobe(p) do {} while (0)
701#define try_to_optimize_kprobe(p) do {} while (0)
702#define __arm_kprobe(p) arch_arm_kprobe(p)
703#define __disarm_kprobe(p) arch_disarm_kprobe(p)
704
705static __kprobes void free_aggr_kprobe(struct kprobe *p)
706{
707 kfree(p);
708}
709
710static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
711{
712 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
713}
714#endif /* CONFIG_OPTPROBES */
715
313/* Arm a kprobe with text_mutex */ 716/* Arm a kprobe with text_mutex */
314static void __kprobes arm_kprobe(struct kprobe *kp) 717static void __kprobes arm_kprobe(struct kprobe *kp)
315{ 718{
719 /*
720 * Here, since __arm_kprobe() doesn't use stop_machine(),
721 * this doesn't cause deadlock on text_mutex. So, we don't
722 * need get_online_cpus().
723 */
316 mutex_lock(&text_mutex); 724 mutex_lock(&text_mutex);
317 arch_arm_kprobe(kp); 725 __arm_kprobe(kp);
318 mutex_unlock(&text_mutex); 726 mutex_unlock(&text_mutex);
319} 727}
320 728
321/* Disarm a kprobe with text_mutex */ 729/* Disarm a kprobe with text_mutex */
322static void __kprobes disarm_kprobe(struct kprobe *kp) 730static void __kprobes disarm_kprobe(struct kprobe *kp)
323{ 731{
732 get_online_cpus(); /* For avoiding text_mutex deadlock */
324 mutex_lock(&text_mutex); 733 mutex_lock(&text_mutex);
325 arch_disarm_kprobe(kp); 734 __disarm_kprobe(kp);
326 mutex_unlock(&text_mutex); 735 mutex_unlock(&text_mutex);
736 put_online_cpus();
327} 737}
328 738
329/* 739/*
@@ -392,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
392void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 802void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
393{ 803{
394 struct kprobe *kp; 804 struct kprobe *kp;
395 if (p->pre_handler != aggr_pre_handler) { 805 if (!kprobe_aggrprobe(p)) {
396 p->nmissed++; 806 p->nmissed++;
397 } else { 807 } else {
398 list_for_each_entry_rcu(kp, &p->list, list) 808 list_for_each_entry_rcu(kp, &p->list, list)
@@ -516,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
516} 926}
517 927
518/* 928/*
519 * Keep all fields in the kprobe consistent
520 */
521static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
522{
523 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
524 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
525}
526
527/*
528* Add the new probe to ap->list. Fail if this is the 929* Add the new probe to ap->list. Fail if this is the
529* second jprobe at the address - two jprobes can't coexist 930* second jprobe at the address - two jprobes can't coexist
530*/ 931*/
531static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 932static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
532{ 933{
533 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 934 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
935
936 if (p->break_handler || p->post_handler)
937 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
938
534 if (p->break_handler) { 939 if (p->break_handler) {
535 if (ap->break_handler) 940 if (ap->break_handler)
536 return -EEXIST; 941 return -EEXIST;
@@ -545,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
545 ap->flags &= ~KPROBE_FLAG_DISABLED; 950 ap->flags &= ~KPROBE_FLAG_DISABLED;
546 if (!kprobes_all_disarmed) 951 if (!kprobes_all_disarmed)
547 /* Arm the breakpoint again. */ 952 /* Arm the breakpoint again. */
548 arm_kprobe(ap); 953 __arm_kprobe(ap);
549 } 954 }
550 return 0; 955 return 0;
551} 956}
@@ -554,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
554 * Fill in the required fields of the "manager kprobe". Replace the 959 * Fill in the required fields of the "manager kprobe". Replace the
555 * earlier kprobe in the hlist with the manager kprobe 960 * earlier kprobe in the hlist with the manager kprobe
556 */ 961 */
557static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 962static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
558{ 963{
964 /* Copy p's insn slot to ap */
559 copy_kprobe(p, ap); 965 copy_kprobe(p, ap);
560 flush_insn_slot(ap); 966 flush_insn_slot(ap);
561 ap->addr = p->addr; 967 ap->addr = p->addr;
562 ap->flags = p->flags; 968 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
563 ap->pre_handler = aggr_pre_handler; 969 ap->pre_handler = aggr_pre_handler;
564 ap->fault_handler = aggr_fault_handler; 970 ap->fault_handler = aggr_fault_handler;
565 /* We don't care the kprobe which has gone. */ 971 /* We don't care the kprobe which has gone. */
@@ -569,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
569 ap->break_handler = aggr_break_handler; 975 ap->break_handler = aggr_break_handler;
570 976
571 INIT_LIST_HEAD(&ap->list); 977 INIT_LIST_HEAD(&ap->list);
572 list_add_rcu(&p->list, &ap->list); 978 INIT_HLIST_NODE(&ap->hlist);
573 979
980 list_add_rcu(&p->list, &ap->list);
574 hlist_replace_rcu(&p->hlist, &ap->hlist); 981 hlist_replace_rcu(&p->hlist, &ap->hlist);
575} 982}
576 983
@@ -584,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
584 int ret = 0; 991 int ret = 0;
585 struct kprobe *ap = old_p; 992 struct kprobe *ap = old_p;
586 993
587 if (old_p->pre_handler != aggr_pre_handler) { 994 if (!kprobe_aggrprobe(old_p)) {
588 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 995 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
589 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 996 ap = alloc_aggr_kprobe(old_p);
590 if (!ap) 997 if (!ap)
591 return -ENOMEM; 998 return -ENOMEM;
592 add_aggr_kprobe(ap, old_p); 999 init_aggr_kprobe(ap, old_p);
593 } 1000 }
594 1001
595 if (kprobe_gone(ap)) { 1002 if (kprobe_gone(ap)) {
@@ -608,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
608 */ 1015 */
609 return ret; 1016 return ret;
610 1017
1018 /* Prepare optimized instructions if possible. */
1019 prepare_optimized_kprobe(ap);
1020
611 /* 1021 /*
612 * Clear gone flag to prevent allocating new slot again, and 1022 * Clear gone flag to prevent allocating new slot again, and
613 * set disabled flag because it is not armed yet. 1023 * set disabled flag because it is not armed yet.
@@ -616,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
616 | KPROBE_FLAG_DISABLED; 1026 | KPROBE_FLAG_DISABLED;
617 } 1027 }
618 1028
1029 /* Copy ap's insn slot to p */
619 copy_kprobe(ap, p); 1030 copy_kprobe(ap, p);
620 return add_new_kprobe(ap, p); 1031 return add_new_kprobe(ap, p);
621} 1032}
@@ -728,7 +1139,8 @@ int __kprobes register_kprobe(struct kprobe *p)
728 1139
729 preempt_disable(); 1140 preempt_disable();
730 if (!kernel_text_address((unsigned long) p->addr) || 1141 if (!kernel_text_address((unsigned long) p->addr) ||
731 in_kprobes_functions((unsigned long) p->addr)) { 1142 in_kprobes_functions((unsigned long) p->addr) ||
1143 ftrace_text_reserved(p->addr, p->addr)) {
732 preempt_enable(); 1144 preempt_enable();
733 return -EINVAL; 1145 return -EINVAL;
734 } 1146 }
@@ -765,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p)
765 p->nmissed = 0; 1177 p->nmissed = 0;
766 INIT_LIST_HEAD(&p->list); 1178 INIT_LIST_HEAD(&p->list);
767 mutex_lock(&kprobe_mutex); 1179 mutex_lock(&kprobe_mutex);
1180
1181 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1182 mutex_lock(&text_mutex);
1183
768 old_p = get_kprobe(p->addr); 1184 old_p = get_kprobe(p->addr);
769 if (old_p) { 1185 if (old_p) {
1186 /* Since this may unoptimize old_p, locking text_mutex. */
770 ret = register_aggr_kprobe(old_p, p); 1187 ret = register_aggr_kprobe(old_p, p);
771 goto out; 1188 goto out;
772 } 1189 }
773 1190
774 mutex_lock(&text_mutex);
775 ret = arch_prepare_kprobe(p); 1191 ret = arch_prepare_kprobe(p);
776 if (ret) 1192 if (ret)
777 goto out_unlock_text; 1193 goto out;
778 1194
779 INIT_HLIST_NODE(&p->hlist); 1195 INIT_HLIST_NODE(&p->hlist);
780 hlist_add_head_rcu(&p->hlist, 1196 hlist_add_head_rcu(&p->hlist,
781 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1197 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
782 1198
783 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1199 if (!kprobes_all_disarmed && !kprobe_disabled(p))
784 arch_arm_kprobe(p); 1200 __arm_kprobe(p);
1201
1202 /* Try to optimize kprobe */
1203 try_to_optimize_kprobe(p);
785 1204
786out_unlock_text:
787 mutex_unlock(&text_mutex);
788out: 1205out:
1206 mutex_unlock(&text_mutex);
1207 put_online_cpus();
789 mutex_unlock(&kprobe_mutex); 1208 mutex_unlock(&kprobe_mutex);
790 1209
791 if (probed_mod) 1210 if (probed_mod)
@@ -807,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
807 return -EINVAL; 1226 return -EINVAL;
808 1227
809 if (old_p == p || 1228 if (old_p == p ||
810 (old_p->pre_handler == aggr_pre_handler && 1229 (kprobe_aggrprobe(old_p) &&
811 list_is_singular(&old_p->list))) { 1230 list_is_singular(&old_p->list))) {
812 /* 1231 /*
813 * Only probe on the hash list. Disarm only if kprobes are 1232 * Only probe on the hash list. Disarm only if kprobes are
@@ -815,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
815 * already have been removed. We save on flushing icache. 1234 * already have been removed. We save on flushing icache.
816 */ 1235 */
817 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1236 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
818 disarm_kprobe(p); 1237 disarm_kprobe(old_p);
819 hlist_del_rcu(&old_p->hlist); 1238 hlist_del_rcu(&old_p->hlist);
820 } else { 1239 } else {
821 if (p->break_handler && !kprobe_gone(p)) 1240 if (p->break_handler && !kprobe_gone(p))
@@ -831,8 +1250,13 @@ noclean:
831 list_del_rcu(&p->list); 1250 list_del_rcu(&p->list);
832 if (!kprobe_disabled(old_p)) { 1251 if (!kprobe_disabled(old_p)) {
833 try_to_disable_aggr_kprobe(old_p); 1252 try_to_disable_aggr_kprobe(old_p);
834 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1253 if (!kprobes_all_disarmed) {
835 disarm_kprobe(old_p); 1254 if (kprobe_disabled(old_p))
1255 disarm_kprobe(old_p);
1256 else
1257 /* Try to optimize this probe again */
1258 optimize_kprobe(old_p);
1259 }
836 } 1260 }
837 } 1261 }
838 return 0; 1262 return 0;
@@ -849,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
849 old_p = list_entry(p->list.next, struct kprobe, list); 1273 old_p = list_entry(p->list.next, struct kprobe, list);
850 list_del(&p->list); 1274 list_del(&p->list);
851 arch_remove_kprobe(old_p); 1275 arch_remove_kprobe(old_p);
852 kfree(old_p); 1276 free_aggr_kprobe(old_p);
853 } 1277 }
854} 1278}
855 1279
@@ -1035,7 +1459,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1035 /* Pre-allocate memory for max kretprobe instances */ 1459 /* Pre-allocate memory for max kretprobe instances */
1036 if (rp->maxactive <= 0) { 1460 if (rp->maxactive <= 0) {
1037#ifdef CONFIG_PREEMPT 1461#ifdef CONFIG_PREEMPT
1038 rp->maxactive = max(10, 2 * num_possible_cpus()); 1462 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
1039#else 1463#else
1040 rp->maxactive = num_possible_cpus(); 1464 rp->maxactive = num_possible_cpus();
1041#endif 1465#endif
@@ -1145,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1145 struct kprobe *kp; 1569 struct kprobe *kp;
1146 1570
1147 p->flags |= KPROBE_FLAG_GONE; 1571 p->flags |= KPROBE_FLAG_GONE;
1148 if (p->pre_handler == aggr_pre_handler) { 1572 if (kprobe_aggrprobe(p)) {
1149 /* 1573 /*
1150 * If this is an aggr_kprobe, we have to list all the 1574 * If this is an aggr_kprobe, we have to list all the
1151 * chained probes and mark them GONE. 1575 * chained probes and mark them GONE.
@@ -1154,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1154 kp->flags |= KPROBE_FLAG_GONE; 1578 kp->flags |= KPROBE_FLAG_GONE;
1155 p->post_handler = NULL; 1579 p->post_handler = NULL;
1156 p->break_handler = NULL; 1580 p->break_handler = NULL;
1581 kill_optimized_kprobe(p);
1157 } 1582 }
1158 /* 1583 /*
1159 * Here, we can remove insn_slot safely, because no thread calls 1584 * Here, we can remove insn_slot safely, because no thread calls
@@ -1263,6 +1688,15 @@ static int __init init_kprobes(void)
1263 } 1688 }
1264 } 1689 }
1265 1690
1691#if defined(CONFIG_OPTPROBES)
1692#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1693 /* Init kprobe_optinsn_slots */
1694 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1695#endif
1696 /* By default, kprobes can be optimized */
1697 kprobes_allow_optimization = true;
1698#endif
1699
1266 /* By default, kprobes are armed */ 1700 /* By default, kprobes are armed */
1267 kprobes_all_disarmed = false; 1701 kprobes_all_disarmed = false;
1268 1702
@@ -1281,7 +1715,7 @@ static int __init init_kprobes(void)
1281 1715
1282#ifdef CONFIG_DEBUG_FS 1716#ifdef CONFIG_DEBUG_FS
1283static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1717static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1284 const char *sym, int offset,char *modname) 1718 const char *sym, int offset, char *modname, struct kprobe *pp)
1285{ 1719{
1286 char *kprobe_type; 1720 char *kprobe_type;
1287 1721
@@ -1291,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1291 kprobe_type = "j"; 1725 kprobe_type = "j";
1292 else 1726 else
1293 kprobe_type = "k"; 1727 kprobe_type = "k";
1728
1294 if (sym) 1729 if (sym)
1295 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1730 seq_printf(pi, "%p %s %s+0x%x %s ",
1296 p->addr, kprobe_type, sym, offset, 1731 p->addr, kprobe_type, sym, offset,
1297 (modname ? modname : " "), 1732 (modname ? modname : " "));
1298 (kprobe_gone(p) ? "[GONE]" : ""),
1299 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1300 "[DISABLED]" : ""));
1301 else 1733 else
1302 seq_printf(pi, "%p %s %p %s%s\n", 1734 seq_printf(pi, "%p %s %p ",
1303 p->addr, kprobe_type, p->addr, 1735 p->addr, kprobe_type, p->addr);
1304 (kprobe_gone(p) ? "[GONE]" : ""), 1736
1305 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1737 if (!pp)
1306 "[DISABLED]" : "")); 1738 pp = p;
1739 seq_printf(pi, "%s%s%s\n",
1740 (kprobe_gone(p) ? "[GONE]" : ""),
1741 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1742 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1307} 1743}
1308 1744
1309static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1745static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1339,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1339 hlist_for_each_entry_rcu(p, node, head, hlist) { 1775 hlist_for_each_entry_rcu(p, node, head, hlist) {
1340 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1776 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1341 &offset, &modname, namebuf); 1777 &offset, &modname, namebuf);
1342 if (p->pre_handler == aggr_pre_handler) { 1778 if (kprobe_aggrprobe(p)) {
1343 list_for_each_entry_rcu(kp, &p->list, list) 1779 list_for_each_entry_rcu(kp, &p->list, list)
1344 report_probe(pi, kp, sym, offset, modname); 1780 report_probe(pi, kp, sym, offset, modname, p);
1345 } else 1781 } else
1346 report_probe(pi, p, sym, offset, modname); 1782 report_probe(pi, p, sym, offset, modname, NULL);
1347 } 1783 }
1348 preempt_enable(); 1784 preempt_enable();
1349 return 0; 1785 return 0;
@@ -1421,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
1421 goto out; 1857 goto out;
1422 } 1858 }
1423 1859
1424 if (!kprobes_all_disarmed && kprobe_disabled(p))
1425 arm_kprobe(p);
1426
1427 p->flags &= ~KPROBE_FLAG_DISABLED;
1428 if (p != kp) 1860 if (p != kp)
1429 kp->flags &= ~KPROBE_FLAG_DISABLED; 1861 kp->flags &= ~KPROBE_FLAG_DISABLED;
1862
1863 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1864 p->flags &= ~KPROBE_FLAG_DISABLED;
1865 arm_kprobe(p);
1866 }
1430out: 1867out:
1431 mutex_unlock(&kprobe_mutex); 1868 mutex_unlock(&kprobe_mutex);
1432 return ret; 1869 return ret;
@@ -1446,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void)
1446 if (!kprobes_all_disarmed) 1883 if (!kprobes_all_disarmed)
1447 goto already_enabled; 1884 goto already_enabled;
1448 1885
1886 /* Arming kprobes doesn't optimize kprobe itself */
1449 mutex_lock(&text_mutex); 1887 mutex_lock(&text_mutex);
1450 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1888 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1451 head = &kprobe_table[i]; 1889 head = &kprobe_table[i];
1452 hlist_for_each_entry_rcu(p, node, head, hlist) 1890 hlist_for_each_entry_rcu(p, node, head, hlist)
1453 if (!kprobe_disabled(p)) 1891 if (!kprobe_disabled(p))
1454 arch_arm_kprobe(p); 1892 __arm_kprobe(p);
1455 } 1893 }
1456 mutex_unlock(&text_mutex); 1894 mutex_unlock(&text_mutex);
1457 1895
@@ -1478,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void)
1478 1916
1479 kprobes_all_disarmed = true; 1917 kprobes_all_disarmed = true;
1480 printk(KERN_INFO "Kprobes globally disabled\n"); 1918 printk(KERN_INFO "Kprobes globally disabled\n");
1919
1920 /*
1921 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1922 * because disarming may also unoptimize kprobes.
1923 */
1924 get_online_cpus();
1481 mutex_lock(&text_mutex); 1925 mutex_lock(&text_mutex);
1482 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1926 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1483 head = &kprobe_table[i]; 1927 head = &kprobe_table[i];
1484 hlist_for_each_entry_rcu(p, node, head, hlist) { 1928 hlist_for_each_entry_rcu(p, node, head, hlist) {
1485 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1929 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1486 arch_disarm_kprobe(p); 1930 __disarm_kprobe(p);
1487 } 1931 }
1488 } 1932 }
1489 1933
1490 mutex_unlock(&text_mutex); 1934 mutex_unlock(&text_mutex);
1935 put_online_cpus();
1491 mutex_unlock(&kprobe_mutex); 1936 mutex_unlock(&kprobe_mutex);
1492 /* Allow all currently running kprobes to complete */ 1937 /* Allow all currently running kprobes to complete */
1493 synchronize_sched(); 1938 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
33} 33}
34KERNEL_ATTR_RO(uevent_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
35 35
36/* uevent helper program, used during early boo */ 36/* uevent helper program, used during early boot */
37static ssize_t uevent_helper_show(struct kobject *kobj, 37static ssize_t uevent_helper_show(struct kobject *kobj,
38 struct kobj_attribute *attr, char *buf) 38 struct kobj_attribute *attr, char *buf)
39{ 39{
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe49..681bc2e1e187 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2147 return ret; 2147 return ret;
2148 2148
2149 return print_irq_inversion_bug(curr, &root, target_entry, 2149 return print_irq_inversion_bug(curr, &root, target_entry,
2150 this, 1, irqclass); 2150 this, 0, irqclass);
2151} 2151}
2152 2152
2153void print_irqtrace_events(struct task_struct *curr) 2153void print_irqtrace_events(struct task_struct *curr)
@@ -3809,3 +3809,22 @@ void lockdep_sys_exit(void)
3809 lockdep_print_held_locks(curr); 3809 lockdep_print_held_locks(curr);
3810 } 3810 }
3811} 3811}
3812
3813void lockdep_rcu_dereference(const char *file, const int line)
3814{
3815 struct task_struct *curr = current;
3816
3817 if (!debug_locks_off())
3818 return;
3819 printk("\n===================================================\n");
3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3821 printk( "---------------------------------------------------\n");
3822 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3823 file, line);
3824 printk("\nother info that might help us debug this:\n\n");
3825 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
3826 lockdep_print_held_locks(curr);
3827 printk("\nstack backtrace:\n");
3828 dump_stack();
3829}
3830EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index e96b8ed1cb6a..c968d3606dca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod)
474 474
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 475 INIT_LIST_HEAD(&mod->modules_which_use_me);
476 for_each_possible_cpu(cpu) 476 for_each_possible_cpu(cpu)
477 local_set(__module_ref_addr(mod, cpu), 0); 477 per_cpu_ptr(mod->refptr, cpu)->count = 0;
478
478 /* Hold reference count during initialization. */ 479 /* Hold reference count during initialization. */
479 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 480 __this_cpu_write(mod->refptr->count, 1);
480 /* Backwards compatibility macros put refcount during init. */ 481 /* Backwards compatibility macros put refcount during init. */
481 mod->waiter = current; 482 mod->waiter = current;
482} 483}
@@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod)
619 int cpu; 620 int cpu;
620 621
621 for_each_possible_cpu(cpu) 622 for_each_possible_cpu(cpu)
622 total += local_read(__module_ref_addr(mod, cpu)); 623 total += per_cpu_ptr(mod->refptr, cpu)->count;
623 return total; 624 return total;
624} 625}
625EXPORT_SYMBOL(module_refcount); 626EXPORT_SYMBOL(module_refcount);
@@ -796,14 +797,15 @@ static struct module_attribute refcnt = {
796void module_put(struct module *module) 797void module_put(struct module *module)
797{ 798{
798 if (module) { 799 if (module) {
799 unsigned int cpu = get_cpu(); 800 preempt_disable();
800 local_dec(__module_ref_addr(module, cpu)); 801 __this_cpu_dec(module->refptr->count);
802
801 trace_module_put(module, _RET_IP_, 803 trace_module_put(module, _RET_IP_,
802 local_read(__module_ref_addr(module, cpu))); 804 __this_cpu_read(module->refptr->count));
803 /* Maybe they're waiting for us to drop reference? */ 805 /* Maybe they're waiting for us to drop reference? */
804 if (unlikely(!module_is_live(module))) 806 if (unlikely(!module_is_live(module)))
805 wake_up_process(module->waiter); 807 wake_up_process(module->waiter);
806 put_cpu(); 808 preempt_enable();
807 } 809 }
808} 810}
809EXPORT_SYMBOL(module_put); 811EXPORT_SYMBOL(module_put);
@@ -1010,6 +1012,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1010 * J. Corbet <corbet@lwn.net> 1012 * J. Corbet <corbet@lwn.net>
1011 */ 1013 */
1012#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1014#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1015
1016static inline bool sect_empty(const Elf_Shdr *sect)
1017{
1018 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1019}
1020
1013struct module_sect_attr 1021struct module_sect_attr
1014{ 1022{
1015 struct module_attribute mattr; 1023 struct module_attribute mattr;
@@ -1051,8 +1059,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1051 1059
1052 /* Count loaded sections and allocate structures */ 1060 /* Count loaded sections and allocate structures */
1053 for (i = 0; i < nsect; i++) 1061 for (i = 0; i < nsect; i++)
1054 if (sechdrs[i].sh_flags & SHF_ALLOC 1062 if (!sect_empty(&sechdrs[i]))
1055 && sechdrs[i].sh_size)
1056 nloaded++; 1063 nloaded++;
1057 size[0] = ALIGN(sizeof(*sect_attrs) 1064 size[0] = ALIGN(sizeof(*sect_attrs)
1058 + nloaded * sizeof(sect_attrs->attrs[0]), 1065 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1070,9 +1077,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1070 sattr = &sect_attrs->attrs[0]; 1077 sattr = &sect_attrs->attrs[0];
1071 gattr = &sect_attrs->grp.attrs[0]; 1078 gattr = &sect_attrs->grp.attrs[0];
1072 for (i = 0; i < nsect; i++) { 1079 for (i = 0; i < nsect; i++) {
1073 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1080 if (sect_empty(&sechdrs[i]))
1074 continue;
1075 if (!sechdrs[i].sh_size)
1076 continue; 1081 continue;
1077 sattr->address = sechdrs[i].sh_addr; 1082 sattr->address = sechdrs[i].sh_addr;
1078 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1083 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1080,6 +1085,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1080 if (sattr->name == NULL) 1085 if (sattr->name == NULL)
1081 goto out; 1086 goto out;
1082 sect_attrs->nsections++; 1087 sect_attrs->nsections++;
1088 sysfs_attr_init(&sattr->mattr.attr);
1083 sattr->mattr.show = module_sect_show; 1089 sattr->mattr.show = module_sect_show;
1084 sattr->mattr.store = NULL; 1090 sattr->mattr.store = NULL;
1085 sattr->mattr.attr.name = sattr->name; 1091 sattr->mattr.attr.name = sattr->name;
@@ -1156,7 +1162,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1156 /* Count notes sections and allocate structures. */ 1162 /* Count notes sections and allocate structures. */
1157 notes = 0; 1163 notes = 0;
1158 for (i = 0; i < nsect; i++) 1164 for (i = 0; i < nsect; i++)
1159 if ((sechdrs[i].sh_flags & SHF_ALLOC) && 1165 if (!sect_empty(&sechdrs[i]) &&
1160 (sechdrs[i].sh_type == SHT_NOTE)) 1166 (sechdrs[i].sh_type == SHT_NOTE))
1161 ++notes; 1167 ++notes;
1162 1168
@@ -1172,9 +1178,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1172 notes_attrs->notes = notes; 1178 notes_attrs->notes = notes;
1173 nattr = &notes_attrs->attrs[0]; 1179 nattr = &notes_attrs->attrs[0];
1174 for (loaded = i = 0; i < nsect; ++i) { 1180 for (loaded = i = 0; i < nsect; ++i) {
1175 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1181 if (sect_empty(&sechdrs[i]))
1176 continue; 1182 continue;
1177 if (sechdrs[i].sh_type == SHT_NOTE) { 1183 if (sechdrs[i].sh_type == SHT_NOTE) {
1184 sysfs_bin_attr_init(nattr);
1178 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1185 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1179 nattr->attr.mode = S_IRUGO; 1186 nattr->attr.mode = S_IRUGO;
1180 nattr->size = sechdrs[i].sh_size; 1187 nattr->size = sechdrs[i].sh_size;
@@ -1247,6 +1254,7 @@ int module_add_modinfo_attrs(struct module *mod)
1247 if (!attr->test || 1254 if (!attr->test ||
1248 (attr->test && attr->test(mod))) { 1255 (attr->test && attr->test(mod))) {
1249 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1256 memcpy(temp_attr, attr, sizeof(*temp_attr));
1257 sysfs_attr_init(&temp_attr->attr);
1250 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1258 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1251 ++temp_attr; 1259 ++temp_attr;
1252 } 1260 }
@@ -1394,9 +1402,9 @@ static void free_module(struct module *mod)
1394 kfree(mod->args); 1402 kfree(mod->args);
1395 if (mod->percpu) 1403 if (mod->percpu)
1396 percpu_modfree(mod->percpu); 1404 percpu_modfree(mod->percpu);
1397#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 1405#if defined(CONFIG_MODULE_UNLOAD)
1398 if (mod->refptr) 1406 if (mod->refptr)
1399 percpu_modfree(mod->refptr); 1407 free_percpu(mod->refptr);
1400#endif 1408#endif
1401 /* Free lock-classes: */ 1409 /* Free lock-classes: */
1402 lockdep_free_key_range(mod->module_core, mod->core_size); 1410 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2159,9 +2167,8 @@ static noinline struct module *load_module(void __user *umod,
2159 mod = (void *)sechdrs[modindex].sh_addr; 2167 mod = (void *)sechdrs[modindex].sh_addr;
2160 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2168 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2161 2169
2162#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2170#if defined(CONFIG_MODULE_UNLOAD)
2163 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2171 mod->refptr = alloc_percpu(struct module_ref);
2164 mod->name);
2165 if (!mod->refptr) { 2172 if (!mod->refptr) {
2166 err = -ENOMEM; 2173 err = -ENOMEM;
2167 goto free_init; 2174 goto free_init;
@@ -2393,8 +2400,8 @@ static noinline struct module *load_module(void __user *umod,
2393 kobject_put(&mod->mkobj.kobj); 2400 kobject_put(&mod->mkobj.kobj);
2394 free_unload: 2401 free_unload:
2395 module_unload_free(mod); 2402 module_unload_free(mod);
2396#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2403#if defined(CONFIG_MODULE_UNLOAD)
2397 percpu_modfree(mod->refptr); 2404 free_percpu(mod->refptr);
2398 free_init: 2405 free_init:
2399#endif 2406#endif
2400 module_free(mod, mod->module_init); 2407 module_free(mod, mod->module_init);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
80 80
81 nb = rcu_dereference(*nl); 81 nb = rcu_dereference_raw(*nl);
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference_raw(nb->next);
85 85
86#ifdef CONFIG_DEBUG_NOTIFIERS 86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { 87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference(nh->head)) { 312 if (rcu_dereference_raw(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..2ab67233ee8f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -24,7 +24,18 @@
24 24
25static struct kmem_cache *nsproxy_cachep; 25static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = {
28 .count = ATOMIC_INIT(1),
29 .uts_ns = &init_uts_ns,
30#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
31 .ipc_ns = &init_ipc_ns,
32#endif
33 .mnt_ns = NULL,
34 .pid_ns = &init_pid_ns,
35#ifdef CONFIG_NET
36 .net_ns = &init_net,
37#endif
38};
28 39
29static inline struct nsproxy *create_nsproxy(void) 40static inline struct nsproxy *create_nsproxy(void)
30{ 41{
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..93caf65ff57c
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,696 @@
1/*
2 * padata.c - generic interface to process data streams in parallel
3 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include <linux/module.h>
22#include <linux/cpumask.h>
23#include <linux/err.h>
24#include <linux/cpu.h>
25#include <linux/padata.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/rcupdate.h>
29
30#define MAX_SEQ_NR INT_MAX - NR_CPUS
31#define MAX_OBJ_NUM 10000 * NR_CPUS
32
33static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
34{
35 int cpu, target_cpu;
36
37 target_cpu = cpumask_first(pd->cpumask);
38 for (cpu = 0; cpu < cpu_index; cpu++)
39 target_cpu = cpumask_next(target_cpu, pd->cpumask);
40
41 return target_cpu;
42}
43
44static int padata_cpu_hash(struct padata_priv *padata)
45{
46 int cpu_index;
47 struct parallel_data *pd;
48
49 pd = padata->pd;
50
51 /*
52 * Hash the sequence numbers to the cpus by taking
53 * seq_nr mod. number of cpus in use.
54 */
55 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
56
57 return padata_index_to_cpu(pd, cpu_index);
58}
59
60static void padata_parallel_worker(struct work_struct *work)
61{
62 struct padata_queue *queue;
63 struct parallel_data *pd;
64 struct padata_instance *pinst;
65 LIST_HEAD(local_list);
66
67 local_bh_disable();
68 queue = container_of(work, struct padata_queue, pwork);
69 pd = queue->pd;
70 pinst = pd->pinst;
71
72 spin_lock(&queue->parallel.lock);
73 list_replace_init(&queue->parallel.list, &local_list);
74 spin_unlock(&queue->parallel.lock);
75
76 while (!list_empty(&local_list)) {
77 struct padata_priv *padata;
78
79 padata = list_entry(local_list.next,
80 struct padata_priv, list);
81
82 list_del_init(&padata->list);
83
84 padata->parallel(padata);
85 }
86
87 local_bh_enable();
88}
89
90/*
91 * padata_do_parallel - padata parallelization function
92 *
93 * @pinst: padata instance
94 * @padata: object to be parallelized
95 * @cb_cpu: cpu the serialization callback function will run on,
96 * must be in the cpumask of padata.
97 *
98 * The parallelization callback function will run with BHs off.
99 * Note: Every object which is parallelized by padata_do_parallel
100 * must be seen by padata_do_serial.
101 */
102int padata_do_parallel(struct padata_instance *pinst,
103 struct padata_priv *padata, int cb_cpu)
104{
105 int target_cpu, err;
106 struct padata_queue *queue;
107 struct parallel_data *pd;
108
109 rcu_read_lock_bh();
110
111 pd = rcu_dereference(pinst->pd);
112
113 err = 0;
114 if (!(pinst->flags & PADATA_INIT))
115 goto out;
116
117 err = -EBUSY;
118 if ((pinst->flags & PADATA_RESET))
119 goto out;
120
121 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
122 goto out;
123
124 err = -EINVAL;
125 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
126 goto out;
127
128 err = -EINPROGRESS;
129 atomic_inc(&pd->refcnt);
130 padata->pd = pd;
131 padata->cb_cpu = cb_cpu;
132
133 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
134 atomic_set(&pd->seq_nr, -1);
135
136 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
137
138 target_cpu = padata_cpu_hash(padata);
139 queue = per_cpu_ptr(pd->queue, target_cpu);
140
141 spin_lock(&queue->parallel.lock);
142 list_add_tail(&padata->list, &queue->parallel.list);
143 spin_unlock(&queue->parallel.lock);
144
145 queue_work_on(target_cpu, pinst->wq, &queue->pwork);
146
147out:
148 rcu_read_unlock_bh();
149
150 return err;
151}
152EXPORT_SYMBOL(padata_do_parallel);
153
154static struct padata_priv *padata_get_next(struct parallel_data *pd)
155{
156 int cpu, num_cpus, empty, calc_seq_nr;
157 int seq_nr, next_nr, overrun, next_overrun;
158 struct padata_queue *queue, *next_queue;
159 struct padata_priv *padata;
160 struct padata_list *reorder;
161
162 empty = 0;
163 next_nr = -1;
164 next_overrun = 0;
165 next_queue = NULL;
166
167 num_cpus = cpumask_weight(pd->cpumask);
168
169 for_each_cpu(cpu, pd->cpumask) {
170 queue = per_cpu_ptr(pd->queue, cpu);
171 reorder = &queue->reorder;
172
173 /*
174 * Calculate the seq_nr of the object that should be
175 * next in this queue.
176 */
177 overrun = 0;
178 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
179 + queue->cpu_index;
180
181 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
182 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
183 overrun = 1;
184 }
185
186 if (!list_empty(&reorder->list)) {
187 padata = list_entry(reorder->list.next,
188 struct padata_priv, list);
189
190 seq_nr = padata->seq_nr;
191 BUG_ON(calc_seq_nr != seq_nr);
192 } else {
193 seq_nr = calc_seq_nr;
194 empty++;
195 }
196
197 if (next_nr < 0 || seq_nr < next_nr
198 || (next_overrun && !overrun)) {
199 next_nr = seq_nr;
200 next_overrun = overrun;
201 next_queue = queue;
202 }
203 }
204
205 padata = NULL;
206
207 if (empty == num_cpus)
208 goto out;
209
210 reorder = &next_queue->reorder;
211
212 if (!list_empty(&reorder->list)) {
213 padata = list_entry(reorder->list.next,
214 struct padata_priv, list);
215
216 if (unlikely(next_overrun)) {
217 for_each_cpu(cpu, pd->cpumask) {
218 queue = per_cpu_ptr(pd->queue, cpu);
219 atomic_set(&queue->num_obj, 0);
220 }
221 }
222
223 spin_lock(&reorder->lock);
224 list_del_init(&padata->list);
225 atomic_dec(&pd->reorder_objects);
226 spin_unlock(&reorder->lock);
227
228 atomic_inc(&next_queue->num_obj);
229
230 goto out;
231 }
232
233 if (next_nr % num_cpus == next_queue->cpu_index) {
234 padata = ERR_PTR(-ENODATA);
235 goto out;
236 }
237
238 padata = ERR_PTR(-EINPROGRESS);
239out:
240 return padata;
241}
242
243static void padata_reorder(struct parallel_data *pd)
244{
245 struct padata_priv *padata;
246 struct padata_queue *queue;
247 struct padata_instance *pinst = pd->pinst;
248
249try_again:
250 if (!spin_trylock_bh(&pd->lock))
251 goto out;
252
253 while (1) {
254 padata = padata_get_next(pd);
255
256 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
257 break;
258
259 if (PTR_ERR(padata) == -ENODATA) {
260 spin_unlock_bh(&pd->lock);
261 goto out;
262 }
263
264 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
265
266 spin_lock(&queue->serial.lock);
267 list_add_tail(&padata->list, &queue->serial.list);
268 spin_unlock(&queue->serial.lock);
269
270 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
271 }
272
273 spin_unlock_bh(&pd->lock);
274
275 if (atomic_read(&pd->reorder_objects))
276 goto try_again;
277
278out:
279 return;
280}
281
282static void padata_serial_worker(struct work_struct *work)
283{
284 struct padata_queue *queue;
285 struct parallel_data *pd;
286 LIST_HEAD(local_list);
287
288 local_bh_disable();
289 queue = container_of(work, struct padata_queue, swork);
290 pd = queue->pd;
291
292 spin_lock(&queue->serial.lock);
293 list_replace_init(&queue->serial.list, &local_list);
294 spin_unlock(&queue->serial.lock);
295
296 while (!list_empty(&local_list)) {
297 struct padata_priv *padata;
298
299 padata = list_entry(local_list.next,
300 struct padata_priv, list);
301
302 list_del_init(&padata->list);
303
304 padata->serial(padata);
305 atomic_dec(&pd->refcnt);
306 }
307 local_bh_enable();
308}
309
310/*
311 * padata_do_serial - padata serialization function
312 *
313 * @padata: object to be serialized.
314 *
315 * padata_do_serial must be called for every parallelized object.
316 * The serialization callback function will run with BHs off.
317 */
318void padata_do_serial(struct padata_priv *padata)
319{
320 int cpu;
321 struct padata_queue *queue;
322 struct parallel_data *pd;
323
324 pd = padata->pd;
325
326 cpu = get_cpu();
327 queue = per_cpu_ptr(pd->queue, cpu);
328
329 spin_lock(&queue->reorder.lock);
330 atomic_inc(&pd->reorder_objects);
331 list_add_tail(&padata->list, &queue->reorder.list);
332 spin_unlock(&queue->reorder.lock);
333
334 put_cpu();
335
336 padata_reorder(pd);
337}
338EXPORT_SYMBOL(padata_do_serial);
339
340static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
341 const struct cpumask *cpumask)
342{
343 int cpu, cpu_index, num_cpus;
344 struct padata_queue *queue;
345 struct parallel_data *pd;
346
347 cpu_index = 0;
348
349 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
350 if (!pd)
351 goto err;
352
353 pd->queue = alloc_percpu(struct padata_queue);
354 if (!pd->queue)
355 goto err_free_pd;
356
357 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
358 goto err_free_queue;
359
360 for_each_possible_cpu(cpu) {
361 queue = per_cpu_ptr(pd->queue, cpu);
362
363 queue->pd = pd;
364
365 if (cpumask_test_cpu(cpu, cpumask)
366 && cpumask_test_cpu(cpu, cpu_active_mask)) {
367 queue->cpu_index = cpu_index;
368 cpu_index++;
369 } else
370 queue->cpu_index = -1;
371
372 INIT_LIST_HEAD(&queue->reorder.list);
373 INIT_LIST_HEAD(&queue->parallel.list);
374 INIT_LIST_HEAD(&queue->serial.list);
375 spin_lock_init(&queue->reorder.lock);
376 spin_lock_init(&queue->parallel.lock);
377 spin_lock_init(&queue->serial.lock);
378
379 INIT_WORK(&queue->pwork, padata_parallel_worker);
380 INIT_WORK(&queue->swork, padata_serial_worker);
381 atomic_set(&queue->num_obj, 0);
382 }
383
384 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
385
386 num_cpus = cpumask_weight(pd->cpumask);
387 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
388
389 atomic_set(&pd->seq_nr, -1);
390 atomic_set(&pd->reorder_objects, 0);
391 atomic_set(&pd->refcnt, 0);
392 pd->pinst = pinst;
393 spin_lock_init(&pd->lock);
394
395 return pd;
396
397err_free_queue:
398 free_percpu(pd->queue);
399err_free_pd:
400 kfree(pd);
401err:
402 return NULL;
403}
404
405static void padata_free_pd(struct parallel_data *pd)
406{
407 free_cpumask_var(pd->cpumask);
408 free_percpu(pd->queue);
409 kfree(pd);
410}
411
412static void padata_replace(struct padata_instance *pinst,
413 struct parallel_data *pd_new)
414{
415 struct parallel_data *pd_old = pinst->pd;
416
417 pinst->flags |= PADATA_RESET;
418
419 rcu_assign_pointer(pinst->pd, pd_new);
420
421 synchronize_rcu();
422
423 while (atomic_read(&pd_old->refcnt) != 0)
424 yield();
425
426 flush_workqueue(pinst->wq);
427
428 padata_free_pd(pd_old);
429
430 pinst->flags &= ~PADATA_RESET;
431}
432
433/*
434 * padata_set_cpumask - set the cpumask that padata should use
435 *
436 * @pinst: padata instance
437 * @cpumask: the cpumask to use
438 */
439int padata_set_cpumask(struct padata_instance *pinst,
440 cpumask_var_t cpumask)
441{
442 struct parallel_data *pd;
443 int err = 0;
444
445 might_sleep();
446
447 mutex_lock(&pinst->lock);
448
449 pd = padata_alloc_pd(pinst, cpumask);
450 if (!pd) {
451 err = -ENOMEM;
452 goto out;
453 }
454
455 cpumask_copy(pinst->cpumask, cpumask);
456
457 padata_replace(pinst, pd);
458
459out:
460 mutex_unlock(&pinst->lock);
461
462 return err;
463}
464EXPORT_SYMBOL(padata_set_cpumask);
465
466static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
467{
468 struct parallel_data *pd;
469
470 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
471 pd = padata_alloc_pd(pinst, pinst->cpumask);
472 if (!pd)
473 return -ENOMEM;
474
475 padata_replace(pinst, pd);
476 }
477
478 return 0;
479}
480
481/*
482 * padata_add_cpu - add a cpu to the padata cpumask
483 *
484 * @pinst: padata instance
485 * @cpu: cpu to add
486 */
487int padata_add_cpu(struct padata_instance *pinst, int cpu)
488{
489 int err;
490
491 might_sleep();
492
493 mutex_lock(&pinst->lock);
494
495 cpumask_set_cpu(cpu, pinst->cpumask);
496 err = __padata_add_cpu(pinst, cpu);
497
498 mutex_unlock(&pinst->lock);
499
500 return err;
501}
502EXPORT_SYMBOL(padata_add_cpu);
503
504static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
505{
506 struct parallel_data *pd;
507
508 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
509 pd = padata_alloc_pd(pinst, pinst->cpumask);
510 if (!pd)
511 return -ENOMEM;
512
513 padata_replace(pinst, pd);
514 }
515
516 return 0;
517}
518
519/*
520 * padata_remove_cpu - remove a cpu from the padata cpumask
521 *
522 * @pinst: padata instance
523 * @cpu: cpu to remove
524 */
525int padata_remove_cpu(struct padata_instance *pinst, int cpu)
526{
527 int err;
528
529 might_sleep();
530
531 mutex_lock(&pinst->lock);
532
533 cpumask_clear_cpu(cpu, pinst->cpumask);
534 err = __padata_remove_cpu(pinst, cpu);
535
536 mutex_unlock(&pinst->lock);
537
538 return err;
539}
540EXPORT_SYMBOL(padata_remove_cpu);
541
542/*
543 * padata_start - start the parallel processing
544 *
545 * @pinst: padata instance to start
546 */
547void padata_start(struct padata_instance *pinst)
548{
549 might_sleep();
550
551 mutex_lock(&pinst->lock);
552 pinst->flags |= PADATA_INIT;
553 mutex_unlock(&pinst->lock);
554}
555EXPORT_SYMBOL(padata_start);
556
557/*
558 * padata_stop - stop the parallel processing
559 *
560 * @pinst: padata instance to stop
561 */
562void padata_stop(struct padata_instance *pinst)
563{
564 might_sleep();
565
566 mutex_lock(&pinst->lock);
567 pinst->flags &= ~PADATA_INIT;
568 mutex_unlock(&pinst->lock);
569}
570EXPORT_SYMBOL(padata_stop);
571
572static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
573 unsigned long action, void *hcpu)
574{
575 int err;
576 struct padata_instance *pinst;
577 int cpu = (unsigned long)hcpu;
578
579 pinst = container_of(nfb, struct padata_instance, cpu_notifier);
580
581 switch (action) {
582 case CPU_ONLINE:
583 case CPU_ONLINE_FROZEN:
584 if (!cpumask_test_cpu(cpu, pinst->cpumask))
585 break;
586 mutex_lock(&pinst->lock);
587 err = __padata_add_cpu(pinst, cpu);
588 mutex_unlock(&pinst->lock);
589 if (err)
590 return NOTIFY_BAD;
591 break;
592
593 case CPU_DOWN_PREPARE:
594 case CPU_DOWN_PREPARE_FROZEN:
595 if (!cpumask_test_cpu(cpu, pinst->cpumask))
596 break;
597 mutex_lock(&pinst->lock);
598 err = __padata_remove_cpu(pinst, cpu);
599 mutex_unlock(&pinst->lock);
600 if (err)
601 return NOTIFY_BAD;
602 break;
603
604 case CPU_UP_CANCELED:
605 case CPU_UP_CANCELED_FROZEN:
606 if (!cpumask_test_cpu(cpu, pinst->cpumask))
607 break;
608 mutex_lock(&pinst->lock);
609 __padata_remove_cpu(pinst, cpu);
610 mutex_unlock(&pinst->lock);
611
612 case CPU_DOWN_FAILED:
613 case CPU_DOWN_FAILED_FROZEN:
614 if (!cpumask_test_cpu(cpu, pinst->cpumask))
615 break;
616 mutex_lock(&pinst->lock);
617 __padata_add_cpu(pinst, cpu);
618 mutex_unlock(&pinst->lock);
619 }
620
621 return NOTIFY_OK;
622}
623
624/*
625 * padata_alloc - allocate and initialize a padata instance
626 *
627 * @cpumask: cpumask that padata uses for parallelization
628 * @wq: workqueue to use for the allocated padata instance
629 */
630struct padata_instance *padata_alloc(const struct cpumask *cpumask,
631 struct workqueue_struct *wq)
632{
633 int err;
634 struct padata_instance *pinst;
635 struct parallel_data *pd;
636
637 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
638 if (!pinst)
639 goto err;
640
641 pd = padata_alloc_pd(pinst, cpumask);
642 if (!pd)
643 goto err_free_inst;
644
645 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
646 goto err_free_pd;
647
648 rcu_assign_pointer(pinst->pd, pd);
649
650 pinst->wq = wq;
651
652 cpumask_copy(pinst->cpumask, cpumask);
653
654 pinst->flags = 0;
655
656 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
657 pinst->cpu_notifier.priority = 0;
658 err = register_hotcpu_notifier(&pinst->cpu_notifier);
659 if (err)
660 goto err_free_cpumask;
661
662 mutex_init(&pinst->lock);
663
664 return pinst;
665
666err_free_cpumask:
667 free_cpumask_var(pinst->cpumask);
668err_free_pd:
669 padata_free_pd(pd);
670err_free_inst:
671 kfree(pinst);
672err:
673 return NULL;
674}
675EXPORT_SYMBOL(padata_alloc);
676
677/*
678 * padata_free - free a padata instance
679 *
680 * @ padata_inst: padata instance to free
681 */
682void padata_free(struct padata_instance *pinst)
683{
684 padata_stop(pinst);
685
686 synchronize_rcu();
687
688 while (atomic_read(&pinst->pd->refcnt) != 0)
689 yield();
690
691 unregister_hotcpu_notifier(&pinst->cpu_notifier);
692 padata_free_pd(pinst->pd);
693 free_cpumask_var(pinst->cpumask);
694 kfree(pinst);
695}
696EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 5827f7b97254..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 36
37EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
38 38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
45long (*panic_blink)(long time); 40long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
47 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
48/** 69/**
49 * panic - halt the system 70 * panic - halt the system
50 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -75,7 +96,6 @@ NORET_TYPE void panic(const char * fmt, ...)
75 dump_stack(); 96 dump_stack();
76#endif 97#endif
77 98
78 kmsg_dump(KMSG_DUMP_PANIC);
79 /* 99 /*
80 * If we have crashed and we have a crash kernel loaded let it handle 100 * If we have crashed and we have a crash kernel loaded let it handle
81 * everything else. 101 * everything else.
@@ -83,6 +103,8 @@ NORET_TYPE void panic(const char * fmt, ...)
83 */ 103 */
84 crash_kexec(NULL); 104 crash_kexec(NULL);
85 105
106 kmsg_dump(KMSG_DUMP_PANIC);
107
86 /* 108 /*
87 * Note smp_send_stop is the usual smp shutdown function, which 109 * Note smp_send_stop is the usual smp shutdown function, which
88 * unfortunately means it may not be hardened to work in a panic 110 * unfortunately means it may not be hardened to work in a panic
@@ -94,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
94 116
95 bust_spinlocks(0); 117 bust_spinlocks(0);
96 118
97 if (!panic_blink)
98 panic_blink = no_blink;
99
100 if (panic_timeout > 0) { 119 if (panic_timeout > 0) {
101 /* 120 /*
102 * Delay timeout seconds before rebooting the machine. 121 * Delay timeout seconds before rebooting the machine.
@@ -104,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
104 */ 123 */
105 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
106 125
107 for (i = 0; i < panic_timeout*1000; ) { 126 for (i = 0; i < panic_timeout; i++) {
108 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
109 i += panic_blink(i); 128 panic_blink_one_second();
110 mdelay(1);
111 i++;
112 } 129 }
113 /* 130 /*
114 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
@@ -134,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
134 } 151 }
135#endif 152#endif
136 local_irq_enable(); 153 local_irq_enable();
137 for (i = 0; ; ) { 154 while (1) {
138 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
139 i += panic_blink(i); 156 panic_blink_one_second();
140 mdelay(1);
141 i++;
142 } 157 }
143} 158}
144 159
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
28 27
29#if 0 28#if 0
30#define DEBUGP printk 29#define DEBUGP printk
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
402} 401}
403 402
404/* sysfs output in /sys/modules/XYZ/parameters/ */ 403/* sysfs output in /sys/modules/XYZ/parameters/ */
405#define to_module_attr(n) container_of(n, struct module_attribute, attr); 404#define to_module_attr(n) container_of(n, struct module_attribute, attr)
406#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 405#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
407 406
408extern struct kernel_param __start___param[], __stop___param[]; 407extern struct kernel_param __start___param[], __stop___param[];
409 408
@@ -421,7 +420,7 @@ struct module_param_attrs
421}; 420};
422 421
423#ifdef CONFIG_SYSFS 422#ifdef CONFIG_SYSFS
424#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 423#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
425 424
426static ssize_t param_attr_show(struct module_attribute *mattr, 425static ssize_t param_attr_show(struct module_attribute *mattr,
427 struct module *mod, char *buf) 426 struct module *mod, char *buf)
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
517 new->grp.attrs = attrs; 516 new->grp.attrs = attrs;
518 517
519 /* Tack new one on the end. */ 518 /* Tack new one on the end. */
519 sysfs_attr_init(&new->attrs[num].mattr.attr);
520 new->attrs[num].param = kp; 520 new->attrs[num].param = kp;
521 new->attrs[num].mattr.show = param_attr_show; 521 new->attrs[num].mattr.show = param_attr_show;
522 new->attrs[num].mattr.store = param_attr_store; 522 new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
723 return ret; 723 return ret;
724} 724}
725 725
726static struct sysfs_ops module_sysfs_ops = { 726static const struct sysfs_ops module_sysfs_ops = {
727 .show = module_attr_show, 727 .show = module_attr_show,
728 .store = module_attr_store, 728 .store = module_attr_store,
729}; 729};
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
737 return 0; 737 return 0;
738} 738}
739 739
740static struct kset_uevent_ops module_uevent_ops = { 740static const struct kset_uevent_ops module_uevent_ops = {
741 .filter = uevent_filter, 741 .filter = uevent_filter,
742}; 742};
743 743
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 1f38270f08c7..4393b9e73740 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly;
56 */ 56 */
57int sysctl_perf_event_paranoid __read_mostly = 1; 57int sysctl_perf_event_paranoid __read_mostly = 1;
58 58
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 59int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75 60
76/* 61/*
@@ -98,11 +83,12 @@ void __weak hw_perf_enable(void) { barrier(); }
98 83
99void __weak hw_perf_event_setup(int cpu) { barrier(); } 84void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); } 85void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
86void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
101 87
102int __weak 88int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 89hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 90 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 91 struct perf_event_context *ctx)
106{ 92{
107 return 0; 93 return 0;
108} 94}
@@ -248,7 +234,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 234
249static inline u64 perf_clock(void) 235static inline u64 perf_clock(void)
250{ 236{
251 return cpu_clock(smp_processor_id()); 237 return cpu_clock(raw_smp_processor_id());
252} 238}
253 239
254/* 240/*
@@ -289,6 +275,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 275 event->total_time_running = run_end - event->tstamp_running;
290} 276}
291 277
278static struct list_head *
279ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
280{
281 if (event->attr.pinned)
282 return &ctx->pinned_groups;
283 else
284 return &ctx->flexible_groups;
285}
286
292/* 287/*
293 * Add a event from the lists for its context. 288 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 289 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +298,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 298 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 299 * leader's sibling list:
305 */ 300 */
306 if (group_leader == event) 301 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 302 struct list_head *list;
308 else { 303
304 if (is_software_event(event))
305 event->group_flags |= PERF_GROUP_SOFTWARE;
306
307 list = ctx_group_list(event, ctx);
308 list_add_tail(&event->group_entry, list);
309 } else {
310 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
311 !is_software_event(event))
312 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
313
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 314 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 315 group_leader->nr_siblings++;
311 } 316 }
@@ -355,9 +360,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 360 * to the context list directly:
356 */ 361 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 362 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
363 struct list_head *list;
358 364
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 365 list = ctx_group_list(event, ctx);
366 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 367 sibling->group_leader = sibling;
368
369 /* Inherit group flags from the previous leader */
370 sibling->group_flags = event->group_flags;
361 } 371 }
362} 372}
363 373
@@ -608,14 +618,13 @@ void perf_event_disable(struct perf_event *event)
608static int 618static int
609event_sched_in(struct perf_event *event, 619event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 620 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 621 struct perf_event_context *ctx)
612 int cpu)
613{ 622{
614 if (event->state <= PERF_EVENT_STATE_OFF) 623 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 624 return 0;
616 625
617 event->state = PERF_EVENT_STATE_ACTIVE; 626 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 627 event->oncpu = smp_processor_id();
619 /* 628 /*
620 * The new state must be visible before we turn it on in the hardware: 629 * The new state must be visible before we turn it on in the hardware:
621 */ 630 */
@@ -642,8 +651,7 @@ event_sched_in(struct perf_event *event,
642static int 651static int
643group_sched_in(struct perf_event *group_event, 652group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 653 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 654 struct perf_event_context *ctx)
646 int cpu)
647{ 655{
648 struct perf_event *event, *partial_group; 656 struct perf_event *event, *partial_group;
649 int ret; 657 int ret;
@@ -651,18 +659,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 659 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 660 return 0;
653 661
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 662 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 663 if (ret)
656 return ret < 0 ? ret : 0; 664 return ret < 0 ? ret : 0;
657 665
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 666 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 667 return -EAGAIN;
660 668
661 /* 669 /*
662 * Schedule in siblings as one group (if any): 670 * Schedule in siblings as one group (if any):
663 */ 671 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 672 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 673 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 674 partial_group = event;
667 goto group_error; 675 goto group_error;
668 } 676 }
@@ -686,24 +694,6 @@ group_error:
686} 694}
687 695
688/* 696/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 697 * Work out whether we can put this event group on the CPU now.
708 */ 698 */
709static int group_can_go_on(struct perf_event *event, 699static int group_can_go_on(struct perf_event *event,
@@ -713,7 +703,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 703 /*
714 * Groups consisting entirely of software events can always go on. 704 * Groups consisting entirely of software events can always go on.
715 */ 705 */
716 if (is_software_only_group(event)) 706 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 707 return 1;
718 /* 708 /*
719 * If an exclusive group is already on, no other hardware 709 * If an exclusive group is already on, no other hardware
@@ -754,7 +744,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 744 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 745 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 746 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 747 int err;
759 748
760 /* 749 /*
@@ -801,7 +790,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 790 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 791 err = -EEXIST;
803 else 792 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 793 err = event_sched_in(event, cpuctx, ctx);
805 794
806 if (err) { 795 if (err) {
807 /* 796 /*
@@ -943,11 +932,9 @@ static void __perf_event_enable(void *info)
943 } else { 932 } else {
944 perf_disable(); 933 perf_disable();
945 if (event == leader) 934 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 935 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 936 else
949 err = event_sched_in(event, cpuctx, ctx, 937 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 938 perf_enable();
952 } 939 }
953 940
@@ -1043,8 +1030,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1030 return 0;
1044} 1031}
1045 1032
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1033enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1034 EVENT_FLEXIBLE = 0x1,
1035 EVENT_PINNED = 0x2,
1036 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1037};
1038
1039static void ctx_sched_out(struct perf_event_context *ctx,
1040 struct perf_cpu_context *cpuctx,
1041 enum event_type_t event_type)
1048{ 1042{
1049 struct perf_event *event; 1043 struct perf_event *event;
1050 1044
@@ -1055,10 +1049,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1049 update_context_time(ctx);
1056 1050
1057 perf_disable(); 1051 perf_disable();
1058 if (ctx->nr_active) { 1052 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1053 goto out_enable;
1054
1055 if (event_type & EVENT_PINNED)
1056 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1057 group_sched_out(event, cpuctx, ctx);
1061 } 1058
1059 if (event_type & EVENT_FLEXIBLE)
1060 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1061 group_sched_out(event, cpuctx, ctx);
1062
1063 out_enable:
1062 perf_enable(); 1064 perf_enable();
1063 out: 1065 out:
1064 raw_spin_unlock(&ctx->lock); 1066 raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1172,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1172 * not restart the event.
1171 */ 1173 */
1172void perf_event_task_sched_out(struct task_struct *task, 1174void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1175 struct task_struct *next)
1174{ 1176{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1177 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1178 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1179 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1180 struct perf_event_context *parent;
@@ -1220,15 +1222,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1222 rcu_read_unlock();
1221 1223
1222 if (do_switch) { 1224 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1225 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1226 cpuctx->task_ctx = NULL;
1225 } 1227 }
1226} 1228}
1227 1229
1228/* 1230static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1231 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1232{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1234
@@ -1238,47 +1238,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1239 return;
1240 1240
1241 __perf_event_sched_out(ctx, cpuctx); 1241 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1242 cpuctx->task_ctx = NULL;
1243} 1243}
1244 1244
1245/* 1245/*
1246 * Called with IRQs disabled 1246 * Called with IRQs disabled
1247 */ 1247 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1248static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1249{
1250 task_ctx_sched_out(ctx, EVENT_ALL);
1251}
1252
1253/*
1254 * Called with IRQs disabled
1255 */
1256static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1257 enum event_type_t event_type)
1249{ 1258{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1259 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1260}
1252 1261
1253static void 1262static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1263ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1264 struct perf_cpu_context *cpuctx)
1256{ 1265{
1257 struct perf_event *event; 1266 struct perf_event *event;
1258 int can_add_hw = 1;
1259 1267
1260 raw_spin_lock(&ctx->lock); 1268 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1261 ctx->is_active = 1; 1269 if (event->state <= PERF_EVENT_STATE_OFF)
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264
1265 ctx->timestamp = perf_clock();
1266
1267 perf_disable();
1268
1269 /*
1270 * First go through the list and put on any pinned groups
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1270 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1271 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1272 continue;
1279 1273
1280 if (group_can_go_on(event, cpuctx, 1)) 1274 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1275 group_sched_in(event, cpuctx, ctx);
1282 1276
1283 /* 1277 /*
1284 * If this pinned group hasn't been scheduled, 1278 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1283,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1283 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1284 }
1291 } 1285 }
1286}
1292 1287
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1288static void
1294 /* 1289ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1290 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1291{
1297 */ 1292 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1293 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1294
1295 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1296 /* Ignore events in OFF or ERROR state */
1297 if (event->state <= PERF_EVENT_STATE_OFF)
1298 continue;
1302 /* 1299 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1300 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1301 * of events:
1305 */ 1302 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1303 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1304 continue;
1308 1305
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1306 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1307 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1308 can_add_hw = 0;
1312 } 1309 }
1310}
1311
1312static void
1313ctx_sched_in(struct perf_event_context *ctx,
1314 struct perf_cpu_context *cpuctx,
1315 enum event_type_t event_type)
1316{
1317 raw_spin_lock(&ctx->lock);
1318 ctx->is_active = 1;
1319 if (likely(!ctx->nr_events))
1320 goto out;
1321
1322 ctx->timestamp = perf_clock();
1323
1324 perf_disable();
1325
1326 /*
1327 * First go through the list and put on any pinned groups
1328 * in order to give them the best chance of going on.
1329 */
1330 if (event_type & EVENT_PINNED)
1331 ctx_pinned_sched_in(ctx, cpuctx);
1332
1333 /* Then walk through the lower prio flexible groups */
1334 if (event_type & EVENT_FLEXIBLE)
1335 ctx_flexible_sched_in(ctx, cpuctx);
1336
1313 perf_enable(); 1337 perf_enable();
1314 out: 1338 out:
1315 raw_spin_unlock(&ctx->lock); 1339 raw_spin_unlock(&ctx->lock);
1316} 1340}
1317 1341
1342static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1343 enum event_type_t event_type)
1344{
1345 struct perf_event_context *ctx = &cpuctx->ctx;
1346
1347 ctx_sched_in(ctx, cpuctx, event_type);
1348}
1349
1350static void task_ctx_sched_in(struct task_struct *task,
1351 enum event_type_t event_type)
1352{
1353 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1354 struct perf_event_context *ctx = task->perf_event_ctxp;
1355
1356 if (likely(!ctx))
1357 return;
1358 if (cpuctx->task_ctx == ctx)
1359 return;
1360 ctx_sched_in(ctx, cpuctx, event_type);
1361 cpuctx->task_ctx = ctx;
1362}
1318/* 1363/*
1319 * Called from scheduler to add the events of the current task 1364 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1365 * with interrupts disabled.
@@ -1326,38 +1371,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1371 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1372 * keep the event running.
1328 */ 1373 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1374void perf_event_task_sched_in(struct task_struct *task)
1330{ 1375{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1376 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1377 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1378
1334 if (likely(!ctx)) 1379 if (likely(!ctx))
1335 return; 1380 return;
1381
1336 if (cpuctx->task_ctx == ctx) 1382 if (cpuctx->task_ctx == ctx)
1337 return; 1383 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1384
1385 /*
1386 * We want to keep the following priority order:
1387 * cpu pinned (that don't need to move), task pinned,
1388 * cpu flexible, task flexible.
1389 */
1390 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1391
1392 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1393 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1394 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1395
1339 cpuctx->task_ctx = ctx; 1396 cpuctx->task_ctx = ctx;
1340} 1397}
1341 1398
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1399#define MAX_INTERRUPTS (~0ULL)
1400
1401static void perf_log_throttle(struct perf_event *event, int enable);
1402
1403static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1343{ 1404{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1405 u64 frequency = event->attr.sample_freq;
1406 u64 sec = NSEC_PER_SEC;
1407 u64 divisor, dividend;
1408
1409 int count_fls, nsec_fls, frequency_fls, sec_fls;
1410
1411 count_fls = fls64(count);
1412 nsec_fls = fls64(nsec);
1413 frequency_fls = fls64(frequency);
1414 sec_fls = 30;
1415
1416 /*
1417 * We got @count in @nsec, with a target of sample_freq HZ
1418 * the target period becomes:
1419 *
1420 * @count * 10^9
1421 * period = -------------------
1422 * @nsec * sample_freq
1423 *
1424 */
1425
1426 /*
1427 * Reduce accuracy by one bit such that @a and @b converge
1428 * to a similar magnitude.
1429 */
1430#define REDUCE_FLS(a, b) \
1431do { \
1432 if (a##_fls > b##_fls) { \
1433 a >>= 1; \
1434 a##_fls--; \
1435 } else { \
1436 b >>= 1; \
1437 b##_fls--; \
1438 } \
1439} while (0)
1440
1441 /*
1442 * Reduce accuracy until either term fits in a u64, then proceed with
1443 * the other, so that finally we can do a u64/u64 division.
1444 */
1445 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1446 REDUCE_FLS(nsec, frequency);
1447 REDUCE_FLS(sec, count);
1448 }
1345 1449
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1450 if (count_fls + sec_fls > 64) {
1451 divisor = nsec * frequency;
1452
1453 while (count_fls + sec_fls > 64) {
1454 REDUCE_FLS(count, sec);
1455 divisor >>= 1;
1456 }
1457
1458 dividend = count * sec;
1459 } else {
1460 dividend = count * sec;
1461
1462 while (nsec_fls + frequency_fls > 64) {
1463 REDUCE_FLS(nsec, frequency);
1464 dividend >>= 1;
1465 }
1466
1467 divisor = nsec * frequency;
1468 }
1469
1470 return div64_u64(dividend, divisor);
1347} 1471}
1348 1472
1349#define MAX_INTERRUPTS (~0ULL) 1473static void perf_event_stop(struct perf_event *event)
1474{
1475 if (!event->pmu->stop)
1476 return event->pmu->disable(event);
1350 1477
1351static void perf_log_throttle(struct perf_event *event, int enable); 1478 return event->pmu->stop(event);
1479}
1480
1481static int perf_event_start(struct perf_event *event)
1482{
1483 if (!event->pmu->start)
1484 return event->pmu->enable(event);
1485
1486 return event->pmu->start(event);
1487}
1352 1488
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1489static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1490{
1355 struct hw_perf_event *hwc = &event->hw; 1491 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1492 u64 period, sample_period;
1357 s64 delta; 1493 s64 delta;
1358 1494
1359 events *= hwc->sample_period; 1495 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1496
1362 delta = (s64)(period - hwc->sample_period); 1497 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1498 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1503,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1503 sample_period = 1;
1369 1504
1370 hwc->sample_period = sample_period; 1505 hwc->sample_period = sample_period;
1506
1507 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1508 perf_disable();
1509 perf_event_stop(event);
1510 atomic64_set(&hwc->period_left, 0);
1511 perf_event_start(event);
1512 perf_enable();
1513 }
1371} 1514}
1372 1515
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1516static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1517{
1375 struct perf_event *event; 1518 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1519 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1520 u64 interrupts, now;
1521 s64 delta;
1378 1522
1379 raw_spin_lock(&ctx->lock); 1523 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1524 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1539,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1395 if (interrupts == MAX_INTERRUPTS) { 1539 if (interrupts == MAX_INTERRUPTS) {
1396 perf_log_throttle(event, 1); 1540 perf_log_throttle(event, 1);
1397 event->pmu->unthrottle(event); 1541 event->pmu->unthrottle(event);
1398 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1399 } 1542 }
1400 1543
1401 if (!event->attr.freq || !event->attr.sample_freq) 1544 if (!event->attr.freq || !event->attr.sample_freq)
1402 continue; 1545 continue;
1403 1546
1404 /* 1547 event->pmu->read(event);
1405 * if the specified freq < HZ then we need to skip ticks 1548 now = atomic64_read(&event->count);
1406 */ 1549 delta = now - hwc->freq_count_stamp;
1407 if (event->attr.sample_freq < HZ) { 1550 hwc->freq_count_stamp = now;
1408 freq = event->attr.sample_freq;
1409
1410 hwc->freq_count += freq;
1411 hwc->freq_interrupts += interrupts;
1412
1413 if (hwc->freq_count < HZ)
1414 continue;
1415
1416 interrupts = hwc->freq_interrupts;
1417 hwc->freq_interrupts = 0;
1418 hwc->freq_count -= HZ;
1419 } else
1420 freq = HZ;
1421
1422 perf_adjust_period(event, freq * interrupts);
1423 1551
1424 /* 1552 if (delta > 0)
1425 * In order to avoid being stalled by an (accidental) huge 1553 perf_adjust_period(event, TICK_NSEC, delta);
1426 * sample period, force reset the sample period if we didn't
1427 * get any events in this freq period.
1428 */
1429 if (!interrupts) {
1430 perf_disable();
1431 event->pmu->disable(event);
1432 atomic64_set(&hwc->period_left, 0);
1433 event->pmu->enable(event);
1434 perf_enable();
1435 }
1436 } 1554 }
1437 raw_spin_unlock(&ctx->lock); 1555 raw_spin_unlock(&ctx->lock);
1438} 1556}
@@ -1442,26 +1560,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1442 */ 1560 */
1443static void rotate_ctx(struct perf_event_context *ctx) 1561static void rotate_ctx(struct perf_event_context *ctx)
1444{ 1562{
1445 struct perf_event *event;
1446
1447 if (!ctx->nr_events) 1563 if (!ctx->nr_events)
1448 return; 1564 return;
1449 1565
1450 raw_spin_lock(&ctx->lock); 1566 raw_spin_lock(&ctx->lock);
1451 /* 1567
1452 * Rotate the first entry last (works just fine for group events too): 1568 /* Rotate the first entry last of non-pinned groups */
1453 */ 1569 list_rotate_left(&ctx->flexible_groups);
1454 perf_disable();
1455 list_for_each_entry(event, &ctx->group_list, group_entry) {
1456 list_move_tail(&event->group_entry, &ctx->group_list);
1457 break;
1458 }
1459 perf_enable();
1460 1570
1461 raw_spin_unlock(&ctx->lock); 1571 raw_spin_unlock(&ctx->lock);
1462} 1572}
1463 1573
1464void perf_event_task_tick(struct task_struct *curr, int cpu) 1574void perf_event_task_tick(struct task_struct *curr)
1465{ 1575{
1466 struct perf_cpu_context *cpuctx; 1576 struct perf_cpu_context *cpuctx;
1467 struct perf_event_context *ctx; 1577 struct perf_event_context *ctx;
@@ -1469,24 +1579,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
1469 if (!atomic_read(&nr_events)) 1579 if (!atomic_read(&nr_events))
1470 return; 1580 return;
1471 1581
1472 cpuctx = &per_cpu(perf_cpu_context, cpu); 1582 cpuctx = &__get_cpu_var(perf_cpu_context);
1473 ctx = curr->perf_event_ctxp; 1583 ctx = curr->perf_event_ctxp;
1474 1584
1585 perf_disable();
1586
1475 perf_ctx_adjust_freq(&cpuctx->ctx); 1587 perf_ctx_adjust_freq(&cpuctx->ctx);
1476 if (ctx) 1588 if (ctx)
1477 perf_ctx_adjust_freq(ctx); 1589 perf_ctx_adjust_freq(ctx);
1478 1590
1479 perf_event_cpu_sched_out(cpuctx); 1591 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1480 if (ctx) 1592 if (ctx)
1481 __perf_event_task_sched_out(ctx); 1593 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1482 1594
1483 rotate_ctx(&cpuctx->ctx); 1595 rotate_ctx(&cpuctx->ctx);
1484 if (ctx) 1596 if (ctx)
1485 rotate_ctx(ctx); 1597 rotate_ctx(ctx);
1486 1598
1487 perf_event_cpu_sched_in(cpuctx, cpu); 1599 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1488 if (ctx) 1600 if (ctx)
1489 perf_event_task_sched_in(curr, cpu); 1601 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1602
1603 perf_enable();
1604}
1605
1606static int event_enable_on_exec(struct perf_event *event,
1607 struct perf_event_context *ctx)
1608{
1609 if (!event->attr.enable_on_exec)
1610 return 0;
1611
1612 event->attr.enable_on_exec = 0;
1613 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1614 return 0;
1615
1616 __perf_event_mark_enabled(event, ctx);
1617
1618 return 1;
1490} 1619}
1491 1620
1492/* 1621/*
@@ -1499,6 +1628,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1499 struct perf_event *event; 1628 struct perf_event *event;
1500 unsigned long flags; 1629 unsigned long flags;
1501 int enabled = 0; 1630 int enabled = 0;
1631 int ret;
1502 1632
1503 local_irq_save(flags); 1633 local_irq_save(flags);
1504 ctx = task->perf_event_ctxp; 1634 ctx = task->perf_event_ctxp;
@@ -1509,14 +1639,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1509 1639
1510 raw_spin_lock(&ctx->lock); 1640 raw_spin_lock(&ctx->lock);
1511 1641
1512 list_for_each_entry(event, &ctx->group_list, group_entry) { 1642 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1513 if (!event->attr.enable_on_exec) 1643 ret = event_enable_on_exec(event, ctx);
1514 continue; 1644 if (ret)
1515 event->attr.enable_on_exec = 0; 1645 enabled = 1;
1516 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1646 }
1517 continue; 1647
1518 __perf_event_mark_enabled(event, ctx); 1648 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1519 enabled = 1; 1649 ret = event_enable_on_exec(event, ctx);
1650 if (ret)
1651 enabled = 1;
1520 } 1652 }
1521 1653
1522 /* 1654 /*
@@ -1527,7 +1659,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1527 1659
1528 raw_spin_unlock(&ctx->lock); 1660 raw_spin_unlock(&ctx->lock);
1529 1661
1530 perf_event_task_sched_in(task, smp_processor_id()); 1662 perf_event_task_sched_in(task);
1531 out: 1663 out:
1532 local_irq_restore(flags); 1664 local_irq_restore(flags);
1533} 1665}
@@ -1590,7 +1722,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1590{ 1722{
1591 raw_spin_lock_init(&ctx->lock); 1723 raw_spin_lock_init(&ctx->lock);
1592 mutex_init(&ctx->mutex); 1724 mutex_init(&ctx->mutex);
1593 INIT_LIST_HEAD(&ctx->group_list); 1725 INIT_LIST_HEAD(&ctx->pinned_groups);
1726 INIT_LIST_HEAD(&ctx->flexible_groups);
1594 INIT_LIST_HEAD(&ctx->event_list); 1727 INIT_LIST_HEAD(&ctx->event_list);
1595 atomic_set(&ctx->refcount, 1); 1728 atomic_set(&ctx->refcount, 1);
1596 ctx->task = task; 1729 ctx->task = task;
@@ -2462,7 +2595,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2462 if (user_locked > user_lock_limit) 2595 if (user_locked > user_lock_limit)
2463 extra = user_locked - user_lock_limit; 2596 extra = user_locked - user_lock_limit;
2464 2597
2465 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2598 lock_limit = rlimit(RLIMIT_MEMLOCK);
2466 lock_limit >>= PAGE_SHIFT; 2599 lock_limit >>= PAGE_SHIFT;
2467 locked = vma->vm_mm->locked_vm + extra; 2600 locked = vma->vm_mm->locked_vm + extra;
2468 2601
@@ -3259,8 +3392,6 @@ static void perf_event_task_output(struct perf_event *event,
3259 task_event->event_id.tid = perf_event_tid(event, task); 3392 task_event->event_id.tid = perf_event_tid(event, task);
3260 task_event->event_id.ptid = perf_event_tid(event, current); 3393 task_event->event_id.ptid = perf_event_tid(event, current);
3261 3394
3262 task_event->event_id.time = perf_clock();
3263
3264 perf_output_put(&handle, task_event->event_id); 3395 perf_output_put(&handle, task_event->event_id);
3265 3396
3266 perf_output_end(&handle); 3397 perf_output_end(&handle);
@@ -3268,6 +3399,9 @@ static void perf_event_task_output(struct perf_event *event,
3268 3399
3269static int perf_event_task_match(struct perf_event *event) 3400static int perf_event_task_match(struct perf_event *event)
3270{ 3401{
3402 if (event->state < PERF_EVENT_STATE_INACTIVE)
3403 return 0;
3404
3271 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3405 if (event->cpu != -1 && event->cpu != smp_processor_id())
3272 return 0; 3406 return 0;
3273 3407
@@ -3297,7 +3431,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3297 cpuctx = &get_cpu_var(perf_cpu_context); 3431 cpuctx = &get_cpu_var(perf_cpu_context);
3298 perf_event_task_ctx(&cpuctx->ctx, task_event); 3432 perf_event_task_ctx(&cpuctx->ctx, task_event);
3299 if (!ctx) 3433 if (!ctx)
3300 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3434 ctx = rcu_dereference(current->perf_event_ctxp);
3301 if (ctx) 3435 if (ctx)
3302 perf_event_task_ctx(ctx, task_event); 3436 perf_event_task_ctx(ctx, task_event);
3303 put_cpu_var(perf_cpu_context); 3437 put_cpu_var(perf_cpu_context);
@@ -3328,6 +3462,7 @@ static void perf_event_task(struct task_struct *task,
3328 /* .ppid */ 3462 /* .ppid */
3329 /* .tid */ 3463 /* .tid */
3330 /* .ptid */ 3464 /* .ptid */
3465 .time = perf_clock(),
3331 }, 3466 },
3332 }; 3467 };
3333 3468
@@ -3377,6 +3512,9 @@ static void perf_event_comm_output(struct perf_event *event,
3377 3512
3378static int perf_event_comm_match(struct perf_event *event) 3513static int perf_event_comm_match(struct perf_event *event)
3379{ 3514{
3515 if (event->state < PERF_EVENT_STATE_INACTIVE)
3516 return 0;
3517
3380 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3518 if (event->cpu != -1 && event->cpu != smp_processor_id())
3381 return 0; 3519 return 0;
3382 3520
@@ -3494,6 +3632,9 @@ static void perf_event_mmap_output(struct perf_event *event,
3494static int perf_event_mmap_match(struct perf_event *event, 3632static int perf_event_mmap_match(struct perf_event *event,
3495 struct perf_mmap_event *mmap_event) 3633 struct perf_mmap_event *mmap_event)
3496{ 3634{
3635 if (event->state < PERF_EVENT_STATE_INACTIVE)
3636 return 0;
3637
3497 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3638 if (event->cpu != -1 && event->cpu != smp_processor_id())
3498 return 0; 3639 return 0;
3499 3640
@@ -3600,7 +3741,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3600 /* .tid */ 3741 /* .tid */
3601 .start = vma->vm_start, 3742 .start = vma->vm_start,
3602 .len = vma->vm_end - vma->vm_start, 3743 .len = vma->vm_end - vma->vm_start,
3603 .pgoff = vma->vm_pgoff, 3744 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3604 }, 3745 },
3605 }; 3746 };
3606 3747
@@ -3680,12 +3821,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3680 3821
3681 if (event->attr.freq) { 3822 if (event->attr.freq) {
3682 u64 now = perf_clock(); 3823 u64 now = perf_clock();
3683 s64 delta = now - hwc->freq_stamp; 3824 s64 delta = now - hwc->freq_time_stamp;
3684 3825
3685 hwc->freq_stamp = now; 3826 hwc->freq_time_stamp = now;
3686 3827
3687 if (delta > 0 && delta < TICK_NSEC) 3828 if (delta > 0 && delta < 2*TICK_NSEC)
3688 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3829 perf_adjust_period(event, delta, hwc->last_period);
3689 } 3830 }
3690 3831
3691 /* 3832 /*
@@ -3967,8 +4108,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3967 if (rctx < 0) 4108 if (rctx < 0)
3968 return; 4109 return;
3969 4110
3970 data.addr = addr; 4111 perf_sample_data_init(&data, addr);
3971 data.raw = NULL;
3972 4112
3973 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4113 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3974 4114
@@ -4013,11 +4153,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4013 struct perf_event *event; 4153 struct perf_event *event;
4014 u64 period; 4154 u64 period;
4015 4155
4016 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4156 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4017 event->pmu->read(event); 4157 event->pmu->read(event);
4018 4158
4019 data.addr = 0; 4159 perf_sample_data_init(&data, 0);
4020 data.raw = NULL;
4021 data.period = event->hw.last_period; 4160 data.period = event->hw.last_period;
4022 regs = get_irq_regs(); 4161 regs = get_irq_regs();
4023 /* 4162 /*
@@ -4176,22 +4315,20 @@ static const struct pmu perf_ops_task_clock = {
4176 .read = task_clock_perf_event_read, 4315 .read = task_clock_perf_event_read,
4177}; 4316};
4178 4317
4179#ifdef CONFIG_EVENT_PROFILE 4318#ifdef CONFIG_EVENT_TRACING
4180 4319
4181void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4320void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4182 int entry_size) 4321 int entry_size)
4183{ 4322{
4323 struct pt_regs *regs = get_irq_regs();
4324 struct perf_sample_data data;
4184 struct perf_raw_record raw = { 4325 struct perf_raw_record raw = {
4185 .size = entry_size, 4326 .size = entry_size,
4186 .data = record, 4327 .data = record,
4187 }; 4328 };
4188 4329
4189 struct perf_sample_data data = { 4330 perf_sample_data_init(&data, addr);
4190 .addr = addr, 4331 data.raw = &raw;
4191 .raw = &raw,
4192 };
4193
4194 struct pt_regs *regs = get_irq_regs();
4195 4332
4196 if (!regs) 4333 if (!regs)
4197 regs = task_pt_regs(current); 4334 regs = task_pt_regs(current);
@@ -4281,7 +4418,7 @@ static void perf_event_free_filter(struct perf_event *event)
4281{ 4418{
4282} 4419}
4283 4420
4284#endif /* CONFIG_EVENT_PROFILE */ 4421#endif /* CONFIG_EVENT_TRACING */
4285 4422
4286#ifdef CONFIG_HAVE_HW_BREAKPOINT 4423#ifdef CONFIG_HAVE_HW_BREAKPOINT
4287static void bp_perf_event_destroy(struct perf_event *event) 4424static void bp_perf_event_destroy(struct perf_event *event)
@@ -4307,8 +4444,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4307 struct perf_sample_data sample; 4444 struct perf_sample_data sample;
4308 struct pt_regs *regs = data; 4445 struct pt_regs *regs = data;
4309 4446
4310 sample.raw = NULL; 4447 perf_sample_data_init(&sample, bp->attr.bp_addr);
4311 sample.addr = bp->attr.bp_addr;
4312 4448
4313 if (!perf_exclude_event(bp, regs)) 4449 if (!perf_exclude_event(bp, regs))
4314 perf_swevent_add(bp, 1, 1, &sample, regs); 4450 perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4571,7 +4707,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4571 if (attr->type >= PERF_TYPE_MAX) 4707 if (attr->type >= PERF_TYPE_MAX)
4572 return -EINVAL; 4708 return -EINVAL;
4573 4709
4574 if (attr->__reserved_1 || attr->__reserved_2) 4710 if (attr->__reserved_1)
4575 return -EINVAL; 4711 return -EINVAL;
4576 4712
4577 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4713 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4862,8 +4998,15 @@ inherit_event(struct perf_event *parent_event,
4862 else 4998 else
4863 child_event->state = PERF_EVENT_STATE_OFF; 4999 child_event->state = PERF_EVENT_STATE_OFF;
4864 5000
4865 if (parent_event->attr.freq) 5001 if (parent_event->attr.freq) {
4866 child_event->hw.sample_period = parent_event->hw.sample_period; 5002 u64 sample_period = parent_event->hw.sample_period;
5003 struct hw_perf_event *hwc = &child_event->hw;
5004
5005 hwc->sample_period = sample_period;
5006 hwc->last_period = sample_period;
5007
5008 atomic64_set(&hwc->period_left, sample_period);
5009 }
4867 5010
4868 child_event->overflow_handler = parent_event->overflow_handler; 5011 child_event->overflow_handler = parent_event->overflow_handler;
4869 5012
@@ -5031,7 +5174,11 @@ void perf_event_exit_task(struct task_struct *child)
5031 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5174 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5032 5175
5033again: 5176again:
5034 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5177 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5178 group_entry)
5179 __perf_event_exit_task(child_event, child_ctx, child);
5180
5181 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5035 group_entry) 5182 group_entry)
5036 __perf_event_exit_task(child_event, child_ctx, child); 5183 __perf_event_exit_task(child_event, child_ctx, child);
5037 5184
@@ -5040,7 +5187,8 @@ again:
5040 * its siblings to the list, but we obtained 'tmp' before that which 5187 * its siblings to the list, but we obtained 'tmp' before that which
5041 * will still point to the list head terminating the iteration. 5188 * will still point to the list head terminating the iteration.
5042 */ 5189 */
5043 if (!list_empty(&child_ctx->group_list)) 5190 if (!list_empty(&child_ctx->pinned_groups) ||
5191 !list_empty(&child_ctx->flexible_groups))
5044 goto again; 5192 goto again;
5045 5193
5046 mutex_unlock(&child_ctx->mutex); 5194 mutex_unlock(&child_ctx->mutex);
@@ -5048,6 +5196,24 @@ again:
5048 put_ctx(child_ctx); 5196 put_ctx(child_ctx);
5049} 5197}
5050 5198
5199static void perf_free_event(struct perf_event *event,
5200 struct perf_event_context *ctx)
5201{
5202 struct perf_event *parent = event->parent;
5203
5204 if (WARN_ON_ONCE(!parent))
5205 return;
5206
5207 mutex_lock(&parent->child_mutex);
5208 list_del_init(&event->child_list);
5209 mutex_unlock(&parent->child_mutex);
5210
5211 fput(parent->filp);
5212
5213 list_del_event(event, ctx);
5214 free_event(event);
5215}
5216
5051/* 5217/*
5052 * free an unexposed, unused context as created by inheritance by 5218 * free an unexposed, unused context as created by inheritance by
5053 * init_task below, used by fork() in case of fail. 5219 * init_task below, used by fork() in case of fail.
@@ -5062,36 +5228,70 @@ void perf_event_free_task(struct task_struct *task)
5062 5228
5063 mutex_lock(&ctx->mutex); 5229 mutex_lock(&ctx->mutex);
5064again: 5230again:
5065 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5231 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5066 struct perf_event *parent = event->parent; 5232 perf_free_event(event, ctx);
5067 5233
5068 if (WARN_ON_ONCE(!parent)) 5234 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5069 continue; 5235 group_entry)
5236 perf_free_event(event, ctx);
5070 5237
5071 mutex_lock(&parent->child_mutex); 5238 if (!list_empty(&ctx->pinned_groups) ||
5072 list_del_init(&event->child_list); 5239 !list_empty(&ctx->flexible_groups))
5073 mutex_unlock(&parent->child_mutex); 5240 goto again;
5074 5241
5075 fput(parent->filp); 5242 mutex_unlock(&ctx->mutex);
5076 5243
5077 list_del_event(event, ctx); 5244 put_ctx(ctx);
5078 free_event(event); 5245}
5246
5247static int
5248inherit_task_group(struct perf_event *event, struct task_struct *parent,
5249 struct perf_event_context *parent_ctx,
5250 struct task_struct *child,
5251 int *inherited_all)
5252{
5253 int ret;
5254 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5255
5256 if (!event->attr.inherit) {
5257 *inherited_all = 0;
5258 return 0;
5079 } 5259 }
5080 5260
5081 if (!list_empty(&ctx->group_list)) 5261 if (!child_ctx) {
5082 goto again; 5262 /*
5263 * This is executed from the parent task context, so
5264 * inherit events that have been marked for cloning.
5265 * First allocate and initialize a context for the
5266 * child.
5267 */
5083 5268
5084 mutex_unlock(&ctx->mutex); 5269 child_ctx = kzalloc(sizeof(struct perf_event_context),
5270 GFP_KERNEL);
5271 if (!child_ctx)
5272 return -ENOMEM;
5085 5273
5086 put_ctx(ctx); 5274 __perf_event_init_context(child_ctx, child);
5275 child->perf_event_ctxp = child_ctx;
5276 get_task_struct(child);
5277 }
5278
5279 ret = inherit_group(event, parent, parent_ctx,
5280 child, child_ctx);
5281
5282 if (ret)
5283 *inherited_all = 0;
5284
5285 return ret;
5087} 5286}
5088 5287
5288
5089/* 5289/*
5090 * Initialize the perf_event context in task_struct 5290 * Initialize the perf_event context in task_struct
5091 */ 5291 */
5092int perf_event_init_task(struct task_struct *child) 5292int perf_event_init_task(struct task_struct *child)
5093{ 5293{
5094 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5294 struct perf_event_context *child_ctx, *parent_ctx;
5095 struct perf_event_context *cloned_ctx; 5295 struct perf_event_context *cloned_ctx;
5096 struct perf_event *event; 5296 struct perf_event *event;
5097 struct task_struct *parent = current; 5297 struct task_struct *parent = current;
@@ -5129,42 +5329,23 @@ int perf_event_init_task(struct task_struct *child)
5129 * We dont have to disable NMIs - we are only looking at 5329 * We dont have to disable NMIs - we are only looking at
5130 * the list, not manipulating it: 5330 * the list, not manipulating it:
5131 */ 5331 */
5132 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5332 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5133 5333 ret = inherit_task_group(event, parent, parent_ctx, child,
5134 if (!event->attr.inherit) { 5334 &inherited_all);
5135 inherited_all = 0; 5335 if (ret)
5136 continue; 5336 break;
5137 } 5337 }
5138
5139 if (!child->perf_event_ctxp) {
5140 /*
5141 * This is executed from the parent task context, so
5142 * inherit events that have been marked for cloning.
5143 * First allocate and initialize a context for the
5144 * child.
5145 */
5146
5147 child_ctx = kzalloc(sizeof(struct perf_event_context),
5148 GFP_KERNEL);
5149 if (!child_ctx) {
5150 ret = -ENOMEM;
5151 goto exit;
5152 }
5153
5154 __perf_event_init_context(child_ctx, child);
5155 child->perf_event_ctxp = child_ctx;
5156 get_task_struct(child);
5157 }
5158 5338
5159 ret = inherit_group(event, parent, parent_ctx, 5339 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5160 child, child_ctx); 5340 ret = inherit_task_group(event, parent, parent_ctx, child,
5161 if (ret) { 5341 &inherited_all);
5162 inherited_all = 0; 5342 if (ret)
5163 break; 5343 break;
5164 }
5165 } 5344 }
5166 5345
5167 if (inherited_all) { 5346 child_ctx = child->perf_event_ctxp;
5347
5348 if (child_ctx && inherited_all) {
5168 /* 5349 /*
5169 * Mark the child context as a clone of the parent 5350 * Mark the child context as a clone of the parent
5170 * context, or of whatever the parent is a clone of. 5351 * context, or of whatever the parent is a clone of.
@@ -5184,7 +5365,6 @@ int perf_event_init_task(struct task_struct *child)
5184 get_ctx(child_ctx->parent_ctx); 5365 get_ctx(child_ctx->parent_ctx);
5185 } 5366 }
5186 5367
5187exit:
5188 mutex_unlock(&parent_ctx->mutex); 5368 mutex_unlock(&parent_ctx->mutex);
5189 5369
5190 perf_unpin_context(parent_ctx); 5370 perf_unpin_context(parent_ctx);
@@ -5213,7 +5393,9 @@ static void __perf_event_exit_cpu(void *info)
5213 struct perf_event_context *ctx = &cpuctx->ctx; 5393 struct perf_event_context *ctx = &cpuctx->ctx;
5214 struct perf_event *event, *tmp; 5394 struct perf_event *event, *tmp;
5215 5395
5216 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5396 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5397 __perf_event_remove_from_context(event);
5398 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5217 __perf_event_remove_from_context(event); 5399 __perf_event_remove_from_context(event);
5218} 5400}
5219static void perf_event_exit_cpu(int cpu) 5401static void perf_event_exit_cpu(int cpu)
@@ -5251,6 +5433,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5251 perf_event_exit_cpu(cpu); 5433 perf_event_exit_cpu(cpu);
5252 break; 5434 break;
5253 5435
5436 case CPU_DEAD:
5437 hw_perf_event_setup_offline(cpu);
5438 break;
5439
5254 default: 5440 default:
5255 break; 5441 break;
5256 } 5442 }
@@ -5275,13 +5461,16 @@ void __init perf_event_init(void)
5275 register_cpu_notifier(&perf_cpu_nb); 5461 register_cpu_notifier(&perf_cpu_nb);
5276} 5462}
5277 5463
5278static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5464static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5465 struct sysdev_class_attribute *attr,
5466 char *buf)
5279{ 5467{
5280 return sprintf(buf, "%d\n", perf_reserved_percpu); 5468 return sprintf(buf, "%d\n", perf_reserved_percpu);
5281} 5469}
5282 5470
5283static ssize_t 5471static ssize_t
5284perf_set_reserve_percpu(struct sysdev_class *class, 5472perf_set_reserve_percpu(struct sysdev_class *class,
5473 struct sysdev_class_attribute *attr,
5285 const char *buf, 5474 const char *buf,
5286 size_t count) 5475 size_t count)
5287{ 5476{
@@ -5310,13 +5499,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5310 return count; 5499 return count;
5311} 5500}
5312 5501
5313static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5502static ssize_t perf_show_overcommit(struct sysdev_class *class,
5503 struct sysdev_class_attribute *attr,
5504 char *buf)
5314{ 5505{
5315 return sprintf(buf, "%d\n", perf_overcommit); 5506 return sprintf(buf, "%d\n", perf_overcommit);
5316} 5507}
5317 5508
5318static ssize_t 5509static ssize_t
5319perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5510perf_set_overcommit(struct sysdev_class *class,
5511 struct sysdev_class_attribute *attr,
5512 const char *buf, size_t count)
5320{ 5513{
5321 unsigned long val; 5514 unsigned long val;
5322 int err; 5515 int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cbe..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference(pid->tasks[type].first); 370 first = rcu_dereference_check(pid->tasks[type].first,
371 rcu_read_lock_held() ||
372 lockdep_tasklist_lock_is_held());
371 if (first) 373 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 374 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 375 }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 378EXPORT_SYMBOL(pid_task);
377 379
378/* 380/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 381 * Must be called under rcu_read_lock().
380 */ 382 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 383struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 384{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..79aac93acf99 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -161,13 +161,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
161 rcu_read_lock(); 161 rcu_read_lock();
162 162
163 /* 163 /*
164 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring 164 * Any nested-container's init processes won't ignore the
165 * any nested-container's init processes don't ignore the 165 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
166 * signal
167 */ 166 */
168 task = pid_task(find_vpid(nr), PIDTYPE_PID); 167 task = pid_task(find_vpid(nr), PIDTYPE_PID);
169 if (task) 168 if (task)
170 force_sig(SIGKILL, task); 169 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
171 170
172 rcu_read_unlock(); 171 rcu_read_unlock();
173 172
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..1a22dfd42df9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
982 int maxfire; 982 int maxfire;
983 struct list_head *timers = tsk->cpu_timers; 983 struct list_head *timers = tsk->cpu_timers;
984 struct signal_struct *const sig = tsk->signal; 984 struct signal_struct *const sig = tsk->signal;
985 unsigned long soft;
985 986
986 maxfire = 20; 987 maxfire = 20;
987 tsk->cputime_expires.prof_exp = cputime_zero; 988 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
1030 /* 1031 /*
1031 * Check for the special case thread timers. 1032 * Check for the special case thread timers.
1032 */ 1033 */
1033 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 1034 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1034 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 1035 if (soft != RLIM_INFINITY) {
1035 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 1036 unsigned long hard =
1037 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1036 1038
1037 if (hard != RLIM_INFINITY && 1039 if (hard != RLIM_INFINITY &&
1038 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 1040 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
1043 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1045 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1044 return; 1046 return;
1045 } 1047 }
1046 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 1048 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1047 /* 1049 /*
1048 * At the soft limit, send a SIGXCPU every second. 1050 * At the soft limit, send a SIGXCPU every second.
1049 */ 1051 */
1050 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 1052 if (soft < hard) {
1051 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 1053 soft += USEC_PER_SEC;
1052 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1054 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1053 USEC_PER_SEC;
1054 } 1055 }
1055 printk(KERN_INFO 1056 printk(KERN_INFO
1056 "RT Watchdog Timeout: %s[%d]\n", 1057 "RT Watchdog Timeout: %s[%d]\n",
@@ -1121,6 +1122,7 @@ static void check_process_timers(struct task_struct *tsk,
1121 unsigned long long sum_sched_runtime, sched_expires; 1122 unsigned long long sum_sched_runtime, sched_expires;
1122 struct list_head *timers = sig->cpu_timers; 1123 struct list_head *timers = sig->cpu_timers;
1123 struct task_cputime cputime; 1124 struct task_cputime cputime;
1125 unsigned long soft;
1124 1126
1125 /* 1127 /*
1126 * Don't sample the current process CPU clocks if there are no timers. 1128 * Don't sample the current process CPU clocks if there are no timers.
@@ -1193,11 +1195,13 @@ static void check_process_timers(struct task_struct *tsk,
1193 SIGPROF); 1195 SIGPROF);
1194 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1196 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1195 SIGVTALRM); 1197 SIGVTALRM);
1196 1198 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1197 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1199 if (soft != RLIM_INFINITY) {
1198 unsigned long psecs = cputime_to_secs(ptime); 1200 unsigned long psecs = cputime_to_secs(ptime);
1201 unsigned long hard =
1202 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1199 cputime_t x; 1203 cputime_t x;
1200 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1204 if (psecs >= hard) {
1201 /* 1205 /*
1202 * At the hard limit, we just die. 1206 * At the hard limit, we just die.
1203 * No need to calculate anything else now. 1207 * No need to calculate anything else now.
@@ -1205,17 +1209,17 @@ static void check_process_timers(struct task_struct *tsk,
1205 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1209 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1206 return; 1210 return;
1207 } 1211 }
1208 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1212 if (psecs >= soft) {
1209 /* 1213 /*
1210 * At the soft limit, send a SIGXCPU every second. 1214 * At the soft limit, send a SIGXCPU every second.
1211 */ 1215 */
1212 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1216 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1213 if (sig->rlim[RLIMIT_CPU].rlim_cur 1217 if (soft < hard) {
1214 < sig->rlim[RLIMIT_CPU].rlim_max) { 1218 soft++;
1215 sig->rlim[RLIMIT_CPU].rlim_cur++; 1219 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1216 } 1220 }
1217 } 1221 }
1218 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1222 x = secs_to_cputime(soft);
1219 if (cputime_eq(prof_expires, cputime_zero) || 1223 if (cputime_eq(prof_expires, cputime_zero) ||
1220 cputime_lt(x, prof_expires)) { 1224 cputime_lt(x, prof_expires)) {
1221 prof_expires = x; 1225 prof_expires = x;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
256 return 0; 256 return 0;
257} 257}
258 258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 259static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{ 260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 262 return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
27 code. This is helpful when debugging and reporting PM bugs, like 27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support. 28 suspend support.
29 29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
30config PM_VERBOSE 39config PM_VERBOSE
31 bool "Verbose Power Management debugging" 40 bool "Verbose Power Management debugging"
32 depends on PM_DEBUG 41 depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 95 default y
87 96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
88config SUSPEND 102config SUSPEND
89 bool "Suspend to RAM and standby" 103 bool "Suspend to RAM and standby"
90 depends on PM && ARCH_SUSPEND_POSSIBLE 104 depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
222 and the bus type drivers of the buses the devices are on are 236 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and 237 responsible for the actual handling of the autosuspend requests and
224 wake-up events. 238 wake-up events.
239
240config PM_OPS
241 bool
242 depends on PM_SLEEP || PM_RUNTIME
243 default y
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..da5288ec2392 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mode)
323int hibernation_snapshot(int platform_mode) 323int hibernation_snapshot(int platform_mode)
324{ 324{
325 int error; 325 int error;
326 gfp_t saved_mask;
326 327
327 error = platform_begin(platform_mode); 328 error = platform_begin(platform_mode);
328 if (error) 329 if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode)
334 goto Close; 335 goto Close;
335 336
336 suspend_console(); 337 suspend_console();
338 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
337 error = dpm_suspend_start(PMSG_FREEZE); 339 error = dpm_suspend_start(PMSG_FREEZE);
338 if (error) 340 if (error)
339 goto Recover_platform; 341 goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode)
351 353
352 dpm_resume_end(in_suspend ? 354 dpm_resume_end(in_suspend ?
353 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 355 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
356 set_gfp_allowed_mask(saved_mask);
354 resume_console(); 357 resume_console();
355 Close: 358 Close:
356 platform_end(platform_mode); 359 platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode)
445int hibernation_restore(int platform_mode) 448int hibernation_restore(int platform_mode)
446{ 449{
447 int error; 450 int error;
451 gfp_t saved_mask;
448 452
449 pm_prepare_console(); 453 pm_prepare_console();
450 suspend_console(); 454 suspend_console();
455 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
451 error = dpm_suspend_start(PMSG_QUIESCE); 456 error = dpm_suspend_start(PMSG_QUIESCE);
452 if (!error) { 457 if (!error) {
453 error = resume_target_kernel(platform_mode); 458 error = resume_target_kernel(platform_mode);
454 dpm_resume_end(PMSG_RECOVER); 459 dpm_resume_end(PMSG_RECOVER);
455 } 460 }
461 set_gfp_allowed_mask(saved_mask);
456 resume_console(); 462 resume_console();
457 pm_restore_console(); 463 pm_restore_console();
458 return error; 464 return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode)
466int hibernation_platform_enter(void) 472int hibernation_platform_enter(void)
467{ 473{
468 int error; 474 int error;
475 gfp_t saved_mask;
469 476
470 if (!hibernation_ops) 477 if (!hibernation_ops)
471 return -ENOSYS; 478 return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
481 488
482 entering_platform_hibernation = true; 489 entering_platform_hibernation = true;
483 suspend_console(); 490 suspend_console();
491 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
484 error = dpm_suspend_start(PMSG_HIBERNATE); 492 error = dpm_suspend_start(PMSG_HIBERNATE);
485 if (error) { 493 if (error) {
486 if (hibernation_ops->recover) 494 if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
518 Resume_devices: 526 Resume_devices:
519 entering_platform_hibernation = false; 527 entering_platform_hibernation = false;
520 dpm_resume_end(PMSG_RESTORE); 528 dpm_resume_end(PMSG_RESTORE);
529 set_gfp_allowed_mask(saved_mask);
521 resume_console(); 530 resume_console();
522 531
523 Close: 532 Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
44 == NOTIFY_BAD) ? -EINVAL : 0; 44 == NOTIFY_BAD) ? -EINVAL : 0;
45} 45}
46 46
47/* If set, devices may be suspended and resumed asynchronously. */
48int pm_async_enabled = 1;
49
50static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
51 char *buf)
52{
53 return sprintf(buf, "%d\n", pm_async_enabled);
54}
55
56static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
57 const char *buf, size_t n)
58{
59 unsigned long val;
60
61 if (strict_strtoul(buf, 10, &val))
62 return -EINVAL;
63
64 if (val > 1)
65 return -EINVAL;
66
67 pm_async_enabled = val;
68 return n;
69}
70
71power_attr(pm_async);
72
47#ifdef CONFIG_PM_DEBUG 73#ifdef CONFIG_PM_DEBUG
48int pm_test_level = TEST_NONE; 74int pm_test_level = TEST_NONE;
49 75
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
208#ifdef CONFIG_PM_TRACE 234#ifdef CONFIG_PM_TRACE
209 &pm_trace_attr.attr, 235 &pm_trace_attr.attr,
210#endif 236#endif
211#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) 237#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr,
239#ifdef CONFIG_PM_DEBUG
212 &pm_test_attr.attr, 240 &pm_test_attr.attr,
213#endif 241#endif
242#endif
214 NULL, 243 NULL,
215}; 244};
216 245
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..830cadecbdfc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void)
1181 1181
1182 memory_bm_position_reset(&copy_bm); 1182 memory_bm_position_reset(&copy_bm);
1183 1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) { 1184 while (to_free_normal > 0 || to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn); 1186 struct page *page = pfn_to_page(pfn);
1187 1187
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void)
1500{ 1500{
1501 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
1502 1502
1503 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1503 printk(KERN_INFO "PM: Creating hibernation image:\n");
1504 1504
1505 drain_local_pages(NULL); 1505 drain_local_pages(NULL);
1506 nr_pages = count_data_pages(); 1506 nr_pages = count_data_pages();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..44cce10b582d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 189int suspend_devices_and_enter(suspend_state_t state)
190{ 190{
191 int error; 191 int error;
192 gfp_t saved_mask;
192 193
193 if (!suspend_ops) 194 if (!suspend_ops)
194 return -ENOSYS; 195 return -ENOSYS;
@@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 200 goto Close;
200 } 201 }
201 suspend_console(); 202 suspend_console();
203 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 204 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 205 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 206 if (error) {
@@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 217 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 218 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 219 suspend_test_finish("resume devices");
220 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 221 resume_console();
219 Close: 222 Close:
220 if (suspend_ops->end) 223 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..1d575733d4e1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p)
657 struct swsusp_info *header; 657 struct swsusp_info *header;
658 658
659 *flags_p = swsusp_header->flags; 659 *flags_p = swsusp_header->flags;
660 if (IS_ERR(resume_bdev)) {
661 pr_debug("PM: Image device not initialised\n");
662 return PTR_ERR(resume_bdev);
663 }
664 660
665 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 661 memset(&snapshot, 0, sizeof(struct snapshot_handle));
666 error = snapshot_write_next(&snapshot, PAGE_SIZE); 662 error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file provides code to write suspend image to swap and read it back.
5 *
6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 *
9 * This file is released under the GPLv2.
10 *
11 * I'd like to thank the following people for their work:
12 *
13 * Pavel Machek <pavel@ucw.cz>:
14 * Modifications, defectiveness pointing, being with me at the very beginning,
15 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16 *
17 * Steve Doddi <dirk@loth.demon.co.uk>:
18 * Support the possibility of hardware state restoring.
19 *
20 * Raph <grey.havens@earthling.net>:
21 * Support for preserving states of network devices and virtual console
22 * (including X and svgatextmode)
23 *
24 * Kurt Garloff <garloff@suse.de>:
25 * Straightened the critical function in order to prevent compilers from
26 * playing tricks with local variables.
27 *
28 * Andreas Mohr <a.mohr@mailto.de>
29 *
30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init
32 *
33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Reworked the freeing of memory and the handling of swap
35 *
36 * More state savers are welcome. Especially for the scsi layer...
37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */
40
41#include <linux/mm.h>
42#include <linux/suspend.h>
43#include <linux/spinlock.h>
44#include <linux/kernel.h>
45#include <linux/major.h>
46#include <linux/swap.h>
47#include <linux/pm.h>
48#include <linux/swapops.h>
49#include <linux/bootmem.h>
50#include <linux/syscalls.h>
51#include <linux/highmem.h>
52#include <linux/time.h>
53#include <linux/rbtree.h>
54#include <linux/io.h>
55
56#include "power.h"
57
58int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..4d2289626a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
195 return res; 195 return res;
196} 196}
197 197
198static void snapshot_deprecated_ioctl(unsigned int cmd)
199{
200 if (printk_ratelimit())
201 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
202 "be removed soon, update your suspend-to-disk "
203 "utilities\n",
204 __builtin_return_address(0), cmd);
205}
206
198static long snapshot_ioctl(struct file *filp, unsigned int cmd, 207static long snapshot_ioctl(struct file *filp, unsigned int cmd,
199 unsigned long arg) 208 unsigned long arg)
200{ 209{
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
246 data->frozen = 0; 255 data->frozen = 0;
247 break; 256 break;
248 257
249 case SNAPSHOT_CREATE_IMAGE:
250 case SNAPSHOT_ATOMIC_SNAPSHOT: 258 case SNAPSHOT_ATOMIC_SNAPSHOT:
259 snapshot_deprecated_ioctl(cmd);
260 case SNAPSHOT_CREATE_IMAGE:
251 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 261 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
252 error = -EPERM; 262 error = -EPERM;
253 break; 263 break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 data->ready = 0; 285 data->ready = 0;
276 break; 286 break;
277 287
278 case SNAPSHOT_PREF_IMAGE_SIZE:
279 case SNAPSHOT_SET_IMAGE_SIZE: 288 case SNAPSHOT_SET_IMAGE_SIZE:
289 snapshot_deprecated_ioctl(cmd);
290 case SNAPSHOT_PREF_IMAGE_SIZE:
280 image_size = arg; 291 image_size = arg;
281 break; 292 break;
282 293
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
290 error = put_user(size, (loff_t __user *)arg); 301 error = put_user(size, (loff_t __user *)arg);
291 break; 302 break;
292 303
293 case SNAPSHOT_AVAIL_SWAP_SIZE:
294 case SNAPSHOT_AVAIL_SWAP: 304 case SNAPSHOT_AVAIL_SWAP:
305 snapshot_deprecated_ioctl(cmd);
306 case SNAPSHOT_AVAIL_SWAP_SIZE:
295 size = count_swap_pages(data->swap, 1); 307 size = count_swap_pages(data->swap, 1);
296 size <<= PAGE_SHIFT; 308 size <<= PAGE_SHIFT;
297 error = put_user(size, (loff_t __user *)arg); 309 error = put_user(size, (loff_t __user *)arg);
298 break; 310 break;
299 311
300 case SNAPSHOT_ALLOC_SWAP_PAGE:
301 case SNAPSHOT_GET_SWAP_PAGE: 312 case SNAPSHOT_GET_SWAP_PAGE:
313 snapshot_deprecated_ioctl(cmd);
314 case SNAPSHOT_ALLOC_SWAP_PAGE:
302 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 315 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
303 error = -ENODEV; 316 error = -ENODEV;
304 break; 317 break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 break; 334 break;
322 335
323 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ 336 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
337 snapshot_deprecated_ioctl(cmd);
324 if (!swsusp_swap_in_use()) { 338 if (!swsusp_swap_in_use()) {
325 /* 339 /*
326 * User space encodes device types as two-byte values, 340 * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
362 break; 376 break;
363 377
364 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ 378 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
379 snapshot_deprecated_ioctl(cmd);
365 error = -EINVAL; 380 error = -EINVAL;
366 381
367 switch (arg) { 382 switch (arg) {
diff --git a/kernel/printk.c b/kernel/printk.c
index 17463ca2e229..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 37#include <linux/kmsg_dump.h>
38#include <linux/syslog.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
@@ -69,8 +70,6 @@ int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 70 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
70}; 71};
71 72
72static int saved_console_loglevel = -1;
73
74/* 73/*
75 * Low level drivers may need that to know if they can schedule in 74 * Low level drivers may need that to know if they can schedule in
76 * their unblank() callback or not. So let's export it. 75 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
145static char *log_buf = __log_buf; 144static char *log_buf = __log_buf;
146static int log_buf_len = __LOG_BUF_LEN; 145static int log_buf_len = __LOG_BUF_LEN;
147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 146static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
147static int saved_console_loglevel = -1;
148 148
149#ifdef CONFIG_KEXEC 149#ifdef CONFIG_KEXEC
150/* 150/*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
258} 258}
259#endif 259#endif
260 260
261/* 261int do_syslog(int type, char __user *buf, int len, bool from_file)
262 * Commands to do_syslog:
263 *
264 * 0 -- Close the log. Currently a NOP.
265 * 1 -- Open the log. Currently a NOP.
266 * 2 -- Read from the log.
267 * 3 -- Read all messages remaining in the ring buffer.
268 * 4 -- Read and clear all messages remaining in the ring buffer
269 * 5 -- Clear ring buffer.
270 * 6 -- Disable printk's to console
271 * 7 -- Enable printk's to console
272 * 8 -- Set level of messages printed to console
273 * 9 -- Return number of unread characters in the log buffer
274 * 10 -- Return size of the log buffer
275 */
276int do_syslog(int type, char __user *buf, int len)
277{ 262{
278 unsigned i, j, limit, count; 263 unsigned i, j, limit, count;
279 int do_clear = 0; 264 int do_clear = 0;
280 char c; 265 char c;
281 int error = 0; 266 int error = 0;
282 267
283 error = security_syslog(type); 268 error = security_syslog(type, from_file);
284 if (error) 269 if (error)
285 return error; 270 return error;
286 271
287 switch (type) { 272 switch (type) {
288 case 0: /* Close log */ 273 case SYSLOG_ACTION_CLOSE: /* Close log */
289 break; 274 break;
290 case 1: /* Open log */ 275 case SYSLOG_ACTION_OPEN: /* Open log */
291 break; 276 break;
292 case 2: /* Read from log */ 277 case SYSLOG_ACTION_READ: /* Read from log */
293 error = -EINVAL; 278 error = -EINVAL;
294 if (!buf || len < 0) 279 if (!buf || len < 0)
295 goto out; 280 goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
320 if (!error) 305 if (!error)
321 error = i; 306 error = i;
322 break; 307 break;
323 case 4: /* Read/clear last kernel messages */ 308 /* Read/clear last kernel messages */
309 case SYSLOG_ACTION_READ_CLEAR:
324 do_clear = 1; 310 do_clear = 1;
325 /* FALL THRU */ 311 /* FALL THRU */
326 case 3: /* Read last kernel messages */ 312 /* Read last kernel messages */
313 case SYSLOG_ACTION_READ_ALL:
327 error = -EINVAL; 314 error = -EINVAL;
328 if (!buf || len < 0) 315 if (!buf || len < 0)
329 goto out; 316 goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
376 } 363 }
377 } 364 }
378 break; 365 break;
379 case 5: /* Clear ring buffer */ 366 /* Clear ring buffer */
367 case SYSLOG_ACTION_CLEAR:
380 logged_chars = 0; 368 logged_chars = 0;
381 break; 369 break;
382 case 6: /* Disable logging to console */ 370 /* Disable logging to console */
371 case SYSLOG_ACTION_CONSOLE_OFF:
383 if (saved_console_loglevel == -1) 372 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel; 373 saved_console_loglevel = console_loglevel;
385 console_loglevel = minimum_console_loglevel; 374 console_loglevel = minimum_console_loglevel;
386 break; 375 break;
387 case 7: /* Enable logging to console */ 376 /* Enable logging to console */
377 case SYSLOG_ACTION_CONSOLE_ON:
388 if (saved_console_loglevel != -1) { 378 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel; 379 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1; 380 saved_console_loglevel = -1;
391 } 381 }
392 break; 382 break;
393 case 8: /* Set level of messages printed to console */ 383 /* Set level of messages printed to console */
384 case SYSLOG_ACTION_CONSOLE_LEVEL:
394 error = -EINVAL; 385 error = -EINVAL;
395 if (len < 1 || len > 8) 386 if (len < 1 || len > 8)
396 goto out; 387 goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
401 saved_console_loglevel = -1; 392 saved_console_loglevel = -1;
402 error = 0; 393 error = 0;
403 break; 394 break;
404 case 9: /* Number of chars in the log buffer */ 395 /* Number of chars in the log buffer */
396 case SYSLOG_ACTION_SIZE_UNREAD:
405 error = log_end - log_start; 397 error = log_end - log_start;
406 break; 398 break;
407 case 10: /* Size of the log buffer */ 399 /* Size of the log buffer */
400 case SYSLOG_ACTION_SIZE_BUFFER:
408 error = log_buf_len; 401 error = log_buf_len;
409 break; 402 break;
410 default: 403 default:
@@ -417,7 +410,7 @@ out:
417 410
418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 411SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
419{ 412{
420 return do_syslog(type, buf, len); 413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 414}
422 415
423/* 416/*
@@ -1467,6 +1460,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1467static const char const *kmsg_reasons[] = { 1460static const char const *kmsg_reasons[] = {
1468 [KMSG_DUMP_OOPS] = "oops", 1461 [KMSG_DUMP_OOPS] = "oops",
1469 [KMSG_DUMP_PANIC] = "panic", 1462 [KMSG_DUMP_PANIC] = "panic",
1463 [KMSG_DUMP_KEXEC] = "kexec",
1470}; 1464};
1471 1465
1472static const char *kmsg_to_str(enum kmsg_dump_reason reason) 1466static const char *kmsg_to_str(enum kmsg_dump_reason reason)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/regset.h>
25 26
26 27
27/* 28/*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 512 return 0;
512} 513}
513 514
515#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
516
517static const struct user_regset *
518find_regset(const struct user_regset_view *view, unsigned int type)
519{
520 const struct user_regset *regset;
521 int n;
522
523 for (n = 0; n < view->n; ++n) {
524 regset = view->regsets + n;
525 if (regset->core_note_type == type)
526 return regset;
527 }
528
529 return NULL;
530}
531
532static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
533 struct iovec *kiov)
534{
535 const struct user_regset_view *view = task_user_regset_view(task);
536 const struct user_regset *regset = find_regset(view, type);
537 int regset_no;
538
539 if (!regset || (kiov->iov_len % regset->size) != 0)
540 return -EINVAL;
541
542 regset_no = regset - view->regsets;
543 kiov->iov_len = min(kiov->iov_len,
544 (__kernel_size_t) (regset->n * regset->size));
545
546 if (req == PTRACE_GETREGSET)
547 return copy_regset_to_user(task, view, regset_no, 0,
548 kiov->iov_len, kiov->iov_base);
549 else
550 return copy_regset_from_user(task, view, regset_no, 0,
551 kiov->iov_len, kiov->iov_base);
552}
553
554#endif
555
514int ptrace_request(struct task_struct *child, long request, 556int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 557 long addr, long data)
516{ 558{
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 615 return 0;
574 return ptrace_resume(child, request, SIGKILL); 616 return ptrace_resume(child, request, SIGKILL);
575 617
618#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
619 case PTRACE_GETREGSET:
620 case PTRACE_SETREGSET:
621 {
622 struct iovec kiov;
623 struct iovec __user *uiov = (struct iovec __user *) data;
624
625 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
626 return -EFAULT;
627
628 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
629 __get_user(kiov.iov_len, &uiov->iov_len))
630 return -EFAULT;
631
632 ret = ptrace_regset(child, request, addr, &kiov);
633 if (!ret)
634 ret = __put_user(kiov.iov_len, &uiov->iov_len);
635 break;
636 }
637#endif
576 default: 638 default:
577 break; 639 break;
578 } 640 }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 773 else
712 ret = ptrace_setsiginfo(child, &siginfo); 774 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 775 break;
776#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
777 case PTRACE_GETREGSET:
778 case PTRACE_SETREGSET:
779 {
780 struct iovec kiov;
781 struct compat_iovec __user *uiov =
782 (struct compat_iovec __user *) datap;
783 compat_uptr_t ptr;
784 compat_size_t len;
785
786 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
787 return -EFAULT;
788
789 if (__get_user(ptr, &uiov->iov_base) ||
790 __get_user(len, &uiov->iov_len))
791 return -EFAULT;
792
793 kiov.iov_base = compat_ptr(ptr);
794 kiov.iov_len = len;
795
796 ret = ptrace_regset(child, request, addr, &kiov);
797 if (!ret)
798 ret = __put_user(kiov.iov_len, &uiov->iov_len);
799 break;
800 }
801#endif
714 802
715 default: 803 default:
716 ret = ptrace_request(child, request, addr, data); 804 ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..f1125c1a6321 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,43 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
47 48
48#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
49static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
50struct lockdep_map rcu_lock_map = 51struct lockdep_map rcu_lock_map =
51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
52EXPORT_SYMBOL_GPL(rcu_lock_map); 53EXPORT_SYMBOL_GPL(rcu_lock_map);
54
55static struct lock_class_key rcu_bh_lock_key;
56struct lockdep_map rcu_bh_lock_map =
57 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
58EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
59
60static struct lock_class_key rcu_sched_lock_key;
61struct lockdep_map rcu_sched_lock_map =
62 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
53#endif 64#endif
54 65
66int rcu_scheduler_active __read_mostly;
67EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68
69/*
70 * This function is invoked towards the end of the scheduler's initialization
71 * process. Before this is called, the idle task might contain
72 * RCU read-side critical sections (during which time, this idle
73 * task is booting the system). After this function is called, the
74 * idle tasks are prohibited from containing RCU read-side critical
75 * sections.
76 */
77void rcu_scheduler_starting(void)
78{
79 WARN_ON(num_online_cpus() != 1);
80 WARN_ON(nr_context_switches() > 0);
81 rcu_scheduler_active = 1;
82}
83
55/* 84/*
56 * Awaken the corresponding synchronize_rcu() instance now that a 85 * Awaken the corresponding synchronize_rcu() instance now that a
57 * grace period has elapsed. 86 * grace period has elapsed.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af02..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */
64static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 67static char *torture_type = "rcu"; /* What RCU implementation to torture. */
65 68
66module_param(nreaders, int, 0444); 69module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
79MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 82MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
80module_param(irqreader, int, 0444); 83module_param(irqreader, int, 0444);
81MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 84MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
85module_param(fqs_duration, int, 0444);
86MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
87module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
82module_param(torture_type, charp, 0444); 91module_param(torture_type, charp, 0444);
83MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
84 93
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
99static struct task_struct *stats_task; 108static struct task_struct *stats_task;
100static struct task_struct *shuffler_task; 109static struct task_struct *shuffler_task;
101static struct task_struct *stutter_task; 110static struct task_struct *stutter_task;
111static struct task_struct *fqs_task;
102 112
103#define RCU_TORTURE_PIPE_LEN 10 113#define RCU_TORTURE_PIPE_LEN 10
104 114
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
263 void (*deferred_free)(struct rcu_torture *p); 273 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 274 void (*sync)(void);
265 void (*cb_barrier)(void); 275 void (*cb_barrier)(void);
276 void (*fqs)(void);
266 int (*stats)(char *page); 277 int (*stats)(char *page);
267 int irq_capable; 278 int irq_capable;
268 char *name; 279 char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
347 .deferred_free = rcu_torture_deferred_free, 358 .deferred_free = rcu_torture_deferred_free,
348 .sync = synchronize_rcu, 359 .sync = synchronize_rcu,
349 .cb_barrier = rcu_barrier, 360 .cb_barrier = rcu_barrier,
361 .fqs = rcu_force_quiescent_state,
350 .stats = NULL, 362 .stats = NULL,
351 .irq_capable = 1, 363 .irq_capable = 1,
352 .name = "rcu" 364 .name = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .deferred_free = rcu_sync_torture_deferred_free, 400 .deferred_free = rcu_sync_torture_deferred_free,
389 .sync = synchronize_rcu, 401 .sync = synchronize_rcu,
390 .cb_barrier = NULL, 402 .cb_barrier = NULL,
403 .fqs = rcu_force_quiescent_state,
391 .stats = NULL, 404 .stats = NULL,
392 .irq_capable = 1, 405 .irq_capable = 1,
393 .name = "rcu_sync" 406 .name = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
403 .deferred_free = rcu_sync_torture_deferred_free, 416 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited, 417 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL, 418 .cb_barrier = NULL,
419 .fqs = rcu_force_quiescent_state,
406 .stats = NULL, 420 .stats = NULL,
407 .irq_capable = 1, 421 .irq_capable = 1,
408 .name = "rcu_expedited" 422 .name = "rcu_expedited"
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
465 .deferred_free = rcu_bh_torture_deferred_free, 479 .deferred_free = rcu_bh_torture_deferred_free,
466 .sync = rcu_bh_torture_synchronize, 480 .sync = rcu_bh_torture_synchronize,
467 .cb_barrier = rcu_barrier_bh, 481 .cb_barrier = rcu_barrier_bh,
482 .fqs = rcu_bh_force_quiescent_state,
468 .stats = NULL, 483 .stats = NULL,
469 .irq_capable = 1, 484 .irq_capable = 1,
470 .name = "rcu_bh" 485 .name = "rcu_bh"
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
480 .deferred_free = rcu_sync_torture_deferred_free, 495 .deferred_free = rcu_sync_torture_deferred_free,
481 .sync = rcu_bh_torture_synchronize, 496 .sync = rcu_bh_torture_synchronize,
482 .cb_barrier = NULL, 497 .cb_barrier = NULL,
498 .fqs = rcu_bh_force_quiescent_state,
483 .stats = NULL, 499 .stats = NULL,
484 .irq_capable = 1, 500 .irq_capable = 1,
485 .name = "rcu_bh_sync" 501 .name = "rcu_bh_sync"
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = {
621 .deferred_free = rcu_sched_torture_deferred_free, 637 .deferred_free = rcu_sched_torture_deferred_free,
622 .sync = sched_torture_synchronize, 638 .sync = sched_torture_synchronize,
623 .cb_barrier = rcu_barrier_sched, 639 .cb_barrier = rcu_barrier_sched,
640 .fqs = rcu_sched_force_quiescent_state,
624 .stats = NULL, 641 .stats = NULL,
625 .irq_capable = 1, 642 .irq_capable = 1,
626 .name = "sched" 643 .name = "sched"
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = {
636 .deferred_free = rcu_sync_torture_deferred_free, 653 .deferred_free = rcu_sync_torture_deferred_free,
637 .sync = sched_torture_synchronize, 654 .sync = sched_torture_synchronize,
638 .cb_barrier = NULL, 655 .cb_barrier = NULL,
656 .fqs = rcu_sched_force_quiescent_state,
639 .stats = NULL, 657 .stats = NULL,
640 .name = "sched_sync" 658 .name = "sched_sync"
641}; 659};
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
650 .deferred_free = rcu_sync_torture_deferred_free, 668 .deferred_free = rcu_sync_torture_deferred_free,
651 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
652 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state,
653 .stats = rcu_expedited_torture_stats, 672 .stats = rcu_expedited_torture_stats,
654 .irq_capable = 1, 673 .irq_capable = 1,
655 .name = "sched_expedited" 674 .name = "sched_expedited"
656}; 675};
657 676
658/* 677/*
678 * RCU torture force-quiescent-state kthread. Repeatedly induces
679 * bursts of calls to force_quiescent_state(), increasing the probability
680 * of occurrence of some important types of race conditions.
681 */
682static int
683rcu_torture_fqs(void *arg)
684{
685 unsigned long fqs_resume_time;
686 int fqs_burst_remaining;
687
688 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
689 do {
690 fqs_resume_time = jiffies + fqs_stutter * HZ;
691 while (jiffies - fqs_resume_time > LONG_MAX) {
692 schedule_timeout_interruptible(1);
693 }
694 fqs_burst_remaining = fqs_duration;
695 while (fqs_burst_remaining > 0) {
696 cur_ops->fqs();
697 udelay(fqs_holdoff);
698 fqs_burst_remaining -= fqs_holdoff;
699 }
700 rcu_stutter_wait("rcu_torture_fqs");
701 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
702 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
703 rcutorture_shutdown_absorb("rcu_torture_fqs");
704 while (!kthread_should_stop())
705 schedule_timeout_uninterruptible(1);
706 return 0;
707}
708
709/*
659 * RCU torture writer kthread. Repeatedly substitutes a new structure 710 * RCU torture writer kthread. Repeatedly substitutes a new structure
660 * for that pointed to by rcu_torture_current, freeing the old structure 711 * for that pointed to by rcu_torture_current, freeing the old structure
661 * after a series of grace periods (the "pipeline"). 712 * after a series of grace periods (the "pipeline").
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
745 796
746 idx = cur_ops->readlock(); 797 idx = cur_ops->readlock();
747 completed = cur_ops->completed(); 798 completed = cur_ops->completed();
748 p = rcu_dereference(rcu_torture_current); 799 p = rcu_dereference_check(rcu_torture_current,
800 rcu_read_lock_held() ||
801 rcu_read_lock_bh_held() ||
802 rcu_read_lock_sched_held() ||
803 srcu_read_lock_held(&srcu_ctl));
749 if (p == NULL) { 804 if (p == NULL) {
750 /* Leave because rcu_torture_writer is not yet underway */ 805 /* Leave because rcu_torture_writer is not yet underway */
751 cur_ops->readunlock(idx); 806 cur_ops->readunlock(idx);
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
763 /* Should not happen, but... */ 818 /* Should not happen, but... */
764 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
765 } 820 }
766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 821 __this_cpu_inc(rcu_torture_count[pipe_count]);
767 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
768 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
769 /* Should not happen, but... */ 824 /* Should not happen, but... */
770 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
771 } 826 }
772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 827 __this_cpu_inc(rcu_torture_batch[completed]);
773 preempt_enable(); 828 preempt_enable();
774 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
775} 830}
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg)
798 do { 853 do {
799 if (irqreader && cur_ops->irq_capable) { 854 if (irqreader && cur_ops->irq_capable) {
800 if (!timer_pending(&t)) 855 if (!timer_pending(&t))
801 mod_timer(&t, 1); 856 mod_timer(&t, jiffies + 1);
802 } 857 }
803 idx = cur_ops->readlock(); 858 idx = cur_ops->readlock();
804 completed = cur_ops->completed(); 859 completed = cur_ops->completed();
805 p = rcu_dereference(rcu_torture_current); 860 p = rcu_dereference_check(rcu_torture_current,
861 rcu_read_lock_held() ||
862 rcu_read_lock_bh_held() ||
863 rcu_read_lock_sched_held() ||
864 srcu_read_lock_held(&srcu_ctl));
806 if (p == NULL) { 865 if (p == NULL) {
807 /* Wait for rcu_torture_writer to get underway */ 866 /* Wait for rcu_torture_writer to get underway */
808 cur_ops->readunlock(idx); 867 cur_ops->readunlock(idx);
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg)
818 /* Should not happen, but... */ 877 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 878 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 879 }
821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 880 __this_cpu_inc(rcu_torture_count[pipe_count]);
822 completed = cur_ops->completed() - completed; 881 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 882 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 883 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 884 completed = RCU_TORTURE_PIPE_LEN;
826 } 885 }
827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 886 __this_cpu_inc(rcu_torture_batch[completed]);
828 preempt_enable(); 887 preempt_enable();
829 cur_ops->readunlock(idx); 888 cur_ops->readunlock(idx);
830 schedule(); 889 schedule();
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
1030 printk(KERN_ALERT "%s" TORTURE_FLAG 1089 printk(KERN_ALERT "%s" TORTURE_FLAG
1031 "--- %s: nreaders=%d nfakewriters=%d " 1090 "--- %s: nreaders=%d nfakewriters=%d "
1032 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1091 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1033 "shuffle_interval=%d stutter=%d irqreader=%d\n", 1092 "shuffle_interval=%d stutter=%d irqreader=%d "
1093 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
1034 torture_type, tag, nrealreaders, nfakewriters, 1094 torture_type, tag, nrealreaders, nfakewriters,
1035 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1095 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1036 stutter, irqreader); 1096 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
1037} 1097}
1038 1098
1039static struct notifier_block rcutorture_nb = { 1099static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void)
1109 } 1169 }
1110 stats_task = NULL; 1170 stats_task = NULL;
1111 1171
1172 if (fqs_task) {
1173 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1174 kthread_stop(fqs_task);
1175 }
1176 fqs_task = NULL;
1177
1112 /* Wait for all RCU callbacks to fire. */ 1178 /* Wait for all RCU callbacks to fire. */
1113 1179
1114 if (cur_ops->cb_barrier != NULL) 1180 if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1220,11 @@ rcu_torture_init(void)
1154 mutex_unlock(&fullstop_mutex); 1220 mutex_unlock(&fullstop_mutex);
1155 return -EINVAL; 1221 return -EINVAL;
1156 } 1222 }
1223 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1224 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1225 "fqs_duration, fqs disabled.\n");
1226 fqs_duration = 0;
1227 }
1157 if (cur_ops->init) 1228 if (cur_ops->init)
1158 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1229 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1159 1230
@@ -1282,6 +1353,19 @@ rcu_torture_init(void)
1282 goto unwind; 1353 goto unwind;
1283 } 1354 }
1284 } 1355 }
1356 if (fqs_duration < 0)
1357 fqs_duration = 0;
1358 if (fqs_duration) {
1359 /* Create the stutter thread */
1360 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1361 "rcu_torture_fqs");
1362 if (IS_ERR(fqs_task)) {
1363 firsterr = PTR_ERR(fqs_task);
1364 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1365 fqs_task = NULL;
1366 goto unwind;
1367 }
1368 }
1285 register_reboot_notifier(&rcutorture_nb); 1369 register_reboot_notifier(&rcutorture_nb);
1286 mutex_unlock(&fullstop_mutex); 1370 mutex_unlock(&fullstop_mutex);
1287 return 0; 1371 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,7 +46,6 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
50 49
51#include "rcutree.h" 50#include "rcutree.h"
52 51
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
66 .signaled = RCU_GP_IDLE, \ 65 .signaled = RCU_GP_IDLE, \
67 .gpnum = -300, \ 66 .gpnum = -300, \
68 .completed = -300, \ 67 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \ 69 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \ 70 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \ 71 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 73 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 74 .n_force_qs_ngp = 0, \
76} 75}
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 82
84static int rcu_scheduler_active __read_mostly;
85
86
87/* 83/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node 85 * permit this function to be invoked without holding the root rcu_node
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void)
157EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 153EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
158 154
159/* 155/*
156 * Force a quiescent state for RCU BH.
157 */
158void rcu_bh_force_quiescent_state(void)
159{
160 force_quiescent_state(&rcu_bh_state, 0);
161}
162EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
163
164/*
165 * Force a quiescent state for RCU-sched.
166 */
167void rcu_sched_force_quiescent_state(void)
168{
169 force_quiescent_state(&rcu_sched_state, 0);
170}
171EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
172
173/*
160 * Does the CPU have callbacks ready to be invoked? 174 * Does the CPU have callbacks ready to be invoked?
161 */ 175 */
162static int 176static int
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
439 453
440 /* Only let one CPU complain about others per time interval. */ 454 /* Only let one CPU complain about others per time interval. */
441 455
442 spin_lock_irqsave(&rnp->lock, flags); 456 raw_spin_lock_irqsave(&rnp->lock, flags);
443 delta = jiffies - rsp->jiffies_stall; 457 delta = jiffies - rsp->jiffies_stall;
444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 458 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
445 spin_unlock_irqrestore(&rnp->lock, flags); 459 raw_spin_unlock_irqrestore(&rnp->lock, flags);
446 return; 460 return;
447 } 461 }
448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 462 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
452 * due to CPU offlining. 466 * due to CPU offlining.
453 */ 467 */
454 rcu_print_task_stall(rnp); 468 rcu_print_task_stall(rnp);
455 spin_unlock_irqrestore(&rnp->lock, flags); 469 raw_spin_unlock_irqrestore(&rnp->lock, flags);
456 470
457 /* OK, time to rat on our buddy... */ 471 /* OK, time to rat on our buddy... */
458 472
459 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 473 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
460 rcu_for_each_leaf_node(rsp, rnp) { 474 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags);
461 rcu_print_task_stall(rnp); 476 rcu_print_task_stall(rnp);
477 raw_spin_unlock_irqrestore(&rnp->lock, flags);
462 if (rnp->qsmask == 0) 478 if (rnp->qsmask == 0)
463 continue; 479 continue;
464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 480 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 485 smp_processor_id(), (long)(jiffies - rsp->gp_start));
470 trigger_all_cpu_backtrace(); 486 trigger_all_cpu_backtrace();
471 487
488 /* If so configured, complain about tasks blocking the grace period. */
489
490 rcu_print_detail_task_stall(rsp);
491
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 492 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 493}
474 494
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
481 smp_processor_id(), jiffies - rsp->gp_start); 501 smp_processor_id(), jiffies - rsp->gp_start);
482 trigger_all_cpu_backtrace(); 502 trigger_all_cpu_backtrace();
483 503
484 spin_lock_irqsave(&rnp->lock, flags); 504 raw_spin_lock_irqsave(&rnp->lock, flags);
485 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 505 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
486 rsp->jiffies_stall = 506 rsp->jiffies_stall =
487 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 507 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
488 spin_unlock_irqrestore(&rnp->lock, flags); 508 raw_spin_unlock_irqrestore(&rnp->lock, flags);
489 509
490 set_need_resched(); /* kick ourselves to get things going. */ 510 set_need_resched(); /* kick ourselves to get things going. */
491} 511}
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
545 local_irq_save(flags); 565 local_irq_save(flags);
546 rnp = rdp->mynode; 566 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ 567 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 568 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
549 local_irq_restore(flags); 569 local_irq_restore(flags);
550 return; 570 return;
551 } 571 }
552 __note_new_gpnum(rsp, rnp, rdp); 572 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags); 573 raw_spin_unlock_irqrestore(&rnp->lock, flags);
554} 574}
555 575
556/* 576/*
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
609 local_irq_save(flags); 629 local_irq_save(flags);
610 rnp = rdp->mynode; 630 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 631 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 632 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
613 local_irq_restore(flags); 633 local_irq_restore(flags);
614 return; 634 return;
615 } 635 }
616 __rcu_process_gp_end(rsp, rnp, rdp); 636 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags); 637 raw_spin_unlock_irqrestore(&rnp->lock, flags);
618} 638}
619 639
620/* 640/*
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
659 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 679 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
660 struct rcu_node *rnp = rcu_get_root(rsp); 680 struct rcu_node *rnp = rcu_get_root(rsp);
661 681
662 if (!cpu_needs_another_gp(rsp, rdp)) { 682 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
683 if (cpu_needs_another_gp(rsp, rdp))
684 rsp->fqs_need_gp = 1;
663 if (rnp->completed == rsp->completed) { 685 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags); 686 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 687 return;
666 } 688 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 689 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 690
669 /* 691 /*
670 * Propagate new ->completed value to rcu_node structures 692 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672 * of the next grace period to process their callbacks. 694 * of the next grace period to process their callbacks.
673 */ 695 */
674 rcu_for_each_node_breadth_first(rsp, rnp) { 696 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */ 697 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed; 698 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 699 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 } 700 }
679 local_irq_restore(flags); 701 local_irq_restore(flags);
680 return; 702 return;
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
695 rnp->completed = rsp->completed; 717 rnp->completed = rsp->completed;
696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 718 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp); 719 rcu_start_gp_per_cpu(rsp, rnp, rdp);
698 spin_unlock_irqrestore(&rnp->lock, flags); 720 raw_spin_unlock_irqrestore(&rnp->lock, flags);
699 return; 721 return;
700 } 722 }
701 723
702 spin_unlock(&rnp->lock); /* leave irqs disabled. */ 724 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
703 725
704 726
705 /* Exclude any concurrent CPU-hotplug operations. */ 727 /* Exclude any concurrent CPU-hotplug operations. */
706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 728 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
707 729
708 /* 730 /*
709 * Set the quiescent-state-needed bits in all the rcu_node 731 * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
723 * irqs disabled. 745 * irqs disabled.
724 */ 746 */
725 rcu_for_each_node_breadth_first(rsp, rnp) { 747 rcu_for_each_node_breadth_first(rsp, rnp) {
726 spin_lock(&rnp->lock); /* irqs already disabled. */ 748 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
727 rcu_preempt_check_blocked_tasks(rnp); 749 rcu_preempt_check_blocked_tasks(rnp);
728 rnp->qsmask = rnp->qsmaskinit; 750 rnp->qsmask = rnp->qsmaskinit;
729 rnp->gpnum = rsp->gpnum; 751 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed; 752 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode) 753 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp); 754 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 755 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
734 } 756 }
735 757
736 rnp = rcu_get_root(rsp); 758 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */ 759 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 760 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 761 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
740 spin_unlock_irqrestore(&rsp->onofflock, flags); 762 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
741} 763}
742 764
743/* 765/*
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
776 if (!(rnp->qsmask & mask)) { 798 if (!(rnp->qsmask & mask)) {
777 799
778 /* Our bit has already been cleared, so done. */ 800 /* Our bit has already been cleared, so done. */
779 spin_unlock_irqrestore(&rnp->lock, flags); 801 raw_spin_unlock_irqrestore(&rnp->lock, flags);
780 return; 802 return;
781 } 803 }
782 rnp->qsmask &= ~mask; 804 rnp->qsmask &= ~mask;
783 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 805 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
784 806
785 /* Other bits still set at this level, so done. */ 807 /* Other bits still set at this level, so done. */
786 spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
787 return; 809 return;
788 } 810 }
789 mask = rnp->grpmask; 811 mask = rnp->grpmask;
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
793 815
794 break; 816 break;
795 } 817 }
796 spin_unlock_irqrestore(&rnp->lock, flags); 818 raw_spin_unlock_irqrestore(&rnp->lock, flags);
797 rnp_c = rnp; 819 rnp_c = rnp;
798 rnp = rnp->parent; 820 rnp = rnp->parent;
799 spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
800 WARN_ON_ONCE(rnp_c->qsmask); 822 WARN_ON_ONCE(rnp_c->qsmask);
801 } 823 }
802 824
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
825 struct rcu_node *rnp; 847 struct rcu_node *rnp;
826 848
827 rnp = rdp->mynode; 849 rnp = rdp->mynode;
828 spin_lock_irqsave(&rnp->lock, flags); 850 raw_spin_lock_irqsave(&rnp->lock, flags);
829 if (lastcomp != rnp->completed) { 851 if (lastcomp != rnp->completed) {
830 852
831 /* 853 /*
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
837 * race occurred. 859 * race occurred.
838 */ 860 */
839 rdp->passed_quiesc = 0; /* try again later! */ 861 rdp->passed_quiesc = 0; /* try again later! */
840 spin_unlock_irqrestore(&rnp->lock, flags); 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
841 return; 863 return;
842 } 864 }
843 mask = rdp->grpmask; 865 mask = rdp->grpmask;
844 if ((rnp->qsmask & mask) == 0) { 866 if ((rnp->qsmask & mask) == 0) {
845 spin_unlock_irqrestore(&rnp->lock, flags); 867 raw_spin_unlock_irqrestore(&rnp->lock, flags);
846 } else { 868 } else {
847 rdp->qs_pending = 0; 869 rdp->qs_pending = 0;
848 870
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
906 928
907 if (rdp->nxtlist == NULL) 929 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */ 930 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 931 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist; 932 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 933 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL; 934 rdp->nxtlist = NULL;
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
914 rdp->nxttail[i] = &rdp->nxtlist; 936 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen; 937 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0; 938 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 939 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918} 940}
919 941
920/* 942/*
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
925 unsigned long flags; 947 unsigned long flags;
926 struct rcu_data *rdp; 948 struct rcu_data *rdp;
927 949
928 spin_lock_irqsave(&rsp->onofflock, flags); 950 raw_spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()]; 951 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) { 952 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags); 953 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return; 954 return;
933 } 955 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 956 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
937 rsp->orphan_cbs_list = NULL; 959 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 960 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0; 961 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 962 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
941} 963}
942 964
943/* 965/*
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
953 struct rcu_node *rnp; 975 struct rcu_node *rnp;
954 976
955 /* Exclude any attempts to start a new grace period. */ 977 /* Exclude any attempts to start a new grace period. */
956 spin_lock_irqsave(&rsp->onofflock, flags); 978 raw_spin_lock_irqsave(&rsp->onofflock, flags);
957 979
958 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 980 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
959 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ 981 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
960 mask = rdp->grpmask; /* rnp->grplo is constant. */ 982 mask = rdp->grpmask; /* rnp->grplo is constant. */
961 do { 983 do {
962 spin_lock(&rnp->lock); /* irqs already disabled. */ 984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
963 rnp->qsmaskinit &= ~mask; 985 rnp->qsmaskinit &= ~mask;
964 if (rnp->qsmaskinit != 0) { 986 if (rnp->qsmaskinit != 0) {
965 if (rnp != rdp->mynode) 987 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 988 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
967 break; 989 break;
968 } 990 }
969 if (rnp == rdp->mynode) 991 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 992 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else 993 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 994 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
973 mask = rnp->grpmask; 995 mask = rnp->grpmask;
974 rnp = rnp->parent; 996 rnp = rnp->parent;
975 } while (rnp != NULL); 997 } while (rnp != NULL);
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1002 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock. 1003 * held leads to deadlock.
982 */ 1004 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1005 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode; 1006 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP) 1007 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags); 1008 rcu_report_unblock_qs_rnp(rnp, flags);
987 else 1009 else
988 spin_unlock_irqrestore(&rnp->lock, flags); 1010 raw_spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP) 1011 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp); 1012 rcu_report_exp_rnp(rsp, rnp);
991 1013
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
1144/* 1166/*
1145 * Scan the leaf rcu_node structures, processing dyntick state for any that 1167 * Scan the leaf rcu_node structures, processing dyntick state for any that
1146 * have not yet encountered a quiescent state, using the function specified. 1168 * have not yet encountered a quiescent state, using the function specified.
1147 * Returns 1 if the current grace period ends while scanning (possibly 1169 * The caller must have suppressed start of new grace periods.
1148 * because we made it end).
1149 */ 1170 */
1150static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, 1171static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1151 int (*f)(struct rcu_data *))
1152{ 1172{
1153 unsigned long bit; 1173 unsigned long bit;
1154 int cpu; 1174 int cpu;
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1158 1178
1159 rcu_for_each_leaf_node(rsp, rnp) { 1179 rcu_for_each_leaf_node(rsp, rnp) {
1160 mask = 0; 1180 mask = 0;
1161 spin_lock_irqsave(&rnp->lock, flags); 1181 raw_spin_lock_irqsave(&rnp->lock, flags);
1162 if (rnp->completed != lastcomp) { 1182 if (!rcu_gp_in_progress(rsp)) {
1163 spin_unlock_irqrestore(&rnp->lock, flags); 1183 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1164 return 1; 1184 return;
1165 } 1185 }
1166 if (rnp->qsmask == 0) { 1186 if (rnp->qsmask == 0) {
1167 spin_unlock_irqrestore(&rnp->lock, flags); 1187 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168 continue; 1188 continue;
1169 } 1189 }
1170 cpu = rnp->grplo; 1190 cpu = rnp->grplo;
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1193 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1174 mask |= bit; 1194 mask |= bit;
1175 } 1195 }
1176 if (mask != 0 && rnp->completed == lastcomp) { 1196 if (mask != 0) {
1177 1197
1178 /* rcu_report_qs_rnp() releases rnp->lock. */ 1198 /* rcu_report_qs_rnp() releases rnp->lock. */
1179 rcu_report_qs_rnp(mask, rsp, rnp, flags); 1199 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1180 continue; 1200 continue;
1181 } 1201 }
1182 spin_unlock_irqrestore(&rnp->lock, flags); 1202 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1183 } 1203 }
1184 return 0;
1185} 1204}
1186 1205
1187/* 1206/*
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1191static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1210static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1192{ 1211{
1193 unsigned long flags; 1212 unsigned long flags;
1194 long lastcomp;
1195 struct rcu_node *rnp = rcu_get_root(rsp); 1213 struct rcu_node *rnp = rcu_get_root(rsp);
1196 u8 signaled;
1197 u8 forcenow;
1198 1214
1199 if (!rcu_gp_in_progress(rsp)) 1215 if (!rcu_gp_in_progress(rsp))
1200 return; /* No grace period in progress, nothing to force. */ 1216 return; /* No grace period in progress, nothing to force. */
1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1217 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1218 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1203 return; /* Someone else is already on the job. */ 1219 return; /* Someone else is already on the job. */
1204 } 1220 }
1205 if (relaxed && 1221 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1206 (long)(rsp->jiffies_force_qs - jiffies) >= 0) 1222 goto unlock_fqs_ret; /* no emergency and done recently. */
1207 goto unlock_ret; /* no emergency and done recently. */
1208 rsp->n_force_qs++; 1223 rsp->n_force_qs++;
1209 spin_lock(&rnp->lock); 1224 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1210 lastcomp = rsp->gpnum - 1;
1211 signaled = rsp->signaled;
1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1225 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1213 if(!rcu_gp_in_progress(rsp)) { 1226 if(!rcu_gp_in_progress(rsp)) {
1214 rsp->n_force_qs_ngp++; 1227 rsp->n_force_qs_ngp++;
1215 spin_unlock(&rnp->lock); 1228 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1216 goto unlock_ret; /* no GP in progress, time updated. */ 1229 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1217 } 1230 }
1218 spin_unlock(&rnp->lock); 1231 rsp->fqs_active = 1;
1219 switch (signaled) { 1232 switch (rsp->signaled) {
1220 case RCU_GP_IDLE: 1233 case RCU_GP_IDLE:
1221 case RCU_GP_INIT: 1234 case RCU_GP_INIT:
1222 1235
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1224 1237
1225 case RCU_SAVE_DYNTICK: 1238 case RCU_SAVE_DYNTICK:
1226 1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1227 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1228 break; /* So gcc recognizes the dead code. */ 1242 break; /* So gcc recognizes the dead code. */
1229 1243
1230 /* Record dyntick-idle state. */ 1244 /* Record dyntick-idle state. */
1231 if (rcu_process_dyntick(rsp, lastcomp, 1245 force_qs_rnp(rsp, dyntick_save_progress_counter);
1232 dyntick_save_progress_counter)) 1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1233 goto unlock_ret; 1247 if (rcu_gp_in_progress(rsp))
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1237
1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1240 spin_lock(&rnp->lock);
1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1244 rsp->signaled = RCU_FORCE_QS; 1248 rsp->signaled = RCU_FORCE_QS;
1245 rsp->completed_fqs = lastcomp; 1249 break;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1247 }
1248 spin_unlock(&rnp->lock);
1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1252 1250
1253 case RCU_FORCE_QS: 1251 case RCU_FORCE_QS:
1254 1252
1255 /* Check dyntick-idle state, send IPI to laggarts. */ 1253 /* Check dyntick-idle state, send IPI to laggarts. */
1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs, 1254 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1257 rcu_implicit_dynticks_qs)) 1255 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1258 goto unlock_ret;
1259 1256
1260 /* Leave state in case more forcing is required. */ 1257 /* Leave state in case more forcing is required. */
1261 1258
1259 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1262 break; 1260 break;
1263 } 1261 }
1264unlock_ret: 1262 rsp->fqs_active = 0;
1265 spin_unlock_irqrestore(&rsp->fqslock, flags); 1263 if (rsp->fqs_need_gp) {
1264 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1265 rsp->fqs_need_gp = 0;
1266 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1267 return;
1268 }
1269 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1270unlock_fqs_ret:
1271 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1266} 1272}
1267 1273
1268#else /* #ifdef CONFIG_SMP */ 1274#else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1290 * If an RCU GP has gone long enough, go check for dyntick 1296 * If an RCU GP has gone long enough, go check for dyntick
1291 * idle CPUs and, if needed, send resched IPIs. 1297 * idle CPUs and, if needed, send resched IPIs.
1292 */ 1298 */
1293 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1299 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1294 force_quiescent_state(rsp, 1); 1300 force_quiescent_state(rsp, 1);
1295 1301
1296 /* 1302 /*
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1304 1310
1305 /* Does this CPU require a not-yet-started grace period? */ 1311 /* Does this CPU require a not-yet-started grace period? */
1306 if (cpu_needs_another_gp(rsp, rdp)) { 1312 if (cpu_needs_another_gp(rsp, rdp)) {
1307 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 1313 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1308 rcu_start_gp(rsp, flags); /* releases above lock */ 1314 rcu_start_gp(rsp, flags); /* releases above lock */
1309 } 1315 }
1310 1316
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1335 * grace-period manipulations above. 1341 * grace-period manipulations above.
1336 */ 1342 */
1337 smp_mb(); /* See above block comment. */ 1343 smp_mb(); /* See above block comment. */
1344
1345 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1346 rcu_needs_cpu_flush();
1338} 1347}
1339 1348
1340static void 1349static void
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1369 unsigned long nestflag; 1378 unsigned long nestflag;
1370 struct rcu_node *rnp_root = rcu_get_root(rsp); 1379 struct rcu_node *rnp_root = rcu_get_root(rsp);
1371 1380
1372 spin_lock_irqsave(&rnp_root->lock, nestflag); 1381 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1382 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1374 } 1383 }
1375 1384
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1387 force_quiescent_state(rsp, 0); 1396 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs; 1397 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen; 1398 rdp->qlen_last_fqs_check = rdp->qlen;
1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1399 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1391 force_quiescent_state(rsp, 1); 1400 force_quiescent_state(rsp, 1);
1392 local_irq_restore(flags); 1401 local_irq_restore(flags);
1393} 1402}
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1520 1529
1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1530 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1522 if (rcu_gp_in_progress(rsp) && 1531 if (rcu_gp_in_progress(rsp) &&
1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1532 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1524 rdp->n_rp_need_fqs++; 1533 rdp->n_rp_need_fqs++;
1525 return 1; 1534 return 1;
1526 } 1535 }
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu)
1545/* 1554/*
1546 * Check to see if any future RCU-related work will need to be done 1555 * Check to see if any future RCU-related work will need to be done
1547 * by the current CPU, even if none need be done immediately, returning 1556 * by the current CPU, even if none need be done immediately, returning
1548 * 1 if so. This function is part of the RCU implementation; it is -not- 1557 * 1 if so.
1549 * an exported member of the RCU API.
1550 */ 1558 */
1551int rcu_needs_cpu(int cpu) 1559static int rcu_needs_cpu_quick_check(int cpu)
1552{ 1560{
1553 /* RCU callbacks either ready or pending? */ 1561 /* RCU callbacks either ready or pending? */
1554 return per_cpu(rcu_sched_data, cpu).nxtlist || 1562 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu)
1556 rcu_preempt_needs_cpu(cpu); 1564 rcu_preempt_needs_cpu(cpu);
1557} 1565}
1558 1566
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1567static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count; 1568static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex); 1569static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1659 struct rcu_node *rnp = rcu_get_root(rsp); 1652 struct rcu_node *rnp = rcu_get_root(rsp);
1660 1653
1661 /* Set up local state, ensuring consistent view of global state. */ 1654 /* Set up local state, ensuring consistent view of global state. */
1662 spin_lock_irqsave(&rnp->lock, flags); 1655 raw_spin_lock_irqsave(&rnp->lock, flags);
1663 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1656 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1664 rdp->nxtlist = NULL; 1657 rdp->nxtlist = NULL;
1665 for (i = 0; i < RCU_NEXT_SIZE; i++) 1658 for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1669 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1662 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1670#endif /* #ifdef CONFIG_NO_HZ */ 1663#endif /* #ifdef CONFIG_NO_HZ */
1671 rdp->cpu = cpu; 1664 rdp->cpu = cpu;
1672 spin_unlock_irqrestore(&rnp->lock, flags); 1665 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1673} 1666}
1674 1667
1675/* 1668/*
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1687 struct rcu_node *rnp = rcu_get_root(rsp); 1680 struct rcu_node *rnp = rcu_get_root(rsp);
1688 1681
1689 /* Set up local state, ensuring consistent view of global state. */ 1682 /* Set up local state, ensuring consistent view of global state. */
1690 spin_lock_irqsave(&rnp->lock, flags); 1683 raw_spin_lock_irqsave(&rnp->lock, flags);
1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1684 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1685 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1693 rdp->beenonline = 1; /* We have now been online. */ 1686 rdp->beenonline = 1; /* We have now been online. */
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1695 rdp->qlen_last_fqs_check = 0; 1688 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs; 1689 rdp->n_force_qs_snap = rsp->n_force_qs;
1697 rdp->blimit = blimit; 1690 rdp->blimit = blimit;
1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1691 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1699 1692
1700 /* 1693 /*
1701 * A new grace period might start here. If so, we won't be part 1694 * A new grace period might start here. If so, we won't be part
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1703 */ 1696 */
1704 1697
1705 /* Exclude any attempts to start a new GP on large systems. */ 1698 /* Exclude any attempts to start a new GP on large systems. */
1706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1699 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1707 1700
1708 /* Add CPU to rcu_node bitmasks. */ 1701 /* Add CPU to rcu_node bitmasks. */
1709 rnp = rdp->mynode; 1702 rnp = rdp->mynode;
1710 mask = rdp->grpmask; 1703 mask = rdp->grpmask;
1711 do { 1704 do {
1712 /* Exclude any attempts to start a new GP on small systems. */ 1705 /* Exclude any attempts to start a new GP on small systems. */
1713 spin_lock(&rnp->lock); /* irqs already disabled. */ 1706 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1714 rnp->qsmaskinit |= mask; 1707 rnp->qsmaskinit |= mask;
1715 mask = rnp->grpmask; 1708 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) { 1709 if (rnp == rdp->mynode) {
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1718 rdp->completed = rnp->completed; 1711 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1; 1712 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 } 1713 }
1721 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1714 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1722 rnp = rnp->parent; 1715 rnp = rnp->parent;
1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1716 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1724 1717
1725 spin_unlock_irqrestore(&rsp->onofflock, flags); 1718 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1726} 1719}
1727 1720
1728static void __cpuinit rcu_online_cpu(int cpu) 1721static void __cpuinit rcu_online_cpu(int cpu)
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1806 */ 1799 */
1807static void __init rcu_init_one(struct rcu_state *rsp) 1800static void __init rcu_init_one(struct rcu_state *rsp)
1808{ 1801{
1802 static char *buf[] = { "rcu_node_level_0",
1803 "rcu_node_level_1",
1804 "rcu_node_level_2",
1805 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
1809 int cpustride = 1; 1806 int cpustride = 1;
1810 int i; 1807 int i;
1811 int j; 1808 int j;
1812 struct rcu_node *rnp; 1809 struct rcu_node *rnp;
1813 1810
1811 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
1812
1814 /* Initialize the level-tracking arrays. */ 1813 /* Initialize the level-tracking arrays. */
1815 1814
1816 for (i = 1; i < NUM_RCU_LVLS; i++) 1815 for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1823 cpustride *= rsp->levelspread[i]; 1822 cpustride *= rsp->levelspread[i];
1824 rnp = rsp->level[i]; 1823 rnp = rsp->level[i];
1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1824 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1826 spin_lock_init(&rnp->lock); 1825 raw_spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]); 1826 lockdep_set_class_and_name(&rnp->lock,
1827 &rcu_node_class[i], buf[i]);
1828 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1829 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1830 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1876,7 +1876,7 @@ do { \
1876 1876
1877void __init rcu_init(void) 1877void __init rcu_init(void)
1878{ 1878{
1879 int i; 1879 int cpu;
1880 1880
1881 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -1896,8 +1896,8 @@ void __init rcu_init(void)
1896 * or the scheduler are operational. 1896 * or the scheduler are operational.
1897 */ 1897 */
1898 cpu_notifier(rcu_cpu_notify, 0); 1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i) 1899 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); 1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1901} 1901}
1902 1902
1903#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
90 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
91 */ 91 */
92struct rcu_node { 92struct rcu_node {
93 spinlock_t lock; /* Root rcu_node's lock protects some */ 93 raw_spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */ 94 /* rcu_state fields as well as following. */
95 long gpnum; /* Current grace period for this node. */ 95 unsigned long gpnum; /* Current grace period for this node. */
96 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
97 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */ 98 unsigned long completed; /* Last GP completed for this node. */
99 /* This will either be equal to or one */ 99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */ 100 /* behind the root rcu_node's gpnum. */
101 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
161/* Per-CPU data for read-copy update. */ 161/* Per-CPU data for read-copy update. */
162struct rcu_data { 162struct rcu_data {
163 /* 1) quiescent-state and grace-period handling : */ 163 /* 1) quiescent-state and grace-period handling : */
164 long completed; /* Track rsp->completed gp number */ 164 unsigned long completed; /* Track rsp->completed gp number */
165 /* in order to detect GP end. */ 165 /* in order to detect GP end. */
166 long gpnum; /* Highest gp number that this CPU */ 166 unsigned long gpnum; /* Highest gp number that this CPU */
167 /* is aware of having started. */ 167 /* is aware of having started. */
168 long passed_quiesc_completed; 168 unsigned long passed_quiesc_completed;
169 /* Value of completed at time of qs. */ 169 /* Value of completed at time of qs. */
170 bool passed_quiesc; /* User-mode/idle loop etc. */ 170 bool passed_quiesc; /* User-mode/idle loop etc. */
171 bool qs_pending; /* Core waits for quiesc state. */ 171 bool qs_pending; /* Core waits for quiesc state. */
@@ -221,14 +221,14 @@ struct rcu_data {
221 unsigned long resched_ipi; /* Sent a resched IPI. */ 221 unsigned long resched_ipi; /* Sent a resched IPI. */
222 222
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 long n_rp_cb_ready; 226 unsigned long n_rp_cb_ready;
227 long n_rp_cpu_needs_gp; 227 unsigned long n_rp_cpu_needs_gp;
228 long n_rp_gp_completed; 228 unsigned long n_rp_gp_completed;
229 long n_rp_gp_started; 229 unsigned long n_rp_gp_started;
230 long n_rp_need_fqs; 230 unsigned long n_rp_need_fqs;
231 long n_rp_need_nothing; 231 unsigned long n_rp_need_nothing;
232 232
233 int cpu; 233 int cpu;
234}; 234};
@@ -237,25 +237,36 @@ struct rcu_data {
237#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
238#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ 240#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
242#ifdef CONFIG_NO_HZ 241#ifdef CONFIG_NO_HZ
243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 242#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
244#else /* #ifdef CONFIG_NO_HZ */ 243#else /* #ifdef CONFIG_NO_HZ */
245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED 244#define RCU_SIGNAL_INIT RCU_FORCE_QS
246#endif /* #else #ifdef CONFIG_NO_HZ */ 245#endif /* #else #ifdef CONFIG_NO_HZ */
247 246
248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
249#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 248#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
250#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ 249
251#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ 250#ifdef CONFIG_PROVE_RCU
252#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 251#define RCU_STALL_DELAY_DELTA (5 * HZ)
253 /* to take at least one */ 252#else
254 /* scheduling clock irq */ 253#define RCU_STALL_DELAY_DELTA 0
255 /* before ratting on them. */ 254#endif
255
256#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
257 /* for rsp->jiffies_stall */
258#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
259 /* for rsp->jiffies_stall */
260#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
261 /* to take at least one */
262 /* scheduling clock irq */
263 /* before ratting on them. */
256 264
257#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 265#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
258 266
267#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
268#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
269
259/* 270/*
260 * RCU global state, including node hierarchy. This hierarchy is 271 * RCU global state, including node hierarchy. This hierarchy is
261 * represented in "heap" form in a dense array. The root (first level) 272 * represented in "heap" form in a dense array. The root (first level)
@@ -277,12 +288,19 @@ struct rcu_state {
277 288
278 u8 signaled ____cacheline_internodealigned_in_smp; 289 u8 signaled ____cacheline_internodealigned_in_smp;
279 /* Force QS state. */ 290 /* Force QS state. */
280 long gpnum; /* Current gp number. */ 291 u8 fqs_active; /* force_quiescent_state() */
281 long completed; /* # of last completed gp. */ 292 /* is running. */
293 u8 fqs_need_gp; /* A CPU was prevented from */
294 /* starting a new grace */
295 /* period because */
296 /* force_quiescent_state() */
297 /* was running. */
298 unsigned long gpnum; /* Current gp number. */
299 unsigned long completed; /* # of last completed gp. */
282 300
283 /* End of fields guarded by root rcu_node's lock. */ 301 /* End of fields guarded by root rcu_node's lock. */
284 302
285 spinlock_t onofflock; /* exclude on/offline and */ 303 raw_spinlock_t onofflock; /* exclude on/offline and */
286 /* starting new GP. Also */ 304 /* starting new GP. Also */
287 /* protects the following */ 305 /* protects the following */
288 /* orphan_cbs fields. */ 306 /* orphan_cbs fields. */
@@ -292,10 +310,8 @@ struct rcu_state {
292 /* going offline. */ 310 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ 311 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */ 312 long orphan_qlen; /* Number of orphaned cbs. */
295 spinlock_t fqslock; /* Only one task forcing */ 313 raw_spinlock_t fqslock; /* Only one task forcing */
296 /* quiescent states. */ 314 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
299 unsigned long jiffies_force_qs; /* Time at which to invoke */ 315 unsigned long jiffies_force_qs; /* Time at which to invoke */
300 /* force_quiescent_state(). */ 316 /* force_quiescent_state(). */
301 unsigned long n_force_qs; /* Number of calls to */ 317 unsigned long n_force_qs; /* Number of calls to */
@@ -319,8 +335,6 @@ struct rcu_state {
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ 335#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */ 336 /* GP were moved to root. */
321 337
322#ifdef RCU_TREE_NONCORE
323
324/* 338/*
325 * RCU implementation internal declarations: 339 * RCU implementation internal declarations:
326 */ 340 */
@@ -335,7 +349,7 @@ extern struct rcu_state rcu_preempt_state;
335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 349DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 350#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
337 351
338#else /* #ifdef RCU_TREE_NONCORE */ 352#ifndef RCU_TREE_NONCORE
339 353
340/* Forward declarations for rcutree_plugin.h */ 354/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void); 355static void rcu_bootup_announce(void);
@@ -347,6 +361,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags); 361 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 362#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 363#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
364static void rcu_print_detail_task_stall(struct rcu_state *rsp);
350static void rcu_print_task_stall(struct rcu_node *rnp); 365static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 366#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 367static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +382,6 @@ static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 382static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void); 383static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void); 384static void __init __rcu_init_preempt(void);
385static void rcu_needs_cpu_flush(void);
370 386
371#endif /* #else #ifdef RCU_TREE_NONCORE */ 387#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -62,6 +62,15 @@ long rcu_batches_completed(void)
62EXPORT_SYMBOL_GPL(rcu_batches_completed); 62EXPORT_SYMBOL_GPL(rcu_batches_completed);
63 63
64/* 64/*
65 * Force a quiescent state for preemptible RCU.
66 */
67void rcu_force_quiescent_state(void)
68{
69 force_quiescent_state(&rcu_preempt_state, 0);
70}
71EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
72
73/*
65 * Record a preemptable-RCU quiescent state for the specified CPU. Note 74 * Record a preemptable-RCU quiescent state for the specified CPU. Note
66 * that this just means that the task currently running on the CPU is 75 * that this just means that the task currently running on the CPU is
67 * not in a quiescent state. There might be any number of tasks blocked 76 * not in a quiescent state. There might be any number of tasks blocked
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
102 /* Possibly blocking in an RCU read-side critical section. */ 111 /* Possibly blocking in an RCU read-side critical section. */
103 rdp = rcu_preempt_state.rda[cpu]; 112 rdp = rcu_preempt_state.rda[cpu];
104 rnp = rdp->mynode; 113 rnp = rdp->mynode;
105 spin_lock_irqsave(&rnp->lock, flags); 114 raw_spin_lock_irqsave(&rnp->lock, flags);
106 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 115 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
107 t->rcu_blocked_node = rnp; 116 t->rcu_blocked_node = rnp;
108 117
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
123 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 132 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
124 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 133 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
125 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 134 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
126 spin_unlock_irqrestore(&rnp->lock, flags); 135 raw_spin_unlock_irqrestore(&rnp->lock, flags);
127 } 136 }
128 137
129 /* 138 /*
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
180 struct rcu_node *rnp_p; 189 struct rcu_node *rnp_p;
181 190
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 191 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */ 193 return; /* Still need more quiescent states! */
185 } 194 }
186 195
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
197 206
198 /* Report up the rest of the hierarchy. */ 207 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask; 208 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 209 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */ 210 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 211 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203} 212}
204 213
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
248 */ 257 */
249 for (;;) { 258 for (;;) {
250 rnp = t->rcu_blocked_node; 259 rnp = t->rcu_blocked_node;
251 spin_lock(&rnp->lock); /* irqs already disabled. */ 260 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
252 if (rnp == t->rcu_blocked_node) 261 if (rnp == t->rcu_blocked_node)
253 break; 262 break;
254 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
255 } 264 }
256 empty = !rcu_preempted_readers(rnp); 265 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp); 266 empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 274 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
266 */ 275 */
267 if (empty) 276 if (empty)
268 spin_unlock_irqrestore(&rnp->lock, flags); 277 raw_spin_unlock_irqrestore(&rnp->lock, flags);
269 else 278 else
270 rcu_report_unblock_qs_rnp(rnp, flags); 279 rcu_report_unblock_qs_rnp(rnp, flags);
271 280
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void)
295 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 304 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
296 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 305 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
297 rcu_read_unlock_special(t); 306 rcu_read_unlock_special(t);
307#ifdef CONFIG_PROVE_LOCKING
308 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
309#endif /* #ifdef CONFIG_PROVE_LOCKING */
298} 310}
299EXPORT_SYMBOL_GPL(__rcu_read_unlock); 311EXPORT_SYMBOL_GPL(__rcu_read_unlock);
300 312
301#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 313#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
302 314
315#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
316
317/*
318 * Dump detailed information for all tasks blocking the current RCU
319 * grace period on the specified rcu_node structure.
320 */
321static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
322{
323 unsigned long flags;
324 struct list_head *lp;
325 int phase;
326 struct task_struct *t;
327
328 if (rcu_preempted_readers(rnp)) {
329 raw_spin_lock_irqsave(&rnp->lock, flags);
330 phase = rnp->gpnum & 0x1;
331 lp = &rnp->blocked_tasks[phase];
332 list_for_each_entry(t, lp, rcu_node_entry)
333 sched_show_task(t);
334 raw_spin_unlock_irqrestore(&rnp->lock, flags);
335 }
336}
337
338/*
339 * Dump detailed information for all tasks blocking the current RCU
340 * grace period.
341 */
342static void rcu_print_detail_task_stall(struct rcu_state *rsp)
343{
344 struct rcu_node *rnp = rcu_get_root(rsp);
345
346 rcu_print_detail_task_stall_rnp(rnp);
347 rcu_for_each_leaf_node(rsp, rnp)
348 rcu_print_detail_task_stall_rnp(rnp);
349}
350
351#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
352
353static void rcu_print_detail_task_stall(struct rcu_state *rsp)
354{
355}
356
357#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
358
303/* 359/*
304 * Scan the current list of tasks blocked within RCU read-side critical 360 * Scan the current list of tasks blocked within RCU read-side critical
305 * sections, printing out the tid of each. 361 * sections, printing out the tid of each.
306 */ 362 */
307static void rcu_print_task_stall(struct rcu_node *rnp) 363static void rcu_print_task_stall(struct rcu_node *rnp)
308{ 364{
309 unsigned long flags;
310 struct list_head *lp; 365 struct list_head *lp;
311 int phase; 366 int phase;
312 struct task_struct *t; 367 struct task_struct *t;
313 368
314 if (rcu_preempted_readers(rnp)) { 369 if (rcu_preempted_readers(rnp)) {
315 spin_lock_irqsave(&rnp->lock, flags);
316 phase = rnp->gpnum & 0x1; 370 phase = rnp->gpnum & 0x1;
317 lp = &rnp->blocked_tasks[phase]; 371 lp = &rnp->blocked_tasks[phase];
318 list_for_each_entry(t, lp, rcu_node_entry) 372 list_for_each_entry(t, lp, rcu_node_entry)
319 printk(" P%d", t->pid); 373 printk(" P%d", t->pid);
320 spin_unlock_irqrestore(&rnp->lock, flags);
321 } 374 }
322} 375}
323 376
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
388 lp_root = &rnp_root->blocked_tasks[i]; 441 lp_root = &rnp_root->blocked_tasks[i];
389 while (!list_empty(lp)) { 442 while (!list_empty(lp)) {
390 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 443 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
391 spin_lock(&rnp_root->lock); /* irqs already disabled */ 444 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
392 list_del(&tp->rcu_node_entry); 445 list_del(&tp->rcu_node_entry);
393 tp->rcu_blocked_node = rnp_root; 446 tp->rcu_blocked_node = rnp_root;
394 list_add(&tp->rcu_node_entry, lp_root); 447 list_add(&tp->rcu_node_entry, lp_root);
395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 448 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
396 } 449 }
397 } 450 }
398 return retval; 451 return retval;
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
516 unsigned long flags; 569 unsigned long flags;
517 unsigned long mask; 570 unsigned long mask;
518 571
519 spin_lock_irqsave(&rnp->lock, flags); 572 raw_spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) { 573 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp)) 574 if (!sync_rcu_preempt_exp_done(rnp))
522 break; 575 break;
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
525 break; 578 break;
526 } 579 }
527 mask = rnp->grpmask; 580 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */ 581 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent; 582 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */ 583 raw_spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask; 584 rnp->expmask &= ~mask;
532 } 585 }
533 spin_unlock_irqrestore(&rnp->lock, flags); 586 raw_spin_unlock_irqrestore(&rnp->lock, flags);
534} 587}
535 588
536/* 589/*
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{ 598{
546 int must_wait; 599 int must_wait;
547 600
548 spin_lock(&rnp->lock); /* irqs already disabled */ 601 raw_spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 602 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 603 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp); 604 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */ 605 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait) 606 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp); 607 rcu_report_exp_rnp(rsp, rnp);
555} 608}
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void)
594 /* force all RCU readers onto blocked_tasks[]. */ 647 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited(); 648 synchronize_sched_expedited();
596 649
597 spin_lock_irqsave(&rsp->onofflock, flags); 650 raw_spin_lock_irqsave(&rsp->onofflock, flags);
598 651
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 652 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 653 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */ 654 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit; 655 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 656 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 } 657 }
605 658
606 /* Snapshot current state of ->blocked_tasks[] lists. */ 659 /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void)
609 if (NUM_RCU_NODES > 1) 662 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 663 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611 664
612 spin_unlock_irqrestore(&rsp->onofflock, flags); 665 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
613 666
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 667 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp); 668 rnp = rcu_get_root(rsp);
@@ -713,6 +766,16 @@ long rcu_batches_completed(void)
713EXPORT_SYMBOL_GPL(rcu_batches_completed); 766EXPORT_SYMBOL_GPL(rcu_batches_completed);
714 767
715/* 768/*
769 * Force a quiescent state for RCU, which, because there is no preemptible
770 * RCU, becomes the same as rcu-sched.
771 */
772void rcu_force_quiescent_state(void)
773{
774 rcu_sched_force_quiescent_state();
775}
776EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
777
778/*
716 * Because preemptable RCU does not exist, we never have to check for 779 * Because preemptable RCU does not exist, we never have to check for
717 * CPUs being in quiescent states. 780 * CPUs being in quiescent states.
718 */ 781 */
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
734/* Because preemptible RCU does not exist, no quieting of tasks. */ 797/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 798static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{ 799{
737 spin_unlock_irqrestore(&rnp->lock, flags); 800 raw_spin_unlock_irqrestore(&rnp->lock, flags);
738} 801}
739 802
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 803#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
745 * Because preemptable RCU does not exist, we never have to check for 808 * Because preemptable RCU does not exist, we never have to check for
746 * tasks blocked within RCU read-side critical sections. 809 * tasks blocked within RCU read-side critical sections.
747 */ 810 */
811static void rcu_print_detail_task_stall(struct rcu_state *rsp)
812{
813}
814
815/*
816 * Because preemptable RCU does not exist, we never have to check for
817 * tasks blocked within RCU read-side critical sections.
818 */
748static void rcu_print_task_stall(struct rcu_node *rnp) 819static void rcu_print_task_stall(struct rcu_node *rnp)
749{ 820{
750} 821}
@@ -884,3 +955,115 @@ static void __init __rcu_init_preempt(void)
884} 955}
885 956
886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 957#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
958
959#if !defined(CONFIG_RCU_FAST_NO_HZ)
960
961/*
962 * Check to see if any future RCU-related work will need to be done
963 * by the current CPU, even if none need be done immediately, returning
964 * 1 if so. This function is part of the RCU implementation; it is -not-
965 * an exported member of the RCU API.
966 *
967 * Because we have preemptible RCU, just check whether this CPU needs
968 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
969 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
970 */
971int rcu_needs_cpu(int cpu)
972{
973 return rcu_needs_cpu_quick_check(cpu);
974}
975
976/*
977 * Check to see if we need to continue a callback-flush operations to
978 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
979 * entry is not configured, so we never do need to.
980 */
981static void rcu_needs_cpu_flush(void)
982{
983}
984
985#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
986
987#define RCU_NEEDS_CPU_FLUSHES 5
988static DEFINE_PER_CPU(int, rcu_dyntick_drain);
989static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
990
991/*
992 * Check to see if any future RCU-related work will need to be done
993 * by the current CPU, even if none need be done immediately, returning
994 * 1 if so. This function is part of the RCU implementation; it is -not-
995 * an exported member of the RCU API.
996 *
997 * Because we are not supporting preemptible RCU, attempt to accelerate
998 * any current grace periods so that RCU no longer needs this CPU, but
999 * only if all other CPUs are already in dynticks-idle mode. This will
1000 * allow the CPU cores to be powered down immediately, as opposed to after
1001 * waiting many milliseconds for grace periods to elapse.
1002 *
1003 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1004 * disabled, we do one pass of force_quiescent_state(), then do a
1005 * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
1006 * The per-cpu rcu_dyntick_drain variable controls the sequencing.
1007 */
1008int rcu_needs_cpu(int cpu)
1009{
1010 int c = 0;
1011 int thatcpu;
1012
1013 /* Check for being in the holdoff period. */
1014 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1015 return rcu_needs_cpu_quick_check(cpu);
1016
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1019 if (thatcpu != cpu) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu);
1023 }
1024
1025 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1027 /* First time through, initialize the counter. */
1028 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1029 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1030 /* We have hit the limit, so time to give up. */
1031 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1032 return rcu_needs_cpu_quick_check(cpu);
1033 }
1034
1035 /* Do one step pushing remaining RCU callbacks through. */
1036 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1037 rcu_sched_qs(cpu);
1038 force_quiescent_state(&rcu_sched_state, 0);
1039 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1040 }
1041 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1042 rcu_bh_qs(cpu);
1043 force_quiescent_state(&rcu_bh_state, 0);
1044 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1045 }
1046
1047 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1048 if (c)
1049 raise_softirq(RCU_SOFTIRQ);
1050 return c;
1051}
1052
1053/*
1054 * Check to see if we need to continue a callback-flush operations to
1055 * allow the last CPU to enter dyntick-idle mode.
1056 */
1057static void rcu_needs_cpu_flush(void)
1058{
1059 int cpu = smp_processor_id();
1060 unsigned long flags;
1061
1062 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
1063 return;
1064 local_irq_save(flags);
1065 (void)rcu_needs_cpu(cpu);
1066 local_irq_restore(flags);
1067}
1068
1069#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 50{
51 if (!rdp->beenonline) 51 if (!rdp->beenonline)
52 return; 52 return;
53 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", 53 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
54 rdp->cpu, 54 rdp->cpu,
55 cpu_is_offline(rdp->cpu) ? '!' : ' ', 55 cpu_is_offline(rdp->cpu) ? '!' : ' ',
56 rdp->completed, rdp->gpnum, 56 rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
105{ 105{
106 if (!rdp->beenonline) 106 if (!rdp->beenonline)
107 return; 107 return;
108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
109 rdp->cpu, 109 rdp->cpu,
110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
111 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum; 158 unsigned long gpnum;
159 int level = 0; 159 int level = 0;
160 int phase; 160 int phase;
161 struct rcu_node *rnp; 161 struct rcu_node *rnp;
162 162
163 gpnum = rsp->gpnum; 163 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
166 rsp->completed, gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
167 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
215static int show_rcugp(struct seq_file *m, void *unused) 215static int show_rcugp(struct seq_file *m, void *unused)
216{ 216{
217#ifdef CONFIG_TREE_PREEMPT_RCU 217#ifdef CONFIG_TREE_PREEMPT_RCU
218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", 218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
219 rcu_preempt_state.completed, rcu_preempt_state.gpnum); 219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", 221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
222 rcu_sched_state.completed, rcu_sched_state.gpnum); 222 rcu_sched_state.completed, rcu_sched_state.gpnum);
223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
224 rcu_bh_state.completed, rcu_bh_state.gpnum); 224 rcu_bh_state.completed, rcu_bh_state.gpnum);
225 return 0; 225 return 0;
226} 226}
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..2d5be5d9bf5f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old)
188 return -EINVAL; 188 return -EINVAL;
189} 189}
190 190
191static void __release_child_resources(struct resource *r)
192{
193 struct resource *tmp, *p;
194 resource_size_t size;
195
196 p = r->child;
197 r->child = NULL;
198 while (p) {
199 tmp = p;
200 p = p->sibling;
201
202 tmp->parent = NULL;
203 tmp->sibling = NULL;
204 __release_child_resources(tmp);
205
206 printk(KERN_DEBUG "release child resource %pR\n", tmp);
207 /* need to restore size, and keep flags */
208 size = resource_size(tmp);
209 tmp->start = 0;
210 tmp->end = size - 1;
211 }
212}
213
214void release_child_resources(struct resource *r)
215{
216 write_lock(&resource_lock);
217 __release_child_resources(r);
218 write_unlock(&resource_lock);
219}
220
191/** 221/**
192 * request_resource - request and reserve an I/O or memory resource 222 * request_resource - request and reserve an I/O or memory resource
193 * @root: root resource descriptor 223 * @root: root resource descriptor
@@ -274,7 +304,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *)) 304 void *arg, int (*func)(unsigned long, unsigned long, void *))
275{ 305{
276 struct resource res; 306 struct resource res;
277 unsigned long pfn, len; 307 unsigned long pfn, end_pfn;
278 u64 orig_end; 308 u64 orig_end;
279 int ret = -1; 309 int ret = -1;
280 310
@@ -284,9 +314,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
284 orig_end = res.end; 314 orig_end = res.end;
285 while ((res.start < res.end) && 315 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) { 316 (find_next_system_ram(&res, "System RAM") >= 0)) {
287 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 317 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 318 end_pfn = (res.end + 1) >> PAGE_SHIFT;
289 ret = (*func)(pfn, len, arg); 319 if (end_pfn > pfn)
320 ret = (*func)(pfn, end_pfn - pfn, arg);
290 if (ret) 321 if (ret)
291 break; 322 break;
292 res.start = res.end + 1; 323 res.start = res.end + 1;
@@ -297,14 +328,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
297 328
298#endif 329#endif
299 330
331static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
332{
333 return 1;
334}
335/*
336 * This generic page_is_ram() returns true if specified address is
337 * registered as "System RAM" in iomem_resource list.
338 */
339int __weak page_is_ram(unsigned long pfn)
340{
341 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
342}
343
300/* 344/*
301 * Find empty slot in the resource tree given range and alignment. 345 * Find empty slot in the resource tree given range and alignment.
302 */ 346 */
303static int find_resource(struct resource *root, struct resource *new, 347static int find_resource(struct resource *root, struct resource *new,
304 resource_size_t size, resource_size_t min, 348 resource_size_t size, resource_size_t min,
305 resource_size_t max, resource_size_t align, 349 resource_size_t max, resource_size_t align,
306 void (*alignf)(void *, struct resource *, 350 resource_size_t (*alignf)(void *,
307 resource_size_t, resource_size_t), 351 const struct resource *,
352 resource_size_t,
353 resource_size_t),
308 void *alignf_data) 354 void *alignf_data)
309{ 355{
310 struct resource *this = root->child; 356 struct resource *this = root->child;
@@ -330,7 +376,7 @@ static int find_resource(struct resource *root, struct resource *new,
330 tmp.end = max; 376 tmp.end = max;
331 tmp.start = ALIGN(tmp.start, align); 377 tmp.start = ALIGN(tmp.start, align);
332 if (alignf) 378 if (alignf)
333 alignf(alignf_data, &tmp, size, align); 379 tmp.start = alignf(alignf_data, &tmp, size, align);
334 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 380 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
335 new->start = tmp.start; 381 new->start = tmp.start;
336 new->end = tmp.start + size - 1; 382 new->end = tmp.start + size - 1;
@@ -358,8 +404,10 @@ static int find_resource(struct resource *root, struct resource *new,
358int allocate_resource(struct resource *root, struct resource *new, 404int allocate_resource(struct resource *root, struct resource *new,
359 resource_size_t size, resource_size_t min, 405 resource_size_t size, resource_size_t min,
360 resource_size_t max, resource_size_t align, 406 resource_size_t max, resource_size_t align,
361 void (*alignf)(void *, struct resource *, 407 resource_size_t (*alignf)(void *,
362 resource_size_t, resource_size_t), 408 const struct resource *,
409 resource_size_t,
410 resource_size_t),
363 void *alignf_data) 411 void *alignf_data)
364{ 412{
365 int err; 413 int err;
diff --git a/kernel/sched.c b/kernel/sched.c
index c535cc4f6428..9ab3cd7858d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 602#endif
646} 603}
647 604
605#define rcu_dereference_check_sched_domain(p) \
606 rcu_dereference_check((p), \
607 rcu_read_lock_sched_held() || \
608 lockdep_is_held(&sched_domains_mutex))
609
648/* 610/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 611 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 612 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 615 * preempt-disabled sections.
654 */ 616 */
655#define for_each_domain(cpu, __sd) \ 617#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 618 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 619
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 620#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 621#define this_rq() (&__get_cpu_var(runqueues))
@@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 904
943/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
946 */ 921 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 923 __acquires(rq->lock)
949{ 924{
925 struct rq *rq;
926
950 for (;;) { 927 for (;;) {
951 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 931 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 933 return rq;
955 raw_spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
956 } 935 }
@@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 946 struct rq *rq;
968 947
969 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
970 local_irq_save(*flags); 951 local_irq_save(*flags);
971 rq = task_rq(p); 952 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 955 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 957 }
@@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1372};
1392 1373
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
1531 1486
1532static struct sched_group *group_of(int cpu) 1487static struct sched_group *group_of(int cpu)
1533{ 1488{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1535 1490
1536 if (!sd) 1491 if (!sd)
1537 return NULL; 1492 return NULL;
@@ -1566,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1566 1521
1567#ifdef CONFIG_FAIR_GROUP_SCHED 1522#ifdef CONFIG_FAIR_GROUP_SCHED
1568 1523
1569static __read_mostly unsigned long *update_shares_data; 1524static __read_mostly unsigned long __percpu *update_shares_data;
1570 1525
1571static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1526static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1572 1527
@@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1656 }
1702} 1657}
1703 1658
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1715{ 1660{
1716 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1670{
1726} 1671}
1727 1672
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1673#endif
1733 1674
1734#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1808#endif 1794#endif
1809 1795
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1820#endif
1835} 1821}
1836 1822
1837#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1824
1845#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1826#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1848 1828
1829#include "sched_stats.h"
1830
1849static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1850{ 1832{
1851 rq->nr_running++; 1833 rq->nr_running++;
@@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1865 *avg += diff >> 3;
1884} 1866}
1885 1867
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1870{
1888 if (wakeup) 1871 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1873
1891 sched_info_queued(p); 1874 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1894} 1877}
1895 1878
@@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1895}
1913 1896
1914/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1915 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1916 */ 1930 */
1917static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1971 return p->prio;
1958} 1972}
1959 1973
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1974/**
1985 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1976 * @p: the task in question.
@@ -2320,14 +2310,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2320} 2310}
2321 2311
2322/* 2312/*
2323 * Called from: 2313 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2324 * 2314 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2325 * - fork, @p is stable because it isn't on the tasklist yet 2315 * by:
2326 * 2316 *
2327 * - exec, @p is unstable, retry loop 2317 * exec: is unstable, retry loop
2328 * 2318 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2329 * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
2330 * we should be good.
2331 */ 2319 */
2332static inline 2320static inline
2333int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -2371,7 +2359,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2371{ 2359{
2372 int cpu, orig_cpu, this_cpu, success = 0; 2360 int cpu, orig_cpu, this_cpu, success = 0;
2373 unsigned long flags; 2361 unsigned long flags;
2374 struct rq *rq, *orig_rq; 2362 struct rq *rq;
2375 2363
2376 if (!sched_feat(SYNC_WAKEUPS)) 2364 if (!sched_feat(SYNC_WAKEUPS))
2377 wake_flags &= ~WF_SYNC; 2365 wake_flags &= ~WF_SYNC;
@@ -2379,7 +2367,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2379 this_cpu = get_cpu(); 2367 this_cpu = get_cpu();
2380 2368
2381 smp_wmb(); 2369 smp_wmb();
2382 rq = orig_rq = task_rq_lock(p, &flags); 2370 rq = task_rq_lock(p, &flags);
2383 update_rq_clock(rq); 2371 update_rq_clock(rq);
2384 if (!(p->state & state)) 2372 if (!(p->state & state))
2385 goto out; 2373 goto out;
@@ -2410,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2410 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2411 2399
2412 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2413 if (cpu != orig_cpu) 2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2414 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2408 }
2415 2409
2416 rq = __task_rq_lock(p); 2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2417 update_rq_clock(rq); 2412 update_rq_clock(rq);
2418 2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2419 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2420 cpu = task_cpu(p);
2421 2422
2422#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2423 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2620,9 +2621,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
2620 if (p->sched_class->task_fork) 2621 if (p->sched_class->task_fork)
2621 p->sched_class->task_fork(p); 2622 p->sched_class->task_fork(p);
2622 2623
2623#ifdef CONFIG_SMP
2624 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2625#endif
2626 set_task_cpu(p, cpu); 2624 set_task_cpu(p, cpu);
2627 2625
2628#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2626#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2652,8 +2650,29 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2652{ 2650{
2653 unsigned long flags; 2651 unsigned long flags;
2654 struct rq *rq; 2652 struct rq *rq;
2653 int cpu = get_cpu();
2654
2655#ifdef CONFIG_SMP
2656 /*
2657 * Fork balancing, do it here and not earlier because:
2658 * - cpus_allowed can change in the fork path
2659 * - any previously selected cpu might disappear through hotplug
2660 *
2661 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2662 * ->cpus_allowed is stable, we have preemption disabled, meaning
2663 * cpu_online_mask is stable.
2664 */
2665 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2666 set_task_cpu(p, cpu);
2667#endif
2668
2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2655 2675
2656 rq = task_rq_lock(p, &flags);
2657 BUG_ON(p->state != TASK_WAKING); 2676 BUG_ON(p->state != TASK_WAKING);
2658 p->state = TASK_RUNNING; 2677 p->state = TASK_RUNNING;
2659 update_rq_clock(rq); 2678 update_rq_clock(rq);
@@ -2665,6 +2684,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2665 p->sched_class->task_woken(rq, p); 2684 p->sched_class->task_woken(rq, p);
2666#endif 2685#endif
2667 task_rq_unlock(rq, &flags); 2686 task_rq_unlock(rq, &flags);
2687 put_cpu();
2668} 2688}
2669 2689
2670#ifdef CONFIG_PREEMPT_NOTIFIERS 2690#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2783,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2783 */ 2803 */
2784 prev_state = prev->state; 2804 prev_state = prev->state;
2785 finish_arch_switch(prev); 2805 finish_arch_switch(prev);
2786 perf_event_task_sched_in(current, cpu_of(rq)); 2806#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2807 local_irq_disable();
2808#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2809 perf_event_task_sched_in(current);
2810#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2811 local_irq_enable();
2812#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2787 finish_lock_switch(rq, prev); 2813 finish_lock_switch(rq, prev);
2788 2814
2789 fire_sched_in_preempt_notifiers(current); 2815 fire_sched_in_preempt_notifiers(current);
@@ -3088,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3088#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3089 3115
3090/* 3116/*
3091 * double_rq_lock - safely lock two runqueues
3092 *
3093 * Note this does not disable interrupts like task_rq_lock,
3094 * you need to do so manually before calling.
3095 */
3096static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3097 __acquires(rq1->lock)
3098 __acquires(rq2->lock)
3099{
3100 BUG_ON(!irqs_disabled());
3101 if (rq1 == rq2) {
3102 raw_spin_lock(&rq1->lock);
3103 __acquire(rq2->lock); /* Fake it out ;) */
3104 } else {
3105 if (rq1 < rq2) {
3106 raw_spin_lock(&rq1->lock);
3107 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3108 } else {
3109 raw_spin_lock(&rq2->lock);
3110 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3111 }
3112 }
3113 update_rq_clock(rq1);
3114 update_rq_clock(rq2);
3115}
3116
3117/*
3118 * double_rq_unlock - safely unlock two runqueues
3119 *
3120 * Note this does not restore interrupts like task_rq_unlock,
3121 * you need to do so manually after calling.
3122 */
3123static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3124 __releases(rq1->lock)
3125 __releases(rq2->lock)
3126{
3127 raw_spin_unlock(&rq1->lock);
3128 if (rq1 != rq2)
3129 raw_spin_unlock(&rq2->lock);
3130 else
3131 __release(rq2->lock);
3132}
3133
3134/*
3135 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3136 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3137 */ 3119 */
@@ -3179,1771 +3161,6 @@ again:
3179 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3180} 3162}
3181 3163
3182/*
3183 * pull_task - move a task from a remote runqueue to the local runqueue.
3184 * Both runqueues must be locked.
3185 */
3186static void pull_task(struct rq *src_rq, struct task_struct *p,
3187 struct rq *this_rq, int this_cpu)
3188{
3189 deactivate_task(src_rq, p, 0);
3190 set_task_cpu(p, this_cpu);
3191 activate_task(this_rq, p, 0);
3192 check_preempt_curr(this_rq, p, 0);
3193}
3194
3195/*
3196 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3197 */
3198static
3199int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3200 struct sched_domain *sd, enum cpu_idle_type idle,
3201 int *all_pinned)
3202{
3203 int tsk_cache_hot = 0;
3204 /*
3205 * We do not migrate tasks that are:
3206 * 1) running (obviously), or
3207 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3208 * 3) are cache-hot on their current CPU.
3209 */
3210 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3211 schedstat_inc(p, se.nr_failed_migrations_affine);
3212 return 0;
3213 }
3214 *all_pinned = 0;
3215
3216 if (task_running(rq, p)) {
3217 schedstat_inc(p, se.nr_failed_migrations_running);
3218 return 0;
3219 }
3220
3221 /*
3222 * Aggressive migration if:
3223 * 1) task is cache cold, or
3224 * 2) too many balance attempts have failed.
3225 */
3226
3227 tsk_cache_hot = task_hot(p, rq->clock, sd);
3228 if (!tsk_cache_hot ||
3229 sd->nr_balance_failed > sd->cache_nice_tries) {
3230#ifdef CONFIG_SCHEDSTATS
3231 if (tsk_cache_hot) {
3232 schedstat_inc(sd, lb_hot_gained[idle]);
3233 schedstat_inc(p, se.nr_forced_migrations);
3234 }
3235#endif
3236 return 1;
3237 }
3238
3239 if (tsk_cache_hot) {
3240 schedstat_inc(p, se.nr_failed_migrations_hot);
3241 return 0;
3242 }
3243 return 1;
3244}
3245
3246static unsigned long
3247balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3248 unsigned long max_load_move, struct sched_domain *sd,
3249 enum cpu_idle_type idle, int *all_pinned,
3250 int *this_best_prio, struct rq_iterator *iterator)
3251{
3252 int loops = 0, pulled = 0, pinned = 0;
3253 struct task_struct *p;
3254 long rem_load_move = max_load_move;
3255
3256 if (max_load_move == 0)
3257 goto out;
3258
3259 pinned = 1;
3260
3261 /*
3262 * Start the load-balancing iterator:
3263 */
3264 p = iterator->start(iterator->arg);
3265next:
3266 if (!p || loops++ > sysctl_sched_nr_migrate)
3267 goto out;
3268
3269 if ((p->se.load.weight >> 1) > rem_load_move ||
3270 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3271 p = iterator->next(iterator->arg);
3272 goto next;
3273 }
3274
3275 pull_task(busiest, p, this_rq, this_cpu);
3276 pulled++;
3277 rem_load_move -= p->se.load.weight;
3278
3279#ifdef CONFIG_PREEMPT
3280 /*
3281 * NEWIDLE balancing is a source of latency, so preemptible kernels
3282 * will stop after the first task is pulled to minimize the critical
3283 * section.
3284 */
3285 if (idle == CPU_NEWLY_IDLE)
3286 goto out;
3287#endif
3288
3289 /*
3290 * We only want to steal up to the prescribed amount of weighted load.
3291 */
3292 if (rem_load_move > 0) {
3293 if (p->prio < *this_best_prio)
3294 *this_best_prio = p->prio;
3295 p = iterator->next(iterator->arg);
3296 goto next;
3297 }
3298out:
3299 /*
3300 * Right now, this is one of only two places pull_task() is called,
3301 * so we can safely collect pull_task() stats here rather than
3302 * inside pull_task().
3303 */
3304 schedstat_add(sd, lb_gained[idle], pulled);
3305
3306 if (all_pinned)
3307 *all_pinned = pinned;
3308
3309 return max_load_move - rem_load_move;
3310}
3311
3312/*
3313 * move_tasks tries to move up to max_load_move weighted load from busiest to
3314 * this_rq, as part of a balancing operation within domain "sd".
3315 * Returns 1 if successful and 0 otherwise.
3316 *
3317 * Called with both runqueues locked.
3318 */
3319static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3320 unsigned long max_load_move,
3321 struct sched_domain *sd, enum cpu_idle_type idle,
3322 int *all_pinned)
3323{
3324 const struct sched_class *class = sched_class_highest;
3325 unsigned long total_load_moved = 0;
3326 int this_best_prio = this_rq->curr->prio;
3327
3328 do {
3329 total_load_moved +=
3330 class->load_balance(this_rq, this_cpu, busiest,
3331 max_load_move - total_load_moved,
3332 sd, idle, all_pinned, &this_best_prio);
3333 class = class->next;
3334
3335#ifdef CONFIG_PREEMPT
3336 /*
3337 * NEWIDLE balancing is a source of latency, so preemptible
3338 * kernels will stop after the first task is pulled to minimize
3339 * the critical section.
3340 */
3341 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3342 break;
3343#endif
3344 } while (class && max_load_move > total_load_moved);
3345
3346 return total_load_moved > 0;
3347}
3348
3349static int
3350iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3351 struct sched_domain *sd, enum cpu_idle_type idle,
3352 struct rq_iterator *iterator)
3353{
3354 struct task_struct *p = iterator->start(iterator->arg);
3355 int pinned = 0;
3356
3357 while (p) {
3358 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3359 pull_task(busiest, p, this_rq, this_cpu);
3360 /*
3361 * Right now, this is only the second place pull_task()
3362 * is called, so we can safely collect pull_task()
3363 * stats here rather than inside pull_task().
3364 */
3365 schedstat_inc(sd, lb_gained[idle]);
3366
3367 return 1;
3368 }
3369 p = iterator->next(iterator->arg);
3370 }
3371
3372 return 0;
3373}
3374
3375/*
3376 * move_one_task tries to move exactly one task from busiest to this_rq, as
3377 * part of active balancing operations within "domain".
3378 * Returns 1 if successful and 0 otherwise.
3379 *
3380 * Called with both runqueues locked.
3381 */
3382static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3383 struct sched_domain *sd, enum cpu_idle_type idle)
3384{
3385 const struct sched_class *class;
3386
3387 for_each_class(class) {
3388 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3389 return 1;
3390 }
3391
3392 return 0;
3393}
3394/********** Helpers for find_busiest_group ************************/
3395/*
3396 * sd_lb_stats - Structure to store the statistics of a sched_domain
3397 * during load balancing.
3398 */
3399struct sd_lb_stats {
3400 struct sched_group *busiest; /* Busiest group in this sd */
3401 struct sched_group *this; /* Local group in this sd */
3402 unsigned long total_load; /* Total load of all groups in sd */
3403 unsigned long total_pwr; /* Total power of all groups in sd */
3404 unsigned long avg_load; /* Average load across all groups in sd */
3405
3406 /** Statistics of this group */
3407 unsigned long this_load;
3408 unsigned long this_load_per_task;
3409 unsigned long this_nr_running;
3410
3411 /* Statistics of the busiest group */
3412 unsigned long max_load;
3413 unsigned long busiest_load_per_task;
3414 unsigned long busiest_nr_running;
3415
3416 int group_imb; /* Is there imbalance in this sd */
3417#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3418 int power_savings_balance; /* Is powersave balance needed for this sd */
3419 struct sched_group *group_min; /* Least loaded group in sd */
3420 struct sched_group *group_leader; /* Group which relieves group_min */
3421 unsigned long min_load_per_task; /* load_per_task in group_min */
3422 unsigned long leader_nr_running; /* Nr running of group_leader */
3423 unsigned long min_nr_running; /* Nr running of group_min */
3424#endif
3425};
3426
3427/*
3428 * sg_lb_stats - stats of a sched_group required for load_balancing
3429 */
3430struct sg_lb_stats {
3431 unsigned long avg_load; /*Avg load across the CPUs of the group */
3432 unsigned long group_load; /* Total load over the CPUs of the group */
3433 unsigned long sum_nr_running; /* Nr tasks running in the group */
3434 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3435 unsigned long group_capacity;
3436 int group_imb; /* Is there an imbalance in the group ? */
3437};
3438
3439/**
3440 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3441 * @group: The group whose first cpu is to be returned.
3442 */
3443static inline unsigned int group_first_cpu(struct sched_group *group)
3444{
3445 return cpumask_first(sched_group_cpus(group));
3446}
3447
3448/**
3449 * get_sd_load_idx - Obtain the load index for a given sched domain.
3450 * @sd: The sched_domain whose load_idx is to be obtained.
3451 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3452 */
3453static inline int get_sd_load_idx(struct sched_domain *sd,
3454 enum cpu_idle_type idle)
3455{
3456 int load_idx;
3457
3458 switch (idle) {
3459 case CPU_NOT_IDLE:
3460 load_idx = sd->busy_idx;
3461 break;
3462
3463 case CPU_NEWLY_IDLE:
3464 load_idx = sd->newidle_idx;
3465 break;
3466 default:
3467 load_idx = sd->idle_idx;
3468 break;
3469 }
3470
3471 return load_idx;
3472}
3473
3474
3475#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3476/**
3477 * init_sd_power_savings_stats - Initialize power savings statistics for
3478 * the given sched_domain, during load balancing.
3479 *
3480 * @sd: Sched domain whose power-savings statistics are to be initialized.
3481 * @sds: Variable containing the statistics for sd.
3482 * @idle: Idle status of the CPU at which we're performing load-balancing.
3483 */
3484static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3485 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3486{
3487 /*
3488 * Busy processors will not participate in power savings
3489 * balance.
3490 */
3491 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3492 sds->power_savings_balance = 0;
3493 else {
3494 sds->power_savings_balance = 1;
3495 sds->min_nr_running = ULONG_MAX;
3496 sds->leader_nr_running = 0;
3497 }
3498}
3499
3500/**
3501 * update_sd_power_savings_stats - Update the power saving stats for a
3502 * sched_domain while performing load balancing.
3503 *
3504 * @group: sched_group belonging to the sched_domain under consideration.
3505 * @sds: Variable containing the statistics of the sched_domain
3506 * @local_group: Does group contain the CPU for which we're performing
3507 * load balancing ?
3508 * @sgs: Variable containing the statistics of the group.
3509 */
3510static inline void update_sd_power_savings_stats(struct sched_group *group,
3511 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3512{
3513
3514 if (!sds->power_savings_balance)
3515 return;
3516
3517 /*
3518 * If the local group is idle or completely loaded
3519 * no need to do power savings balance at this domain
3520 */
3521 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3522 !sds->this_nr_running))
3523 sds->power_savings_balance = 0;
3524
3525 /*
3526 * If a group is already running at full capacity or idle,
3527 * don't include that group in power savings calculations
3528 */
3529 if (!sds->power_savings_balance ||
3530 sgs->sum_nr_running >= sgs->group_capacity ||
3531 !sgs->sum_nr_running)
3532 return;
3533
3534 /*
3535 * Calculate the group which has the least non-idle load.
3536 * This is the group from where we need to pick up the load
3537 * for saving power
3538 */
3539 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3540 (sgs->sum_nr_running == sds->min_nr_running &&
3541 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3542 sds->group_min = group;
3543 sds->min_nr_running = sgs->sum_nr_running;
3544 sds->min_load_per_task = sgs->sum_weighted_load /
3545 sgs->sum_nr_running;
3546 }
3547
3548 /*
3549 * Calculate the group which is almost near its
3550 * capacity but still has some space to pick up some load
3551 * from other group and save more power
3552 */
3553 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3554 return;
3555
3556 if (sgs->sum_nr_running > sds->leader_nr_running ||
3557 (sgs->sum_nr_running == sds->leader_nr_running &&
3558 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3559 sds->group_leader = group;
3560 sds->leader_nr_running = sgs->sum_nr_running;
3561 }
3562}
3563
3564/**
3565 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3566 * @sds: Variable containing the statistics of the sched_domain
3567 * under consideration.
3568 * @this_cpu: Cpu at which we're currently performing load-balancing.
3569 * @imbalance: Variable to store the imbalance.
3570 *
3571 * Description:
3572 * Check if we have potential to perform some power-savings balance.
3573 * If yes, set the busiest group to be the least loaded group in the
3574 * sched_domain, so that it's CPUs can be put to idle.
3575 *
3576 * Returns 1 if there is potential to perform power-savings balance.
3577 * Else returns 0.
3578 */
3579static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3580 int this_cpu, unsigned long *imbalance)
3581{
3582 if (!sds->power_savings_balance)
3583 return 0;
3584
3585 if (sds->this != sds->group_leader ||
3586 sds->group_leader == sds->group_min)
3587 return 0;
3588
3589 *imbalance = sds->min_load_per_task;
3590 sds->busiest = sds->group_min;
3591
3592 return 1;
3593
3594}
3595#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3596static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3597 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3598{
3599 return;
3600}
3601
3602static inline void update_sd_power_savings_stats(struct sched_group *group,
3603 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3604{
3605 return;
3606}
3607
3608static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3609 int this_cpu, unsigned long *imbalance)
3610{
3611 return 0;
3612}
3613#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3614
3615
3616unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3617{
3618 return SCHED_LOAD_SCALE;
3619}
3620
3621unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3622{
3623 return default_scale_freq_power(sd, cpu);
3624}
3625
3626unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3627{
3628 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3629 unsigned long smt_gain = sd->smt_gain;
3630
3631 smt_gain /= weight;
3632
3633 return smt_gain;
3634}
3635
3636unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3637{
3638 return default_scale_smt_power(sd, cpu);
3639}
3640
3641unsigned long scale_rt_power(int cpu)
3642{
3643 struct rq *rq = cpu_rq(cpu);
3644 u64 total, available;
3645
3646 sched_avg_update(rq);
3647
3648 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3649 available = total - rq->rt_avg;
3650
3651 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3652 total = SCHED_LOAD_SCALE;
3653
3654 total >>= SCHED_LOAD_SHIFT;
3655
3656 return div_u64(available, total);
3657}
3658
3659static void update_cpu_power(struct sched_domain *sd, int cpu)
3660{
3661 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3662 unsigned long power = SCHED_LOAD_SCALE;
3663 struct sched_group *sdg = sd->groups;
3664
3665 if (sched_feat(ARCH_POWER))
3666 power *= arch_scale_freq_power(sd, cpu);
3667 else
3668 power *= default_scale_freq_power(sd, cpu);
3669
3670 power >>= SCHED_LOAD_SHIFT;
3671
3672 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3673 if (sched_feat(ARCH_POWER))
3674 power *= arch_scale_smt_power(sd, cpu);
3675 else
3676 power *= default_scale_smt_power(sd, cpu);
3677
3678 power >>= SCHED_LOAD_SHIFT;
3679 }
3680
3681 power *= scale_rt_power(cpu);
3682 power >>= SCHED_LOAD_SHIFT;
3683
3684 if (!power)
3685 power = 1;
3686
3687 sdg->cpu_power = power;
3688}
3689
3690static void update_group_power(struct sched_domain *sd, int cpu)
3691{
3692 struct sched_domain *child = sd->child;
3693 struct sched_group *group, *sdg = sd->groups;
3694 unsigned long power;
3695
3696 if (!child) {
3697 update_cpu_power(sd, cpu);
3698 return;
3699 }
3700
3701 power = 0;
3702
3703 group = child->groups;
3704 do {
3705 power += group->cpu_power;
3706 group = group->next;
3707 } while (group != child->groups);
3708
3709 sdg->cpu_power = power;
3710}
3711
3712/**
3713 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3714 * @sd: The sched_domain whose statistics are to be updated.
3715 * @group: sched_group whose statistics are to be updated.
3716 * @this_cpu: Cpu for which load balance is currently performed.
3717 * @idle: Idle status of this_cpu
3718 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3719 * @sd_idle: Idle status of the sched_domain containing group.
3720 * @local_group: Does group contain this_cpu.
3721 * @cpus: Set of cpus considered for load balancing.
3722 * @balance: Should we balance.
3723 * @sgs: variable to hold the statistics for this group.
3724 */
3725static inline void update_sg_lb_stats(struct sched_domain *sd,
3726 struct sched_group *group, int this_cpu,
3727 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3728 int local_group, const struct cpumask *cpus,
3729 int *balance, struct sg_lb_stats *sgs)
3730{
3731 unsigned long load, max_cpu_load, min_cpu_load;
3732 int i;
3733 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3734 unsigned long sum_avg_load_per_task;
3735 unsigned long avg_load_per_task;
3736
3737 if (local_group) {
3738 balance_cpu = group_first_cpu(group);
3739 if (balance_cpu == this_cpu)
3740 update_group_power(sd, this_cpu);
3741 }
3742
3743 /* Tally up the load of all CPUs in the group */
3744 sum_avg_load_per_task = avg_load_per_task = 0;
3745 max_cpu_load = 0;
3746 min_cpu_load = ~0UL;
3747
3748 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3749 struct rq *rq = cpu_rq(i);
3750
3751 if (*sd_idle && rq->nr_running)
3752 *sd_idle = 0;
3753
3754 /* Bias balancing toward cpus of our domain */
3755 if (local_group) {
3756 if (idle_cpu(i) && !first_idle_cpu) {
3757 first_idle_cpu = 1;
3758 balance_cpu = i;
3759 }
3760
3761 load = target_load(i, load_idx);
3762 } else {
3763 load = source_load(i, load_idx);
3764 if (load > max_cpu_load)
3765 max_cpu_load = load;
3766 if (min_cpu_load > load)
3767 min_cpu_load = load;
3768 }
3769
3770 sgs->group_load += load;
3771 sgs->sum_nr_running += rq->nr_running;
3772 sgs->sum_weighted_load += weighted_cpuload(i);
3773
3774 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3775 }
3776
3777 /*
3778 * First idle cpu or the first cpu(busiest) in this sched group
3779 * is eligible for doing load balancing at this and above
3780 * domains. In the newly idle case, we will allow all the cpu's
3781 * to do the newly idle load balance.
3782 */
3783 if (idle != CPU_NEWLY_IDLE && local_group &&
3784 balance_cpu != this_cpu && balance) {
3785 *balance = 0;
3786 return;
3787 }
3788
3789 /* Adjust by relative CPU power of the group */
3790 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3791
3792
3793 /*
3794 * Consider the group unbalanced when the imbalance is larger
3795 * than the average weight of two tasks.
3796 *
3797 * APZ: with cgroup the avg task weight can vary wildly and
3798 * might not be a suitable number - should we keep a
3799 * normalized nr_running number somewhere that negates
3800 * the hierarchy?
3801 */
3802 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3803 group->cpu_power;
3804
3805 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3806 sgs->group_imb = 1;
3807
3808 sgs->group_capacity =
3809 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3810}
3811
3812/**
3813 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3814 * @sd: sched_domain whose statistics are to be updated.
3815 * @this_cpu: Cpu for which load balance is currently performed.
3816 * @idle: Idle status of this_cpu
3817 * @sd_idle: Idle status of the sched_domain containing group.
3818 * @cpus: Set of cpus considered for load balancing.
3819 * @balance: Should we balance.
3820 * @sds: variable to hold the statistics for this sched_domain.
3821 */
3822static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3823 enum cpu_idle_type idle, int *sd_idle,
3824 const struct cpumask *cpus, int *balance,
3825 struct sd_lb_stats *sds)
3826{
3827 struct sched_domain *child = sd->child;
3828 struct sched_group *group = sd->groups;
3829 struct sg_lb_stats sgs;
3830 int load_idx, prefer_sibling = 0;
3831
3832 if (child && child->flags & SD_PREFER_SIBLING)
3833 prefer_sibling = 1;
3834
3835 init_sd_power_savings_stats(sd, sds, idle);
3836 load_idx = get_sd_load_idx(sd, idle);
3837
3838 do {
3839 int local_group;
3840
3841 local_group = cpumask_test_cpu(this_cpu,
3842 sched_group_cpus(group));
3843 memset(&sgs, 0, sizeof(sgs));
3844 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3845 local_group, cpus, balance, &sgs);
3846
3847 if (local_group && balance && !(*balance))
3848 return;
3849
3850 sds->total_load += sgs.group_load;
3851 sds->total_pwr += group->cpu_power;
3852
3853 /*
3854 * In case the child domain prefers tasks go to siblings
3855 * first, lower the group capacity to one so that we'll try
3856 * and move all the excess tasks away.
3857 */
3858 if (prefer_sibling)
3859 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3860
3861 if (local_group) {
3862 sds->this_load = sgs.avg_load;
3863 sds->this = group;
3864 sds->this_nr_running = sgs.sum_nr_running;
3865 sds->this_load_per_task = sgs.sum_weighted_load;
3866 } else if (sgs.avg_load > sds->max_load &&
3867 (sgs.sum_nr_running > sgs.group_capacity ||
3868 sgs.group_imb)) {
3869 sds->max_load = sgs.avg_load;
3870 sds->busiest = group;
3871 sds->busiest_nr_running = sgs.sum_nr_running;
3872 sds->busiest_load_per_task = sgs.sum_weighted_load;
3873 sds->group_imb = sgs.group_imb;
3874 }
3875
3876 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3877 group = group->next;
3878 } while (group != sd->groups);
3879}
3880
3881/**
3882 * fix_small_imbalance - Calculate the minor imbalance that exists
3883 * amongst the groups of a sched_domain, during
3884 * load balancing.
3885 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3886 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3887 * @imbalance: Variable to store the imbalance.
3888 */
3889static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3890 int this_cpu, unsigned long *imbalance)
3891{
3892 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3893 unsigned int imbn = 2;
3894
3895 if (sds->this_nr_running) {
3896 sds->this_load_per_task /= sds->this_nr_running;
3897 if (sds->busiest_load_per_task >
3898 sds->this_load_per_task)
3899 imbn = 1;
3900 } else
3901 sds->this_load_per_task =
3902 cpu_avg_load_per_task(this_cpu);
3903
3904 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3905 sds->busiest_load_per_task * imbn) {
3906 *imbalance = sds->busiest_load_per_task;
3907 return;
3908 }
3909
3910 /*
3911 * OK, we don't have enough imbalance to justify moving tasks,
3912 * however we may be able to increase total CPU power used by
3913 * moving them.
3914 */
3915
3916 pwr_now += sds->busiest->cpu_power *
3917 min(sds->busiest_load_per_task, sds->max_load);
3918 pwr_now += sds->this->cpu_power *
3919 min(sds->this_load_per_task, sds->this_load);
3920 pwr_now /= SCHED_LOAD_SCALE;
3921
3922 /* Amount of load we'd subtract */
3923 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3924 sds->busiest->cpu_power;
3925 if (sds->max_load > tmp)
3926 pwr_move += sds->busiest->cpu_power *
3927 min(sds->busiest_load_per_task, sds->max_load - tmp);
3928
3929 /* Amount of load we'd add */
3930 if (sds->max_load * sds->busiest->cpu_power <
3931 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3932 tmp = (sds->max_load * sds->busiest->cpu_power) /
3933 sds->this->cpu_power;
3934 else
3935 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3936 sds->this->cpu_power;
3937 pwr_move += sds->this->cpu_power *
3938 min(sds->this_load_per_task, sds->this_load + tmp);
3939 pwr_move /= SCHED_LOAD_SCALE;
3940
3941 /* Move if we gain throughput */
3942 if (pwr_move > pwr_now)
3943 *imbalance = sds->busiest_load_per_task;
3944}
3945
3946/**
3947 * calculate_imbalance - Calculate the amount of imbalance present within the
3948 * groups of a given sched_domain during load balance.
3949 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3950 * @this_cpu: Cpu for which currently load balance is being performed.
3951 * @imbalance: The variable to store the imbalance.
3952 */
3953static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3954 unsigned long *imbalance)
3955{
3956 unsigned long max_pull;
3957 /*
3958 * In the presence of smp nice balancing, certain scenarios can have
3959 * max load less than avg load(as we skip the groups at or below
3960 * its cpu_power, while calculating max_load..)
3961 */
3962 if (sds->max_load < sds->avg_load) {
3963 *imbalance = 0;
3964 return fix_small_imbalance(sds, this_cpu, imbalance);
3965 }
3966
3967 /* Don't want to pull so many tasks that a group would go idle */
3968 max_pull = min(sds->max_load - sds->avg_load,
3969 sds->max_load - sds->busiest_load_per_task);
3970
3971 /* How much load to actually move to equalise the imbalance */
3972 *imbalance = min(max_pull * sds->busiest->cpu_power,
3973 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3974 / SCHED_LOAD_SCALE;
3975
3976 /*
3977 * if *imbalance is less than the average load per runnable task
3978 * there is no gaurantee that any tasks will be moved so we'll have
3979 * a think about bumping its value to force at least one task to be
3980 * moved
3981 */
3982 if (*imbalance < sds->busiest_load_per_task)
3983 return fix_small_imbalance(sds, this_cpu, imbalance);
3984
3985}
3986/******* find_busiest_group() helpers end here *********************/
3987
3988/**
3989 * find_busiest_group - Returns the busiest group within the sched_domain
3990 * if there is an imbalance. If there isn't an imbalance, and
3991 * the user has opted for power-savings, it returns a group whose
3992 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3993 * such a group exists.
3994 *
3995 * Also calculates the amount of weighted load which should be moved
3996 * to restore balance.
3997 *
3998 * @sd: The sched_domain whose busiest group is to be returned.
3999 * @this_cpu: The cpu for which load balancing is currently being performed.
4000 * @imbalance: Variable which stores amount of weighted load which should
4001 * be moved to restore balance/put a group to idle.
4002 * @idle: The idle status of this_cpu.
4003 * @sd_idle: The idleness of sd
4004 * @cpus: The set of CPUs under consideration for load-balancing.
4005 * @balance: Pointer to a variable indicating if this_cpu
4006 * is the appropriate cpu to perform load balancing at this_level.
4007 *
4008 * Returns: - the busiest group if imbalance exists.
4009 * - If no imbalance and user has opted for power-savings balance,
4010 * return the least loaded group whose CPUs can be
4011 * put to idle by rebalancing its tasks onto our group.
4012 */
4013static struct sched_group *
4014find_busiest_group(struct sched_domain *sd, int this_cpu,
4015 unsigned long *imbalance, enum cpu_idle_type idle,
4016 int *sd_idle, const struct cpumask *cpus, int *balance)
4017{
4018 struct sd_lb_stats sds;
4019
4020 memset(&sds, 0, sizeof(sds));
4021
4022 /*
4023 * Compute the various statistics relavent for load balancing at
4024 * this level.
4025 */
4026 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4027 balance, &sds);
4028
4029 /* Cases where imbalance does not exist from POV of this_cpu */
4030 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4031 * at this level.
4032 * 2) There is no busy sibling group to pull from.
4033 * 3) This group is the busiest group.
4034 * 4) This group is more busy than the avg busieness at this
4035 * sched_domain.
4036 * 5) The imbalance is within the specified limit.
4037 * 6) Any rebalance would lead to ping-pong
4038 */
4039 if (balance && !(*balance))
4040 goto ret;
4041
4042 if (!sds.busiest || sds.busiest_nr_running == 0)
4043 goto out_balanced;
4044
4045 if (sds.this_load >= sds.max_load)
4046 goto out_balanced;
4047
4048 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4049
4050 if (sds.this_load >= sds.avg_load)
4051 goto out_balanced;
4052
4053 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4054 goto out_balanced;
4055
4056 sds.busiest_load_per_task /= sds.busiest_nr_running;
4057 if (sds.group_imb)
4058 sds.busiest_load_per_task =
4059 min(sds.busiest_load_per_task, sds.avg_load);
4060
4061 /*
4062 * We're trying to get all the cpus to the average_load, so we don't
4063 * want to push ourselves above the average load, nor do we wish to
4064 * reduce the max loaded cpu below the average load, as either of these
4065 * actions would just result in more rebalancing later, and ping-pong
4066 * tasks around. Thus we look for the minimum possible imbalance.
4067 * Negative imbalances (*we* are more loaded than anyone else) will
4068 * be counted as no imbalance for these purposes -- we can't fix that
4069 * by pulling tasks to us. Be careful of negative numbers as they'll
4070 * appear as very large values with unsigned longs.
4071 */
4072 if (sds.max_load <= sds.busiest_load_per_task)
4073 goto out_balanced;
4074
4075 /* Looks like there is an imbalance. Compute it */
4076 calculate_imbalance(&sds, this_cpu, imbalance);
4077 return sds.busiest;
4078
4079out_balanced:
4080 /*
4081 * There is no obvious imbalance. But check if we can do some balancing
4082 * to save power.
4083 */
4084 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4085 return sds.busiest;
4086ret:
4087 *imbalance = 0;
4088 return NULL;
4089}
4090
4091/*
4092 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4093 */
4094static struct rq *
4095find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4096 unsigned long imbalance, const struct cpumask *cpus)
4097{
4098 struct rq *busiest = NULL, *rq;
4099 unsigned long max_load = 0;
4100 int i;
4101
4102 for_each_cpu(i, sched_group_cpus(group)) {
4103 unsigned long power = power_of(i);
4104 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4105 unsigned long wl;
4106
4107 if (!cpumask_test_cpu(i, cpus))
4108 continue;
4109
4110 rq = cpu_rq(i);
4111 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4112 wl /= power;
4113
4114 if (capacity && rq->nr_running == 1 && wl > imbalance)
4115 continue;
4116
4117 if (wl > max_load) {
4118 max_load = wl;
4119 busiest = rq;
4120 }
4121 }
4122
4123 return busiest;
4124}
4125
4126/*
4127 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4128 * so long as it is large enough.
4129 */
4130#define MAX_PINNED_INTERVAL 512
4131
4132/* Working cpumask for load_balance and load_balance_newidle. */
4133static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4134
4135/*
4136 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4137 * tasks if there is an imbalance.
4138 */
4139static int load_balance(int this_cpu, struct rq *this_rq,
4140 struct sched_domain *sd, enum cpu_idle_type idle,
4141 int *balance)
4142{
4143 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4144 struct sched_group *group;
4145 unsigned long imbalance;
4146 struct rq *busiest;
4147 unsigned long flags;
4148 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4149
4150 cpumask_copy(cpus, cpu_active_mask);
4151
4152 /*
4153 * When power savings policy is enabled for the parent domain, idle
4154 * sibling can pick up load irrespective of busy siblings. In this case,
4155 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4156 * portraying it as CPU_NOT_IDLE.
4157 */
4158 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4159 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4160 sd_idle = 1;
4161
4162 schedstat_inc(sd, lb_count[idle]);
4163
4164redo:
4165 update_shares(sd);
4166 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4167 cpus, balance);
4168
4169 if (*balance == 0)
4170 goto out_balanced;
4171
4172 if (!group) {
4173 schedstat_inc(sd, lb_nobusyg[idle]);
4174 goto out_balanced;
4175 }
4176
4177 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4178 if (!busiest) {
4179 schedstat_inc(sd, lb_nobusyq[idle]);
4180 goto out_balanced;
4181 }
4182
4183 BUG_ON(busiest == this_rq);
4184
4185 schedstat_add(sd, lb_imbalance[idle], imbalance);
4186
4187 ld_moved = 0;
4188 if (busiest->nr_running > 1) {
4189 /*
4190 * Attempt to move tasks. If find_busiest_group has found
4191 * an imbalance but busiest->nr_running <= 1, the group is
4192 * still unbalanced. ld_moved simply stays zero, so it is
4193 * correctly treated as an imbalance.
4194 */
4195 local_irq_save(flags);
4196 double_rq_lock(this_rq, busiest);
4197 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4198 imbalance, sd, idle, &all_pinned);
4199 double_rq_unlock(this_rq, busiest);
4200 local_irq_restore(flags);
4201
4202 /*
4203 * some other cpu did the load balance for us.
4204 */
4205 if (ld_moved && this_cpu != smp_processor_id())
4206 resched_cpu(this_cpu);
4207
4208 /* All tasks on this runqueue were pinned by CPU affinity */
4209 if (unlikely(all_pinned)) {
4210 cpumask_clear_cpu(cpu_of(busiest), cpus);
4211 if (!cpumask_empty(cpus))
4212 goto redo;
4213 goto out_balanced;
4214 }
4215 }
4216
4217 if (!ld_moved) {
4218 schedstat_inc(sd, lb_failed[idle]);
4219 sd->nr_balance_failed++;
4220
4221 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4222
4223 raw_spin_lock_irqsave(&busiest->lock, flags);
4224
4225 /* don't kick the migration_thread, if the curr
4226 * task on busiest cpu can't be moved to this_cpu
4227 */
4228 if (!cpumask_test_cpu(this_cpu,
4229 &busiest->curr->cpus_allowed)) {
4230 raw_spin_unlock_irqrestore(&busiest->lock,
4231 flags);
4232 all_pinned = 1;
4233 goto out_one_pinned;
4234 }
4235
4236 if (!busiest->active_balance) {
4237 busiest->active_balance = 1;
4238 busiest->push_cpu = this_cpu;
4239 active_balance = 1;
4240 }
4241 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4242 if (active_balance)
4243 wake_up_process(busiest->migration_thread);
4244
4245 /*
4246 * We've kicked active balancing, reset the failure
4247 * counter.
4248 */
4249 sd->nr_balance_failed = sd->cache_nice_tries+1;
4250 }
4251 } else
4252 sd->nr_balance_failed = 0;
4253
4254 if (likely(!active_balance)) {
4255 /* We were unbalanced, so reset the balancing interval */
4256 sd->balance_interval = sd->min_interval;
4257 } else {
4258 /*
4259 * If we've begun active balancing, start to back off. This
4260 * case may not be covered by the all_pinned logic if there
4261 * is only 1 task on the busy runqueue (because we don't call
4262 * move_tasks).
4263 */
4264 if (sd->balance_interval < sd->max_interval)
4265 sd->balance_interval *= 2;
4266 }
4267
4268 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4269 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4270 ld_moved = -1;
4271
4272 goto out;
4273
4274out_balanced:
4275 schedstat_inc(sd, lb_balanced[idle]);
4276
4277 sd->nr_balance_failed = 0;
4278
4279out_one_pinned:
4280 /* tune up the balancing interval */
4281 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4282 (sd->balance_interval < sd->max_interval))
4283 sd->balance_interval *= 2;
4284
4285 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4286 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4287 ld_moved = -1;
4288 else
4289 ld_moved = 0;
4290out:
4291 if (ld_moved)
4292 update_shares(sd);
4293 return ld_moved;
4294}
4295
4296/*
4297 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4298 * tasks if there is an imbalance.
4299 *
4300 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4301 * this_rq is locked.
4302 */
4303static int
4304load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4305{
4306 struct sched_group *group;
4307 struct rq *busiest = NULL;
4308 unsigned long imbalance;
4309 int ld_moved = 0;
4310 int sd_idle = 0;
4311 int all_pinned = 0;
4312 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4313
4314 cpumask_copy(cpus, cpu_active_mask);
4315
4316 /*
4317 * When power savings policy is enabled for the parent domain, idle
4318 * sibling can pick up load irrespective of busy siblings. In this case,
4319 * let the state of idle sibling percolate up as IDLE, instead of
4320 * portraying it as CPU_NOT_IDLE.
4321 */
4322 if (sd->flags & SD_SHARE_CPUPOWER &&
4323 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4324 sd_idle = 1;
4325
4326 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4327redo:
4328 update_shares_locked(this_rq, sd);
4329 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4330 &sd_idle, cpus, NULL);
4331 if (!group) {
4332 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4333 goto out_balanced;
4334 }
4335
4336 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4337 if (!busiest) {
4338 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4339 goto out_balanced;
4340 }
4341
4342 BUG_ON(busiest == this_rq);
4343
4344 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4345
4346 ld_moved = 0;
4347 if (busiest->nr_running > 1) {
4348 /* Attempt to move tasks */
4349 double_lock_balance(this_rq, busiest);
4350 /* this_rq->clock is already updated */
4351 update_rq_clock(busiest);
4352 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4353 imbalance, sd, CPU_NEWLY_IDLE,
4354 &all_pinned);
4355 double_unlock_balance(this_rq, busiest);
4356
4357 if (unlikely(all_pinned)) {
4358 cpumask_clear_cpu(cpu_of(busiest), cpus);
4359 if (!cpumask_empty(cpus))
4360 goto redo;
4361 }
4362 }
4363
4364 if (!ld_moved) {
4365 int active_balance = 0;
4366
4367 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4368 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4369 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4370 return -1;
4371
4372 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4373 return -1;
4374
4375 if (sd->nr_balance_failed++ < 2)
4376 return -1;
4377
4378 /*
4379 * The only task running in a non-idle cpu can be moved to this
4380 * cpu in an attempt to completely freeup the other CPU
4381 * package. The same method used to move task in load_balance()
4382 * have been extended for load_balance_newidle() to speedup
4383 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4384 *
4385 * The package power saving logic comes from
4386 * find_busiest_group(). If there are no imbalance, then
4387 * f_b_g() will return NULL. However when sched_mc={1,2} then
4388 * f_b_g() will select a group from which a running task may be
4389 * pulled to this cpu in order to make the other package idle.
4390 * If there is no opportunity to make a package idle and if
4391 * there are no imbalance, then f_b_g() will return NULL and no
4392 * action will be taken in load_balance_newidle().
4393 *
4394 * Under normal task pull operation due to imbalance, there
4395 * will be more than one task in the source run queue and
4396 * move_tasks() will succeed. ld_moved will be true and this
4397 * active balance code will not be triggered.
4398 */
4399
4400 /* Lock busiest in correct order while this_rq is held */
4401 double_lock_balance(this_rq, busiest);
4402
4403 /*
4404 * don't kick the migration_thread, if the curr
4405 * task on busiest cpu can't be moved to this_cpu
4406 */
4407 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4408 double_unlock_balance(this_rq, busiest);
4409 all_pinned = 1;
4410 return ld_moved;
4411 }
4412
4413 if (!busiest->active_balance) {
4414 busiest->active_balance = 1;
4415 busiest->push_cpu = this_cpu;
4416 active_balance = 1;
4417 }
4418
4419 double_unlock_balance(this_rq, busiest);
4420 /*
4421 * Should not call ttwu while holding a rq->lock
4422 */
4423 raw_spin_unlock(&this_rq->lock);
4424 if (active_balance)
4425 wake_up_process(busiest->migration_thread);
4426 raw_spin_lock(&this_rq->lock);
4427
4428 } else
4429 sd->nr_balance_failed = 0;
4430
4431 update_shares_locked(this_rq, sd);
4432 return ld_moved;
4433
4434out_balanced:
4435 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4436 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4437 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4438 return -1;
4439 sd->nr_balance_failed = 0;
4440
4441 return 0;
4442}
4443
4444/*
4445 * idle_balance is called by schedule() if this_cpu is about to become
4446 * idle. Attempts to pull tasks from other CPUs.
4447 */
4448static void idle_balance(int this_cpu, struct rq *this_rq)
4449{
4450 struct sched_domain *sd;
4451 int pulled_task = 0;
4452 unsigned long next_balance = jiffies + HZ;
4453
4454 this_rq->idle_stamp = this_rq->clock;
4455
4456 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4457 return;
4458
4459 for_each_domain(this_cpu, sd) {
4460 unsigned long interval;
4461
4462 if (!(sd->flags & SD_LOAD_BALANCE))
4463 continue;
4464
4465 if (sd->flags & SD_BALANCE_NEWIDLE)
4466 /* If we've pulled tasks over stop searching: */
4467 pulled_task = load_balance_newidle(this_cpu, this_rq,
4468 sd);
4469
4470 interval = msecs_to_jiffies(sd->balance_interval);
4471 if (time_after(next_balance, sd->last_balance + interval))
4472 next_balance = sd->last_balance + interval;
4473 if (pulled_task) {
4474 this_rq->idle_stamp = 0;
4475 break;
4476 }
4477 }
4478 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4479 /*
4480 * We are going idle. next_balance may be set based on
4481 * a busy processor. So reset next_balance.
4482 */
4483 this_rq->next_balance = next_balance;
4484 }
4485}
4486
4487/*
4488 * active_load_balance is run by migration threads. It pushes running tasks
4489 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4490 * running on each physical CPU where possible, and avoids physical /
4491 * logical imbalances.
4492 *
4493 * Called with busiest_rq locked.
4494 */
4495static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4496{
4497 int target_cpu = busiest_rq->push_cpu;
4498 struct sched_domain *sd;
4499 struct rq *target_rq;
4500
4501 /* Is there any task to move? */
4502 if (busiest_rq->nr_running <= 1)
4503 return;
4504
4505 target_rq = cpu_rq(target_cpu);
4506
4507 /*
4508 * This condition is "impossible", if it occurs
4509 * we need to fix it. Originally reported by
4510 * Bjorn Helgaas on a 128-cpu setup.
4511 */
4512 BUG_ON(busiest_rq == target_rq);
4513
4514 /* move a task from busiest_rq to target_rq */
4515 double_lock_balance(busiest_rq, target_rq);
4516 update_rq_clock(busiest_rq);
4517 update_rq_clock(target_rq);
4518
4519 /* Search for an sd spanning us and the target CPU. */
4520 for_each_domain(target_cpu, sd) {
4521 if ((sd->flags & SD_LOAD_BALANCE) &&
4522 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4523 break;
4524 }
4525
4526 if (likely(sd)) {
4527 schedstat_inc(sd, alb_count);
4528
4529 if (move_one_task(target_rq, target_cpu, busiest_rq,
4530 sd, CPU_IDLE))
4531 schedstat_inc(sd, alb_pushed);
4532 else
4533 schedstat_inc(sd, alb_failed);
4534 }
4535 double_unlock_balance(busiest_rq, target_rq);
4536}
4537
4538#ifdef CONFIG_NO_HZ
4539static struct {
4540 atomic_t load_balancer;
4541 cpumask_var_t cpu_mask;
4542 cpumask_var_t ilb_grp_nohz_mask;
4543} nohz ____cacheline_aligned = {
4544 .load_balancer = ATOMIC_INIT(-1),
4545};
4546
4547int get_nohz_load_balancer(void)
4548{
4549 return atomic_read(&nohz.load_balancer);
4550}
4551
4552#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4553/**
4554 * lowest_flag_domain - Return lowest sched_domain containing flag.
4555 * @cpu: The cpu whose lowest level of sched domain is to
4556 * be returned.
4557 * @flag: The flag to check for the lowest sched_domain
4558 * for the given cpu.
4559 *
4560 * Returns the lowest sched_domain of a cpu which contains the given flag.
4561 */
4562static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4563{
4564 struct sched_domain *sd;
4565
4566 for_each_domain(cpu, sd)
4567 if (sd && (sd->flags & flag))
4568 break;
4569
4570 return sd;
4571}
4572
4573/**
4574 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4575 * @cpu: The cpu whose domains we're iterating over.
4576 * @sd: variable holding the value of the power_savings_sd
4577 * for cpu.
4578 * @flag: The flag to filter the sched_domains to be iterated.
4579 *
4580 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4581 * set, starting from the lowest sched_domain to the highest.
4582 */
4583#define for_each_flag_domain(cpu, sd, flag) \
4584 for (sd = lowest_flag_domain(cpu, flag); \
4585 (sd && (sd->flags & flag)); sd = sd->parent)
4586
4587/**
4588 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4589 * @ilb_group: group to be checked for semi-idleness
4590 *
4591 * Returns: 1 if the group is semi-idle. 0 otherwise.
4592 *
4593 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4594 * and atleast one non-idle CPU. This helper function checks if the given
4595 * sched_group is semi-idle or not.
4596 */
4597static inline int is_semi_idle_group(struct sched_group *ilb_group)
4598{
4599 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4600 sched_group_cpus(ilb_group));
4601
4602 /*
4603 * A sched_group is semi-idle when it has atleast one busy cpu
4604 * and atleast one idle cpu.
4605 */
4606 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4607 return 0;
4608
4609 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4610 return 0;
4611
4612 return 1;
4613}
4614/**
4615 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4616 * @cpu: The cpu which is nominating a new idle_load_balancer.
4617 *
4618 * Returns: Returns the id of the idle load balancer if it exists,
4619 * Else, returns >= nr_cpu_ids.
4620 *
4621 * This algorithm picks the idle load balancer such that it belongs to a
4622 * semi-idle powersavings sched_domain. The idea is to try and avoid
4623 * completely idle packages/cores just for the purpose of idle load balancing
4624 * when there are other idle cpu's which are better suited for that job.
4625 */
4626static int find_new_ilb(int cpu)
4627{
4628 struct sched_domain *sd;
4629 struct sched_group *ilb_group;
4630
4631 /*
4632 * Have idle load balancer selection from semi-idle packages only
4633 * when power-aware load balancing is enabled
4634 */
4635 if (!(sched_smt_power_savings || sched_mc_power_savings))
4636 goto out_done;
4637
4638 /*
4639 * Optimize for the case when we have no idle CPUs or only one
4640 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4641 */
4642 if (cpumask_weight(nohz.cpu_mask) < 2)
4643 goto out_done;
4644
4645 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4646 ilb_group = sd->groups;
4647
4648 do {
4649 if (is_semi_idle_group(ilb_group))
4650 return cpumask_first(nohz.ilb_grp_nohz_mask);
4651
4652 ilb_group = ilb_group->next;
4653
4654 } while (ilb_group != sd->groups);
4655 }
4656
4657out_done:
4658 return cpumask_first(nohz.cpu_mask);
4659}
4660#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4661static inline int find_new_ilb(int call_cpu)
4662{
4663 return cpumask_first(nohz.cpu_mask);
4664}
4665#endif
4666
4667/*
4668 * This routine will try to nominate the ilb (idle load balancing)
4669 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4670 * load balancing on behalf of all those cpus. If all the cpus in the system
4671 * go into this tickless mode, then there will be no ilb owner (as there is
4672 * no need for one) and all the cpus will sleep till the next wakeup event
4673 * arrives...
4674 *
4675 * For the ilb owner, tick is not stopped. And this tick will be used
4676 * for idle load balancing. ilb owner will still be part of
4677 * nohz.cpu_mask..
4678 *
4679 * While stopping the tick, this cpu will become the ilb owner if there
4680 * is no other owner. And will be the owner till that cpu becomes busy
4681 * or if all cpus in the system stop their ticks at which point
4682 * there is no need for ilb owner.
4683 *
4684 * When the ilb owner becomes busy, it nominates another owner, during the
4685 * next busy scheduler_tick()
4686 */
4687int select_nohz_load_balancer(int stop_tick)
4688{
4689 int cpu = smp_processor_id();
4690
4691 if (stop_tick) {
4692 cpu_rq(cpu)->in_nohz_recently = 1;
4693
4694 if (!cpu_active(cpu)) {
4695 if (atomic_read(&nohz.load_balancer) != cpu)
4696 return 0;
4697
4698 /*
4699 * If we are going offline and still the leader,
4700 * give up!
4701 */
4702 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4703 BUG();
4704
4705 return 0;
4706 }
4707
4708 cpumask_set_cpu(cpu, nohz.cpu_mask);
4709
4710 /* time for ilb owner also to sleep */
4711 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4712 if (atomic_read(&nohz.load_balancer) == cpu)
4713 atomic_set(&nohz.load_balancer, -1);
4714 return 0;
4715 }
4716
4717 if (atomic_read(&nohz.load_balancer) == -1) {
4718 /* make me the ilb owner */
4719 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4720 return 1;
4721 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4722 int new_ilb;
4723
4724 if (!(sched_smt_power_savings ||
4725 sched_mc_power_savings))
4726 return 1;
4727 /*
4728 * Check to see if there is a more power-efficient
4729 * ilb.
4730 */
4731 new_ilb = find_new_ilb(cpu);
4732 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4733 atomic_set(&nohz.load_balancer, -1);
4734 resched_cpu(new_ilb);
4735 return 0;
4736 }
4737 return 1;
4738 }
4739 } else {
4740 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4741 return 0;
4742
4743 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4744
4745 if (atomic_read(&nohz.load_balancer) == cpu)
4746 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4747 BUG();
4748 }
4749 return 0;
4750}
4751#endif
4752
4753static DEFINE_SPINLOCK(balancing);
4754
4755/*
4756 * It checks each scheduling domain to see if it is due to be balanced,
4757 * and initiates a balancing operation if so.
4758 *
4759 * Balancing parameters are set up in arch_init_sched_domains.
4760 */
4761static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4762{
4763 int balance = 1;
4764 struct rq *rq = cpu_rq(cpu);
4765 unsigned long interval;
4766 struct sched_domain *sd;
4767 /* Earliest time when we have to do rebalance again */
4768 unsigned long next_balance = jiffies + 60*HZ;
4769 int update_next_balance = 0;
4770 int need_serialize;
4771
4772 for_each_domain(cpu, sd) {
4773 if (!(sd->flags & SD_LOAD_BALANCE))
4774 continue;
4775
4776 interval = sd->balance_interval;
4777 if (idle != CPU_IDLE)
4778 interval *= sd->busy_factor;
4779
4780 /* scale ms to jiffies */
4781 interval = msecs_to_jiffies(interval);
4782 if (unlikely(!interval))
4783 interval = 1;
4784 if (interval > HZ*NR_CPUS/10)
4785 interval = HZ*NR_CPUS/10;
4786
4787 need_serialize = sd->flags & SD_SERIALIZE;
4788
4789 if (need_serialize) {
4790 if (!spin_trylock(&balancing))
4791 goto out;
4792 }
4793
4794 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4795 if (load_balance(cpu, rq, sd, idle, &balance)) {
4796 /*
4797 * We've pulled tasks over so either we're no
4798 * longer idle, or one of our SMT siblings is
4799 * not idle.
4800 */
4801 idle = CPU_NOT_IDLE;
4802 }
4803 sd->last_balance = jiffies;
4804 }
4805 if (need_serialize)
4806 spin_unlock(&balancing);
4807out:
4808 if (time_after(next_balance, sd->last_balance + interval)) {
4809 next_balance = sd->last_balance + interval;
4810 update_next_balance = 1;
4811 }
4812
4813 /*
4814 * Stop the load balance at this level. There is another
4815 * CPU in our sched group which is doing load balancing more
4816 * actively.
4817 */
4818 if (!balance)
4819 break;
4820 }
4821
4822 /*
4823 * next_balance will be updated only when there is a need.
4824 * When the cpu is attached to null domain for ex, it will not be
4825 * updated.
4826 */
4827 if (likely(update_next_balance))
4828 rq->next_balance = next_balance;
4829}
4830
4831/*
4832 * run_rebalance_domains is triggered when needed from the scheduler tick.
4833 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4834 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4835 */
4836static void run_rebalance_domains(struct softirq_action *h)
4837{
4838 int this_cpu = smp_processor_id();
4839 struct rq *this_rq = cpu_rq(this_cpu);
4840 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4841 CPU_IDLE : CPU_NOT_IDLE;
4842
4843 rebalance_domains(this_cpu, idle);
4844
4845#ifdef CONFIG_NO_HZ
4846 /*
4847 * If this cpu is the owner for idle load balancing, then do the
4848 * balancing on behalf of the other idle cpus whose ticks are
4849 * stopped.
4850 */
4851 if (this_rq->idle_at_tick &&
4852 atomic_read(&nohz.load_balancer) == this_cpu) {
4853 struct rq *rq;
4854 int balance_cpu;
4855
4856 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4857 if (balance_cpu == this_cpu)
4858 continue;
4859
4860 /*
4861 * If this cpu gets work to do, stop the load balancing
4862 * work being done for other cpus. Next load
4863 * balancing owner will pick it up.
4864 */
4865 if (need_resched())
4866 break;
4867
4868 rebalance_domains(balance_cpu, CPU_IDLE);
4869
4870 rq = cpu_rq(balance_cpu);
4871 if (time_after(this_rq->next_balance, rq->next_balance))
4872 this_rq->next_balance = rq->next_balance;
4873 }
4874 }
4875#endif
4876}
4877
4878static inline int on_null_domain(int cpu)
4879{
4880 return !rcu_dereference(cpu_rq(cpu)->sd);
4881}
4882
4883/*
4884 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4885 *
4886 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4887 * idle load balancing owner or decide to stop the periodic load balancing,
4888 * if the whole system is idle.
4889 */
4890static inline void trigger_load_balance(struct rq *rq, int cpu)
4891{
4892#ifdef CONFIG_NO_HZ
4893 /*
4894 * If we were in the nohz mode recently and busy at the current
4895 * scheduler tick, then check if we need to nominate new idle
4896 * load balancer.
4897 */
4898 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4899 rq->in_nohz_recently = 0;
4900
4901 if (atomic_read(&nohz.load_balancer) == cpu) {
4902 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4903 atomic_set(&nohz.load_balancer, -1);
4904 }
4905
4906 if (atomic_read(&nohz.load_balancer) == -1) {
4907 int ilb = find_new_ilb(cpu);
4908
4909 if (ilb < nr_cpu_ids)
4910 resched_cpu(ilb);
4911 }
4912 }
4913
4914 /*
4915 * If this cpu is idle and doing idle load balancing for all the
4916 * cpus with ticks stopped, is it time for that to stop?
4917 */
4918 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4919 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4920 resched_cpu(cpu);
4921 return;
4922 }
4923
4924 /*
4925 * If this cpu is idle and the idle load balancing is done by
4926 * someone else, then no need raise the SCHED_SOFTIRQ
4927 */
4928 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4929 cpumask_test_cpu(cpu, nohz.cpu_mask))
4930 return;
4931#endif
4932 /* Don't need to rebalance while attached to NULL domain */
4933 if (time_after_eq(jiffies, rq->next_balance) &&
4934 likely(!on_null_domain(cpu)))
4935 raise_softirq(SCHED_SOFTIRQ);
4936}
4937
4938#else /* CONFIG_SMP */
4939
4940/*
4941 * on UP we do not need to balance between CPUs:
4942 */
4943static inline void idle_balance(int cpu, struct rq *rq)
4944{
4945}
4946
4947#endif 3164#endif
4948 3165
4949DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5298,7 +3515,7 @@ void scheduler_tick(void)
5298 curr->sched_class->task_tick(rq, curr, 0); 3515 curr->sched_class->task_tick(rq, curr, 0);
5299 raw_spin_unlock(&rq->lock); 3516 raw_spin_unlock(&rq->lock);
5300 3517
5301 perf_event_task_tick(curr, cpu); 3518 perf_event_task_tick(curr);
5302 3519
5303#ifdef CONFIG_SMP 3520#ifdef CONFIG_SMP
5304 rq->idle_at_tick = idle_cpu(cpu); 3521 rq->idle_at_tick = idle_cpu(cpu);
@@ -5512,7 +3729,7 @@ need_resched_nonpreemptible:
5512 3729
5513 if (likely(prev != next)) { 3730 if (likely(prev != next)) {
5514 sched_info_switch(prev, next); 3731 sched_info_switch(prev, next);
5515 perf_event_task_sched_out(prev, next, cpu); 3732 perf_event_task_sched_out(prev, next);
5516 3733
5517 rq->nr_switches++; 3734 rq->nr_switches++;
5518 rq->curr = next; 3735 rq->curr = next;
@@ -5530,8 +3747,11 @@ need_resched_nonpreemptible:
5530 3747
5531 post_schedule(rq); 3748 post_schedule(rq);
5532 3749
5533 if (unlikely(reacquire_kernel_lock(current) < 0)) 3750 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3751 prev = rq->curr;
3752 switch_count = &prev->nivcsw;
5534 goto need_resched_nonpreemptible; 3753 goto need_resched_nonpreemptible;
3754 }
5535 3755
5536 preempt_enable_no_resched(); 3756 preempt_enable_no_resched();
5537 if (need_resched()) 3757 if (need_resched())
@@ -6040,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6040 unsigned long flags; 4260 unsigned long flags;
6041 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6042 struct rq *rq; 4262 struct rq *rq;
6043 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6044 4264
6045 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6046 4266
@@ -6048,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6048 update_rq_clock(rq); 4268 update_rq_clock(rq);
6049 4269
6050 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6051 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6052 running = task_current(rq, p); 4273 running = task_current(rq, p);
6053 if (on_rq) 4274 if (on_rq)
@@ -6065,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6065 if (running) 4286 if (running)
6066 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6067 if (on_rq) { 4288 if (on_rq) {
6068 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6069 4290
6070 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6071 } 4292 }
@@ -6109,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6109 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6110 4331
6111 if (on_rq) { 4332 if (on_rq) {
6112 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6113 /* 4334 /*
6114 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6115 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6132,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice)
6132 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4353 /* convert nice value [19,-20] to rlimit style value [1,40] */
6133 int nice_rlim = 20 - nice; 4354 int nice_rlim = 20 - nice;
6134 4355
6135 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4356 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6136 capable(CAP_SYS_NICE)); 4357 capable(CAP_SYS_NICE));
6137} 4358}
6138 4359
@@ -6267,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6267{ 4488{
6268 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6269 unsigned long flags; 4490 unsigned long flags;
6270 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6271 struct rq *rq; 4492 struct rq *rq;
6272 int reset_on_fork; 4493 int reset_on_fork;
6273 4494
@@ -6309,7 +4530,7 @@ recheck:
6309 4530
6310 if (!lock_task_sighand(p, &flags)) 4531 if (!lock_task_sighand(p, &flags))
6311 return -ESRCH; 4532 return -ESRCH;
6312 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4533 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6313 unlock_task_sighand(p, &flags); 4534 unlock_task_sighand(p, &flags);
6314 4535
6315 /* can't set/change the rt policy */ 4536 /* can't set/change the rt policy */
@@ -6381,6 +4602,7 @@ recheck:
6381 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6382 4603
6383 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6384 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6385 4607
6386 if (running) 4608 if (running)
@@ -7131,23 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7131 struct rq *rq; 5353 struct rq *rq;
7132 int ret = 0; 5354 int ret = 0;
7133 5355
7134 /*
7135 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7136 * the ->cpus_allowed mask from under waking tasks, which would be
7137 * possible when we change rq->lock in ttwu(), so synchronize against
7138 * TASK_WAKING to avoid that.
7139 */
7140again:
7141 while (p->state == TASK_WAKING)
7142 cpu_relax();
7143
7144 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
7145 5357
7146 if (p->state == TASK_WAKING) {
7147 task_rq_unlock(rq, &flags);
7148 goto again;
7149 }
7150
7151 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7152 ret = -EINVAL; 5359 ret = -EINVAL;
7153 goto out; 5360 goto out;
@@ -9199,11 +7406,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
9199 7406
9200#ifdef CONFIG_SCHED_MC 7407#ifdef CONFIG_SCHED_MC
9201static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7408static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7409 struct sysdev_class_attribute *attr,
9202 char *page) 7410 char *page)
9203{ 7411{
9204 return sprintf(page, "%u\n", sched_mc_power_savings); 7412 return sprintf(page, "%u\n", sched_mc_power_savings);
9205} 7413}
9206static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7414static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7415 struct sysdev_class_attribute *attr,
9207 const char *buf, size_t count) 7416 const char *buf, size_t count)
9208{ 7417{
9209 return sched_power_savings_store(buf, count, 0); 7418 return sched_power_savings_store(buf, count, 0);
@@ -9215,11 +7424,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
9215 7424
9216#ifdef CONFIG_SCHED_SMT 7425#ifdef CONFIG_SCHED_SMT
9217static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7426static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7427 struct sysdev_class_attribute *attr,
9218 char *page) 7428 char *page)
9219{ 7429{
9220 return sprintf(page, "%u\n", sched_smt_power_savings); 7430 return sprintf(page, "%u\n", sched_smt_power_savings);
9221} 7431}
9222static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7432static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7433 struct sysdev_class_attribute *attr,
9223 const char *buf, size_t count) 7434 const char *buf, size_t count)
9224{ 7435{
9225 return sched_power_savings_store(buf, count, 1); 7436 return sched_power_savings_store(buf, count, 1);
@@ -9434,7 +7645,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9434 tg->rt_rq[cpu] = rt_rq; 7645 tg->rt_rq[cpu] = rt_rq;
9435 init_rt_rq(rt_rq, rq); 7646 init_rt_rq(rt_rq, rq);
9436 rt_rq->tg = tg; 7647 rt_rq->tg = tg;
9437 rt_rq->rt_se = rt_se;
9438 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7648 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9439 if (add) 7649 if (add)
9440 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7650 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9465,9 +7675,6 @@ void __init sched_init(void)
9465#ifdef CONFIG_RT_GROUP_SCHED 7675#ifdef CONFIG_RT_GROUP_SCHED
9466 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7676 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9467#endif 7677#endif
9468#ifdef CONFIG_USER_SCHED
9469 alloc_size *= 2;
9470#endif
9471#ifdef CONFIG_CPUMASK_OFFSTACK 7678#ifdef CONFIG_CPUMASK_OFFSTACK
9472 alloc_size += num_possible_cpus() * cpumask_size(); 7679 alloc_size += num_possible_cpus() * cpumask_size();
9473#endif 7680#endif
@@ -9481,13 +7688,6 @@ void __init sched_init(void)
9481 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7688 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9482 ptr += nr_cpu_ids * sizeof(void **); 7689 ptr += nr_cpu_ids * sizeof(void **);
9483 7690
9484#ifdef CONFIG_USER_SCHED
9485 root_task_group.se = (struct sched_entity **)ptr;
9486 ptr += nr_cpu_ids * sizeof(void **);
9487
9488 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9489 ptr += nr_cpu_ids * sizeof(void **);
9490#endif /* CONFIG_USER_SCHED */
9491#endif /* CONFIG_FAIR_GROUP_SCHED */ 7691#endif /* CONFIG_FAIR_GROUP_SCHED */
9492#ifdef CONFIG_RT_GROUP_SCHED 7692#ifdef CONFIG_RT_GROUP_SCHED
9493 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7693 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9496,13 +7696,6 @@ void __init sched_init(void)
9496 init_task_group.rt_rq = (struct rt_rq **)ptr; 7696 init_task_group.rt_rq = (struct rt_rq **)ptr;
9497 ptr += nr_cpu_ids * sizeof(void **); 7697 ptr += nr_cpu_ids * sizeof(void **);
9498 7698
9499#ifdef CONFIG_USER_SCHED
9500 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9501 ptr += nr_cpu_ids * sizeof(void **);
9502
9503 root_task_group.rt_rq = (struct rt_rq **)ptr;
9504 ptr += nr_cpu_ids * sizeof(void **);
9505#endif /* CONFIG_USER_SCHED */
9506#endif /* CONFIG_RT_GROUP_SCHED */ 7699#endif /* CONFIG_RT_GROUP_SCHED */
9507#ifdef CONFIG_CPUMASK_OFFSTACK 7700#ifdef CONFIG_CPUMASK_OFFSTACK
9508 for_each_possible_cpu(i) { 7701 for_each_possible_cpu(i) {
@@ -9522,22 +7715,13 @@ void __init sched_init(void)
9522#ifdef CONFIG_RT_GROUP_SCHED 7715#ifdef CONFIG_RT_GROUP_SCHED
9523 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7716 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9524 global_rt_period(), global_rt_runtime()); 7717 global_rt_period(), global_rt_runtime());
9525#ifdef CONFIG_USER_SCHED
9526 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9527 global_rt_period(), RUNTIME_INF);
9528#endif /* CONFIG_USER_SCHED */
9529#endif /* CONFIG_RT_GROUP_SCHED */ 7718#endif /* CONFIG_RT_GROUP_SCHED */
9530 7719
9531#ifdef CONFIG_GROUP_SCHED 7720#ifdef CONFIG_CGROUP_SCHED
9532 list_add(&init_task_group.list, &task_groups); 7721 list_add(&init_task_group.list, &task_groups);
9533 INIT_LIST_HEAD(&init_task_group.children); 7722 INIT_LIST_HEAD(&init_task_group.children);
9534 7723
9535#ifdef CONFIG_USER_SCHED 7724#endif /* CONFIG_CGROUP_SCHED */
9536 INIT_LIST_HEAD(&root_task_group.children);
9537 init_task_group.parent = &root_task_group;
9538 list_add(&init_task_group.siblings, &root_task_group.children);
9539#endif /* CONFIG_USER_SCHED */
9540#endif /* CONFIG_GROUP_SCHED */
9541 7725
9542#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7726#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9543 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7727 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9577,25 +7761,6 @@ void __init sched_init(void)
9577 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7761 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9578 */ 7762 */
9579 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7763 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9580#elif defined CONFIG_USER_SCHED
9581 root_task_group.shares = NICE_0_LOAD;
9582 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9583 /*
9584 * In case of task-groups formed thr' the user id of tasks,
9585 * init_task_group represents tasks belonging to root user.
9586 * Hence it forms a sibling of all subsequent groups formed.
9587 * In this case, init_task_group gets only a fraction of overall
9588 * system cpu resource, based on the weight assigned to root
9589 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9590 * by letting tasks of init_task_group sit in a separate cfs_rq
9591 * (init_tg_cfs_rq) and having one entity represent this group of
9592 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9593 */
9594 init_tg_cfs_entry(&init_task_group,
9595 &per_cpu(init_tg_cfs_rq, i),
9596 &per_cpu(init_sched_entity, i), i, 1,
9597 root_task_group.se[i]);
9598
9599#endif 7764#endif
9600#endif /* CONFIG_FAIR_GROUP_SCHED */ 7765#endif /* CONFIG_FAIR_GROUP_SCHED */
9601 7766
@@ -9604,12 +7769,6 @@ void __init sched_init(void)
9604 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7769 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9605#ifdef CONFIG_CGROUP_SCHED 7770#ifdef CONFIG_CGROUP_SCHED
9606 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7771 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9607#elif defined CONFIG_USER_SCHED
9608 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9609 init_tg_rt_entry(&init_task_group,
9610 &per_cpu(init_rt_rq_var, i),
9611 &per_cpu(init_sched_rt_entity, i), i, 1,
9612 root_task_group.rt_se[i]);
9613#endif 7772#endif
9614#endif 7773#endif
9615 7774
@@ -9694,7 +7853,7 @@ static inline int preempt_count_equals(int preempt_offset)
9694 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7853 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9695} 7854}
9696 7855
9697void __might_sleep(char *file, int line, int preempt_offset) 7856void __might_sleep(const char *file, int line, int preempt_offset)
9698{ 7857{
9699#ifdef in_atomic 7858#ifdef in_atomic
9700 static unsigned long prev_jiffy; /* ratelimiting */ 7859 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10005,7 +8164,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10005} 8164}
10006#endif /* CONFIG_RT_GROUP_SCHED */ 8165#endif /* CONFIG_RT_GROUP_SCHED */
10007 8166
10008#ifdef CONFIG_GROUP_SCHED 8167#ifdef CONFIG_CGROUP_SCHED
10009static void free_sched_group(struct task_group *tg) 8168static void free_sched_group(struct task_group *tg)
10010{ 8169{
10011 free_fair_sched_group(tg); 8170 free_fair_sched_group(tg);
@@ -10110,11 +8269,11 @@ void sched_move_task(struct task_struct *tsk)
10110 if (unlikely(running)) 8269 if (unlikely(running))
10111 tsk->sched_class->set_curr_task(rq); 8270 tsk->sched_class->set_curr_task(rq);
10112 if (on_rq) 8271 if (on_rq)
10113 enqueue_task(rq, tsk, 0); 8272 enqueue_task(rq, tsk, 0, false);
10114 8273
10115 task_rq_unlock(rq, &flags); 8274 task_rq_unlock(rq, &flags);
10116} 8275}
10117#endif /* CONFIG_GROUP_SCHED */ 8276#endif /* CONFIG_CGROUP_SCHED */
10118 8277
10119#ifdef CONFIG_FAIR_GROUP_SCHED 8278#ifdef CONFIG_FAIR_GROUP_SCHED
10120static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8279static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10256,13 +8415,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10256 runtime = d->rt_runtime; 8415 runtime = d->rt_runtime;
10257 } 8416 }
10258 8417
10259#ifdef CONFIG_USER_SCHED
10260 if (tg == &root_task_group) {
10261 period = global_rt_period();
10262 runtime = global_rt_runtime();
10263 }
10264#endif
10265
10266 /* 8418 /*
10267 * Cannot have more runtime than the period. 8419 * Cannot have more runtime than the period.
10268 */ 8420 */
@@ -10665,7 +8817,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10665struct cpuacct { 8817struct cpuacct {
10666 struct cgroup_subsys_state css; 8818 struct cgroup_subsys_state css;
10667 /* cpuusage holds pointer to a u64-type object on every cpu */ 8819 /* cpuusage holds pointer to a u64-type object on every cpu */
10668 u64 *cpuusage; 8820 u64 __percpu *cpuusage;
10669 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8821 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10670 struct cpuacct *parent; 8822 struct cpuacct *parent;
10671}; 8823};
@@ -10882,12 +9034,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10882} 9034}
10883 9035
10884/* 9036/*
9037 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9038 * in cputime_t units. As a result, cpuacct_update_stats calls
9039 * percpu_counter_add with values large enough to always overflow the
9040 * per cpu batch limit causing bad SMP scalability.
9041 *
9042 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9043 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9044 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9045 */
9046#ifdef CONFIG_SMP
9047#define CPUACCT_BATCH \
9048 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9049#else
9050#define CPUACCT_BATCH 0
9051#endif
9052
9053/*
10885 * Charge the system/user time to the task's accounting group. 9054 * Charge the system/user time to the task's accounting group.
10886 */ 9055 */
10887static void cpuacct_update_stats(struct task_struct *tsk, 9056static void cpuacct_update_stats(struct task_struct *tsk,
10888 enum cpuacct_stat_index idx, cputime_t val) 9057 enum cpuacct_stat_index idx, cputime_t val)
10889{ 9058{
10890 struct cpuacct *ca; 9059 struct cpuacct *ca;
9060 int batch = CPUACCT_BATCH;
10891 9061
10892 if (unlikely(!cpuacct_subsys.active)) 9062 if (unlikely(!cpuacct_subsys.active))
10893 return; 9063 return;
@@ -10896,7 +9066,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10896 ca = task_ca(tsk); 9066 ca = task_ca(tsk);
10897 9067
10898 do { 9068 do {
10899 percpu_counter_add(&ca->cpustat[idx], val); 9069 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10900 ca = ca->parent; 9070 ca = ca->parent;
10901 } while (ca); 9071 } while (ca);
10902 rcu_read_unlock(); 9072 rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..fccf9fbb0d7b 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -58,7 +56,7 @@ static int convert_prio(int prio)
58 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) 56 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59 * 57 *
60 * Note: This function returns the recommended CPUs as calculated during the 58 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in 59 * current invocation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not 60 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct 61 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current 62 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 42ac3c9f66f6..5a5ea2cd924f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1508,7 +1509,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1508 * If there's an idle sibling in this domain, make that 1509 * If there's an idle sibling in this domain, make that
1509 * the wake_affine target instead of the current cpu. 1510 * the wake_affine target instead of the current cpu.
1510 */ 1511 */
1511 if (tmp->flags & SD_PREFER_SIBLING) 1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1512 target = select_idle_sibling(p, tmp, target); 1513 target = select_idle_sibling(p, tmp, target);
1513 1514
1514 if (target >= 0) { 1515 if (target >= 0) {
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1850 1903
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1934{ 2050{
1935 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1936 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
1937 2053
1938 cfs_rq_iterator.start = load_balance_start_fair; 2054 do {
1939 cfs_rq_iterator.next = load_balance_next_fair; 2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1940 2058
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
1942 /* 2062 /*
1943 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1944 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1945 */ 2066 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1948 &cfs_rq_iterator)) 2069
1949 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
1950 } 2155 }
1951 2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
1952 return 0; 2297 return 0;
1953} 2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
3348 return 0;
3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
1954 3536
1955static void rq_online_fair(struct rq *rq) 3537static void rq_online_fair(struct rq *rq)
1956{ 3538{
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3544 update_sysctl();
1963} 3545}
1964 3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1965#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1966 3557
1967/* 3558/*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3667}
2077#endif 3668#endif
2078 3669
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3671{
2081 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2110 3701
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3702 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3703 .rq_offline = rq_offline_fair,
2115 3704
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..b5b920ae2ea7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1136,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1136 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
1137 continue; 1147 continue;
1138 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1148 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1139 struct task_struct *p = rt_task_of(rt_se); 1149 struct task_struct *p;
1150
1151 if (!rt_entity_is_task(rt_se))
1152 continue;
1153
1154 p = rt_task_of(rt_se);
1140 if (pick_rt_task(rq, p, cpu)) { 1155 if (pick_rt_task(rq, p, cpu)) {
1141 next = p; 1156 next = p;
1142 break; 1157 break;
@@ -1481,24 +1496,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1496 push_rt_tasks(rq);
1482} 1497}
1483 1498
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1499static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1500 const struct cpumask *new_mask)
1504{ 1501{
@@ -1670,8 +1667,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1670 if (!p->signal) 1667 if (!p->signal)
1671 return; 1668 return;
1672 1669
1673 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1670 /* max may change after cur was read, this will be fixed next tick */
1674 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1671 soft = task_rlimit(p, RLIMIT_RTTIME);
1672 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1675 1673
1676 if (soft != RLIM_INFINITY) { 1674 if (soft != RLIM_INFINITY) {
1677 unsigned long next; 1675 unsigned long next;
@@ -1721,7 +1719,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
1722} 1720}
1723 1721
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1722static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1723{
1726 /* 1724 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1725 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1744,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1744#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1745 .select_task_rq = select_task_rq_rt,
1748 1746
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1747 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1748 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1749 .rq_offline = rq_offline_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index d09692b40376..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
159 159
160/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
161 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
162int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
163{ 167{
164 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
166 170
167 s = pending->signal.sig; 171 s = pending->signal.sig;
168 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
169 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
170 default: 187 default:
171 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
172 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
173 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
174 break; 191 continue;
175 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
176 break; 195 break;
177 196
178 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
179 sig = 1; 198 x = s[1] &~ m[1];
180 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
181 sig = _NSIG_BPW + 1;
182 else
183 break; 200 break;
184 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
185 break; 202 break;
186 203
187 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
188 sig = ffz(~x) + 1; 205 /* Nothing to do */
189 break; 206 break;
190 } 207 }
191 208
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
228 245
229 if (override_rlimit || 246 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
232 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else { 250 } else {
234 print_dropped_signal(sig); 251 print_dropped_signal(sig);
@@ -979,7 +996,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
979 for (i = 0; i < 16; i++) { 996 for (i = 0; i < 16; i++) {
980 unsigned char insn; 997 unsigned char insn;
981 998
982 __get_user(insn, (unsigned char *)(regs->ip + i)); 999 if (get_user(insn, (unsigned char *)(regs->ip + i)))
1000 break;
983 printk("%02x ", insn); 1001 printk("%02x ", insn);
984 } 1002 }
985 } 1003 }
diff --git a/kernel/smp.c b/kernel/smp.c
index de735a6637d0..9867b6bfefce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,8 +12,6 @@
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14 14
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 15static struct {
18 struct list_head queue; 16 struct list_head queue;
19 raw_spinlock_t lock; 17 raw_spinlock_t lock;
@@ -33,12 +31,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 31 cpumask_var_t cpumask;
34}; 32};
35 33
34static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 raw_spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 42
43static int 43static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void)
256 } 256 }
257} 257}
258 258
259static DEFINE_PER_CPU(struct call_single_data, csd_data); 259static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
260 260
261/* 261/*
262 * smp_call_function_single - Run a function on a specific CPU 262 * smp_call_function_single - Run a function on a specific CPU
@@ -347,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask,
347 goto call; 347 goto call;
348 348
349 /* Try for same node. */ 349 /* Try for same node. */
350 nodemask = cpumask_of_node(cpu); 350 nodemask = cpumask_of_node(cpu_to_node(cpu));
351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; 351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
352 cpu = cpumask_next_and(cpu, nodemask, mask)) { 352 cpu = cpumask_next_and(cpu, nodemask, mask)) {
353 if (cpu_online(cpu)) 353 if (cpu_online(cpu))
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef75..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e27..0d4c7898ab80 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
118 } 125 }
119 126
120 if (touch_ts == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..bde4295774c8 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,6 +34,30 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37static int init_srcu_struct_fields(struct srcu_struct *sp)
38{
39 sp->completed = 0;
40 mutex_init(&sp->mutex);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM;
43}
44
45#ifdef CONFIG_DEBUG_LOCK_ALLOC
46
47int __init_srcu_struct(struct srcu_struct *sp, const char *name,
48 struct lock_class_key *key)
49{
50#ifdef CONFIG_DEBUG_LOCK_ALLOC
51 /* Don't re-initialize a lock while it is held. */
52 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
53 lockdep_init_map(&sp->dep_map, name, key, 0);
54#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
55 return init_srcu_struct_fields(sp);
56}
57EXPORT_SYMBOL_GPL(__init_srcu_struct);
58
59#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
60
37/** 61/**
38 * init_srcu_struct - initialize a sleep-RCU structure 62 * init_srcu_struct - initialize a sleep-RCU structure
39 * @sp: structure to initialize. 63 * @sp: structure to initialize.
@@ -44,13 +68,12 @@
44 */ 68 */
45int init_srcu_struct(struct srcu_struct *sp) 69int init_srcu_struct(struct srcu_struct *sp)
46{ 70{
47 sp->completed = 0; 71 return init_srcu_struct_fields(sp);
48 mutex_init(&sp->mutex);
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 72}
52EXPORT_SYMBOL_GPL(init_srcu_struct); 73EXPORT_SYMBOL_GPL(init_srcu_struct);
53 74
75#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
76
54/* 77/*
55 * srcu_readers_active_idx -- returns approximate number of readers 78 * srcu_readers_active_idx -- returns approximate number of readers
56 * active on the specified rank of per-CPU counters. 79 * active on the specified rank of per-CPU counters.
@@ -100,15 +123,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
100} 123}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 124EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
102 125
103/** 126/*
104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
105 * @sp: srcu_struct in which to register the new reader.
106 *
107 * Counts the new reader in the appropriate per-CPU element of the 127 * Counts the new reader in the appropriate per-CPU element of the
108 * srcu_struct. Must be called from process context. 128 * srcu_struct. Must be called from process context.
109 * Returns an index that must be passed to the matching srcu_read_unlock(). 129 * Returns an index that must be passed to the matching srcu_read_unlock().
110 */ 130 */
111int srcu_read_lock(struct srcu_struct *sp) 131int __srcu_read_lock(struct srcu_struct *sp)
112{ 132{
113 int idx; 133 int idx;
114 134
@@ -120,31 +140,27 @@ int srcu_read_lock(struct srcu_struct *sp)
120 preempt_enable(); 140 preempt_enable();
121 return idx; 141 return idx;
122} 142}
123EXPORT_SYMBOL_GPL(srcu_read_lock); 143EXPORT_SYMBOL_GPL(__srcu_read_lock);
124 144
125/** 145/*
126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
127 * @sp: srcu_struct in which to unregister the old reader.
128 * @idx: return value from corresponding srcu_read_lock().
129 *
130 * Removes the count for the old reader from the appropriate per-CPU 146 * Removes the count for the old reader from the appropriate per-CPU
131 * element of the srcu_struct. Note that this may well be a different 147 * element of the srcu_struct. Note that this may well be a different
132 * CPU than that which was incremented by the corresponding srcu_read_lock(). 148 * CPU than that which was incremented by the corresponding srcu_read_lock().
133 * Must be called from process context. 149 * Must be called from process context.
134 */ 150 */
135void srcu_read_unlock(struct srcu_struct *sp, int idx) 151void __srcu_read_unlock(struct srcu_struct *sp, int idx)
136{ 152{
137 preempt_disable(); 153 preempt_disable();
138 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 154 srcu_barrier(); /* ensure compiler won't misorder critical section. */
139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 155 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
140 preempt_enable(); 156 preempt_enable();
141} 157}
142EXPORT_SYMBOL_GPL(srcu_read_unlock); 158EXPORT_SYMBOL_GPL(__srcu_read_unlock);
143 159
144/* 160/*
145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 161 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
146 */ 162 */
147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 163static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
148{ 164{
149 int idx; 165 int idx;
150 166
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void __percpu *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
51{ 51{
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..8298878f4f71 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/personality.h>
36#include <linux/ptrace.h> 37#include <linux/ptrace.h>
37#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
38 39
@@ -222,6 +223,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
222 if (which > PRIO_USER || which < PRIO_PROCESS) 223 if (which > PRIO_USER || which < PRIO_PROCESS)
223 return -EINVAL; 224 return -EINVAL;
224 225
226 rcu_read_lock();
225 read_lock(&tasklist_lock); 227 read_lock(&tasklist_lock);
226 switch (which) { 228 switch (which) {
227 case PRIO_PROCESS: 229 case PRIO_PROCESS:
@@ -267,6 +269,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
267 } 269 }
268out_unlock: 270out_unlock:
269 read_unlock(&tasklist_lock); 271 read_unlock(&tasklist_lock);
272 rcu_read_unlock();
270 273
271 return retval; 274 return retval;
272} 275}
@@ -569,13 +572,7 @@ static int set_user(struct cred *new)
569 if (!new_user) 572 if (!new_user)
570 return -EAGAIN; 573 return -EAGAIN;
571 574
572 if (!task_can_switch_user(new_user, current)) { 575 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
573 free_uid(new_user);
574 return -EINVAL;
575 }
576
577 if (atomic_read(&new_user->processes) >=
578 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
579 new_user != INIT_USER) { 576 new_user != INIT_USER) {
580 free_uid(new_user); 577 free_uid(new_user);
581 return -EAGAIN; 578 return -EAGAIN;
@@ -1118,6 +1115,15 @@ out:
1118 1115
1119DECLARE_RWSEM(uts_sem); 1116DECLARE_RWSEM(uts_sem);
1120 1117
1118#ifdef COMPAT_UTS_MACHINE
1119#define override_architecture(name) \
1120 (current->personality == PER_LINUX32 && \
1121 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1122 sizeof(COMPAT_UTS_MACHINE)))
1123#else
1124#define override_architecture(name) 0
1125#endif
1126
1121SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1127SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1122{ 1128{
1123 int errno = 0; 1129 int errno = 0;
@@ -1126,9 +1132,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1126 if (copy_to_user(name, utsname(), sizeof *name)) 1132 if (copy_to_user(name, utsname(), sizeof *name))
1127 errno = -EFAULT; 1133 errno = -EFAULT;
1128 up_read(&uts_sem); 1134 up_read(&uts_sem);
1135
1136 if (!errno && override_architecture(name))
1137 errno = -EFAULT;
1129 return errno; 1138 return errno;
1130} 1139}
1131 1140
1141#ifdef __ARCH_WANT_SYS_OLD_UNAME
1142/*
1143 * Old cruft
1144 */
1145SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1146{
1147 int error = 0;
1148
1149 if (!name)
1150 return -EFAULT;
1151
1152 down_read(&uts_sem);
1153 if (copy_to_user(name, utsname(), sizeof(*name)))
1154 error = -EFAULT;
1155 up_read(&uts_sem);
1156
1157 if (!error && override_architecture(name))
1158 error = -EFAULT;
1159 return error;
1160}
1161
1162SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1163{
1164 int error;
1165
1166 if (!name)
1167 return -EFAULT;
1168 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1169 return -EFAULT;
1170
1171 down_read(&uts_sem);
1172 error = __copy_to_user(&name->sysname, &utsname()->sysname,
1173 __OLD_UTS_LEN);
1174 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1175 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1176 __OLD_UTS_LEN);
1177 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1178 error |= __copy_to_user(&name->release, &utsname()->release,
1179 __OLD_UTS_LEN);
1180 error |= __put_user(0, name->release + __OLD_UTS_LEN);
1181 error |= __copy_to_user(&name->version, &utsname()->version,
1182 __OLD_UTS_LEN);
1183 error |= __put_user(0, name->version + __OLD_UTS_LEN);
1184 error |= __copy_to_user(&name->machine, &utsname()->machine,
1185 __OLD_UTS_LEN);
1186 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1187 up_read(&uts_sem);
1188
1189 if (!error && override_architecture(name))
1190 error = -EFAULT;
1191 return error ? -EFAULT : 0;
1192}
1193#endif
1194
1132SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1195SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1133{ 1196{
1134 int errno; 1197 int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
126cond_syscall(sys_setuid16); 126cond_syscall(sys_setuid16);
127cond_syscall(sys_vm86old); 127cond_syscall(sys_vm86old);
128cond_syscall(sys_vm86); 128cond_syscall(sys_vm86);
129cond_syscall(sys_ipc);
129cond_syscall(compat_sys_ipc); 130cond_syscall(compat_sys_ipc);
130cond_syscall(compat_sys_sysctl); 131cond_syscall(compat_sys_sysctl);
131cond_syscall(sys_flock); 132cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h>
26#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
27#include <linux/security.h> 28#include <linux/security.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
@@ -50,6 +51,7 @@
50#include <linux/ftrace.h> 51#include <linux/ftrace.h>
51#include <linux/slow-work.h> 52#include <linux/slow-work.h>
52#include <linux/perf_event.h> 53#include <linux/perf_event.h>
54#include <linux/kprobes.h>
53 55
54#include <asm/uaccess.h> 56#include <asm/uaccess.h>
55#include <asm/processor.h> 57#include <asm/processor.h>
@@ -59,13 +61,23 @@
59#include <asm/stacktrace.h> 61#include <asm/stacktrace.h>
60#include <asm/io.h> 62#include <asm/io.h>
61#endif 63#endif
64#ifdef CONFIG_BSD_PROCESS_ACCT
65#include <linux/acct.h>
66#endif
67#ifdef CONFIG_RT_MUTEXES
68#include <linux/rtmutex.h>
69#endif
70#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
71#include <linux/lockdep.h>
72#endif
73#ifdef CONFIG_CHR_DEV_SG
74#include <scsi/sg.h>
75#endif
62 76
63 77
64#if defined(CONFIG_SYSCTL) 78#if defined(CONFIG_SYSCTL)
65 79
66/* External variables not in a header file. */ 80/* External variables not in a header file. */
67extern int C_A_D;
68extern int print_fatal_signals;
69extern int sysctl_overcommit_memory; 81extern int sysctl_overcommit_memory;
70extern int sysctl_overcommit_ratio; 82extern int sysctl_overcommit_ratio;
71extern int sysctl_panic_on_oom; 83extern int sysctl_panic_on_oom;
@@ -87,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
87#ifndef CONFIG_MMU 99#ifndef CONFIG_MMU
88extern int sysctl_nr_trim_pages; 100extern int sysctl_nr_trim_pages;
89#endif 101#endif
90#ifdef CONFIG_RCU_TORTURE_TEST
91extern int rcutorture_runnable;
92#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
93#ifdef CONFIG_BLOCK 102#ifdef CONFIG_BLOCK
94extern int blk_iopoll_enabled; 103extern int blk_iopoll_enabled;
95#endif 104#endif
@@ -119,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
119 128
120static int ngroups_max = NGROUPS_MAX; 129static int ngroups_max = NGROUPS_MAX;
121 130
122#ifdef CONFIG_MODULES
123extern char modprobe_path[];
124extern int modules_disabled;
125#endif
126#ifdef CONFIG_CHR_DEV_SG
127extern int sg_big_buff;
128#endif
129
130#ifdef CONFIG_SPARC 131#ifdef CONFIG_SPARC
131#include <asm/system.h> 132#include <asm/system.h>
132#endif 133#endif
@@ -148,10 +149,6 @@ extern int sysctl_userprocess_debug;
148extern int spin_retry; 149extern int spin_retry;
149#endif 150#endif
150 151
151#ifdef CONFIG_BSD_PROCESS_ACCT
152extern int acct_parm[];
153#endif
154
155#ifdef CONFIG_IA64 152#ifdef CONFIG_IA64
156extern int no_unaligned_warning; 153extern int no_unaligned_warning;
157extern int unaligned_dump_stack; 154extern int unaligned_dump_stack;
@@ -159,10 +156,6 @@ extern int unaligned_dump_stack;
159 156
160extern struct ratelimit_state printk_ratelimit_state; 157extern struct ratelimit_state printk_ratelimit_state;
161 158
162#ifdef CONFIG_RT_MUTEXES
163extern int max_lock_depth;
164#endif
165
166#ifdef CONFIG_PROC_SYSCTL 159#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 160static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 161 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -201,9 +194,6 @@ extern struct ctl_table epoll_table[];
201int sysctl_legacy_va_layout; 194int sysctl_legacy_va_layout;
202#endif 195#endif
203 196
204extern int prove_locking;
205extern int lock_stat;
206
207/* The default sysctl tables: */ 197/* The default sysctl tables: */
208 198
209static struct ctl_table root_table[] = { 199static struct ctl_table root_table[] = {
@@ -1441,7 +1431,7 @@ static struct ctl_table fs_table[] = {
1441}; 1431};
1442 1432
1443static struct ctl_table debug_table[] = { 1433static struct ctl_table debug_table[] = {
1444#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
1445 { 1435 {
1446 .procname = "exception-trace", 1436 .procname = "exception-trace",
1447 .data = &show_unhandled_signals, 1437 .data = &show_unhandled_signals,
@@ -1450,6 +1440,17 @@ static struct ctl_table debug_table[] = {
1450 .proc_handler = proc_dointvec 1440 .proc_handler = proc_dointvec
1451 }, 1441 },
1452#endif 1442#endif
1443#if defined(CONFIG_OPTPROBES)
1444 {
1445 .procname = "kprobes-optimization",
1446 .data = &sysctl_kprobes_optimization,
1447 .maxlen = sizeof(int),
1448 .mode = 0644,
1449 .proc_handler = proc_kprobes_optimization_handler,
1450 .extra1 = &zero,
1451 .extra2 = &one,
1452 },
1453#endif
1453 { } 1454 { }
1454}; 1455};
1455 1456
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 112533d5fc08..8cd50d8f9bde 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1331 ssize_t result; 1331 ssize_t result;
1332 char *pathname; 1332 char *pathname;
1333 int flags; 1333 int flags;
1334 int acc_mode, fmode; 1334 int acc_mode;
1335 1335
1336 pathname = sysctl_getname(name, nlen, &table); 1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname); 1337 result = PTR_ERR(pathname);
@@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1342 if (oldval && oldlen && newval && newlen) { 1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR; 1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE; 1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) { 1345 } else if (newval && newlen) {
1347 flags = O_WRONLY; 1346 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE; 1347 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) { 1348 } else if (oldval && oldlen) {
1351 flags = O_RDONLY; 1349 flags = O_RDONLY;
1352 acc_mode = MAY_READ; 1350 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else { 1351 } else {
1355 result = 0; 1352 result = 0;
1356 goto out_putname; 1353 goto out_putname;
@@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1361 if (result) 1358 if (result)
1362 goto out_putname; 1359 goto out_putname;
1363 1360
1364 result = may_open(&nd.path, acc_mode, fmode); 1361 result = may_open(&nd.path, acc_mode, flags);
1365 if (result) 1362 if (result)
1366 goto out_putpath; 1363 goto out_putpath;
1367 1364
@@ -1417,6 +1414,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1417 return; 1414 return;
1418} 1415}
1419 1416
1417#define WARN_ONCE_HASH_BITS 8
1418#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
1419
1420static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
1421
1422#define FNV32_OFFSET 2166136261U
1423#define FNV32_PRIME 0x01000193
1424
1425/*
1426 * Print each legacy sysctl (approximately) only once.
1427 * To avoid making the tables non-const use a external
1428 * hash-table instead.
1429 * Worst case hash collision: 6, but very rarely.
1430 * NOTE! We don't use the SMP-safe bit tests. We simply
1431 * don't care enough.
1432 */
1433static void warn_on_bintable(const int *name, int nlen)
1434{
1435 int i;
1436 u32 hash = FNV32_OFFSET;
1437
1438 for (i = 0; i < nlen; i++)
1439 hash = (hash ^ name[i]) * FNV32_PRIME;
1440 hash %= WARN_ONCE_HASH_SIZE;
1441 if (__test_and_set_bit(hash, warn_once_bitmap))
1442 return;
1443 deprecated_sysctl_warning(name, nlen);
1444}
1445
1420static ssize_t do_sysctl(int __user *args_name, int nlen, 1446static ssize_t do_sysctl(int __user *args_name, int nlen,
1421 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1447 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1422{ 1448{
@@ -1431,7 +1457,7 @@ static ssize_t do_sysctl(int __user *args_name, int nlen,
1431 if (get_user(name[i], args_name + i)) 1457 if (get_user(name[i], args_name + i))
1432 return -EFAULT; 1458 return -EFAULT;
1433 1459
1434 deprecated_sysctl_warning(name, nlen); 1460 warn_on_bintable(name, nlen);
1435 1461
1436 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); 1462 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1437} 1463}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..899ca51be5e8 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -46,15 +46,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 46 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 47};
48 48
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 49static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 50 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 52 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 53 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 54
56static struct nla_policy 55static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 56 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 57};
60 58
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 6f740d9f0948..d7395fdfb9f3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -259,7 +259,8 @@ void clockevents_notify(unsigned long reason, void *arg)
259 cpu = *((int *)arg); 259 cpu = *((int *)arg);
260 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { 260 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
261 if (cpumask_test_cpu(cpu, dev->cpumask) && 261 if (cpumask_test_cpu(cpu, dev->cpumask) &&
262 cpumask_weight(dev->cpumask) == 1) { 262 cpumask_weight(dev->cpumask) == 1 &&
263 !tick_is_broadcast_device(dev)) {
263 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 264 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
264 list_del(&dev->list); 265 list_del(&dev->list);
265 } 266 }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d34..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
343{ 343{
344 unsigned long flags; 344 unsigned long flags;
345 345
346 spin_lock_irqsave(&watchdog_lock, flags); 346 /*
347 * We use trylock here to avoid a potential dead lock when
348 * kgdb calls this code after the kernel has been stopped with
349 * watchdog_lock held. When watchdog_lock is held we just
350 * return and accept, that the watchdog might trigger and mark
351 * the monitored clock source (usually TSC) unstable.
352 *
353 * This does not affect the other caller clocksource_resume()
354 * because at this point the kernel is UP, interrupts are
355 * disabled and nothing can hold watchdog_lock.
356 */
357 if (!spin_trylock_irqsave(&watchdog_lock, flags))
358 return;
347 clocksource_reset_watchdog(); 359 clocksource_reset_watchdog();
348 spin_unlock_irqrestore(&watchdog_lock, flags); 360 spin_unlock_irqrestore(&watchdog_lock, flags);
349} 361}
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
441#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
442 454
443/** 455/**
456 * clocksource_suspend - suspend the clocksource(s)
457 */
458void clocksource_suspend(void)
459{
460 struct clocksource *cs;
461
462 list_for_each_entry_reverse(cs, &clocksource_list, list)
463 if (cs->suspend)
464 cs->suspend(cs);
465}
466
467/**
444 * clocksource_resume - resume the clocksource(s) 468 * clocksource_resume - resume the clocksource(s)
445 */ 469 */
446void clocksource_resume(void) 470void clocksource_resume(void)
@@ -449,7 +473,7 @@ void clocksource_resume(void)
449 473
450 list_for_each_entry(cs, &clocksource_list, list) 474 list_for_each_entry(cs, &clocksource_list, list)
451 if (cs->resume) 475 if (cs->resume)
452 cs->resume(); 476 cs->resume(cs);
453 477
454 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
455} 479}
@@ -458,8 +482,8 @@ void clocksource_resume(void)
458 * clocksource_touch_watchdog - Update watchdog 482 * clocksource_touch_watchdog - Update watchdog
459 * 483 *
460 * Update the watchdog after exception contexts such as kgdb so as not 484 * Update the watchdog after exception contexts such as kgdb so as not
461 * to incorrectly trip the watchdog. 485 * to incorrectly trip the watchdog. This might fail when the kernel
462 * 486 * was stopped in code which holds watchdog_lock.
463 */ 487 */
464void clocksource_touch_watchdog(void) 488void clocksource_touch_watchdog(void)
465{ 489{
@@ -568,6 +592,10 @@ static inline void clocksource_select(void) { }
568 */ 592 */
569static int __init clocksource_done_booting(void) 593static int __init clocksource_done_booting(void)
570{ 594{
595 mutex_lock(&clocksource_mutex);
596 curr_clocksource = clocksource_default_clock();
597 mutex_unlock(&clocksource_mutex);
598
571 finished_booting = 1; 599 finished_booting = 1;
572 600
573 /* 601 /*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64 time_offset;
58static long time_constant = 2; 58static long time_constant = 2;
59 59
60/* maximum error (usecs): */ 60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT; 61static long time_maxerror = NTP_PHASE_LIMIT;
62 62
63/* estimated error (usecs): */ 63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT; 64static long time_esterror = NTP_PHASE_LIMIT;
65 65
66/* frequency offset (scaled nsecs/secs): */ 66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq; 67static s64 time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
142 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
143 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
144 */ 144 */
145 secs = xtime.tv_sec - time_reftime; 145 secs = get_seconds() - time_reftime;
146 if (unlikely(time_status & STA_FREQHOLD)) 146 if (unlikely(time_status & STA_FREQHOLD))
147 secs = 0; 147 secs = 0;
148 148
149 time_reftime = xtime.tv_sec; 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
368 * reference time to current time. 368 * reference time to current time.
369 */ 369 */
370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) 370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
371 time_reftime = xtime.tv_sec; 371 time_reftime = get_seconds();
372 372
373 /* only set allowed bits */ 373 /* only set allowed bits */
374 time_status &= STA_RONLY; 374 time_status &= STA_RONLY;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4f..16736379a9ca 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -622,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
622 write_sequnlock_irqrestore(&xtime_lock, flags); 622 write_sequnlock_irqrestore(&xtime_lock, flags);
623 623
624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
625 clocksource_suspend();
625 626
626 return 0; 627 return 0;
627} 628}
@@ -880,6 +881,7 @@ void getboottime(struct timespec *ts)
880 881
881 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 882 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
882} 883}
884EXPORT_SYMBOL_GPL(getboottime);
883 885
884/** 886/**
885 * monotonic_to_bootbased - Convert the monotonic time to boot based. 887 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -889,6 +891,7 @@ void monotonic_to_bootbased(struct timespec *ts)
889{ 891{
890 *ts = timespec_add_safe(*ts, total_sleep_time); 892 *ts = timespec_add_safe(*ts, total_sleep_time);
891} 893}
894EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
892 895
893unsigned long get_seconds(void) 896unsigned long get_seconds(void)
894{ 897{
diff --git a/kernel/timer.c b/kernel/timer.c
index 15533b792397..c61a7949387f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1198,6 +1198,7 @@ void update_process_times(int user_tick)
1198 run_local_timers(); 1198 run_local_timers();
1199 rcu_check_callbacks(cpu, user_tick); 1199 rcu_check_callbacks(cpu, user_tick);
1200 printk_tick(); 1200 printk_tick();
1201 perf_event_do_pending();
1201 scheduler_tick(); 1202 scheduler_tick();
1202 run_posix_cpu_timers(p); 1203 run_posix_cpu_timers(p);
1203} 1204}
@@ -1209,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h)
1209{ 1210{
1210 struct tvec_base *base = __get_cpu_var(tvec_bases); 1211 struct tvec_base *base = __get_cpu_var(tvec_bases);
1211 1212
1212 perf_event_do_pending();
1213
1214 hrtimer_run_pending(); 1213 hrtimer_run_pending();
1215 1214
1216 if (time_after_eq(jiffies, base->timer_jiffies)) 1215 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d006554888dc..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,39 +12,37 @@ config NOP_TRACER
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help 14 help
15 See Documentation/trace/ftrace-implementation.txt 15 See Documentation/trace/ftrace-design.txt
16 16
17config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
18 bool 18 bool
19 help 19 help
20 See Documentation/trace/ftrace-implementation.txt 20 See Documentation/trace/ftrace-design.txt
21 21
22config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
23 bool 23 bool
24 help 24 help
25 See Documentation/trace/ftrace-implementation.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
29 help 29 help
30 An arch may pass in a unique value (frame pointer) to both the 30 See Documentation/trace/ftrace-design.txt
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
33 31
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 33 bool
36 help 34 help
37 See Documentation/trace/ftrace-implementation.txt 35 See Documentation/trace/ftrace-design.txt
38 36
39config HAVE_DYNAMIC_FTRACE 37config HAVE_DYNAMIC_FTRACE
40 bool 38 bool
41 help 39 help
42 See Documentation/trace/ftrace-implementation.txt 40 See Documentation/trace/ftrace-design.txt
43 41
44config HAVE_FTRACE_MCOUNT_RECORD 42config HAVE_FTRACE_MCOUNT_RECORD
45 bool 43 bool
46 help 44 help
47 See Documentation/trace/ftrace-implementation.txt 45 See Documentation/trace/ftrace-design.txt
48 46
49config HAVE_HW_BRANCH_TRACER 47config HAVE_HW_BRANCH_TRACER
50 bool 48 bool
@@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER
52config HAVE_SYSCALL_TRACEPOINTS 50config HAVE_SYSCALL_TRACEPOINTS
53 bool 51 bool
54 help 52 help
55 See Documentation/trace/ftrace-implementation.txt 53 See Documentation/trace/ftrace-design.txt
56 54
57config TRACER_MAX_TRACE 55config TRACER_MAX_TRACE
58 bool 56 bool
@@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP
83# This allows those options to appear when no other tracer is selected. But the 81# This allows those options to appear when no other tracer is selected. But the
84# options do not appear when something else selects it. We need the two options 82# options do not appear when something else selects it. We need the two options
85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 83# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
86# hidding of the automatic options. 84# hiding of the automatic options.
87 85
88config TRACING 86config TRACING
89 bool 87 bool
@@ -119,7 +117,7 @@ menuconfig FTRACE
119 bool "Tracers" 117 bool "Tracers"
120 default y if DEBUG_KERNEL 118 default y if DEBUG_KERNEL
121 help 119 help
122 Enable the kernel tracing infrastructure. 120 Enable the kernel tracing infrastructure.
123 121
124if FTRACE 122if FTRACE
125 123
@@ -133,7 +131,7 @@ config FUNCTION_TRACER
133 help 131 help
134 Enable the kernel to trace every kernel function. This is done 132 Enable the kernel to trace every kernel function. This is done
135 by using a compiler feature to insert a small, 5-byte No-Operation 133 by using a compiler feature to insert a small, 5-byte No-Operation
136 instruction to the beginning of every kernel function, which NOP 134 instruction at the beginning of every kernel function, which NOP
137 sequence is then dynamically patched into a tracer call when 135 sequence is then dynamically patched into a tracer call when
138 tracing is enabled by the administrator. If it's runtime disabled 136 tracing is enabled by the administrator. If it's runtime disabled
139 (the bootup default), then the overhead of the instructions is very 137 (the bootup default), then the overhead of the instructions is very
@@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER
150 and its entry. 148 and its entry.
151 Its first purpose is to trace the duration of functions and 149 Its first purpose is to trace the duration of functions and
152 draw a call graph for each thread with some information like 150 draw a call graph for each thread with some information like
153 the return value. This is done by setting the current return 151 the return value. This is done by setting the current return
154 address on the current task structure into a stack of calls. 152 address on the current task structure into a stack of calls.
155 153
156 154
@@ -173,7 +171,7 @@ config IRQSOFF_TRACER
173 171
174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 172 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
175 173
176 (Note that kernel size and overhead increases with this option 174 (Note that kernel size and overhead increase with this option
177 enabled. This option and the preempt-off timing option can be 175 enabled. This option and the preempt-off timing option can be
178 used together or separately.) 176 used together or separately.)
179 177
@@ -186,7 +184,7 @@ config PREEMPT_TRACER
186 select TRACER_MAX_TRACE 184 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP 185 select RING_BUFFER_ALLOW_SWAP
188 help 186 help
189 This option measures the time spent in preemption off critical 187 This option measures the time spent in preemption-off critical
190 sections, with microsecond accuracy. 188 sections, with microsecond accuracy.
191 189
192 The default measurement method is a maximum search, which is 190 The default measurement method is a maximum search, which is
@@ -195,7 +193,7 @@ config PREEMPT_TRACER
195 193
196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 194 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
197 195
198 (Note that kernel size and overhead increases with this option 196 (Note that kernel size and overhead increase with this option
199 enabled. This option and the irqs-off timing option can be 197 enabled. This option and the irqs-off timing option can be
200 used together or separately.) 198 used together or separately.)
201 199
@@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS
222 depends on !GENERIC_TRACER 220 depends on !GENERIC_TRACER
223 select TRACING 221 select TRACING
224 help 222 help
225 This tracer hooks to various trace points in the kernel 223 This tracer hooks to various trace points in the kernel,
226 allowing the user to pick and choose which trace point they 224 allowing the user to pick and choose which trace point they
227 want to trace. It also includes the sched_switch tracer plugin. 225 want to trace. It also includes the sched_switch tracer plugin.
228 226
@@ -265,19 +263,19 @@ choice
265 The likely/unlikely profiler only looks at the conditions that 263 The likely/unlikely profiler only looks at the conditions that
266 are annotated with a likely or unlikely macro. 264 are annotated with a likely or unlikely macro.
267 265
268 The "all branch" profiler will profile every if statement in the 266 The "all branch" profiler will profile every if-statement in the
269 kernel. This profiler will also enable the likely/unlikely 267 kernel. This profiler will also enable the likely/unlikely
270 profiler as well. 268 profiler.
271 269
272 Either of the above profilers add a bit of overhead to the system. 270 Either of the above profilers adds a bit of overhead to the system.
273 If unsure choose "No branch profiling". 271 If unsure, choose "No branch profiling".
274 272
275config BRANCH_PROFILE_NONE 273config BRANCH_PROFILE_NONE
276 bool "No branch profiling" 274 bool "No branch profiling"
277 help 275 help
278 No branch profiling. Branch profiling adds a bit of overhead. 276 No branch profiling. Branch profiling adds a bit of overhead.
279 Only enable it if you want to analyse the branching behavior. 277 Only enable it if you want to analyse the branching behavior.
280 Otherwise keep it disabled. 278 Otherwise keep it disabled.
281 279
282config PROFILE_ANNOTATED_BRANCHES 280config PROFILE_ANNOTATED_BRANCHES
283 bool "Trace likely/unlikely profiler" 281 bool "Trace likely/unlikely profiler"
@@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
288 286
289 /sys/kernel/debug/tracing/profile_annotated_branch 287 /sys/kernel/debug/tracing/profile_annotated_branch
290 288
291 Note: this will add a significant overhead, only turn this 289 Note: this will add a significant overhead; only turn this
292 on if you need to profile the system's use of these macros. 290 on if you need to profile the system's use of these macros.
293 291
294config PROFILE_ALL_BRANCHES 292config PROFILE_ALL_BRANCHES
@@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES
305 303
306 This configuration, when enabled, will impose a great overhead 304 This configuration, when enabled, will impose a great overhead
307 on the system. This should only be enabled when the system 305 on the system. This should only be enabled when the system
308 is to be analyzed 306 is to be analyzed in much detail.
309endchoice 307endchoice
310 308
311config TRACING_BRANCHES 309config TRACING_BRANCHES
@@ -330,15 +328,6 @@ config BRANCH_TRACER
330 328
331 Say N if unsure. 329 Say N if unsure.
332 330
333config POWER_TRACER
334 bool "Trace power consumption behavior"
335 depends on X86
336 select GENERIC_TRACER
337 help
338 This tracer helps developers to analyze and optimize the kernels
339 power management decisions, specifically the C-state and P-state
340 behavior.
341
342config KSYM_TRACER 331config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations" 332 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT 333 depends on HAVE_HW_BREAKPOINT
@@ -391,14 +380,14 @@ config HW_BRANCH_TRACER
391 select GENERIC_TRACER 380 select GENERIC_TRACER
392 help 381 help
393 This tracer records all branches on the system in a circular 382 This tracer records all branches on the system in a circular
394 buffer giving access to the last N branches for each cpu. 383 buffer, giving access to the last N branches for each cpu.
395 384
396config KMEMTRACE 385config KMEMTRACE
397 bool "Trace SLAB allocations" 386 bool "Trace SLAB allocations"
398 select GENERIC_TRACER 387 select GENERIC_TRACER
399 help 388 help
400 kmemtrace provides tracing for slab allocator functions, such as 389 kmemtrace provides tracing for slab allocator functions, such as
401 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 390 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
402 data is then fed to the userspace application in order to analyse 391 data is then fed to the userspace application in order to analyse
403 allocation hotspots, internal fragmentation and so on, making it 392 allocation hotspots, internal fragmentation and so on, making it
404 possible to see how well an allocator performs, as well as debug 393 possible to see how well an allocator performs, as well as debug
@@ -417,15 +406,15 @@ config WORKQUEUE_TRACER
417 bool "Trace workqueues" 406 bool "Trace workqueues"
418 select GENERIC_TRACER 407 select GENERIC_TRACER
419 help 408 help
420 The workqueue tracer provides some statistical informations 409 The workqueue tracer provides some statistical information
421 about each cpu workqueue thread such as the number of the 410 about each cpu workqueue thread such as the number of the
422 works inserted and executed since their creation. It can help 411 works inserted and executed since their creation. It can help
423 to evaluate the amount of work each of them have to perform. 412 to evaluate the amount of work each of them has to perform.
424 For example it can help a developer to decide whether he should 413 For example it can help a developer to decide whether he should
425 choose a per cpu workqueue instead of a singlethreaded one. 414 choose a per-cpu workqueue instead of a singlethreaded one.
426 415
427config BLK_DEV_IO_TRACE 416config BLK_DEV_IO_TRACE
428 bool "Support for tracing block io actions" 417 bool "Support for tracing block IO actions"
429 depends on SYSFS 418 depends on SYSFS
430 depends on BLOCK 419 depends on BLOCK
431 select RELAY 420 select RELAY
@@ -451,20 +440,20 @@ config BLK_DEV_IO_TRACE
451 440
452config KPROBE_EVENT 441config KPROBE_EVENT
453 depends on KPROBES 442 depends on KPROBES
454 depends on X86 443 depends on HAVE_REGS_AND_STACK_ACCESS_API
455 bool "Enable kprobes-based dynamic events" 444 bool "Enable kprobes-based dynamic events"
456 select TRACING 445 select TRACING
457 default y 446 default y
458 help 447 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly 448 This allows the user to add tracing events (similar to tracepoints)
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt 449 on the fly via the ftrace interface. See
461 for more details. 450 Documentation/trace/kprobetrace.txt for more details.
462 451
463 Those events can be inserted wherever kprobes can probe, and record 452 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values. 453 various register and memory values.
465 454
466 This option is also required by perf-probe subcommand of perf tools. If 455 This option is also required by perf-probe subcommand of perf tools.
467 you want to use perf tools, this option is strongly recommended. 456 If you want to use perf tools, this option is strongly recommended.
468 457
469config DYNAMIC_FTRACE 458config DYNAMIC_FTRACE
470 bool "enable/disable ftrace tracepoints dynamically" 459 bool "enable/disable ftrace tracepoints dynamically"
@@ -472,32 +461,32 @@ config DYNAMIC_FTRACE
472 depends on HAVE_DYNAMIC_FTRACE 461 depends on HAVE_DYNAMIC_FTRACE
473 default y 462 default y
474 help 463 help
475 This option will modify all the calls to ftrace dynamically 464 This option will modify all the calls to ftrace dynamically
476 (will patch them out of the binary image and replaces them 465 (will patch them out of the binary image and replace them
477 with a No-Op instruction) as they are called. A table is 466 with a No-Op instruction) as they are called. A table is
478 created to dynamically enable them again. 467 created to dynamically enable them again.
479 468
480 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise 469 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
481 has native performance as long as no tracing is active. 470 otherwise has native performance as long as no tracing is active.
482 471
483 The changes to the code are done by a kernel thread that 472 The changes to the code are done by a kernel thread that
484 wakes up once a second and checks to see if any ftrace calls 473 wakes up once a second and checks to see if any ftrace calls
485 were made. If so, it runs stop_machine (stops all CPUS) 474 were made. If so, it runs stop_machine (stops all CPUS)
486 and modifies the code to jump over the call to ftrace. 475 and modifies the code to jump over the call to ftrace.
487 476
488config FUNCTION_PROFILER 477config FUNCTION_PROFILER
489 bool "Kernel function profiler" 478 bool "Kernel function profiler"
490 depends on FUNCTION_TRACER 479 depends on FUNCTION_TRACER
491 default n 480 default n
492 help 481 help
493 This option enables the kernel function profiler. A file is created 482 This option enables the kernel function profiler. A file is created
494 in debugfs called function_profile_enabled which defaults to zero. 483 in debugfs called function_profile_enabled which defaults to zero.
495 When a 1 is echoed into this file profiling begins, and when a 484 When a 1 is echoed into this file profiling begins, and when a
496 zero is entered, profiling stops. A file in the trace_stats 485 zero is entered, profiling stops. A "functions" file is created in
497 directory called functions, that show the list of functions that 486 the trace_stats directory; this file shows the list of functions that
498 have been hit and their counters. 487 have been hit and their counters.
499 488
500 If in doubt, say N 489 If in doubt, say N.
501 490
502config FTRACE_MCOUNT_RECORD 491config FTRACE_MCOUNT_RECORD
503 def_bool y 492 def_bool y
@@ -556,8 +545,8 @@ config RING_BUFFER_BENCHMARK
556 tristate "Ring buffer benchmark stress tester" 545 tristate "Ring buffer benchmark stress tester"
557 depends on RING_BUFFER 546 depends on RING_BUFFER
558 help 547 help
559 This option creates a test to stress the ring buffer and bench mark it. 548 This option creates a test to stress the ring buffer and benchmark it.
560 It creates its own ring buffer such that it will not interfer with 549 It creates its own ring buffer such that it will not interfere with
561 any other users of the ring buffer (such as ftrace). It then creates 550 any other users of the ring buffer (such as ftrace). It then creates
562 a producer and consumer that will run for 10 seconds and sleep for 551 a producer and consumer that will run for 10 seconds and sleep for
563 10 seconds. Each interval it will print out the number of events 552 10 seconds. Each interval it will print out the number of events
@@ -566,7 +555,7 @@ config RING_BUFFER_BENCHMARK
566 It does not disable interrupts or raise its priority, so it may be 555 It does not disable interrupts or raise its priority, so it may be
567 affected by processes that are running. 556 affected by processes that are running.
568 557
569 If unsure, say N 558 If unsure, say N.
570 559
571endif # FTRACE 560endif # FTRACE
572 561
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..d00c6fe23f54 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 51obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
56endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..07f945a99430 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -540,9 +540,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
540 if (ret) 540 if (ret)
541 return ret; 541 return ret;
542 542
543 if (copy_to_user(arg, &buts, sizeof(buts))) 543 if (copy_to_user(arg, &buts, sizeof(buts))) {
544 blk_trace_remove(q);
544 return -EFAULT; 545 return -EFAULT;
545 546 }
546 return 0; 547 return 0;
547} 548}
548EXPORT_SYMBOL_GPL(blk_trace_setup); 549EXPORT_SYMBOL_GPL(blk_trace_setup);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7968762c8167..d9062f5cc0c0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,12 +22,12 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
29#include <linux/list.h> 28#include <linux/list.h>
30#include <linux/hash.h> 29#include <linux/hash.h>
30#include <linux/rcupdate.h>
31 31
32#include <trace/events/sched.h> 32#include <trace/events/sched.h>
33 33
@@ -85,22 +85,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
87 87
88#ifdef CONFIG_FUNCTION_GRAPH_TRACER 88/*
89static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 89 * Traverse the ftrace_list, invoking all entries. The reason that we
90#endif 90 * can use rcu_dereference_raw() is that elements removed from this list
91 91 * are simply leaked, so there is no need to interact with a grace-period
92 * mechanism. The rcu_dereference_raw() calls are needed to handle
93 * concurrent insertions into the ftrace_list.
94 *
95 * Silly Alpha and silly pointer-speculation compiler optimizations!
96 */
92static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 97static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
93{ 98{
94 struct ftrace_ops *op = ftrace_list; 99 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
95
96 /* in case someone actually ports this to alpha! */
97 read_barrier_depends();
98 100
99 while (op != &ftrace_list_end) { 101 while (op != &ftrace_list_end) {
100 /* silly alpha */
101 read_barrier_depends();
102 op->func(ip, parent_ip); 102 op->func(ip, parent_ip);
103 op = op->next; 103 op = rcu_dereference_raw(op->next); /*see above*/
104 }; 104 };
105} 105}
106 106
@@ -155,8 +155,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 * the ops->next pointer is valid before another CPU sees 155 * the ops->next pointer is valid before another CPU sees
156 * the ops pointer included into the ftrace_list. 156 * the ops pointer included into the ftrace_list.
157 */ 157 */
158 smp_wmb(); 158 rcu_assign_pointer(ftrace_list, ops);
159 ftrace_list = ops;
160 159
161 if (ftrace_enabled) { 160 if (ftrace_enabled) {
162 ftrace_func_t func; 161 ftrace_func_t func;
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
898 } \ 897 } \
899 } 898 }
900 899
901#ifdef CONFIG_KPROBES
902
903static int frozen_record_count;
904
905static inline void freeze_record(struct dyn_ftrace *rec)
906{
907 if (!(rec->flags & FTRACE_FL_FROZEN)) {
908 rec->flags |= FTRACE_FL_FROZEN;
909 frozen_record_count++;
910 }
911}
912
913static inline void unfreeze_record(struct dyn_ftrace *rec)
914{
915 if (rec->flags & FTRACE_FL_FROZEN) {
916 rec->flags &= ~FTRACE_FL_FROZEN;
917 frozen_record_count--;
918 }
919}
920
921static inline int record_frozen(struct dyn_ftrace *rec)
922{
923 return rec->flags & FTRACE_FL_FROZEN;
924}
925#else
926# define freeze_record(rec) ({ 0; })
927# define unfreeze_record(rec) ({ 0; })
928# define record_frozen(rec) ({ 0; })
929#endif /* CONFIG_KPROBES */
930
931static void ftrace_free_rec(struct dyn_ftrace *rec) 900static void ftrace_free_rec(struct dyn_ftrace *rec)
932{ 901{
933 rec->freelist = ftrace_free_records; 902 rec->freelist = ftrace_free_records;
@@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1025} 994}
1026 995
1027 996
997/* Return 1 if the address range is reserved for ftrace */
998int ftrace_text_reserved(void *start, void *end)
999{
1000 struct dyn_ftrace *rec;
1001 struct ftrace_page *pg;
1002
1003 do_for_each_ftrace_rec(pg, rec) {
1004 if (rec->ip <= (unsigned long)end &&
1005 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1006 return 1;
1007 } while_for_each_ftrace_rec();
1008 return 0;
1009}
1010
1011
1028static int 1012static int
1029__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1013__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1030{ 1014{
@@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable)
1076 !(rec->flags & FTRACE_FL_CONVERTED)) 1060 !(rec->flags & FTRACE_FL_CONVERTED))
1077 continue; 1061 continue;
1078 1062
1079 /* ignore updates to this record's mcount site */
1080 if (get_kprobe((void *)rec->ip)) {
1081 freeze_record(rec);
1082 continue;
1083 } else {
1084 unfreeze_record(rec);
1085 }
1086
1087 failed = __ftrace_replace_code(rec, enable); 1063 failed = __ftrace_replace_code(rec, enable);
1088 if (failed) { 1064 if (failed) {
1089 rec->flags |= FTRACE_FL_FAILED; 1065 rec->flags |= FTRACE_FL_FAILED;
@@ -1690,7 +1666,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1690static int ftrace_match(char *str, char *regex, int len, int type) 1666static int ftrace_match(char *str, char *regex, int len, int type)
1691{ 1667{
1692 int matched = 0; 1668 int matched = 0;
1693 char *ptr; 1669 int slen;
1694 1670
1695 switch (type) { 1671 switch (type) {
1696 case MATCH_FULL: 1672 case MATCH_FULL:
@@ -1706,8 +1682,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1706 matched = 1; 1682 matched = 1;
1707 break; 1683 break;
1708 case MATCH_END_ONLY: 1684 case MATCH_END_ONLY:
1709 ptr = strstr(str, regex); 1685 slen = strlen(str);
1710 if (ptr && (ptr[len] == 0)) 1686 if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
1711 matched = 1; 1687 matched = 1;
1712 break; 1688 break;
1713 } 1689 }
@@ -2300,6 +2276,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
2300 2276
2301#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2277#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2302static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 2278static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2279static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
2280
2303static int __init set_graph_function(char *str) 2281static int __init set_graph_function(char *str)
2304{ 2282{
2305 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 2283 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -2426,6 +2404,7 @@ static const struct file_operations ftrace_notrace_fops = {
2426static DEFINE_MUTEX(graph_lock); 2404static DEFINE_MUTEX(graph_lock);
2427 2405
2428int ftrace_graph_count; 2406int ftrace_graph_count;
2407int ftrace_graph_filter_enabled;
2429unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2408unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2430 2409
2431static void * 2410static void *
@@ -2448,7 +2427,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2448 mutex_lock(&graph_lock); 2427 mutex_lock(&graph_lock);
2449 2428
2450 /* Nothing, tell g_show to print all functions are enabled */ 2429 /* Nothing, tell g_show to print all functions are enabled */
2451 if (!ftrace_graph_count && !*pos) 2430 if (!ftrace_graph_filter_enabled && !*pos)
2452 return (void *)1; 2431 return (void *)1;
2453 2432
2454 return __g_next(m, pos); 2433 return __g_next(m, pos);
@@ -2494,6 +2473,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2494 mutex_lock(&graph_lock); 2473 mutex_lock(&graph_lock);
2495 if ((file->f_mode & FMODE_WRITE) && 2474 if ((file->f_mode & FMODE_WRITE) &&
2496 (file->f_flags & O_TRUNC)) { 2475 (file->f_flags & O_TRUNC)) {
2476 ftrace_graph_filter_enabled = 0;
2497 ftrace_graph_count = 0; 2477 ftrace_graph_count = 0;
2498 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2478 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2499 } 2479 }
@@ -2519,7 +2499,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2519 struct dyn_ftrace *rec; 2499 struct dyn_ftrace *rec;
2520 struct ftrace_page *pg; 2500 struct ftrace_page *pg;
2521 int search_len; 2501 int search_len;
2522 int found = 0; 2502 int fail = 1;
2523 int type, not; 2503 int type, not;
2524 char *search; 2504 char *search;
2525 bool exists; 2505 bool exists;
@@ -2530,37 +2510,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2530 2510
2531 /* decode regex */ 2511 /* decode regex */
2532 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 2512 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2533 if (not) 2513 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2534 return -EINVAL; 2514 return -EBUSY;
2535 2515
2536 search_len = strlen(search); 2516 search_len = strlen(search);
2537 2517
2538 mutex_lock(&ftrace_lock); 2518 mutex_lock(&ftrace_lock);
2539 do_for_each_ftrace_rec(pg, rec) { 2519 do_for_each_ftrace_rec(pg, rec) {
2540 2520
2541 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2542 break;
2543
2544 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2521 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2545 continue; 2522 continue;
2546 2523
2547 if (ftrace_match_record(rec, search, search_len, type)) { 2524 if (ftrace_match_record(rec, search, search_len, type)) {
2548 /* ensure it is not already in the array */ 2525 /* if it is in the array */
2549 exists = false; 2526 exists = false;
2550 for (i = 0; i < *idx; i++) 2527 for (i = 0; i < *idx; i++) {
2551 if (array[i] == rec->ip) { 2528 if (array[i] == rec->ip) {
2552 exists = true; 2529 exists = true;
2553 break; 2530 break;
2554 } 2531 }
2555 if (!exists) 2532 }
2556 array[(*idx)++] = rec->ip; 2533
2557 found = 1; 2534 if (!not) {
2535 fail = 0;
2536 if (!exists) {
2537 array[(*idx)++] = rec->ip;
2538 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2539 goto out;
2540 }
2541 } else {
2542 if (exists) {
2543 array[i] = array[--(*idx)];
2544 array[*idx] = 0;
2545 fail = 0;
2546 }
2547 }
2558 } 2548 }
2559 } while_for_each_ftrace_rec(); 2549 } while_for_each_ftrace_rec();
2560 2550out:
2561 mutex_unlock(&ftrace_lock); 2551 mutex_unlock(&ftrace_lock);
2562 2552
2563 return found ? 0 : -EINVAL; 2553 if (fail)
2554 return -EINVAL;
2555
2556 ftrace_graph_filter_enabled = 1;
2557 return 0;
2564} 2558}
2565 2559
2566static ssize_t 2560static ssize_t
@@ -2570,16 +2564,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2570 struct trace_parser parser; 2564 struct trace_parser parser;
2571 ssize_t read, ret; 2565 ssize_t read, ret;
2572 2566
2573 if (!cnt || cnt < 0) 2567 if (!cnt)
2574 return 0; 2568 return 0;
2575 2569
2576 mutex_lock(&graph_lock); 2570 mutex_lock(&graph_lock);
2577 2571
2578 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2579 ret = -EBUSY;
2580 goto out_unlock;
2581 }
2582
2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2572 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2584 ret = -ENOMEM; 2573 ret = -ENOMEM;
2585 goto out_unlock; 2574 goto out_unlock;
@@ -3364,6 +3353,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3364{ 3353{
3365 /* Make sure we do not use the parent ret_stack */ 3354 /* Make sure we do not use the parent ret_stack */
3366 t->ret_stack = NULL; 3355 t->ret_stack = NULL;
3356 t->curr_ret_stack = -1;
3367 3357
3368 if (ftrace_graph_active) { 3358 if (ftrace_graph_active) {
3369 struct ftrace_ret_stack *ret_stack; 3359 struct ftrace_ret_stack *ret_stack;
@@ -3373,7 +3363,6 @@ void ftrace_graph_init_task(struct task_struct *t)
3373 GFP_KERNEL); 3363 GFP_KERNEL);
3374 if (!ret_stack) 3364 if (!ret_stack)
3375 return; 3365 return;
3376 t->curr_ret_stack = -1;
3377 atomic_set(&t->tracing_graph_pause, 0); 3366 atomic_set(&t->tracing_graph_pause, 0);
3378 atomic_set(&t->trace_overrun, 0); 3367 atomic_set(&t->trace_overrun, 0);
3379 t->ftrace_timestamp = 0; 3368 t->ftrace_timestamp = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2326b04c95c4..05a9f83b8819 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22 22
23#include <asm/local.h>
23#include "trace.h" 24#include "trace.h"
24 25
25/* 26/*
@@ -464,6 +465,8 @@ struct ring_buffer_iter {
464 struct ring_buffer_per_cpu *cpu_buffer; 465 struct ring_buffer_per_cpu *cpu_buffer;
465 unsigned long head; 466 unsigned long head;
466 struct buffer_page *head_page; 467 struct buffer_page *head_page;
468 struct buffer_page *cache_reader_page;
469 unsigned long cache_read;
467 u64 read_stamp; 470 u64 read_stamp;
468}; 471};
469 472
@@ -2230,12 +2233,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2230 if (ring_buffer_flags != RB_BUFFERS_ON) 2233 if (ring_buffer_flags != RB_BUFFERS_ON)
2231 return NULL; 2234 return NULL;
2232 2235
2233 if (atomic_read(&buffer->record_disabled))
2234 return NULL;
2235
2236 /* If we are tracing schedule, we don't want to recurse */ 2236 /* If we are tracing schedule, we don't want to recurse */
2237 resched = ftrace_preempt_disable(); 2237 resched = ftrace_preempt_disable();
2238 2238
2239 if (atomic_read(&buffer->record_disabled))
2240 goto out_nocheck;
2241
2239 if (trace_recursive_lock()) 2242 if (trace_recursive_lock())
2240 goto out_nocheck; 2243 goto out_nocheck;
2241 2244
@@ -2467,11 +2470,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
2467 if (ring_buffer_flags != RB_BUFFERS_ON) 2470 if (ring_buffer_flags != RB_BUFFERS_ON)
2468 return -EBUSY; 2471 return -EBUSY;
2469 2472
2470 if (atomic_read(&buffer->record_disabled))
2471 return -EBUSY;
2472
2473 resched = ftrace_preempt_disable(); 2473 resched = ftrace_preempt_disable();
2474 2474
2475 if (atomic_read(&buffer->record_disabled))
2476 goto out;
2477
2475 cpu = raw_smp_processor_id(); 2478 cpu = raw_smp_processor_id();
2476 2479
2477 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2480 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2539,7 +2542,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2539 * @buffer: The ring buffer to enable writes 2542 * @buffer: The ring buffer to enable writes
2540 * 2543 *
2541 * Note, multiple disables will need the same number of enables 2544 * Note, multiple disables will need the same number of enables
2542 * to truely enable the writing (much like preempt_disable). 2545 * to truly enable the writing (much like preempt_disable).
2543 */ 2546 */
2544void ring_buffer_record_enable(struct ring_buffer *buffer) 2547void ring_buffer_record_enable(struct ring_buffer *buffer)
2545{ 2548{
@@ -2575,7 +2578,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2575 * @cpu: The CPU to enable. 2578 * @cpu: The CPU to enable.
2576 * 2579 *
2577 * Note, multiple disables will need the same number of enables 2580 * Note, multiple disables will need the same number of enables
2578 * to truely enable the writing (much like preempt_disable). 2581 * to truly enable the writing (much like preempt_disable).
2579 */ 2582 */
2580void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 2583void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2581{ 2584{
@@ -2716,6 +2719,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2716 iter->read_stamp = cpu_buffer->read_stamp; 2719 iter->read_stamp = cpu_buffer->read_stamp;
2717 else 2720 else
2718 iter->read_stamp = iter->head_page->page->time_stamp; 2721 iter->read_stamp = iter->head_page->page->time_stamp;
2722 iter->cache_reader_page = cpu_buffer->reader_page;
2723 iter->cache_read = cpu_buffer->read;
2719} 2724}
2720 2725
2721/** 2726/**
@@ -2869,7 +2874,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2869 * Splice the empty reader page into the list around the head. 2874 * Splice the empty reader page into the list around the head.
2870 */ 2875 */
2871 reader = rb_set_head_page(cpu_buffer); 2876 reader = rb_set_head_page(cpu_buffer);
2872 cpu_buffer->reader_page->list.next = reader->list.next; 2877 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
2873 cpu_buffer->reader_page->list.prev = reader->list.prev; 2878 cpu_buffer->reader_page->list.prev = reader->list.prev;
2874 2879
2875 /* 2880 /*
@@ -2906,7 +2911,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2906 * 2911 *
2907 * Now make the new head point back to the reader page. 2912 * Now make the new head point back to the reader page.
2908 */ 2913 */
2909 reader->list.next->prev = &cpu_buffer->reader_page->list; 2914 rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
2910 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2915 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2911 2916
2912 /* Finally update the reader page to the new head */ 2917 /* Finally update the reader page to the new head */
@@ -3060,13 +3065,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3060 struct ring_buffer_event *event; 3065 struct ring_buffer_event *event;
3061 int nr_loops = 0; 3066 int nr_loops = 0;
3062 3067
3063 if (ring_buffer_iter_empty(iter))
3064 return NULL;
3065
3066 cpu_buffer = iter->cpu_buffer; 3068 cpu_buffer = iter->cpu_buffer;
3067 buffer = cpu_buffer->buffer; 3069 buffer = cpu_buffer->buffer;
3068 3070
3071 /*
3072 * Check if someone performed a consuming read to
3073 * the buffer. A consuming read invalidates the iterator
3074 * and we need to reset the iterator in this case.
3075 */
3076 if (unlikely(iter->cache_read != cpu_buffer->read ||
3077 iter->cache_reader_page != cpu_buffer->reader_page))
3078 rb_iter_reset(iter);
3079
3069 again: 3080 again:
3081 if (ring_buffer_iter_empty(iter))
3082 return NULL;
3083
3070 /* 3084 /*
3071 * We repeat when a timestamp is encountered. 3085 * We repeat when a timestamp is encountered.
3072 * We can get multiple timestamps by nested interrupts or also 3086 * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3095,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3081 if (rb_per_cpu_empty(cpu_buffer)) 3095 if (rb_per_cpu_empty(cpu_buffer))
3082 return NULL; 3096 return NULL;
3083 3097
3098 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3099 rb_inc_iter(iter);
3100 goto again;
3101 }
3102
3084 event = rb_iter_head_event(iter); 3103 event = rb_iter_head_event(iter);
3085 3104
3086 switch (event->type_len) { 3105 switch (event->type_len) {
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b9f20ab8eed..3ec2ee6f6560 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,7 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
35#include <linux/ctype.h> 36#include <linux/ctype.h>
36#include <linux/init.h> 37#include <linux/init.h>
37#include <linux/poll.h> 38#include <linux/poll.h>
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
91static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
92{ 93{
93 preempt_disable(); 94 preempt_disable();
94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
95} 96}
96 97
97static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
98{ 99{
99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
100 preempt_enable(); 101 preempt_enable();
101} 102}
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -297,6 +374,21 @@ static int __init set_buf_size(char *str)
297} 374}
298__setup("trace_buf_size=", set_buf_size); 375__setup("trace_buf_size=", set_buf_size);
299 376
377static int __init set_tracing_thresh(char *str)
378{
379 unsigned long threshhold;
380 int ret;
381
382 if (!str)
383 return 0;
384 ret = strict_strtoul(str, 0, &threshhold);
385 if (ret < 0)
386 return 0;
387 tracing_thresh = threshhold * 1000;
388 return 1;
389}
390__setup("tracing_thresh=", set_tracing_thresh);
391
300unsigned long nsecs_to_usecs(unsigned long nsecs) 392unsigned long nsecs_to_usecs(unsigned long nsecs)
301{ 393{
302 return nsecs / 1000; 394 return nsecs / 1000;
@@ -502,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
502static arch_spinlock_t ftrace_max_lock = 594static arch_spinlock_t ftrace_max_lock =
503 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 595 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
504 596
597unsigned long __read_mostly tracing_thresh;
598
505#ifdef CONFIG_TRACER_MAX_TRACE 599#ifdef CONFIG_TRACER_MAX_TRACE
506unsigned long __read_mostly tracing_max_latency; 600unsigned long __read_mostly tracing_max_latency;
507unsigned long __read_mostly tracing_thresh;
508 601
509/* 602/*
510 * Copy the new maximum trace into the separate maximum-trace 603 * Copy the new maximum trace into the separate maximum-trace
@@ -515,7 +608,7 @@ static void
515__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 608__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
516{ 609{
517 struct trace_array_cpu *data = tr->data[cpu]; 610 struct trace_array_cpu *data = tr->data[cpu];
518 struct trace_array_cpu *max_data = tr->data[cpu]; 611 struct trace_array_cpu *max_data;
519 612
520 max_tr.cpu = cpu; 613 max_tr.cpu = cpu;
521 max_tr.time_start = data->preempt_timestamp; 614 max_tr.time_start = data->preempt_timestamp;
@@ -525,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
525 max_data->critical_start = data->critical_start; 618 max_data->critical_start = data->critical_start;
526 max_data->critical_end = data->critical_end; 619 max_data->critical_end = data->critical_end;
527 620
528 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 621 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
529 max_data->pid = tsk->pid; 622 max_data->pid = tsk->pid;
530 max_data->uid = task_uid(tsk); 623 max_data->uid = task_uid(tsk);
531 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 624 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -747,10 +840,10 @@ out:
747 mutex_unlock(&trace_types_lock); 840 mutex_unlock(&trace_types_lock);
748} 841}
749 842
750static void __tracing_reset(struct trace_array *tr, int cpu) 843static void __tracing_reset(struct ring_buffer *buffer, int cpu)
751{ 844{
752 ftrace_disable_cpu(); 845 ftrace_disable_cpu();
753 ring_buffer_reset_cpu(tr->buffer, cpu); 846 ring_buffer_reset_cpu(buffer, cpu);
754 ftrace_enable_cpu(); 847 ftrace_enable_cpu();
755} 848}
756 849
@@ -762,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
762 855
763 /* Make sure all commits have finished */ 856 /* Make sure all commits have finished */
764 synchronize_sched(); 857 synchronize_sched();
765 __tracing_reset(tr, cpu); 858 __tracing_reset(buffer, cpu);
766 859
767 ring_buffer_record_enable(buffer); 860 ring_buffer_record_enable(buffer);
768} 861}
@@ -780,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
780 tr->time_start = ftrace_now(tr->cpu); 873 tr->time_start = ftrace_now(tr->cpu);
781 874
782 for_each_online_cpu(cpu) 875 for_each_online_cpu(cpu)
783 __tracing_reset(tr, cpu); 876 __tracing_reset(buffer, cpu);
784 877
785 ring_buffer_record_enable(buffer); 878 ring_buffer_record_enable(buffer);
786} 879}
@@ -857,6 +950,8 @@ void tracing_start(void)
857 goto out; 950 goto out;
858 } 951 }
859 952
953 /* Prevent the buffers from switching */
954 arch_spin_lock(&ftrace_max_lock);
860 955
861 buffer = global_trace.buffer; 956 buffer = global_trace.buffer;
862 if (buffer) 957 if (buffer)
@@ -866,6 +961,8 @@ void tracing_start(void)
866 if (buffer) 961 if (buffer)
867 ring_buffer_record_enable(buffer); 962 ring_buffer_record_enable(buffer);
868 963
964 arch_spin_unlock(&ftrace_max_lock);
965
869 ftrace_start(); 966 ftrace_start();
870 out: 967 out:
871 spin_unlock_irqrestore(&tracing_start_lock, flags); 968 spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -887,6 +984,9 @@ void tracing_stop(void)
887 if (trace_stop_count++) 984 if (trace_stop_count++)
888 goto out; 985 goto out;
889 986
987 /* Prevent the buffers from switching */
988 arch_spin_lock(&ftrace_max_lock);
989
890 buffer = global_trace.buffer; 990 buffer = global_trace.buffer;
891 if (buffer) 991 if (buffer)
892 ring_buffer_record_disable(buffer); 992 ring_buffer_record_disable(buffer);
@@ -895,6 +995,8 @@ void tracing_stop(void)
895 if (buffer) 995 if (buffer)
896 ring_buffer_record_disable(buffer); 996 ring_buffer_record_disable(buffer);
897 997
998 arch_spin_unlock(&ftrace_max_lock);
999
898 out: 1000 out:
899 spin_unlock_irqrestore(&tracing_start_lock, flags); 1001 spin_unlock_irqrestore(&tracing_start_lock, flags);
900} 1002}
@@ -951,6 +1053,11 @@ void trace_find_cmdline(int pid, char comm[])
951 return; 1053 return;
952 } 1054 }
953 1055
1056 if (WARN_ON_ONCE(pid < 0)) {
1057 strcpy(comm, "<XXX>");
1058 return;
1059 }
1060
954 if (pid > PID_MAX_DEFAULT) { 1061 if (pid > PID_MAX_DEFAULT) {
955 strcpy(comm, "<...>"); 1062 strcpy(comm, "<...>");
956 return; 1063 return;
@@ -1084,7 +1191,7 @@ trace_function(struct trace_array *tr,
1084 struct ftrace_entry *entry; 1191 struct ftrace_entry *entry;
1085 1192
1086 /* If we are reading the ring buffer, don't trace */ 1193 /* If we are reading the ring buffer, don't trace */
1087 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 1194 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1088 return; 1195 return;
1089 1196
1090 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1197 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1177,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1177 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1284 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1178 return; 1285 return;
1179 1286
1287 /*
1288 * NMIs can not handle page faults, even with fix ups.
1289 * The save user stack can (and often does) fault.
1290 */
1291 if (unlikely(in_nmi()))
1292 return;
1293
1180 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1294 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1181 sizeof(*entry), flags, pc); 1295 sizeof(*entry), flags, pc);
1182 if (!event) 1296 if (!event)
@@ -1315,8 +1429,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1315 entry->fmt = fmt; 1429 entry->fmt = fmt;
1316 1430
1317 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1431 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1318 if (!filter_check_discard(call, entry, buffer, event)) 1432 if (!filter_check_discard(call, entry, buffer, event)) {
1319 ring_buffer_unlock_commit(buffer, event); 1433 ring_buffer_unlock_commit(buffer, event);
1434 ftrace_trace_stack(buffer, flags, 6, pc);
1435 }
1320 1436
1321out_unlock: 1437out_unlock:
1322 arch_spin_unlock(&trace_buf_lock); 1438 arch_spin_unlock(&trace_buf_lock);
@@ -1389,8 +1505,10 @@ int trace_array_vprintk(struct trace_array *tr,
1389 1505
1390 memcpy(&entry->buf, trace_buf, len); 1506 memcpy(&entry->buf, trace_buf, len);
1391 entry->buf[len] = '\0'; 1507 entry->buf[len] = '\0';
1392 if (!filter_check_discard(call, entry, buffer, event)) 1508 if (!filter_check_discard(call, entry, buffer, event)) {
1393 ring_buffer_unlock_commit(buffer, event); 1509 ring_buffer_unlock_commit(buffer, event);
1510 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1511 }
1394 1512
1395 out_unlock: 1513 out_unlock:
1396 arch_spin_unlock(&trace_buf_lock); 1514 arch_spin_unlock(&trace_buf_lock);
@@ -1580,12 +1698,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1580} 1698}
1581 1699
1582/* 1700/*
1583 * No necessary locking here. The worst thing which can
1584 * happen is loosing events consumed at the same time
1585 * by a trace_pipe reader.
1586 * Other than that, we don't risk to crash the ring buffer
1587 * because it serializes the readers.
1588 *
1589 * The current tracer is copied to avoid a global locking 1701 * The current tracer is copied to avoid a global locking
1590 * all around. 1702 * all around.
1591 */ 1703 */
@@ -1623,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1623 1735
1624 ftrace_enable_cpu(); 1736 ftrace_enable_cpu();
1625 1737
1738 iter->leftover = 0;
1626 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1739 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1627 ; 1740 ;
1628 1741
@@ -1640,12 +1753,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1640 } 1753 }
1641 1754
1642 trace_event_read_lock(); 1755 trace_event_read_lock();
1756 trace_access_lock(cpu_file);
1643 return p; 1757 return p;
1644} 1758}
1645 1759
1646static void s_stop(struct seq_file *m, void *p) 1760static void s_stop(struct seq_file *m, void *p)
1647{ 1761{
1762 struct trace_iterator *iter = m->private;
1763
1648 atomic_dec(&trace_record_cmdline_disabled); 1764 atomic_dec(&trace_record_cmdline_disabled);
1765 trace_access_unlock(iter->cpu_file);
1649 trace_event_read_unlock(); 1766 trace_event_read_unlock();
1650} 1767}
1651 1768
@@ -2836,22 +2953,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2836 2953
2837 mutex_lock(&trace_types_lock); 2954 mutex_lock(&trace_types_lock);
2838 2955
2839 /* We only allow one reader per cpu */
2840 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2841 if (!cpumask_empty(tracing_reader_cpumask)) {
2842 ret = -EBUSY;
2843 goto out;
2844 }
2845 cpumask_setall(tracing_reader_cpumask);
2846 } else {
2847 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2848 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2849 else {
2850 ret = -EBUSY;
2851 goto out;
2852 }
2853 }
2854
2855 /* create a buffer to store the information to pass to userspace */ 2956 /* create a buffer to store the information to pass to userspace */
2856 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2957 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2857 if (!iter) { 2958 if (!iter) {
@@ -2907,12 +3008,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2907 3008
2908 mutex_lock(&trace_types_lock); 3009 mutex_lock(&trace_types_lock);
2909 3010
2910 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2911 cpumask_clear(tracing_reader_cpumask);
2912 else
2913 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2914
2915
2916 if (iter->trace->pipe_close) 3011 if (iter->trace->pipe_close)
2917 iter->trace->pipe_close(iter); 3012 iter->trace->pipe_close(iter);
2918 3013
@@ -3074,6 +3169,7 @@ waitagain:
3074 iter->pos = -1; 3169 iter->pos = -1;
3075 3170
3076 trace_event_read_lock(); 3171 trace_event_read_lock();
3172 trace_access_lock(iter->cpu_file);
3077 while (find_next_entry_inc(iter) != NULL) { 3173 while (find_next_entry_inc(iter) != NULL) {
3078 enum print_line_t ret; 3174 enum print_line_t ret;
3079 int len = iter->seq.len; 3175 int len = iter->seq.len;
@@ -3090,6 +3186,7 @@ waitagain:
3090 if (iter->seq.len >= cnt) 3186 if (iter->seq.len >= cnt)
3091 break; 3187 break;
3092 } 3188 }
3189 trace_access_unlock(iter->cpu_file);
3093 trace_event_read_unlock(); 3190 trace_event_read_unlock();
3094 3191
3095 /* Now copy what we have to the user */ 3192 /* Now copy what we have to the user */
@@ -3215,6 +3312,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3215 } 3312 }
3216 3313
3217 trace_event_read_lock(); 3314 trace_event_read_lock();
3315 trace_access_lock(iter->cpu_file);
3218 3316
3219 /* Fill as many pages as possible. */ 3317 /* Fill as many pages as possible. */
3220 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3318 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3238,6 +3336,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3238 trace_seq_init(&iter->seq); 3336 trace_seq_init(&iter->seq);
3239 } 3337 }
3240 3338
3339 trace_access_unlock(iter->cpu_file);
3241 trace_event_read_unlock(); 3340 trace_event_read_unlock();
3242 mutex_unlock(&iter->mutex); 3341 mutex_unlock(&iter->mutex);
3243 3342
@@ -3539,10 +3638,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3539 3638
3540 info->read = 0; 3639 info->read = 0;
3541 3640
3641 trace_access_lock(info->cpu);
3542 ret = ring_buffer_read_page(info->tr->buffer, 3642 ret = ring_buffer_read_page(info->tr->buffer,
3543 &info->spare, 3643 &info->spare,
3544 count, 3644 count,
3545 info->cpu, 0); 3645 info->cpu, 0);
3646 trace_access_unlock(info->cpu);
3546 if (ret < 0) 3647 if (ret < 0)
3547 return 0; 3648 return 0;
3548 3649
@@ -3670,6 +3771,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3670 len &= PAGE_MASK; 3771 len &= PAGE_MASK;
3671 } 3772 }
3672 3773
3774 trace_access_lock(info->cpu);
3673 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3775 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3674 3776
3675 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3777 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3717,6 +3819,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3717 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3819 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3718 } 3820 }
3719 3821
3822 trace_access_unlock(info->cpu);
3720 spd.nr_pages = i; 3823 spd.nr_pages = i;
3721 3824
3722 /* did we read anything? */ 3825 /* did we read anything? */
@@ -3949,7 +4052,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3949 if (!!(topt->flags->val & topt->opt->bit) != val) { 4052 if (!!(topt->flags->val & topt->opt->bit) != val) {
3950 mutex_lock(&trace_types_lock); 4053 mutex_lock(&trace_types_lock);
3951 ret = __set_tracer_option(current_trace, topt->flags, 4054 ret = __set_tracer_option(current_trace, topt->flags,
3952 topt->opt, val); 4055 topt->opt, !val);
3953 mutex_unlock(&trace_types_lock); 4056 mutex_unlock(&trace_types_lock);
3954 if (ret) 4057 if (ret)
3955 return ret; 4058 return ret;
@@ -4153,6 +4256,8 @@ static __init int tracer_init_debugfs(void)
4153 struct dentry *d_tracer; 4256 struct dentry *d_tracer;
4154 int cpu; 4257 int cpu;
4155 4258
4259 trace_access_lock_init();
4260
4156 d_tracer = tracing_init_dentry(); 4261 d_tracer = tracing_init_dentry();
4157 4262
4158 trace_create_file("tracing_enabled", 0644, d_tracer, 4263 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4176,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
4176#ifdef CONFIG_TRACER_MAX_TRACE 4281#ifdef CONFIG_TRACER_MAX_TRACE
4177 trace_create_file("tracing_max_latency", 0644, d_tracer, 4282 trace_create_file("tracing_max_latency", 0644, d_tracer,
4178 &tracing_max_latency, &tracing_max_lat_fops); 4283 &tracing_max_latency, &tracing_max_lat_fops);
4284#endif
4179 4285
4180 trace_create_file("tracing_thresh", 0644, d_tracer, 4286 trace_create_file("tracing_thresh", 0644, d_tracer,
4181 &tracing_thresh, &tracing_max_lat_fops); 4287 &tracing_thresh, &tracing_max_lat_fops);
4182#endif
4183 4288
4184 trace_create_file("README", 0444, d_tracer, 4289 trace_create_file("README", 0444, d_tracer,
4185 NULL, &tracing_readme_fops); 4290 NULL, &tracing_readme_fops);
@@ -4387,9 +4492,6 @@ __init static int tracer_alloc_buffers(void)
4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4492 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4388 goto out_free_buffer_mask; 4493 goto out_free_buffer_mask;
4389 4494
4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4391 goto out_free_tracing_cpumask;
4392
4393 /* To save memory, keep the ring buffer size to its minimum */ 4495 /* To save memory, keep the ring buffer size to its minimum */
4394 if (ring_buffer_expanded) 4496 if (ring_buffer_expanded)
4395 ring_buf_size = trace_buf_size; 4497 ring_buf_size = trace_buf_size;
@@ -4447,8 +4549,6 @@ __init static int tracer_alloc_buffers(void)
4447 return 0; 4549 return 0;
4448 4550
4449out_free_cpumask: 4551out_free_cpumask:
4450 free_cpumask_var(tracing_reader_cpumask);
4451out_free_tracing_cpumask:
4452 free_cpumask_var(tracing_cpumask); 4552 free_cpumask_var(tracing_cpumask);
4453out_free_buffer_mask: 4553out_free_buffer_mask:
4454 free_cpumask_var(tracing_buffer_mask); 4554 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396 396
397extern unsigned long nsecs_to_usecs(unsigned long nsecs); 397extern unsigned long nsecs_to_usecs(unsigned long nsecs);
398 398
399extern unsigned long tracing_thresh;
400
399#ifdef CONFIG_TRACER_MAX_TRACE 401#ifdef CONFIG_TRACER_MAX_TRACE
400extern unsigned long tracing_max_latency; 402extern unsigned long tracing_max_latency;
401extern unsigned long tracing_thresh;
402 403
403void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 404void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
404void update_max_tr_single(struct trace_array *tr, 405void update_max_tr_single(struct trace_array *tr,
@@ -497,6 +498,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
497#ifdef CONFIG_DYNAMIC_FTRACE 498#ifdef CONFIG_DYNAMIC_FTRACE
498/* TODO: make this variable */ 499/* TODO: make this variable */
499#define FTRACE_GRAPH_MAX_FUNCS 32 500#define FTRACE_GRAPH_MAX_FUNCS 32
501extern int ftrace_graph_filter_enabled;
500extern int ftrace_graph_count; 502extern int ftrace_graph_count;
501extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 503extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
502 504
@@ -504,7 +506,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
504{ 506{
505 int i; 507 int i;
506 508
507 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 509 if (!ftrace_graph_filter_enabled)
508 return 1; 510 return 1;
509 511
510 for (i = 0; i < ftrace_graph_count; i++) { 512 for (i = 0; i < ftrace_graph_count; i++) {
@@ -549,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
549 * struct trace_parser - servers for reading the user input separated by spaces 551 * struct trace_parser - servers for reading the user input separated by spaces
550 * @cont: set if the input is not complete - no final space char was found 552 * @cont: set if the input is not complete - no final space char was found
551 * @buffer: holds the parsed user input 553 * @buffer: holds the parsed user input
552 * @idx: user input lenght 554 * @idx: user input length
553 * @size: buffer size 555 * @size: buffer size
554 */ 556 */
555struct trace_parser { 557struct trace_parser {
@@ -791,7 +793,8 @@ extern const char *__stop___trace_bprintk_fmt[];
791 793
792#undef FTRACE_ENTRY 794#undef FTRACE_ENTRY
793#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 795#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
794 extern struct ftrace_event_call event_##call; 796 extern struct ftrace_event_call \
797 __attribute__((__aligned__(4))) event_##call;
795#undef FTRACE_ENTRY_DUP 798#undef FTRACE_ENTRY_DUP
796#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 799#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
797 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 800 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 307 return -1;
308 if (percent_a > percent_b) 308 if (percent_a > percent_b)
309 return 1; 309 return 1;
310 else 310
311 return 0; 311 if (a->incorrect < b->incorrect)
312 return -1;
313 if (a->incorrect > b->incorrect)
314 return 1;
315
316 /*
317 * Since the above shows worse (incorrect) cases
318 * first, we continue that by showing best (correct)
319 * cases last.
320 */
321 if (a->correct > b->correct)
322 return -1;
323 if (a->correct < b->correct)
324 return 1;
325
326 return 0;
312} 327}
313 328
314static struct tracer_stat annotated_branch_stats = { 329static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..6fbfb8f417b9 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
13 * Tracer plugins will chose a default from these clocks. 13 * Tracer plugins will chose a default from these clocks.
14 */ 14 */
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/irqflags.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573242cf..c1cc3ab633de 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h>
9#include "trace.h" 10#include "trace.h"
10 11
11 12
12char *perf_trace_buf; 13static char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 14static char *perf_trace_buf_nmi;
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 15
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19 17
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
120 } 118 }
121 mutex_unlock(&event_mutex); 119 mutex_unlock(&event_mutex);
122} 120}
121
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags)
124{
125 struct trace_entry *entry;
126 char *trace_buf, *raw_data;
127 int pc, cpu;
128
129 pc = preempt_count();
130
131 /* Protect the per cpu buffer, begin the rcu read side */
132 local_irq_save(*irq_flags);
133
134 *rctxp = perf_swevent_get_recursion_context();
135 if (*rctxp < 0)
136 goto err_recursion;
137
138 cpu = smp_processor_id();
139
140 if (in_nmi())
141 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
142 else
143 trace_buf = rcu_dereference_sched(perf_trace_buf);
144
145 if (!trace_buf)
146 goto err;
147
148 raw_data = per_cpu_ptr(trace_buf, cpu);
149
150 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
152
153 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc);
155 entry->type = type;
156
157 return raw_data;
158err:
159 perf_swevent_put_recursion_context(*rctxp);
160err_recursion:
161 local_irq_restore(*irq_flags);
162 return NULL;
163}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..3f972ad98d04 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
60 return 0; 60 return 0;
61 61
62err: 62err:
63 if (field) { 63 if (field)
64 kfree(field->name); 64 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 65 kfree(field);
68 66
69 return -ENOMEM; 67 return -ENOMEM;
@@ -520,41 +518,16 @@ out:
520 return ret; 518 return ret;
521} 519}
522 520
523extern char *__bad_type_size(void);
524
525#undef FIELD
526#define FIELD(type, name) \
527 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
528 #type, "common_" #name, offsetof(typeof(field), name), \
529 sizeof(field.name), is_signed_type(type)
530
531static int trace_write_header(struct trace_seq *s)
532{
533 struct trace_entry field;
534
535 /* struct trace_entry */
536 return trace_seq_printf(s,
537 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
538 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
539 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
540 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
541 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
542 "\n",
543 FIELD(unsigned short, type),
544 FIELD(unsigned char, flags),
545 FIELD(unsigned char, preempt_count),
546 FIELD(int, pid),
547 FIELD(int, lock_depth));
548}
549
550static ssize_t 521static ssize_t
551event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 522event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
552 loff_t *ppos) 523 loff_t *ppos)
553{ 524{
554 struct ftrace_event_call *call = filp->private_data; 525 struct ftrace_event_call *call = filp->private_data;
526 struct ftrace_event_field *field;
555 struct trace_seq *s; 527 struct trace_seq *s;
528 int common_field_count = 5;
556 char *buf; 529 char *buf;
557 int r; 530 int r = 0;
558 531
559 if (*ppos) 532 if (*ppos)
560 return 0; 533 return 0;
@@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
565 538
566 trace_seq_init(s); 539 trace_seq_init(s);
567 540
568 /* If any of the first writes fail, so will the show_format. */
569
570 trace_seq_printf(s, "name: %s\n", call->name); 541 trace_seq_printf(s, "name: %s\n", call->name);
571 trace_seq_printf(s, "ID: %d\n", call->id); 542 trace_seq_printf(s, "ID: %d\n", call->id);
572 trace_seq_printf(s, "format:\n"); 543 trace_seq_printf(s, "format:\n");
573 trace_write_header(s);
574 544
575 r = call->show_format(call, s); 545 list_for_each_entry_reverse(field, &call->fields, link) {
546 /*
547 * Smartly shows the array type(except dynamic array).
548 * Normal:
549 * field:TYPE VAR
550 * If TYPE := TYPE[LEN], it is shown:
551 * field:TYPE VAR[LEN]
552 */
553 const char *array_descriptor = strchr(field->type, '[');
554
555 if (!strncmp(field->type, "__data_loc", 10))
556 array_descriptor = NULL;
557
558 if (!array_descriptor) {
559 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
560 "\tsize:%u;\tsigned:%d;\n",
561 field->type, field->name, field->offset,
562 field->size, !!field->is_signed);
563 } else {
564 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
565 "\tsize:%u;\tsigned:%d;\n",
566 (int)(array_descriptor - field->type),
567 field->type, field->name,
568 array_descriptor, field->offset,
569 field->size, !!field->is_signed);
570 }
571
572 if (--common_field_count == 0)
573 r = trace_seq_printf(s, "\n");
574
575 if (!r)
576 break;
577 }
578
579 if (r)
580 r = trace_seq_printf(s, "\nprint fmt: %s\n",
581 call->print_fmt);
582
576 if (!r) { 583 if (!r) {
577 /* 584 /*
578 * ug! The format output is bigger than a PAGE!! 585 * ug! The format output is bigger than a PAGE!!
@@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
948 filter); 955 filter);
949 } 956 }
950 957
951 /* A trace may not want to export its format */
952 if (!call->show_format)
953 return 0;
954
955 trace_create_file("format", 0444, call->dir, call, 958 trace_create_file("format", 0444, call->dir, call,
956 format); 959 format);
957 960
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 50504cb228de..4615f62a04f1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
211{ 211{
212 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
213 int cmp, match; 213 int cmp, match;
214 int len = strlen(*addr) + 1; /* including tailing '\0' */
214 215
215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); 216 cmp = pred->regex.match(*addr, &pred->regex, len);
216 217
217 match = cmp ^ pred->not; 218 match = cmp ^ pred->not;
218 219
@@ -251,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
251 return 0; 252 return 0;
252} 253}
253 254
254/* Basic regex callbacks */ 255/*
256 * regex_match_foo - Basic regex callbacks
257 *
258 * @str: the string to be searched
259 * @r: the regex structure containing the pattern string
260 * @len: the length of the string to be searched (including '\0')
261 *
262 * Note:
263 * - @str might not be NULL-terminated if it's of type DYN_STRING
264 * or STATIC_STRING
265 */
266
255static int regex_match_full(char *str, struct regex *r, int len) 267static int regex_match_full(char *str, struct regex *r, int len)
256{ 268{
257 if (strncmp(str, r->pattern, len) == 0) 269 if (strncmp(str, r->pattern, len) == 0)
@@ -261,23 +273,24 @@ static int regex_match_full(char *str, struct regex *r, int len)
261 273
262static int regex_match_front(char *str, struct regex *r, int len) 274static int regex_match_front(char *str, struct regex *r, int len)
263{ 275{
264 if (strncmp(str, r->pattern, len) == 0) 276 if (strncmp(str, r->pattern, r->len) == 0)
265 return 1; 277 return 1;
266 return 0; 278 return 0;
267} 279}
268 280
269static int regex_match_middle(char *str, struct regex *r, int len) 281static int regex_match_middle(char *str, struct regex *r, int len)
270{ 282{
271 if (strstr(str, r->pattern)) 283 if (strnstr(str, r->pattern, len))
272 return 1; 284 return 1;
273 return 0; 285 return 0;
274} 286}
275 287
276static int regex_match_end(char *str, struct regex *r, int len) 288static int regex_match_end(char *str, struct regex *r, int len)
277{ 289{
278 char *ptr = strstr(str, r->pattern); 290 int strlen = len - 1;
279 291
280 if (ptr && (ptr[r->len] == 0)) 292 if (strlen >= r->len &&
293 memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
281 return 1; 294 return 1;
282 return 0; 295 return 0;
283} 296}
@@ -781,10 +794,8 @@ static int filter_add_pred(struct filter_parse_state *ps,
781 pred->regex.field_len = field->size; 794 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING) 795 } else if (field->filter_type == FILTER_DYN_STRING)
783 fn = filter_pred_strloc; 796 fn = filter_pred_strloc;
784 else { 797 else
785 fn = filter_pred_pchar; 798 fn = filter_pred_pchar;
786 pred->regex.field_len = strlen(pred->regex.pattern);
787 }
788 } else { 799 } else {
789 if (field->is_signed) 800 if (field->is_signed)
790 ret = strict_strtoll(pred->regex.pattern, 0, &val); 801 ret = strict_strtoll(pred->regex.pattern, 0, &val);
@@ -1360,7 +1371,7 @@ out_unlock:
1360 return err; 1371 return err;
1361} 1372}
1362 1373
1363#ifdef CONFIG_EVENT_PROFILE 1374#ifdef CONFIG_PERF_EVENTS
1364 1375
1365void ftrace_profile_free_filter(struct perf_event *event) 1376void ftrace_profile_free_filter(struct perf_event *event)
1366{ 1377{
@@ -1428,5 +1439,5 @@ out_unlock:
1428 return err; 1439 return err;
1429} 1440}
1430 1441
1431#endif /* CONFIG_EVENT_PROFILE */ 1442#endif /* CONFIG_PERF_EVENTS */
1432 1443
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 458e5bfe26d0..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
84
85#undef __array
86#define __array(type, item, len) \
87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
89 offsetof(typeof(field), item), \
90 sizeof(field.item), is_signed_type(type)); \
91 if (!ret) \
92 return 0;
93
94#undef __array_desc
95#define __array_desc(type, container, item, len) \
96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
101 if (!ret) \
102 return 0;
103
104#undef __dynamic_array
105#define __dynamic_array(type, item) \
106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
112
113#undef F_printk
114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
115
116#undef __entry
117#define __entry REC
118
119#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121static int \
122ftrace_format_##name(struct ftrace_event_call *unused, \
123 struct trace_seq *s) \
124{ \
125 struct struct_name field __attribute__((unused)); \
126 int ret = 0; \
127 \
128 tstruct; \
129 \
130 trace_seq_printf(s, "\nprint fmt: " print); \
131 \
132 return ret; \
133}
134
135#include "trace_entries.h"
136
137#undef __field 65#undef __field
138#define __field(type, item) \ 66#define __field(type, item) \
139 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -158,7 +86,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
160 offsetof(typeof(field), item), \ 88 offsetof(typeof(field), item), \
161 sizeof(field.item), 0, FILTER_OTHER); \ 89 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \
162 if (ret) \ 91 if (ret) \
163 return ret; 92 return ret;
164 93
@@ -168,13 +97,18 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
168 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 97 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
169 offsetof(typeof(field), \ 98 offsetof(typeof(field), \
170 container.item), \ 99 container.item), \
171 sizeof(field.container.item), 0, \ 100 sizeof(field.container.item), \
172 FILTER_OTHER); \ 101 is_signed_type(type), FILTER_OTHER); \
173 if (ret) \ 102 if (ret) \
174 return ret; 103 return ret;
175 104
176#undef __dynamic_array 105#undef __dynamic_array
177#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
178 112
179#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
180#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -197,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
197 return 0; 131 return 0;
198} 132}
199 133
134#undef __entry
135#define __entry REC
136
200#undef __field 137#undef __field
201#define __field(type, item) 138#define __field(type, item)
202 139
@@ -212,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
212#undef __dynamic_array 149#undef __dynamic_array
213#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
214 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
215#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
216#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
217 \ 157 \
@@ -222,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
222 .id = type, \ 162 .id = type, \
223 .system = __stringify(TRACE_SYSTEM), \ 163 .system = __stringify(TRACE_SYSTEM), \
224 .raw_init = ftrace_raw_init_event, \ 164 .raw_init = ftrace_raw_init_event, \
225 .show_format = ftrace_format_##call, \ 165 .print_fmt = print, \
226 .define_fields = ftrace_define_fields_##call, \ 166 .define_fields = ftrace_define_fields_##call, \
227}; \ 167}; \
228 168
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..e6989d9b44da 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -18,6 +18,7 @@ struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore; 20 int ignore;
21 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
21}; 22};
22 23
23struct fgraph_data { 24struct fgraph_data {
@@ -187,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr,
187 struct ring_buffer *buffer = tr->buffer; 188 struct ring_buffer *buffer = tr->buffer;
188 struct ftrace_graph_ent_entry *entry; 189 struct ftrace_graph_ent_entry *entry;
189 190
190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 191 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
191 return 0; 192 return 0;
192 193
193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 194 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
212 int cpu; 213 int cpu;
213 int pc; 214 int pc;
214 215
215 if (unlikely(!tr))
216 return 0;
217
218 if (!ftrace_trace_task(current)) 216 if (!ftrace_trace_task(current))
219 return 0; 217 return 0;
220 218
221 if (!ftrace_graph_addr(trace->func)) 219 /* trace it when it is-nested-in or is a function enabled. */
220 if (!(trace->depth || ftrace_graph_addr(trace->func)))
222 return 0; 221 return 0;
223 222
224 local_irq_save(flags); 223 local_irq_save(flags);
@@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
231 } else { 230 } else {
232 ret = 0; 231 ret = 0;
233 } 232 }
234 /* Only do the atomic if it is not already set */
235 if (!test_tsk_trace_graph(current))
236 set_tsk_trace_graph(current);
237 233
238 atomic_dec(&data->disabled); 234 atomic_dec(&data->disabled);
239 local_irq_restore(flags); 235 local_irq_restore(flags);
@@ -241,6 +237,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
241 return ret; 237 return ret;
242} 238}
243 239
240int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
241{
242 if (tracing_thresh)
243 return 1;
244 else
245 return trace_graph_entry(trace);
246}
247
244static void __trace_graph_return(struct trace_array *tr, 248static void __trace_graph_return(struct trace_array *tr,
245 struct ftrace_graph_ret *trace, 249 struct ftrace_graph_ret *trace,
246 unsigned long flags, 250 unsigned long flags,
@@ -251,7 +255,7 @@ static void __trace_graph_return(struct trace_array *tr,
251 struct ring_buffer *buffer = tr->buffer; 255 struct ring_buffer *buffer = tr->buffer;
252 struct ftrace_graph_ret_entry *entry; 256 struct ftrace_graph_ret_entry *entry;
253 257
254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 258 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
255 return; 259 return;
256 260
257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 261 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,19 +285,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
281 pc = preempt_count(); 285 pc = preempt_count();
282 __trace_graph_return(tr, trace, flags, pc); 286 __trace_graph_return(tr, trace, flags, pc);
283 } 287 }
284 if (!trace->depth)
285 clear_tsk_trace_graph(current);
286 atomic_dec(&data->disabled); 288 atomic_dec(&data->disabled);
287 local_irq_restore(flags); 289 local_irq_restore(flags);
288} 290}
289 291
292void set_graph_array(struct trace_array *tr)
293{
294 graph_array = tr;
295
296 /* Make graph_array visible before we start tracing */
297
298 smp_mb();
299}
300
301void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
302{
303 if (tracing_thresh &&
304 (trace->rettime - trace->calltime < tracing_thresh))
305 return;
306 else
307 trace_graph_return(trace);
308}
309
290static int graph_trace_init(struct trace_array *tr) 310static int graph_trace_init(struct trace_array *tr)
291{ 311{
292 int ret; 312 int ret;
293 313
294 graph_array = tr; 314 set_graph_array(tr);
295 ret = register_ftrace_graph(&trace_graph_return, 315 if (tracing_thresh)
296 &trace_graph_entry); 316 ret = register_ftrace_graph(&trace_graph_thresh_return,
317 &trace_graph_thresh_entry);
318 else
319 ret = register_ftrace_graph(&trace_graph_return,
320 &trace_graph_entry);
297 if (ret) 321 if (ret)
298 return ret; 322 return ret;
299 tracing_start_cmdline_record(); 323 tracing_start_cmdline_record();
@@ -301,11 +325,6 @@ static int graph_trace_init(struct trace_array *tr)
301 return 0; 325 return 0;
302} 326}
303 327
304void set_graph_array(struct trace_array *tr)
305{
306 graph_array = tr;
307}
308
309static void graph_trace_reset(struct trace_array *tr) 328static void graph_trace_reset(struct trace_array *tr)
310{ 329{
311 tracing_stop_cmdline_record(); 330 tracing_stop_cmdline_record();
@@ -673,15 +692,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
673 duration = graph_ret->rettime - graph_ret->calltime; 692 duration = graph_ret->rettime - graph_ret->calltime;
674 693
675 if (data) { 694 if (data) {
695 struct fgraph_cpu_data *cpu_data;
676 int cpu = iter->cpu; 696 int cpu = iter->cpu;
677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 697
698 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
678 699
679 /* 700 /*
680 * Comments display at + 1 to depth. Since 701 * Comments display at + 1 to depth. Since
681 * this is a leaf function, keep the comments 702 * this is a leaf function, keep the comments
682 * equal to this depth. 703 * equal to this depth.
683 */ 704 */
684 *depth = call->depth - 1; 705 cpu_data->depth = call->depth - 1;
706
707 /* No need to keep this function around for this depth */
708 if (call->depth < FTRACE_RETFUNC_DEPTH)
709 cpu_data->enter_funcs[call->depth] = 0;
685 } 710 }
686 711
687 /* Overhead */ 712 /* Overhead */
@@ -721,10 +746,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
721 int i; 746 int i;
722 747
723 if (data) { 748 if (data) {
749 struct fgraph_cpu_data *cpu_data;
724 int cpu = iter->cpu; 750 int cpu = iter->cpu;
725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
726 751
727 *depth = call->depth; 752 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
753 cpu_data->depth = call->depth;
754
755 /* Save this function pointer to see if the exit matches */
756 if (call->depth < FTRACE_RETFUNC_DEPTH)
757 cpu_data->enter_funcs[call->depth] = call->func;
728 } 758 }
729 759
730 /* No overhead */ 760 /* No overhead */
@@ -854,19 +884,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
854 struct fgraph_data *data = iter->private; 884 struct fgraph_data *data = iter->private;
855 pid_t pid = ent->pid; 885 pid_t pid = ent->pid;
856 int cpu = iter->cpu; 886 int cpu = iter->cpu;
887 int func_match = 1;
857 int ret; 888 int ret;
858 int i; 889 int i;
859 890
860 if (data) { 891 if (data) {
892 struct fgraph_cpu_data *cpu_data;
861 int cpu = iter->cpu; 893 int cpu = iter->cpu;
862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 894
895 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
863 896
864 /* 897 /*
865 * Comments display at + 1 to depth. This is the 898 * Comments display at + 1 to depth. This is the
866 * return from a function, we now want the comments 899 * return from a function, we now want the comments
867 * to display at the same level of the bracket. 900 * to display at the same level of the bracket.
868 */ 901 */
869 *depth = trace->depth - 1; 902 cpu_data->depth = trace->depth - 1;
903
904 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
905 if (cpu_data->enter_funcs[trace->depth] != trace->func)
906 func_match = 0;
907 cpu_data->enter_funcs[trace->depth] = 0;
908 }
870 } 909 }
871 910
872 if (print_graph_prologue(iter, s, 0, 0)) 911 if (print_graph_prologue(iter, s, 0, 0))
@@ -891,9 +930,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
891 return TRACE_TYPE_PARTIAL_LINE; 930 return TRACE_TYPE_PARTIAL_LINE;
892 } 931 }
893 932
894 ret = trace_seq_printf(s, "}\n"); 933 /*
895 if (!ret) 934 * If the return function does not have a matching entry,
896 return TRACE_TYPE_PARTIAL_LINE; 935 * then the entry was lost. Instead of just printing
936 * the '}' and letting the user guess what function this
937 * belongs to, write out the function name.
938 */
939 if (func_match) {
940 ret = trace_seq_printf(s, "}\n");
941 if (!ret)
942 return TRACE_TYPE_PARTIAL_LINE;
943 } else {
944 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
945 if (!ret)
946 return TRACE_TYPE_PARTIAL_LINE;
947 }
897 948
898 /* Overrun */ 949 /* Overrun */
899 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 950 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 375f81a568dc..505c92273b1a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
91 return retval; 91 return retval;
92} 92}
93 93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy) 95 void *dummy)
101{ 96{
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{ 226{
232 int ret = -EINVAL; 227 int ret = -EINVAL;
233 228
234 if (ff->func == fetch_argument) 229 if (ff->func == fetch_register) {
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name; 230 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data)); 231 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name); 232 ret = snprintf(buf, n, "%%%s", name);
@@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
489 } 482 }
490 } else 483 } else
491 ret = -EINVAL; 484 ret = -EINVAL;
492 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
493 ret = strict_strtoul(arg + 3, 10, &param);
494 if (ret || param > PARAM_MAX_ARGS)
495 ret = -EINVAL;
496 else {
497 ff->func = fetch_argument;
498 ff->data = (void *)param;
499 }
500 } else 485 } else
501 ret = -EINVAL; 486 ret = -EINVAL;
502 return ret; 487 return ret;
@@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
611 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 596 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
612 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 597 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
613 * Fetch args: 598 * Fetch args:
614 * $argN : fetch Nth of function argument. (N:0-)
615 * $retval : fetch return value 599 * $retval : fetch return value
616 * $stack : fetch stack address 600 * $stack : fetch stack address
617 * $stackN : fetch Nth of stack (N:0-) 601 * $stackN : fetch Nth of stack (N:0-)
@@ -651,12 +635,12 @@ static int create_trace_probe(int argc, char **argv)
651 event = strchr(group, '/') + 1; 635 event = strchr(group, '/') + 1;
652 event[-1] = '\0'; 636 event[-1] = '\0';
653 if (strlen(group) == 0) { 637 if (strlen(group) == 0) {
654 pr_info("Group name is not specifiled\n"); 638 pr_info("Group name is not specified\n");
655 return -EINVAL; 639 return -EINVAL;
656 } 640 }
657 } 641 }
658 if (strlen(event) == 0) { 642 if (strlen(event) == 0) {
659 pr_info("Event name is not specifiled\n"); 643 pr_info("Event name is not specified\n");
660 return -EINVAL; 644 return -EINVAL;
661 } 645 }
662 } 646 }
@@ -689,7 +673,7 @@ static int create_trace_probe(int argc, char **argv)
689 return -EINVAL; 673 return -EINVAL;
690 } 674 }
691 /* an address specified */ 675 /* an address specified */
692 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); 676 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
693 if (ret) { 677 if (ret) {
694 pr_info("Failed to parse address.\n"); 678 pr_info("Failed to parse address.\n");
695 return ret; 679 return ret;
@@ -958,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
958}; 942};
959 943
960/* Kprobe handler */ 944/* Kprobe handler */
961static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
962{ 946{
963 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
964 struct kprobe_trace_entry *entry; 948 struct kprobe_trace_entry *entry;
@@ -978,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
978 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
979 irq_flags, pc); 963 irq_flags, pc);
980 if (!event) 964 if (!event)
981 return 0; 965 return;
982 966
983 entry = ring_buffer_event_data(event); 967 entry = ring_buffer_event_data(event);
984 entry->nargs = tp->nr_args; 968 entry->nargs = tp->nr_args;
@@ -988,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
988 972
989 if (!filter_current_check_discard(buffer, call, entry, event)) 973 if (!filter_current_check_discard(buffer, call, entry, event))
990 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
991 return 0;
992} 975}
993 976
994/* Kretprobe handler */ 977/* Kretprobe handler */
995static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, 978static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
996 struct pt_regs *regs) 979 struct pt_regs *regs)
997{ 980{
998 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1011,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1011 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
1012 irq_flags, pc); 995 irq_flags, pc);
1013 if (!event) 996 if (!event)
1014 return 0; 997 return;
1015 998
1016 entry = ring_buffer_event_data(event); 999 entry = ring_buffer_event_data(event);
1017 entry->nargs = tp->nr_args; 1000 entry->nargs = tp->nr_args;
@@ -1022,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1022 1005
1023 if (!filter_current_check_discard(buffer, call, entry, event)) 1006 if (!filter_current_check_discard(buffer, call, entry, event))
1024 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1025
1026 return 0;
1027} 1008}
1028 1009
1029/* Event entry printers */ 1010/* Event entry printers */
@@ -1174,212 +1155,123 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1174 return 0; 1155 return 0;
1175} 1156}
1176 1157
1177static int __probe_event_show_format(struct trace_seq *s, 1158static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1178 struct trace_probe *tp, const char *fmt,
1179 const char *arg)
1180{ 1159{
1181 int i; 1160 int i;
1161 int pos = 0;
1182 1162
1183 /* Show format */ 1163 const char *fmt, *arg;
1184 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1185 return 0;
1186 1164
1187 for (i = 0; i < tp->nr_args; i++) 1165 if (!probe_is_return(tp)) {
1188 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) 1166 fmt = "(%lx)";
1189 return 0; 1167 arg = "REC->" FIELD_STRING_IP;
1168 } else {
1169 fmt = "(%lx <- %lx)";
1170 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1171 }
1190 1172
1191 if (!trace_seq_printf(s, "\", %s", arg)) 1173 /* When len=0, we just calculate the needed length */
1192 return 0; 1174#define LEN_OR_ZERO (len ? len - pos : 0)
1193 1175
1194 for (i = 0; i < tp->nr_args; i++) 1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1195 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1196 return 0;
1197
1198 return trace_seq_puts(s, "\n");
1199}
1200 1177
1201#undef SHOW_FIELD 1178 for (i = 0; i < tp->nr_args; i++) {
1202#define SHOW_FIELD(type, item, name) \ 1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
1203 do { \ 1180 tp->args[i].name);
1204 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ 1181 }
1205 "offset:%u;\tsize:%u;\n", name, \
1206 (unsigned int)offsetof(typeof(field), item),\
1207 (unsigned int)sizeof(type)); \
1208 if (!ret) \
1209 return 0; \
1210 } while (0)
1211 1182
1212static int kprobe_event_show_format(struct ftrace_event_call *call, 1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1213 struct trace_seq *s)
1214{
1215 struct kprobe_trace_entry field __attribute__((unused));
1216 int ret, i;
1217 struct trace_probe *tp = (struct trace_probe *)call->data;
1218 1184
1219 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); 1185 for (i = 0; i < tp->nr_args; i++) {
1220 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1187 tp->args[i].name);
1188 }
1221 1189
1222 /* Show fields */ 1190#undef LEN_OR_ZERO
1223 for (i = 0; i < tp->nr_args; i++)
1224 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1225 trace_seq_puts(s, "\n");
1226 1191
1227 return __probe_event_show_format(s, tp, "(%lx)", 1192 /* return the length of print_fmt */
1228 "REC->" FIELD_STRING_IP); 1193 return pos;
1229} 1194}
1230 1195
1231static int kretprobe_event_show_format(struct ftrace_event_call *call, 1196static int set_print_fmt(struct trace_probe *tp)
1232 struct trace_seq *s)
1233{ 1197{
1234 struct kretprobe_trace_entry field __attribute__((unused)); 1198 int len;
1235 int ret, i; 1199 char *print_fmt;
1236 struct trace_probe *tp = (struct trace_probe *)call->data;
1237 1200
1238 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); 1201 /* First: called with 0 length to calculate the needed length */
1239 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); 1202 len = __set_print_fmt(tp, NULL, 0);
1240 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1203 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1204 if (!print_fmt)
1205 return -ENOMEM;
1241 1206
1242 /* Show fields */ 1207 /* Second: actually write the @print_fmt */
1243 for (i = 0; i < tp->nr_args; i++) 1208 __set_print_fmt(tp, print_fmt, len + 1);
1244 SHOW_FIELD(unsigned long, args[i], tp->args[i].name); 1209 tp->call.print_fmt = print_fmt;
1245 trace_seq_puts(s, "\n");
1246 1210
1247 return __probe_event_show_format(s, tp, "(%lx <- %lx)", 1211 return 0;
1248 "REC->" FIELD_STRING_FUNC
1249 ", REC->" FIELD_STRING_RETIP);
1250} 1212}
1251 1213
1252#ifdef CONFIG_EVENT_PROFILE 1214#ifdef CONFIG_PERF_EVENTS
1253 1215
1254/* Kprobe profile handler */ 1216/* Kprobe profile handler */
1255static __kprobes int kprobe_profile_func(struct kprobe *kp, 1217static __kprobes void kprobe_profile_func(struct kprobe *kp,
1256 struct pt_regs *regs) 1218 struct pt_regs *regs)
1257{ 1219{
1258 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1259 struct ftrace_event_call *call = &tp->call; 1221 struct ftrace_event_call *call = &tp->call;
1260 struct kprobe_trace_entry *entry; 1222 struct kprobe_trace_entry *entry;
1261 struct trace_entry *ent; 1223 int size, __size, i;
1262 int size, __size, i, pc, __cpu;
1263 unsigned long irq_flags; 1224 unsigned long irq_flags;
1264 char *trace_buf;
1265 char *raw_data;
1266 int rctx; 1225 int rctx;
1267 1226
1268 pc = preempt_count();
1269 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1270 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1228 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1271 size -= sizeof(u32); 1229 size -= sizeof(u32);
1272 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1230 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1273 "profile buffer not large enough")) 1231 "profile buffer not large enough"))
1274 return 0; 1232 return;
1275
1276 /*
1277 * Protect the non nmi buffer
1278 * This also protects the rcu read side
1279 */
1280 local_irq_save(irq_flags);
1281
1282 rctx = perf_swevent_get_recursion_context();
1283 if (rctx < 0)
1284 goto end_recursion;
1285
1286 __cpu = smp_processor_id();
1287
1288 if (in_nmi())
1289 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1290 else
1291 trace_buf = rcu_dereference(perf_trace_buf);
1292 1233
1293 if (!trace_buf) 1234 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1294 goto end; 1235 if (!entry)
1295 1236 return;
1296 raw_data = per_cpu_ptr(trace_buf, __cpu);
1297
1298 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1299 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1300 entry = (struct kprobe_trace_entry *)raw_data;
1301 ent = &entry->ent;
1302 1237
1303 tracing_generic_entry_update(ent, irq_flags, pc);
1304 ent->type = call->id;
1305 entry->nargs = tp->nr_args; 1238 entry->nargs = tp->nr_args;
1306 entry->ip = (unsigned long)kp->addr; 1239 entry->ip = (unsigned long)kp->addr;
1307 for (i = 0; i < tp->nr_args; i++) 1240 for (i = 0; i < tp->nr_args; i++)
1308 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1309 perf_tp_event(call->id, entry->ip, 1, entry, size);
1310
1311end:
1312 perf_swevent_put_recursion_context(rctx);
1313end_recursion:
1314 local_irq_restore(irq_flags);
1315 1242
1316 return 0; 1243 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
1317} 1244}
1318 1245
1319/* Kretprobe profile handler */ 1246/* Kretprobe profile handler */
1320static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, 1247static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1321 struct pt_regs *regs) 1248 struct pt_regs *regs)
1322{ 1249{
1323 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1324 struct ftrace_event_call *call = &tp->call; 1251 struct ftrace_event_call *call = &tp->call;
1325 struct kretprobe_trace_entry *entry; 1252 struct kretprobe_trace_entry *entry;
1326 struct trace_entry *ent; 1253 int size, __size, i;
1327 int size, __size, i, pc, __cpu;
1328 unsigned long irq_flags; 1254 unsigned long irq_flags;
1329 char *trace_buf;
1330 char *raw_data;
1331 int rctx; 1255 int rctx;
1332 1256
1333 pc = preempt_count();
1334 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1335 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1258 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1336 size -= sizeof(u32); 1259 size -= sizeof(u32);
1337 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1260 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1338 "profile buffer not large enough")) 1261 "profile buffer not large enough"))
1339 return 0; 1262 return;
1340
1341 /*
1342 * Protect the non nmi buffer
1343 * This also protects the rcu read side
1344 */
1345 local_irq_save(irq_flags);
1346
1347 rctx = perf_swevent_get_recursion_context();
1348 if (rctx < 0)
1349 goto end_recursion;
1350
1351 __cpu = smp_processor_id();
1352 1263
1353 if (in_nmi()) 1264 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1354 trace_buf = rcu_dereference(perf_trace_buf_nmi); 1265 if (!entry)
1355 else 1266 return;
1356 trace_buf = rcu_dereference(perf_trace_buf);
1357
1358 if (!trace_buf)
1359 goto end;
1360
1361 raw_data = per_cpu_ptr(trace_buf, __cpu);
1362
1363 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1364 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1365 entry = (struct kretprobe_trace_entry *)raw_data;
1366 ent = &entry->ent;
1367 1267
1368 tracing_generic_entry_update(ent, irq_flags, pc);
1369 ent->type = call->id;
1370 entry->nargs = tp->nr_args; 1268 entry->nargs = tp->nr_args;
1371 entry->func = (unsigned long)tp->rp.kp.addr; 1269 entry->func = (unsigned long)tp->rp.kp.addr;
1372 entry->ret_ip = (unsigned long)ri->ret_addr; 1270 entry->ret_ip = (unsigned long)ri->ret_addr;
1373 for (i = 0; i < tp->nr_args; i++) 1271 for (i = 0; i < tp->nr_args; i++)
1374 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1375 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1376
1377end:
1378 perf_swevent_put_recursion_context(rctx);
1379end_recursion:
1380 local_irq_restore(irq_flags);
1381 1273
1382 return 0; 1274 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
1383} 1275}
1384 1276
1385static int probe_profile_enable(struct ftrace_event_call *call) 1277static int probe_profile_enable(struct ftrace_event_call *call)
@@ -1407,7 +1299,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1407 disable_kprobe(&tp->rp.kp); 1299 disable_kprobe(&tp->rp.kp);
1408 } 1300 }
1409} 1301}
1410#endif /* CONFIG_EVENT_PROFILE */ 1302#endif /* CONFIG_PERF_EVENTS */
1411 1303
1412 1304
1413static __kprobes 1305static __kprobes
@@ -1417,10 +1309,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1417 1309
1418 if (tp->flags & TP_FLAG_TRACE) 1310 if (tp->flags & TP_FLAG_TRACE)
1419 kprobe_trace_func(kp, regs); 1311 kprobe_trace_func(kp, regs);
1420#ifdef CONFIG_EVENT_PROFILE 1312#ifdef CONFIG_PERF_EVENTS
1421 if (tp->flags & TP_FLAG_PROFILE) 1313 if (tp->flags & TP_FLAG_PROFILE)
1422 kprobe_profile_func(kp, regs); 1314 kprobe_profile_func(kp, regs);
1423#endif /* CONFIG_EVENT_PROFILE */ 1315#endif
1424 return 0; /* We don't tweek kernel, so just return 0 */ 1316 return 0; /* We don't tweek kernel, so just return 0 */
1425} 1317}
1426 1318
@@ -1431,10 +1323,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1431 1323
1432 if (tp->flags & TP_FLAG_TRACE) 1324 if (tp->flags & TP_FLAG_TRACE)
1433 kretprobe_trace_func(ri, regs); 1325 kretprobe_trace_func(ri, regs);
1434#ifdef CONFIG_EVENT_PROFILE 1326#ifdef CONFIG_PERF_EVENTS
1435 if (tp->flags & TP_FLAG_PROFILE) 1327 if (tp->flags & TP_FLAG_PROFILE)
1436 kretprobe_profile_func(ri, regs); 1328 kretprobe_profile_func(ri, regs);
1437#endif /* CONFIG_EVENT_PROFILE */ 1329#endif
1438 return 0; /* We don't tweek kernel, so just return 0 */ 1330 return 0; /* We don't tweek kernel, so just return 0 */
1439} 1331}
1440 1332
@@ -1447,23 +1339,25 @@ static int register_probe_event(struct trace_probe *tp)
1447 if (probe_is_return(tp)) { 1339 if (probe_is_return(tp)) {
1448 tp->event.trace = print_kretprobe_event; 1340 tp->event.trace = print_kretprobe_event;
1449 call->raw_init = probe_event_raw_init; 1341 call->raw_init = probe_event_raw_init;
1450 call->show_format = kretprobe_event_show_format;
1451 call->define_fields = kretprobe_event_define_fields; 1342 call->define_fields = kretprobe_event_define_fields;
1452 } else { 1343 } else {
1453 tp->event.trace = print_kprobe_event; 1344 tp->event.trace = print_kprobe_event;
1454 call->raw_init = probe_event_raw_init; 1345 call->raw_init = probe_event_raw_init;
1455 call->show_format = kprobe_event_show_format;
1456 call->define_fields = kprobe_event_define_fields; 1346 call->define_fields = kprobe_event_define_fields;
1457 } 1347 }
1348 if (set_print_fmt(tp) < 0)
1349 return -ENOMEM;
1458 call->event = &tp->event; 1350 call->event = &tp->event;
1459 call->id = register_ftrace_event(&tp->event); 1351 call->id = register_ftrace_event(&tp->event);
1460 if (!call->id) 1352 if (!call->id) {
1353 kfree(call->print_fmt);
1461 return -ENODEV; 1354 return -ENODEV;
1355 }
1462 call->enabled = 0; 1356 call->enabled = 0;
1463 call->regfunc = probe_event_enable; 1357 call->regfunc = probe_event_enable;
1464 call->unregfunc = probe_event_disable; 1358 call->unregfunc = probe_event_disable;
1465 1359
1466#ifdef CONFIG_EVENT_PROFILE 1360#ifdef CONFIG_PERF_EVENTS
1467 call->profile_enable = probe_profile_enable; 1361 call->profile_enable = probe_profile_enable;
1468 call->profile_disable = probe_profile_disable; 1362 call->profile_disable = probe_profile_disable;
1469#endif 1363#endif
@@ -1471,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp)
1471 ret = trace_add_event_call(call); 1365 ret = trace_add_event_call(call);
1472 if (ret) { 1366 if (ret) {
1473 pr_info("Failed to register kprobe event: %s\n", call->name); 1367 pr_info("Failed to register kprobe event: %s\n", call->name);
1368 kfree(call->print_fmt);
1474 unregister_ftrace_event(&tp->event); 1369 unregister_ftrace_event(&tp->event);
1475 } 1370 }
1476 return ret; 1371 return ret;
@@ -1480,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1480{ 1375{
1481 /* tp->event is unregistered in trace_remove_event_call() */ 1376 /* tp->event is unregistered in trace_remove_event_call() */
1482 trace_remove_event_call(&tp->call); 1377 trace_remove_event_call(&tp->call);
1378 kfree(tp->call.print_fmt);
1483} 1379}
1484 1380
1485/* Make a debugfs interface for controling probe points */ 1381/* Make a debugfs interface for controling probe points */
@@ -1522,28 +1418,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1522 1418
1523static __init int kprobe_trace_self_tests_init(void) 1419static __init int kprobe_trace_self_tests_init(void)
1524{ 1420{
1525 int ret; 1421 int ret, warn = 0;
1526 int (*target)(int, int, int, int, int, int); 1422 int (*target)(int, int, int, int, int, int);
1423 struct trace_probe *tp;
1527 1424
1528 target = kprobe_trace_selftest_target; 1425 target = kprobe_trace_selftest_target;
1529 1426
1530 pr_info("Testing kprobe tracing: "); 1427 pr_info("Testing kprobe tracing: ");
1531 1428
1532 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1429 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1533 "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); 1430 "$stack $stack0 +0($stack)");
1534 if (WARN_ON_ONCE(ret)) 1431 if (WARN_ON_ONCE(ret)) {
1535 pr_warning("error enabling function entry\n"); 1432 pr_warning("error on probing function entry.\n");
1433 warn++;
1434 } else {
1435 /* Enable trace point */
1436 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1437 if (WARN_ON_ONCE(tp == NULL)) {
1438 pr_warning("error on getting new probe.\n");
1439 warn++;
1440 } else
1441 probe_event_enable(&tp->call);
1442 }
1536 1443
1537 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1444 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1538 "$retval"); 1445 "$retval");
1539 if (WARN_ON_ONCE(ret)) 1446 if (WARN_ON_ONCE(ret)) {
1540 pr_warning("error enabling function return\n"); 1447 pr_warning("error on probing function return.\n");
1448 warn++;
1449 } else {
1450 /* Enable trace point */
1451 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1452 if (WARN_ON_ONCE(tp == NULL)) {
1453 pr_warning("error on getting new probe.\n");
1454 warn++;
1455 } else
1456 probe_event_enable(&tp->call);
1457 }
1458
1459 if (warn)
1460 goto end;
1541 1461
1542 ret = target(1, 2, 3, 4, 5, 6); 1462 ret = target(1, 2, 3, 4, 5, 6);
1543 1463
1544 cleanup_all_probes(); 1464 ret = command_trace_probe("-:testprobe");
1465 if (WARN_ON_ONCE(ret)) {
1466 pr_warning("error on deleting a probe.\n");
1467 warn++;
1468 }
1545 1469
1546 pr_cont("OK\n"); 1470 ret = command_trace_probe("-:testprobe2");
1471 if (WARN_ON_ONCE(ret)) {
1472 pr_warning("error on deleting a probe.\n");
1473 warn++;
1474 }
1475
1476end:
1477 cleanup_all_probes();
1478 if (warn)
1479 pr_cont("NG: Some tests are failed. Please check them.\n");
1480 else
1481 pr_cont("OK\n");
1547 return 0; 1482 return 0;
1548} 1483}
1549 1484
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index faf37fa4408c..94103cdcf9d8 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -26,12 +26,13 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27 27
28#include "trace_output.h" 28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h" 29#include "trace.h"
31 30
32#include <linux/hw_breakpoint.h> 31#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h> 32#include <asm/hw_breakpoint.h>
34 33
34#include <asm/atomic.h>
35
35/* 36/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number 37 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers. 38 * of available hardware breakpoint registers.
@@ -44,7 +45,7 @@ struct trace_ksym {
44 struct perf_event **ksym_hbp; 45 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr; 46 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER 47#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter; 48 atomic64_t counter;
48#endif 49#endif
49 struct hlist_node ksym_hlist; 50 struct hlist_node ksym_hlist;
50}; 51};
@@ -69,9 +70,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
69 70
70 rcu_read_lock(); 71 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { 72 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) && 73 if (entry->attr.bp_addr == hbp_hit_addr) {
73 (entry->counter <= MAX_UL_INT)) { 74 atomic64_inc(&entry->counter);
74 entry->counter++;
75 break; 75 break;
76 } 76 }
77 } 77 }
@@ -197,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
197 entry->attr.bp_addr = addr; 197 entry->attr.bp_addr = addr;
198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4; 198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
199 199
200 ret = -EAGAIN;
201 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, 200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
202 ksym_hbp_handler); 201 ksym_hbp_handler);
203 202
@@ -300,8 +299,8 @@ static ssize_t ksym_trace_filter_write(struct file *file,
300 * 2: echo 0 > ksym_trace_filter 299 * 2: echo 0 > ksym_trace_filter
301 * 3: echo "*:---" > ksym_trace_filter 300 * 3: echo "*:---" > ksym_trace_filter
302 */ 301 */
303 if (!buf[0] || !strcmp(buf, "0") || 302 if (!input_string[0] || !strcmp(input_string, "0") ||
304 !strcmp(buf, "*:---")) { 303 !strcmp(input_string, "*:---")) {
305 __ksym_trace_reset(); 304 __ksym_trace_reset();
306 ret = 0; 305 ret = 0;
307 goto out; 306 goto out;
@@ -444,102 +443,77 @@ struct tracer ksym_tracer __read_mostly =
444 .print_line = ksym_trace_output 443 .print_line = ksym_trace_output
445}; 444};
446 445
447__init static int init_ksym_trace(void)
448{
449 struct dentry *d_tracer;
450 struct dentry *entry;
451
452 d_tracer = tracing_init_dentry();
453 ksym_filter_entry_count = 0;
454
455 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
456 NULL, &ksym_tracing_fops);
457 if (!entry)
458 pr_warning("Could not create debugfs "
459 "'ksym_trace_filter' file\n");
460
461 return register_tracer(&ksym_tracer);
462}
463device_initcall(init_ksym_trace);
464
465
466#ifdef CONFIG_PROFILE_KSYM_TRACER 446#ifdef CONFIG_PROFILE_KSYM_TRACER
467static int ksym_tracer_stat_headers(struct seq_file *m) 447static int ksym_profile_show(struct seq_file *m, void *v)
468{ 448{
449 struct hlist_node *node;
450 struct trace_ksym *entry;
451 int access_type = 0;
452 char fn_name[KSYM_NAME_LEN];
453
469 seq_puts(m, " Access Type "); 454 seq_puts(m, " Access Type ");
470 seq_puts(m, " Symbol Counter\n"); 455 seq_puts(m, " Symbol Counter\n");
471 seq_puts(m, " ----------- "); 456 seq_puts(m, " ----------- ");
472 seq_puts(m, " ------ -------\n"); 457 seq_puts(m, " ------ -------\n");
473 return 0;
474}
475 458
476static int ksym_tracer_stat_show(struct seq_file *m, void *v) 459 rcu_read_lock();
477{ 460 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
478 struct hlist_node *stat = v;
479 struct trace_ksym *entry;
480 int access_type = 0;
481 char fn_name[KSYM_NAME_LEN];
482 461
483 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); 462 access_type = entry->attr.bp_type;
484 463
485 access_type = entry->attr.bp_type; 464 switch (access_type) {
465 case HW_BREAKPOINT_R:
466 seq_puts(m, " R ");
467 break;
468 case HW_BREAKPOINT_W:
469 seq_puts(m, " W ");
470 break;
471 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
472 seq_puts(m, " RW ");
473 break;
474 default:
475 seq_puts(m, " NA ");
476 }
486 477
487 switch (access_type) { 478 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
488 case HW_BREAKPOINT_R: 479 seq_printf(m, " %-36s", fn_name);
489 seq_puts(m, " R "); 480 else
490 break; 481 seq_printf(m, " %-36s", "<NA>");
491 case HW_BREAKPOINT_W: 482 seq_printf(m, " %15llu\n",
492 seq_puts(m, " W "); 483 (unsigned long long)atomic64_read(&entry->counter));
493 break;
494 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
495 seq_puts(m, " RW ");
496 break;
497 default:
498 seq_puts(m, " NA ");
499 } 484 }
500 485 rcu_read_unlock();
501 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
502 seq_printf(m, " %-36s", fn_name);
503 else
504 seq_printf(m, " %-36s", "<NA>");
505 seq_printf(m, " %15lu\n", entry->counter);
506 486
507 return 0; 487 return 0;
508} 488}
509 489
510static void *ksym_tracer_stat_start(struct tracer_stat *trace) 490static int ksym_profile_open(struct inode *node, struct file *file)
511{ 491{
512 return ksym_filter_head.first; 492 return single_open(file, ksym_profile_show, NULL);
513}
514
515static void *
516ksym_tracer_stat_next(void *v, int idx)
517{
518 struct hlist_node *stat = v;
519
520 return stat->next;
521} 493}
522 494
523static struct tracer_stat ksym_tracer_stats = { 495static const struct file_operations ksym_profile_fops = {
524 .name = "ksym_tracer", 496 .open = ksym_profile_open,
525 .stat_start = ksym_tracer_stat_start, 497 .read = seq_read,
526 .stat_next = ksym_tracer_stat_next, 498 .llseek = seq_lseek,
527 .stat_headers = ksym_tracer_stat_headers, 499 .release = single_release,
528 .stat_show = ksym_tracer_stat_show
529}; 500};
501#endif /* CONFIG_PROFILE_KSYM_TRACER */
530 502
531__init static int ksym_tracer_stat_init(void) 503__init static int init_ksym_trace(void)
532{ 504{
533 int ret; 505 struct dentry *d_tracer;
534 506
535 ret = register_stat_tracer(&ksym_tracer_stats); 507 d_tracer = tracing_init_dentry();
536 if (ret) {
537 printk(KERN_WARNING "Warning: could not register "
538 "ksym tracer stats\n");
539 return 1;
540 }
541 508
542 return 0; 509 trace_create_file("ksym_trace_filter", 0644, d_tracer,
510 NULL, &ksym_tracing_fops);
511
512#ifdef CONFIG_PROFILE_KSYM_TRACER
513 trace_create_file("ksym_profile", 0444, d_tracer,
514 NULL, &ksym_profile_fops);
515#endif
516
517 return register_tracer(&ksym_tracer);
543} 518}
544fs_initcall(ksym_tracer_stat_init); 519device_initcall(init_ksym_trace);
545#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee30..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
174 arch_spin_lock(&max_stack_lock); 184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 arch_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
210 arch_spin_lock(&max_stack_lock); 228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
238 int cpu;
239
220 arch_spin_unlock(&max_stack_lock); 240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..cba47d7935cc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -143,70 +143,65 @@ extern char *__bad_type_size(void);
143 #type, #name, offsetof(typeof(trace), name), \ 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type) 144 sizeof(trace.name), is_signed_type(type)
145 145
146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146static
147int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
147{ 148{
148 int i; 149 int i;
149 int ret; 150 int pos = 0;
150 struct syscall_metadata *entry = call->data;
151 struct syscall_trace_enter trace;
152 int offset = offsetof(struct syscall_trace_enter, args);
153 151
154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 152 /* When len=0, we just calculate the needed length */
155 "\tsigned:%u;\n", 153#define LEN_OR_ZERO (len ? len - pos : 0)
156 SYSCALL_FIELD(int, nr));
157 if (!ret)
158 return 0;
159 154
155 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
160 for (i = 0; i < entry->nb_args; i++) { 156 for (i = 0; i < entry->nb_args; i++) {
161 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 157 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
162 entry->args[i]); 158 entry->args[i], sizeof(unsigned long),
163 if (!ret) 159 i == entry->nb_args - 1 ? "" : ", ");
164 return 0;
165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
169 if (!ret)
170 return 0;
171 offset += sizeof(unsigned long);
172 } 160 }
161 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
173 162
174 trace_seq_puts(s, "\nprint fmt: \"");
175 for (i = 0; i < entry->nb_args; i++) { 163 for (i = 0; i < entry->nb_args; i++) {
176 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 164 pos += snprintf(buf + pos, LEN_OR_ZERO,
177 sizeof(unsigned long), 165 ", ((unsigned long)(REC->%s))", entry->args[i]);
178 i == entry->nb_args - 1 ? "" : ", ");
179 if (!ret)
180 return 0;
181 } 166 }
182 trace_seq_putc(s, '"');
183 167
184 for (i = 0; i < entry->nb_args; i++) { 168#undef LEN_OR_ZERO
185 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186 entry->args[i]);
187 if (!ret)
188 return 0;
189 }
190 169
191 return trace_seq_putc(s, '\n'); 170 /* return the length of print_fmt */
171 return pos;
192} 172}
193 173
194int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 174static int set_syscall_print_fmt(struct ftrace_event_call *call)
195{ 175{
196 int ret; 176 char *print_fmt;
197 struct syscall_trace_exit trace; 177 int len;
178 struct syscall_metadata *entry = call->data;
198 179
199 ret = trace_seq_printf(s, 180 if (entry->enter_event != call) {
200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 181 call->print_fmt = "\"0x%lx\", REC->ret";
201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
204 SYSCALL_FIELD(int, nr),
205 SYSCALL_FIELD(long, ret));
206 if (!ret)
207 return 0; 182 return 0;
183 }
184
185 /* First: called with 0 length to calculate the needed length */
186 len = __set_enter_print_fmt(entry, NULL, 0);
187
188 print_fmt = kmalloc(len + 1, GFP_KERNEL);
189 if (!print_fmt)
190 return -ENOMEM;
191
192 /* Second: actually write the @print_fmt */
193 __set_enter_print_fmt(entry, print_fmt, len + 1);
194 call->print_fmt = print_fmt;
208 195
209 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 196 return 0;
197}
198
199static void free_syscall_print_fmt(struct ftrace_event_call *call)
200{
201 struct syscall_metadata *entry = call->data;
202
203 if (entry->enter_event == call)
204 kfree(call->print_fmt);
210} 205}
211 206
212int syscall_enter_define_fields(struct ftrace_event_call *call) 207int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
386{ 381{
387 int id; 382 int id;
388 383
389 id = register_ftrace_event(call->event); 384 if (set_syscall_print_fmt(call) < 0)
390 if (!id) 385 return -ENOMEM;
391 return -ENODEV; 386
392 call->id = id; 387 id = trace_event_raw_init(call);
393 INIT_LIST_HEAD(&call->fields); 388
394 return 0; 389 if (id < 0) {
390 free_syscall_print_fmt(call);
391 return id;
392 }
393
394 return id;
395}
396
397unsigned long __init arch_syscall_addr(int nr)
398{
399 return (unsigned long)sys_call_table[nr];
395} 400}
396 401
397int __init init_ftrace_syscalls(void) 402int __init init_ftrace_syscalls(void)
@@ -421,7 +426,7 @@ int __init init_ftrace_syscalls(void)
421} 426}
422core_initcall(init_ftrace_syscalls); 427core_initcall(init_ftrace_syscalls);
423 428
424#ifdef CONFIG_EVENT_PROFILE 429#ifdef CONFIG_PERF_EVENTS
425 430
426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 431static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 432static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -433,12 +438,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
433 struct syscall_metadata *sys_data; 438 struct syscall_metadata *sys_data;
434 struct syscall_trace_enter *rec; 439 struct syscall_trace_enter *rec;
435 unsigned long flags; 440 unsigned long flags;
436 char *trace_buf;
437 char *raw_data;
438 int syscall_nr; 441 int syscall_nr;
439 int rctx; 442 int rctx;
440 int size; 443 int size;
441 int cpu;
442 444
443 syscall_nr = syscall_get_nr(current, regs); 445 syscall_nr = syscall_get_nr(current, regs);
444 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 446 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +459,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
457 "profile buffer not large enough")) 459 "profile buffer not large enough"))
458 return; 460 return;
459 461
460 /* Protect the per cpu buffer, begin the rcu read side */ 462 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
461 local_irq_save(flags); 463 sys_data->enter_event->id, &rctx, &flags);
462 464 if (!rec)
463 rctx = perf_swevent_get_recursion_context(); 465 return;
464 if (rctx < 0)
465 goto end_recursion;
466
467 cpu = smp_processor_id();
468
469 trace_buf = rcu_dereference(perf_trace_buf);
470
471 if (!trace_buf)
472 goto end;
473
474 raw_data = per_cpu_ptr(trace_buf, cpu);
475
476 /* zero the dead bytes from align to not leak stack to user */
477 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
478 466
479 rec = (struct syscall_trace_enter *) raw_data;
480 tracing_generic_entry_update(&rec->ent, 0, 0);
481 rec->ent.type = sys_data->enter_event->id;
482 rec->nr = syscall_nr; 467 rec->nr = syscall_nr;
483 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 468 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
484 (unsigned long *)&rec->args); 469 (unsigned long *)&rec->args);
485 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); 470 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
486
487end:
488 perf_swevent_put_recursion_context(rctx);
489end_recursion:
490 local_irq_restore(flags);
491} 471}
492 472
493int prof_sysenter_enable(struct ftrace_event_call *call) 473int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +511,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
531 struct syscall_trace_exit *rec; 511 struct syscall_trace_exit *rec;
532 unsigned long flags; 512 unsigned long flags;
533 int syscall_nr; 513 int syscall_nr;
534 char *trace_buf;
535 char *raw_data;
536 int rctx; 514 int rctx;
537 int size; 515 int size;
538 int cpu;
539 516
540 syscall_nr = syscall_get_nr(current, regs); 517 syscall_nr = syscall_get_nr(current, regs);
541 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 518 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +534,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
557 "exit event has grown above profile buffer size")) 534 "exit event has grown above profile buffer size"))
558 return; 535 return;
559 536
560 /* Protect the per cpu buffer, begin the rcu read side */ 537 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
561 local_irq_save(flags); 538 sys_data->exit_event->id, &rctx, &flags);
562 539 if (!rec)
563 rctx = perf_swevent_get_recursion_context(); 540 return;
564 if (rctx < 0)
565 goto end_recursion;
566
567 cpu = smp_processor_id();
568
569 trace_buf = rcu_dereference(perf_trace_buf);
570
571 if (!trace_buf)
572 goto end;
573
574 raw_data = per_cpu_ptr(trace_buf, cpu);
575
576 /* zero the dead bytes from align to not leak stack to user */
577 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
578
579 rec = (struct syscall_trace_exit *)raw_data;
580 541
581 tracing_generic_entry_update(&rec->ent, 0, 0);
582 rec->ent.type = sys_data->exit_event->id;
583 rec->nr = syscall_nr; 542 rec->nr = syscall_nr;
584 rec->ret = syscall_get_return_value(current, regs); 543 rec->ret = syscall_get_return_value(current, regs);
585 544
586 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); 545 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
587
588end:
589 perf_swevent_put_recursion_context(rctx);
590end_recursion:
591 local_irq_restore(flags);
592} 546}
593 547
594int prof_sysexit_enable(struct ftrace_event_call *call) 548int prof_sysexit_enable(struct ftrace_event_call *call)
@@ -603,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
603 ret = register_trace_sys_exit(prof_syscall_exit); 557 ret = register_trace_sys_exit(prof_syscall_exit);
604 if (ret) { 558 if (ret) {
605 pr_info("event trace: Could not activate" 559 pr_info("event trace: Could not activate"
606 "syscall entry trace point"); 560 "syscall exit trace point");
607 } else { 561 } else {
608 set_bit(num, enabled_prof_exit_syscalls); 562 set_bit(num, enabled_prof_exit_syscalls);
609 sys_prof_refcount_exit++; 563 sys_prof_refcount_exit++;
@@ -626,6 +580,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
626 mutex_unlock(&syscall_trace_lock); 580 mutex_unlock(&syscall_trace_lock);
627} 581}
628 582
629#endif 583#endif /* CONFIG_PERF_EVENTS */
630
631 584
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186