aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2010-05-12 17:19:01 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2010-05-12 17:20:33 -0400
commita9aa1d02de36b450990b0e25a88fc2ff1c3e6b94 (patch)
tree1f9d19f1642d263e65906a916a48be9339accc73 /kernel
parent5671a10e2bc7f99d9157c6044faf8be2ef302361 (diff)
parentb57f95a38233a2e73b679bea4a5453a1cc2a1cc9 (diff)
Merge commit 'v2.6.34-rc7' into perf/nmi
Merge reason: catch up with latest softlockup detector changes.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit_tree.c101
-rw-r--r--kernel/audit_watch.c1
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c730
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/compat.c1
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c106
-rw-r--r--kernel/cred.c13
-rw-r--r--kernel/early_res.c584
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exit.c20
-rw-r--r--kernel/fork.c83
-rw-r--r--kernel/futex.c30
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/hw_breakpoint.c65
-rw-r--r--kernel/irq/chip.c89
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/manage.c32
-rw-r--r--kernel/irq/numa_migrate.c5
-rw-r--r--kernel/irq/proc.c1
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kfifo.c6
-rw-r--r--kernel/kgdb.c206
-rw-r--r--kernel/kprobes.c648
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c1
-rw-r--r--kernel/lockdep.c52
-rw-r--r--kernel/module.c161
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/nsproxy.c14
-rw-r--r--kernel/padata.c697
-rw-r--r--kernel/panic.c46
-rw-r--r--kernel/params.c12
-rw-r--r--kernel/perf_event.c231
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/posix-cpu-timers.c46
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig19
-rw-r--r--kernel/power/hibernate.c10
-rw-r--r--kernel/power/hibernate_nvs.c1
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/swap.c5
-rw-r--r--kernel/power/swsusp.c58
-rw-r--r--kernel/power/user.c25
-rw-r--r--kernel/printk.c55
-rw-r--r--kernel/ptrace.c88
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c70
-rw-r--r--kernel/rcutorture.c102
-rw-r--r--kernel/rcutree.c268
-rw-r--r--kernel/rcutree.h82
-rw-r--r--kernel/rcutree_plugin.h231
-rw-r--r--kernel/rcutree_trace.c14
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/res_counter.c1
-rw-r--r--kernel/resource.c110
-rw-r--r--kernel/sched.c2268
-rw-r--r--kernel/sched_cpupri.c7
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c1699
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c66
-rw-r--r--kernel/signal.c45
-rw-r--r--kernel/slow-work.c2
-rw-r--r--kernel/slow-work.h8
-rw-r--r--kernel/smp.c9
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/softlockup.c19
-rw-r--r--kernel/srcu.c53
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c78
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c51
-rw-r--r--kernel/sysctl_binary.c8
-rw-r--r--kernel/taskstats.c7
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/time/clocksource.c36
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/tick-oneshot.c52
-rw-r--r--kernel/time/timecompare.c1
-rw-r--r--kernel/time/timekeeping.c6
-rw-r--r--kernel/time/timer_list.c3
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c6
-rw-r--r--kernel/trace/ftrace.c82
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c64
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c206
-rw-r--r--kernel/trace/trace.h11
-rw-r--r--kernel/trace/trace_branch.c19
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_event_perf.c (renamed from kernel/trace/trace_event_profile.c)63
-rw-r--r--kernel/trace/trace_events.c84
-rw-r--r--kernel/trace/trace_events_filter.c1
-rw-r--r--kernel/trace/trace_export.c87
-rw-r--r--kernel/trace/trace_functions_graph.c108
-rw-r--r--kernel/trace/trace_kprobe.c139
-rw-r--r--kernel/trace/trace_ksym.c1
-rw-r--r--kernel/trace/trace_mmiotrace.c1
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/trace/trace_stat.c1
-rw-r--r--kernel/trace/trace_syscalls.c186
-rw-r--r--kernel/trace/trace_workqueue.c1
-rw-r--r--kernel/tsacct.c1
-rw-r--r--kernel/user.c305
-rw-r--r--kernel/workqueue.c2
125 files changed, 7261 insertions, 4172 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 8a5abe53ebad..d5c30060ac14 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -91,6 +92,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 92obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 93obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
93obj-$(CONFIG_LATENCYTOP) += latencytop.o 94obj-$(CONFIG_LATENCYTOP) += latencytop.o
95obj-$(CONFIG_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
97obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 98obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 99obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_X86_DS) += trace/ 100obj-$(CONFIG_X86_DS) += trace/
@@ -101,6 +105,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
101obj-$(CONFIG_PERF_EVENTS) += perf_event.o 105obj-$(CONFIG_PERF_EVENTS) += perf_event.o
102obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 106obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
103obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 107obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
108obj-$(CONFIG_PADATA) += padata.o
104 109
105ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 110ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
106# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 111# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..24f8c81fc48d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -588,16 +588,6 @@ out:
588} 588}
589 589
590/** 590/**
591 * acct_init_pacct - initialize a new pacct_struct
592 * @pacct: per-process accounting info struct to initialize
593 */
594void acct_init_pacct(struct pacct_struct *pacct)
595{
596 memset(pacct, 0, sizeof(struct pacct_struct));
597 pacct->ac_utime = pacct->ac_stime = cputime_zero;
598}
599
600/**
601 * acct_collect - collect accounting information into pacct_struct 591 * acct_collect - collect accounting information into pacct_struct
602 * @exitcode: task exit code 592 * @exitcode: task exit code
603 * @group_dead: not 0, if this thread is the last one in the process. 593 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
56#include <linux/init.h> 56#include <linux/init.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/slab.h>
59#include <asm/atomic.h> 60#include <asm/atomic.h>
60 61
61static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..c71bd26631a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/slab.h>
49#include <linux/err.h> 50#include <linux/err.h>
50#include <linux/kthread.h> 51#include <linux/kthread.h>
51 52
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 skb_get(skb); 399 skb_get(skb);
399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 400 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
400 if (err < 0) { 401 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 402 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 403 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 404 audit_log_lost("auditd dissapeared\n");
404 audit_pid = 0; 405 audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
6#include <linux/slab.h>
6 7
7struct audit_tree; 8struct audit_tree;
8struct audit_chunk; 9struct audit_chunk;
@@ -548,6 +549,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
548 return 0; 549 return 0;
549} 550}
550 551
552static int compare_root(struct vfsmount *mnt, void *arg)
553{
554 return mnt->mnt_root->d_inode == arg;
555}
556
551void audit_trim_trees(void) 557void audit_trim_trees(void)
552{ 558{
553 struct list_head cursor; 559 struct list_head cursor;
@@ -559,7 +565,6 @@ void audit_trim_trees(void)
559 struct path path; 565 struct path path;
560 struct vfsmount *root_mnt; 566 struct vfsmount *root_mnt;
561 struct node *node; 567 struct node *node;
562 struct list_head list;
563 int err; 568 int err;
564 569
565 tree = container_of(cursor.next, struct audit_tree, list); 570 tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +582,16 @@ void audit_trim_trees(void)
577 if (!root_mnt) 582 if (!root_mnt)
578 goto skip_it; 583 goto skip_it;
579 584
580 list_add_tail(&list, &root_mnt->mnt_list);
581 spin_lock(&hash_lock); 585 spin_lock(&hash_lock);
582 list_for_each_entry(node, &tree->chunks, list) { 586 list_for_each_entry(node, &tree->chunks, list) {
583 struct audit_chunk *chunk = find_chunk(node); 587 struct inode *inode = find_chunk(node)->watch.inode;
584 struct inode *inode = chunk->watch.inode;
585 struct vfsmount *mnt;
586 node->index |= 1U<<31; 588 node->index |= 1U<<31;
587 list_for_each_entry(mnt, &list, mnt_list) { 589 if (iterate_mounts(compare_root, inode, root_mnt))
588 if (mnt->mnt_root->d_inode == inode) { 590 node->index &= ~(1U<<31);
589 node->index &= ~(1U<<31);
590 break;
591 }
592 }
593 } 591 }
594 spin_unlock(&hash_lock); 592 spin_unlock(&hash_lock);
595 trim_marked(tree); 593 trim_marked(tree);
596 put_tree(tree); 594 put_tree(tree);
597 list_del_init(&list);
598 drop_collected_mounts(root_mnt); 595 drop_collected_mounts(root_mnt);
599skip_it: 596skip_it:
600 mutex_lock(&audit_filter_mutex); 597 mutex_lock(&audit_filter_mutex);
@@ -603,22 +600,6 @@ skip_it:
603 mutex_unlock(&audit_filter_mutex); 600 mutex_unlock(&audit_filter_mutex);
604} 601}
605 602
606static int is_under(struct vfsmount *mnt, struct dentry *dentry,
607 struct path *path)
608{
609 if (mnt != path->mnt) {
610 for (;;) {
611 if (mnt->mnt_parent == mnt)
612 return 0;
613 if (mnt->mnt_parent == path->mnt)
614 break;
615 mnt = mnt->mnt_parent;
616 }
617 dentry = mnt->mnt_mountpoint;
618 }
619 return is_subdir(dentry, path->dentry);
620}
621
622int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 603int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
623{ 604{
624 605
@@ -638,13 +619,17 @@ void audit_put_tree(struct audit_tree *tree)
638 put_tree(tree); 619 put_tree(tree);
639} 620}
640 621
622static int tag_mount(struct vfsmount *mnt, void *arg)
623{
624 return tag_chunk(mnt->mnt_root->d_inode, arg);
625}
626
641/* called with audit_filter_mutex */ 627/* called with audit_filter_mutex */
642int audit_add_tree_rule(struct audit_krule *rule) 628int audit_add_tree_rule(struct audit_krule *rule)
643{ 629{
644 struct audit_tree *seed = rule->tree, *tree; 630 struct audit_tree *seed = rule->tree, *tree;
645 struct path path; 631 struct path path;
646 struct vfsmount *mnt, *p; 632 struct vfsmount *mnt;
647 struct list_head list;
648 int err; 633 int err;
649 634
650 list_for_each_entry(tree, &tree_list, list) { 635 list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +655,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
670 err = -ENOMEM; 655 err = -ENOMEM;
671 goto Err; 656 goto Err;
672 } 657 }
673 list_add_tail(&list, &mnt->mnt_list);
674 658
675 get_tree(tree); 659 get_tree(tree);
676 list_for_each_entry(p, &list, mnt_list) { 660 err = iterate_mounts(tag_mount, tree, mnt);
677 err = tag_chunk(p->mnt_root->d_inode, tree);
678 if (err)
679 break;
680 }
681
682 list_del(&list);
683 drop_collected_mounts(mnt); 661 drop_collected_mounts(mnt);
684 662
685 if (!err) { 663 if (!err) {
@@ -714,31 +692,23 @@ int audit_tag_tree(char *old, char *new)
714{ 692{
715 struct list_head cursor, barrier; 693 struct list_head cursor, barrier;
716 int failed = 0; 694 int failed = 0;
717 struct path path; 695 struct path path1, path2;
718 struct vfsmount *tagged; 696 struct vfsmount *tagged;
719 struct list_head list;
720 struct vfsmount *mnt;
721 struct dentry *dentry;
722 int err; 697 int err;
723 698
724 err = kern_path(new, 0, &path); 699 err = kern_path(new, 0, &path2);
725 if (err) 700 if (err)
726 return err; 701 return err;
727 tagged = collect_mounts(&path); 702 tagged = collect_mounts(&path2);
728 path_put(&path); 703 path_put(&path2);
729 if (!tagged) 704 if (!tagged)
730 return -ENOMEM; 705 return -ENOMEM;
731 706
732 err = kern_path(old, 0, &path); 707 err = kern_path(old, 0, &path1);
733 if (err) { 708 if (err) {
734 drop_collected_mounts(tagged); 709 drop_collected_mounts(tagged);
735 return err; 710 return err;
736 } 711 }
737 mnt = mntget(path.mnt);
738 dentry = dget(path.dentry);
739 path_put(&path);
740
741 list_add_tail(&list, &tagged->mnt_list);
742 712
743 mutex_lock(&audit_filter_mutex); 713 mutex_lock(&audit_filter_mutex);
744 list_add(&barrier, &tree_list); 714 list_add(&barrier, &tree_list);
@@ -746,7 +716,7 @@ int audit_tag_tree(char *old, char *new)
746 716
747 while (cursor.next != &tree_list) { 717 while (cursor.next != &tree_list) {
748 struct audit_tree *tree; 718 struct audit_tree *tree;
749 struct vfsmount *p; 719 int good_one = 0;
750 720
751 tree = container_of(cursor.next, struct audit_tree, list); 721 tree = container_of(cursor.next, struct audit_tree, list);
752 get_tree(tree); 722 get_tree(tree);
@@ -754,30 +724,19 @@ int audit_tag_tree(char *old, char *new)
754 list_add(&cursor, &tree->list); 724 list_add(&cursor, &tree->list);
755 mutex_unlock(&audit_filter_mutex); 725 mutex_unlock(&audit_filter_mutex);
756 726
757 err = kern_path(tree->pathname, 0, &path); 727 err = kern_path(tree->pathname, 0, &path2);
758 if (err) { 728 if (!err) {
759 put_tree(tree); 729 good_one = path_is_under(&path1, &path2);
760 mutex_lock(&audit_filter_mutex); 730 path_put(&path2);
761 continue;
762 } 731 }
763 732
764 spin_lock(&vfsmount_lock); 733 if (!good_one) {
765 if (!is_under(mnt, dentry, &path)) {
766 spin_unlock(&vfsmount_lock);
767 path_put(&path);
768 put_tree(tree); 734 put_tree(tree);
769 mutex_lock(&audit_filter_mutex); 735 mutex_lock(&audit_filter_mutex);
770 continue; 736 continue;
771 } 737 }
772 spin_unlock(&vfsmount_lock);
773 path_put(&path);
774
775 list_for_each_entry(p, &list, mnt_list) {
776 failed = tag_chunk(p->mnt_root->d_inode, tree);
777 if (failed)
778 break;
779 }
780 738
739 failed = iterate_mounts(tag_mount, tree, tagged);
781 if (failed) { 740 if (failed) {
782 put_tree(tree); 741 put_tree(tree);
783 mutex_lock(&audit_filter_mutex); 742 mutex_lock(&audit_filter_mutex);
@@ -818,10 +777,8 @@ int audit_tag_tree(char *old, char *new)
818 } 777 }
819 list_del(&barrier); 778 list_del(&barrier);
820 list_del(&cursor); 779 list_del(&cursor);
821 list_del(&list);
822 mutex_unlock(&audit_filter_mutex); 780 mutex_unlock(&audit_filter_mutex);
823 dput(dentry); 781 path_put(&path1);
824 mntput(mnt);
825 drop_collected_mounts(tagged); 782 drop_collected_mounts(tagged);
826 return failed; 783 return failed;
827} 784}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/inotify.h> 31#include <linux/inotify.h>
31#include <linux/security.h> 32#include <linux/security.h>
32#include "audit.h" 33#include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/security.h> 31#include <linux/security.h>
31#include "audit.h" 32#include "audit.h"
32 33
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
51#include <linux/module.h> 51#include <linux/module.h>
52#include <linux/slab.h>
52#include <linux/mount.h> 53#include <linux/mount.h>
53#include <linux/socket.h> 54#include <linux/socket.h>
54#include <linux/mqueue.h> 55#include <linux/mqueue.h>
@@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
1893{ 1894{
1894 if (context->name_count >= AUDIT_NAMES) { 1895 if (context->name_count >= AUDIT_NAMES) {
1895 if (inode) 1896 if (inode)
1896 printk(KERN_DEBUG "name_count maxed, losing inode data: " 1897 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1897 "dev=%02x:%02x, inode=%lu\n", 1898 "dev=%02x:%02x, inode=%lu\n",
1898 MAJOR(inode->i_sb->s_dev), 1899 MAJOR(inode->i_sb->s_dev),
1899 MINOR(inode->i_sb->s_dev), 1900 MINOR(inode->i_sb->s_dev),
@@ -1988,7 +1989,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1988 1989
1989/** 1990/**
1990 * audit_inode_child - collect inode info for created/removed objects 1991 * audit_inode_child - collect inode info for created/removed objects
1991 * @dname: inode's dentry name
1992 * @dentry: dentry being audited 1992 * @dentry: dentry being audited
1993 * @parent: inode of dentry parent 1993 * @parent: inode of dentry parent
1994 * 1994 *
@@ -2000,13 +2000,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2000 * must be hooked prior, in order to capture the target inode during 2000 * must be hooked prior, in order to capture the target inode during
2001 * unsuccessful attempts. 2001 * unsuccessful attempts.
2002 */ 2002 */
2003void __audit_inode_child(const char *dname, const struct dentry *dentry, 2003void __audit_inode_child(const struct dentry *dentry,
2004 const struct inode *parent) 2004 const struct inode *parent)
2005{ 2005{
2006 int idx; 2006 int idx;
2007 struct audit_context *context = current->audit_context; 2007 struct audit_context *context = current->audit_context;
2008 const char *found_parent = NULL, *found_child = NULL; 2008 const char *found_parent = NULL, *found_child = NULL;
2009 const struct inode *inode = dentry->d_inode; 2009 const struct inode *inode = dentry->d_inode;
2010 const char *dname = dentry->d_name.name;
2010 int dirlen = 0; 2011 int dirlen = 0;
2011 2012
2012 if (!context->in_syscall) 2013 if (!context->in_syscall)
@@ -2014,9 +2015,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2014 2015
2015 if (inode) 2016 if (inode)
2016 handle_one(inode); 2017 handle_one(inode);
2017 /* determine matching parent */
2018 if (!dname)
2019 goto add_names;
2020 2018
2021 /* parent is more likely, look for it first */ 2019 /* parent is more likely, look for it first */
2022 for (idx = 0; idx < context->name_count; idx++) { 2020 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
135 if (pid && (pid != task_pid_vnr(current))) { 135 if (pid && (pid != task_pid_vnr(current))) {
136 struct task_struct *target; 136 struct task_struct *target;
137 137
138 read_lock(&tasklist_lock); 138 rcu_read_lock();
139 139
140 target = find_task_by_vpid(pid); 140 target = find_task_by_vpid(pid);
141 if (!target) 141 if (!target)
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
143 else 143 else
144 ret = security_capget(target, pEp, pIp, pPp); 144 ret = security_capget(target, pEp, pIp, pPp);
145 145
146 read_unlock(&tasklist_lock); 146 rcu_read_unlock();
147 } else 147 } else
148 ret = security_capget(current, pEp, pIp, pPp); 148 ret = security_capget(current, pEp, pIp, pPp);
149 149
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044a..3a53c771e503 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
43#include <linux/string.h> 47#include <linux/string.h>
44#include <linux/sort.h> 48#include <linux/sort.h>
45#include <linux/kmod.h> 49#include <linux/kmod.h>
50#include <linux/module.h>
46#include <linux/delayacct.h> 51#include <linux/delayacct.h>
47#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
48#include <linux/hash.h> 53#include <linux/hash.h>
@@ -51,15 +56,21 @@
51#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
52#include <linux/idr.h> 57#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
54 61
55#include <asm/atomic.h> 62#include <asm/atomic.h>
56 63
57static DEFINE_MUTEX(cgroup_mutex); 64static DEFINE_MUTEX(cgroup_mutex);
58 65
59/* Generate an array of cgroup subsystem pointers */ 66/*
67 * Generate an array of cgroup subsystem pointers. At boot time, this is
68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69 * registered after that. The mutable section of this array is protected by
70 * cgroup_mutex.
71 */
60#define SUBSYS(_x) &_x ## _subsys, 72#define SUBSYS(_x) &_x ## _subsys,
61 73static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
62static struct cgroup_subsys *subsys[] = {
63#include <linux/cgroup_subsys.h> 74#include <linux/cgroup_subsys.h>
64}; 75};
65 76
@@ -146,6 +157,35 @@ struct css_id {
146 unsigned short stack[0]; /* Array of Length (depth+1) */ 157 unsigned short stack[0]; /* Array of Length (depth+1) */
147}; 158};
148 159
160/*
161 * cgroup_event represents events which userspace want to recieve.
162 */
163struct cgroup_event {
164 /*
165 * Cgroup which the event belongs to.
166 */
167 struct cgroup *cgrp;
168 /*
169 * Control file which the event associated.
170 */
171 struct cftype *cft;
172 /*
173 * eventfd to signal userspace about the event.
174 */
175 struct eventfd_ctx *eventfd;
176 /*
177 * Each of these stored in a list by the cgroup.
178 */
179 struct list_head list;
180 /*
181 * All fields below needed to unregister event when
182 * userspace closes eventfd.
183 */
184 poll_table pt;
185 wait_queue_head_t *wqh;
186 wait_queue_t wait;
187 struct work_struct remove;
188};
149 189
150/* The list of hierarchy roots */ 190/* The list of hierarchy roots */
151 191
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 206 */
167static int need_forkexit_callback __read_mostly; 207static int need_forkexit_callback __read_mostly;
168 208
209#ifdef CONFIG_PROVE_LOCKING
210int cgroup_lock_is_held(void)
211{
212 return lockdep_is_held(&cgroup_mutex);
213}
214#else /* #ifdef CONFIG_PROVE_LOCKING */
215int cgroup_lock_is_held(void)
216{
217 return mutex_is_locked(&cgroup_mutex);
218}
219#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
220
221EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
222
169/* convenient tests for these bits */ 223/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 224inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 225{
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
235static struct css_set init_css_set; 289static struct css_set init_css_set;
236static struct cg_cgroup_link init_css_set_link; 290static struct cg_cgroup_link init_css_set_link;
237 291
238static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 292static int cgroup_init_idr(struct cgroup_subsys *ss,
293 struct cgroup_subsys_state *css);
239 294
240/* css_set_lock protects the list of css_set objects, and the 295/* css_set_lock protects the list of css_set objects, and the
241 * chain of tasks off each css_set. Nests outside task->alloc_lock 296 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
433 struct hlist_node *node; 488 struct hlist_node *node;
434 struct css_set *cg; 489 struct css_set *cg;
435 490
436 /* Built the set of subsystem state objects that we want to 491 /*
437 * see in the new css_set */ 492 * Build the set of subsystem state objects that we want to see in the
493 * new css_set. while subsystems can change globally, the entries here
494 * won't change, so no need for locking.
495 */
438 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 496 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
439 if (root->subsys_bits & (1UL << i)) { 497 if (root->subsys_bits & (1UL << i)) {
440 /* Subsystem is in this hierarchy. So we want 498 /* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
681{ 739{
682 mutex_lock(&cgroup_mutex); 740 mutex_lock(&cgroup_mutex);
683} 741}
742EXPORT_SYMBOL_GPL(cgroup_lock);
684 743
685/** 744/**
686 * cgroup_unlock - release lock on cgroup changes 745 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
691{ 750{
692 mutex_unlock(&cgroup_mutex); 751 mutex_unlock(&cgroup_mutex);
693} 752}
753EXPORT_SYMBOL_GPL(cgroup_unlock);
694 754
695/* 755/*
696 * A couple of forward declarations required, due to cyclic reference loop: 756 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
742 if (ret) 802 if (ret)
743 break; 803 break;
744 } 804 }
805
745 return ret; 806 return ret;
746} 807}
747 808
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
869 css_put(css); 930 css_put(css);
870} 931}
871 932
872 933/*
934 * Call with cgroup_mutex held. Drops reference counts on modules, including
935 * any duplicate ones that parse_cgroupfs_options took. If this function
936 * returns an error, no reference counts are touched.
937 */
873static int rebind_subsystems(struct cgroupfs_root *root, 938static int rebind_subsystems(struct cgroupfs_root *root,
874 unsigned long final_bits) 939 unsigned long final_bits)
875{ 940{
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
877 struct cgroup *cgrp = &root->top_cgroup; 942 struct cgroup *cgrp = &root->top_cgroup;
878 int i; 943 int i;
879 944
945 BUG_ON(!mutex_is_locked(&cgroup_mutex));
946
880 removed_bits = root->actual_subsys_bits & ~final_bits; 947 removed_bits = root->actual_subsys_bits & ~final_bits;
881 added_bits = final_bits & ~root->actual_subsys_bits; 948 added_bits = final_bits & ~root->actual_subsys_bits;
882 /* Check that any added subsystems are currently free */ 949 /* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
885 struct cgroup_subsys *ss = subsys[i]; 952 struct cgroup_subsys *ss = subsys[i];
886 if (!(bit & added_bits)) 953 if (!(bit & added_bits))
887 continue; 954 continue;
955 /*
956 * Nobody should tell us to do a subsys that doesn't exist:
957 * parse_cgroupfs_options should catch that case and refcounts
958 * ensure that subsystems won't disappear once selected.
959 */
960 BUG_ON(ss == NULL);
888 if (ss->root != &rootnode) { 961 if (ss->root != &rootnode) {
889 /* Subsystem isn't free */ 962 /* Subsystem isn't free */
890 return -EBUSY; 963 return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
904 unsigned long bit = 1UL << i; 977 unsigned long bit = 1UL << i;
905 if (bit & added_bits) { 978 if (bit & added_bits) {
906 /* We're binding this subsystem to this hierarchy */ 979 /* We're binding this subsystem to this hierarchy */
980 BUG_ON(ss == NULL);
907 BUG_ON(cgrp->subsys[i]); 981 BUG_ON(cgrp->subsys[i]);
908 BUG_ON(!dummytop->subsys[i]); 982 BUG_ON(!dummytop->subsys[i]);
909 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 983 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
915 if (ss->bind) 989 if (ss->bind)
916 ss->bind(ss, cgrp); 990 ss->bind(ss, cgrp);
917 mutex_unlock(&ss->hierarchy_mutex); 991 mutex_unlock(&ss->hierarchy_mutex);
992 /* refcount was already taken, and we're keeping it */
918 } else if (bit & removed_bits) { 993 } else if (bit & removed_bits) {
919 /* We're removing this subsystem */ 994 /* We're removing this subsystem */
995 BUG_ON(ss == NULL);
920 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 996 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
921 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 997 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
922 mutex_lock(&ss->hierarchy_mutex); 998 mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
927 subsys[i]->root = &rootnode; 1003 subsys[i]->root = &rootnode;
928 list_move(&ss->sibling, &rootnode.subsys_list); 1004 list_move(&ss->sibling, &rootnode.subsys_list);
929 mutex_unlock(&ss->hierarchy_mutex); 1005 mutex_unlock(&ss->hierarchy_mutex);
1006 /* subsystem is now free - drop reference on module */
1007 module_put(ss->module);
930 } else if (bit & final_bits) { 1008 } else if (bit & final_bits) {
931 /* Subsystem state should already exist */ 1009 /* Subsystem state should already exist */
1010 BUG_ON(ss == NULL);
932 BUG_ON(!cgrp->subsys[i]); 1011 BUG_ON(!cgrp->subsys[i]);
1012 /*
1013 * a refcount was taken, but we already had one, so
1014 * drop the extra reference.
1015 */
1016 module_put(ss->module);
1017#ifdef CONFIG_MODULE_UNLOAD
1018 BUG_ON(ss->module && !module_refcount(ss->module));
1019#endif
933 } else { 1020 } else {
934 /* Subsystem state shouldn't exist */ 1021 /* Subsystem state shouldn't exist */
935 BUG_ON(cgrp->subsys[i]); 1022 BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
971 1058
972}; 1059};
973 1060
974/* Convert a hierarchy specifier into a bitmask of subsystems and 1061/*
975 * flags. */ 1062 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
976static int parse_cgroupfs_options(char *data, 1063 * with cgroup_mutex held to protect the subsys[] array. This function takes
977 struct cgroup_sb_opts *opts) 1064 * refcounts on subsystems to be used, unless it returns error, in which case
1065 * no refcounts are taken.
1066 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
978{ 1068{
979 char *token, *o = data ?: "all"; 1069 char *token, *o = data ?: "all";
980 unsigned long mask = (unsigned long)-1; 1070 unsigned long mask = (unsigned long)-1;
1071 int i;
1072 bool module_pin_failed = false;
1073
1074 BUG_ON(!mutex_is_locked(&cgroup_mutex));
981 1075
982#ifdef CONFIG_CPUSETS 1076#ifdef CONFIG_CPUSETS
983 mask = ~(1UL << cpuset_subsys_id); 1077 mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
990 return -EINVAL; 1084 return -EINVAL;
991 if (!strcmp(token, "all")) { 1085 if (!strcmp(token, "all")) {
992 /* Add all non-disabled subsystems */ 1086 /* Add all non-disabled subsystems */
993 int i;
994 opts->subsys_bits = 0; 1087 opts->subsys_bits = 0;
995 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
996 struct cgroup_subsys *ss = subsys[i]; 1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
997 if (!ss->disabled) 1092 if (!ss->disabled)
998 opts->subsys_bits |= 1ul << i; 1093 opts->subsys_bits |= 1ul << i;
999 } 1094 }
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
1011 if (!opts->release_agent) 1106 if (!opts->release_agent)
1012 return -ENOMEM; 1107 return -ENOMEM;
1013 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
1014 int i;
1015 const char *name = token + 5; 1109 const char *name = token + 5;
1016 /* Can't specify an empty name */ 1110 /* Can't specify an empty name */
1017 if (!strlen(name)) 1111 if (!strlen(name))
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
1035 return -ENOMEM; 1129 return -ENOMEM;
1036 } else { 1130 } else {
1037 struct cgroup_subsys *ss; 1131 struct cgroup_subsys *ss;
1038 int i;
1039 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1040 ss = subsys[i]; 1133 ss = subsys[i];
1134 if (ss == NULL)
1135 continue;
1041 if (!strcmp(token, ss->name)) { 1136 if (!strcmp(token, ss->name)) {
1042 if (!ss->disabled) 1137 if (!ss->disabled)
1043 set_bit(i, &opts->subsys_bits); 1138 set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
1072 if (!opts->subsys_bits && !opts->name) 1167 if (!opts->subsys_bits && !opts->name)
1073 return -EINVAL; 1168 return -EINVAL;
1074 1169
1170 /*
1171 * Grab references on all the modules we'll need, so the subsystems
1172 * don't dance around before rebind_subsystems attaches them. This may
1173 * take duplicate reference counts on a subsystem that's already used,
1174 * but rebind_subsystems handles this case.
1175 */
1176 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1177 unsigned long bit = 1UL << i;
1178
1179 if (!(bit & opts->subsys_bits))
1180 continue;
1181 if (!try_module_get(subsys[i]->module)) {
1182 module_pin_failed = true;
1183 break;
1184 }
1185 }
1186 if (module_pin_failed) {
1187 /*
1188 * oops, one of the modules was going away. this means that we
1189 * raced with a module_delete call, and to the user this is
1190 * essentially a "subsystem doesn't exist" case.
1191 */
1192 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1193 /* drop refcounts only on the ones we took */
1194 unsigned long bit = 1UL << i;
1195
1196 if (!(bit & opts->subsys_bits))
1197 continue;
1198 module_put(subsys[i]->module);
1199 }
1200 return -ENOENT;
1201 }
1202
1075 return 0; 1203 return 0;
1076} 1204}
1077 1205
1206static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1207{
1208 int i;
1209 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1210 unsigned long bit = 1UL << i;
1211
1212 if (!(bit & subsys_bits))
1213 continue;
1214 module_put(subsys[i]->module);
1215 }
1216}
1217
1078static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1218static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1079{ 1219{
1080 int ret = 0; 1220 int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1091 if (ret) 1231 if (ret)
1092 goto out_unlock; 1232 goto out_unlock;
1093 1233
1094 /* Don't allow flags to change at remount */ 1234 /* Don't allow flags or name to change at remount */
1095 if (opts.flags != root->flags) { 1235 if (opts.flags != root->flags ||
1096 ret = -EINVAL; 1236 (opts.name && strcmp(opts.name, root->name))) {
1097 goto out_unlock;
1098 }
1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL; 1237 ret = -EINVAL;
1238 drop_parsed_module_refcounts(opts.subsys_bits);
1103 goto out_unlock; 1239 goto out_unlock;
1104 } 1240 }
1105 1241
1106 ret = rebind_subsystems(root, opts.subsys_bits); 1242 ret = rebind_subsystems(root, opts.subsys_bits);
1107 if (ret) 1243 if (ret) {
1244 drop_parsed_module_refcounts(opts.subsys_bits);
1108 goto out_unlock; 1245 goto out_unlock;
1246 }
1109 1247
1110 /* (re)populate subsystem files */ 1248 /* (re)populate subsystem files */
1111 cgroup_populate_dir(cgrp); 1249 cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1136 INIT_LIST_HEAD(&cgrp->release_list); 1274 INIT_LIST_HEAD(&cgrp->release_list);
1137 INIT_LIST_HEAD(&cgrp->pidlists); 1275 INIT_LIST_HEAD(&cgrp->pidlists);
1138 mutex_init(&cgrp->pidlist_mutex); 1276 mutex_init(&cgrp->pidlist_mutex);
1277 INIT_LIST_HEAD(&cgrp->event_list);
1278 spin_lock_init(&cgrp->event_list_lock);
1139} 1279}
1140 1280
1141static void init_cgroup_root(struct cgroupfs_root *root) 1281static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1291 struct cgroupfs_root *new_root; 1431 struct cgroupfs_root *new_root;
1292 1432
1293 /* First find the desired set of subsystems */ 1433 /* First find the desired set of subsystems */
1434 mutex_lock(&cgroup_mutex);
1294 ret = parse_cgroupfs_options(data, &opts); 1435 ret = parse_cgroupfs_options(data, &opts);
1436 mutex_unlock(&cgroup_mutex);
1295 if (ret) 1437 if (ret)
1296 goto out_err; 1438 goto out_err;
1297 1439
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1302 new_root = cgroup_root_from_opts(&opts); 1444 new_root = cgroup_root_from_opts(&opts);
1303 if (IS_ERR(new_root)) { 1445 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root); 1446 ret = PTR_ERR(new_root);
1305 goto out_err; 1447 goto drop_modules;
1306 } 1448 }
1307 opts.new_root = new_root; 1449 opts.new_root = new_root;
1308 1450
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1311 if (IS_ERR(sb)) { 1453 if (IS_ERR(sb)) {
1312 ret = PTR_ERR(sb); 1454 ret = PTR_ERR(sb);
1313 cgroup_drop_root(opts.new_root); 1455 cgroup_drop_root(opts.new_root);
1314 goto out_err; 1456 goto drop_modules;
1315 } 1457 }
1316 1458
1317 root = sb->s_fs_info; 1459 root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1367 free_cg_links(&tmp_cg_links); 1509 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super; 1510 goto drop_new_super;
1369 } 1511 }
1512 /*
1513 * There must be no failure case after here, since rebinding
1514 * takes care of subsystems' refcounts, which are explicitly
1515 * dropped in the failure exit path.
1516 */
1370 1517
1371 /* EBUSY should be the only error here */ 1518 /* EBUSY should be the only error here */
1372 BUG_ON(ret); 1519 BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1405 * any) is not needed 1552 * any) is not needed
1406 */ 1553 */
1407 cgroup_drop_root(opts.new_root); 1554 cgroup_drop_root(opts.new_root);
1555 /* no subsys rebinding, so refcounts don't change */
1556 drop_parsed_module_refcounts(opts.subsys_bits);
1408 } 1557 }
1409 1558
1410 simple_set_mnt(mnt, sb); 1559 simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1414 1563
1415 drop_new_super: 1564 drop_new_super:
1416 deactivate_locked_super(sb); 1565 deactivate_locked_super(sb);
1566 drop_modules:
1567 drop_parsed_module_refcounts(opts.subsys_bits);
1417 out_err: 1568 out_err:
1418 kfree(opts.release_agent); 1569 kfree(opts.release_agent);
1419 kfree(opts.name); 1570 kfree(opts.name);
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1495int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1496{ 1647{
1497 char *start; 1648 char *start;
1498 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1499 1652
1500 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1501 /* 1654 /*
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1511 *--start = '\0'; 1664 *--start = '\0';
1512 for (;;) { 1665 for (;;) {
1513 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1514 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1515 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1516 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1517 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1518 if (!cgrp) 1672 if (!cgrp)
1519 break; 1673 break;
1520 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1521 if (!cgrp->parent) 1678 if (!cgrp->parent)
1522 continue; 1679 continue;
1523 if (--start < buf) 1680 if (--start < buf)
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1527 memmove(buf, start, buf + buflen - start); 1684 memmove(buf, start, buf + buflen - start);
1528 return 0; 1685 return 0;
1529} 1686}
1687EXPORT_SYMBOL_GPL(cgroup_path);
1530 1688
1531/** 1689/**
1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1690 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1539int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1697int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1540{ 1698{
1541 int retval = 0; 1699 int retval = 0;
1542 struct cgroup_subsys *ss; 1700 struct cgroup_subsys *ss, *failed_ss = NULL;
1543 struct cgroup *oldcgrp; 1701 struct cgroup *oldcgrp;
1544 struct css_set *cg; 1702 struct css_set *cg;
1545 struct css_set *newcg; 1703 struct css_set *newcg;
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1553 for_each_subsys(root, ss) { 1711 for_each_subsys(root, ss) {
1554 if (ss->can_attach) { 1712 if (ss->can_attach) {
1555 retval = ss->can_attach(ss, cgrp, tsk, false); 1713 retval = ss->can_attach(ss, cgrp, tsk, false);
1556 if (retval) 1714 if (retval) {
1557 return retval; 1715 /*
1716 * Remember on which subsystem the can_attach()
1717 * failed, so that we only call cancel_attach()
1718 * against the subsystems whose can_attach()
1719 * succeeded. (See below)
1720 */
1721 failed_ss = ss;
1722 goto out;
1723 }
1558 } 1724 }
1559 } 1725 }
1560 1726
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 */ 1734 */
1569 newcg = find_css_set(cg, cgrp); 1735 newcg = find_css_set(cg, cgrp);
1570 put_css_set(cg); 1736 put_css_set(cg);
1571 if (!newcg) 1737 if (!newcg) {
1572 return -ENOMEM; 1738 retval = -ENOMEM;
1739 goto out;
1740 }
1573 1741
1574 task_lock(tsk); 1742 task_lock(tsk);
1575 if (tsk->flags & PF_EXITING) { 1743 if (tsk->flags & PF_EXITING) {
1576 task_unlock(tsk); 1744 task_unlock(tsk);
1577 put_css_set(newcg); 1745 put_css_set(newcg);
1578 return -ESRCH; 1746 retval = -ESRCH;
1747 goto out;
1579 } 1748 }
1580 rcu_assign_pointer(tsk->cgroups, newcg); 1749 rcu_assign_pointer(tsk->cgroups, newcg);
1581 task_unlock(tsk); 1750 task_unlock(tsk);
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1601 * is no longer empty. 1770 * is no longer empty.
1602 */ 1771 */
1603 cgroup_wakeup_rmdir_waiter(cgrp); 1772 cgroup_wakeup_rmdir_waiter(cgrp);
1604 return 0; 1773out:
1774 if (retval) {
1775 for_each_subsys(root, ss) {
1776 if (ss == failed_ss)
1777 /*
1778 * This subsystem was the one that failed the
1779 * can_attach() check earlier, so we don't need
1780 * to call cancel_attach() against it or any
1781 * remaining subsystems.
1782 */
1783 break;
1784 if (ss->cancel_attach)
1785 ss->cancel_attach(ss, cgrp, tsk, false);
1786 }
1787 }
1788 return retval;
1605} 1789}
1606 1790
1607/* 1791/*
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1667 } 1851 }
1668 return true; 1852 return true;
1669} 1853}
1854EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1670 1855
1671static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1856static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1672 const char *buffer) 1857 const char *buffer)
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1935 .rename = cgroup_rename, 2120 .rename = cgroup_rename,
1936}; 2121};
1937 2122
2123/*
2124 * Check if a file is a control file
2125 */
2126static inline struct cftype *__file_cft(struct file *file)
2127{
2128 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2129 return ERR_PTR(-EINVAL);
2130 return __d_cft(file->f_dentry);
2131}
2132
1938static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2133static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1939 struct super_block *sb) 2134 struct super_block *sb)
1940{ 2135{
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2054 error = PTR_ERR(dentry); 2249 error = PTR_ERR(dentry);
2055 return error; 2250 return error;
2056} 2251}
2252EXPORT_SYMBOL_GPL(cgroup_add_file);
2057 2253
2058int cgroup_add_files(struct cgroup *cgrp, 2254int cgroup_add_files(struct cgroup *cgrp,
2059 struct cgroup_subsys *subsys, 2255 struct cgroup_subsys *subsys,
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2068 } 2264 }
2069 return 0; 2265 return 0;
2070} 2266}
2267EXPORT_SYMBOL_GPL(cgroup_add_files);
2071 2268
2072/** 2269/**
2073 * cgroup_task_count - count the number of tasks in a cgroup. 2270 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2453{ 2650{
2454 struct cgroup_pidlist *l; 2651 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */ 2652 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2653 struct pid_namespace *ns = current->nsproxy->pid_ns;
2654
2457 /* 2655 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case 2656 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same 2657 * the last ref-holder is trying to remove l from the list at the same
@@ -2463,8 +2661,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2463 mutex_lock(&cgrp->pidlist_mutex); 2661 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) { 2662 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) { 2663 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */ 2664 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2665 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2666 mutex_unlock(&cgrp->pidlist_mutex);
@@ -2475,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2475 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2671 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2476 if (!l) { 2672 if (!l) {
2477 mutex_unlock(&cgrp->pidlist_mutex); 2673 mutex_unlock(&cgrp->pidlist_mutex);
2478 put_pid_ns(ns);
2479 return l; 2674 return l;
2480 } 2675 }
2481 init_rwsem(&l->mutex); 2676 init_rwsem(&l->mutex);
2482 down_write(&l->mutex); 2677 down_write(&l->mutex);
2483 l->key.type = type; 2678 l->key.type = type;
2484 l->key.ns = ns; 2679 l->key.ns = get_pid_ns(ns);
2485 l->use_count = 0; /* don't increment here */ 2680 l->use_count = 0; /* don't increment here */
2486 l->list = NULL; 2681 l->list = NULL;
2487 l->owner = cgrp; 2682 l->owner = cgrp;
@@ -2789,6 +2984,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2789} 2984}
2790 2985
2791/* 2986/*
2987 * Unregister event and free resources.
2988 *
2989 * Gets called from workqueue.
2990 */
2991static void cgroup_event_remove(struct work_struct *work)
2992{
2993 struct cgroup_event *event = container_of(work, struct cgroup_event,
2994 remove);
2995 struct cgroup *cgrp = event->cgrp;
2996
2997 /* TODO: check return code */
2998 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2999
3000 eventfd_ctx_put(event->eventfd);
3001 kfree(event);
3002 dput(cgrp->dentry);
3003}
3004
3005/*
3006 * Gets called on POLLHUP on eventfd when user closes it.
3007 *
3008 * Called with wqh->lock held and interrupts disabled.
3009 */
3010static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3011 int sync, void *key)
3012{
3013 struct cgroup_event *event = container_of(wait,
3014 struct cgroup_event, wait);
3015 struct cgroup *cgrp = event->cgrp;
3016 unsigned long flags = (unsigned long)key;
3017
3018 if (flags & POLLHUP) {
3019 remove_wait_queue_locked(event->wqh, &event->wait);
3020 spin_lock(&cgrp->event_list_lock);
3021 list_del(&event->list);
3022 spin_unlock(&cgrp->event_list_lock);
3023 /*
3024 * We are in atomic context, but cgroup_event_remove() may
3025 * sleep, so we have to call it in workqueue.
3026 */
3027 schedule_work(&event->remove);
3028 }
3029
3030 return 0;
3031}
3032
3033static void cgroup_event_ptable_queue_proc(struct file *file,
3034 wait_queue_head_t *wqh, poll_table *pt)
3035{
3036 struct cgroup_event *event = container_of(pt,
3037 struct cgroup_event, pt);
3038
3039 event->wqh = wqh;
3040 add_wait_queue(wqh, &event->wait);
3041}
3042
3043/*
3044 * Parse input and register new cgroup event handler.
3045 *
3046 * Input must be in format '<event_fd> <control_fd> <args>'.
3047 * Interpretation of args is defined by control file implementation.
3048 */
3049static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3050 const char *buffer)
3051{
3052 struct cgroup_event *event = NULL;
3053 unsigned int efd, cfd;
3054 struct file *efile = NULL;
3055 struct file *cfile = NULL;
3056 char *endp;
3057 int ret;
3058
3059 efd = simple_strtoul(buffer, &endp, 10);
3060 if (*endp != ' ')
3061 return -EINVAL;
3062 buffer = endp + 1;
3063
3064 cfd = simple_strtoul(buffer, &endp, 10);
3065 if ((*endp != ' ') && (*endp != '\0'))
3066 return -EINVAL;
3067 buffer = endp + 1;
3068
3069 event = kzalloc(sizeof(*event), GFP_KERNEL);
3070 if (!event)
3071 return -ENOMEM;
3072 event->cgrp = cgrp;
3073 INIT_LIST_HEAD(&event->list);
3074 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3075 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3076 INIT_WORK(&event->remove, cgroup_event_remove);
3077
3078 efile = eventfd_fget(efd);
3079 if (IS_ERR(efile)) {
3080 ret = PTR_ERR(efile);
3081 goto fail;
3082 }
3083
3084 event->eventfd = eventfd_ctx_fileget(efile);
3085 if (IS_ERR(event->eventfd)) {
3086 ret = PTR_ERR(event->eventfd);
3087 goto fail;
3088 }
3089
3090 cfile = fget(cfd);
3091 if (!cfile) {
3092 ret = -EBADF;
3093 goto fail;
3094 }
3095
3096 /* the process need read permission on control file */
3097 ret = file_permission(cfile, MAY_READ);
3098 if (ret < 0)
3099 goto fail;
3100
3101 event->cft = __file_cft(cfile);
3102 if (IS_ERR(event->cft)) {
3103 ret = PTR_ERR(event->cft);
3104 goto fail;
3105 }
3106
3107 if (!event->cft->register_event || !event->cft->unregister_event) {
3108 ret = -EINVAL;
3109 goto fail;
3110 }
3111
3112 ret = event->cft->register_event(cgrp, event->cft,
3113 event->eventfd, buffer);
3114 if (ret)
3115 goto fail;
3116
3117 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3118 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3119 ret = 0;
3120 goto fail;
3121 }
3122
3123 /*
3124 * Events should be removed after rmdir of cgroup directory, but before
3125 * destroying subsystem state objects. Let's take reference to cgroup
3126 * directory dentry to do that.
3127 */
3128 dget(cgrp->dentry);
3129
3130 spin_lock(&cgrp->event_list_lock);
3131 list_add(&event->list, &cgrp->event_list);
3132 spin_unlock(&cgrp->event_list_lock);
3133
3134 fput(cfile);
3135 fput(efile);
3136
3137 return 0;
3138
3139fail:
3140 if (cfile)
3141 fput(cfile);
3142
3143 if (event && event->eventfd && !IS_ERR(event->eventfd))
3144 eventfd_ctx_put(event->eventfd);
3145
3146 if (!IS_ERR_OR_NULL(efile))
3147 fput(efile);
3148
3149 kfree(event);
3150
3151 return ret;
3152}
3153
3154/*
2792 * for the common functions, 'private' gives the type of file 3155 * for the common functions, 'private' gives the type of file
2793 */ 3156 */
2794/* for hysterical raisins, we can't put this on the older files */ 3157/* for hysterical raisins, we can't put this on the older files */
@@ -2813,6 +3176,11 @@ static struct cftype files[] = {
2813 .read_u64 = cgroup_read_notify_on_release, 3176 .read_u64 = cgroup_read_notify_on_release,
2814 .write_u64 = cgroup_write_notify_on_release, 3177 .write_u64 = cgroup_write_notify_on_release,
2815 }, 3178 },
3179 {
3180 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3181 .write_string = cgroup_write_event_control,
3182 .mode = S_IWUGO,
3183 },
2816}; 3184};
2817 3185
2818static struct cftype cft_release_agent = { 3186static struct cftype cft_release_agent = {
@@ -2877,8 +3245,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2877 /* We need to take each hierarchy_mutex in a consistent order */ 3245 /* We need to take each hierarchy_mutex in a consistent order */
2878 int i; 3246 int i;
2879 3247
3248 /*
3249 * No worry about a race with rebind_subsystems that might mess up the
3250 * locking order, since both parties are under cgroup_mutex.
3251 */
2880 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3252 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2881 struct cgroup_subsys *ss = subsys[i]; 3253 struct cgroup_subsys *ss = subsys[i];
3254 if (ss == NULL)
3255 continue;
2882 if (ss->root == root) 3256 if (ss->root == root)
2883 mutex_lock(&ss->hierarchy_mutex); 3257 mutex_lock(&ss->hierarchy_mutex);
2884 } 3258 }
@@ -2890,6 +3264,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2890 3264
2891 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3265 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2892 struct cgroup_subsys *ss = subsys[i]; 3266 struct cgroup_subsys *ss = subsys[i];
3267 if (ss == NULL)
3268 continue;
2893 if (ss->root == root) 3269 if (ss->root == root)
2894 mutex_unlock(&ss->hierarchy_mutex); 3270 mutex_unlock(&ss->hierarchy_mutex);
2895 } 3271 }
@@ -2936,14 +3312,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2936 3312
2937 for_each_subsys(root, ss) { 3313 for_each_subsys(root, ss) {
2938 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3314 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3315
2939 if (IS_ERR(css)) { 3316 if (IS_ERR(css)) {
2940 err = PTR_ERR(css); 3317 err = PTR_ERR(css);
2941 goto err_destroy; 3318 goto err_destroy;
2942 } 3319 }
2943 init_cgroup_css(css, ss, cgrp); 3320 init_cgroup_css(css, ss, cgrp);
2944 if (ss->use_id) 3321 if (ss->use_id) {
2945 if (alloc_css_id(ss, parent, cgrp)) 3322 err = alloc_css_id(ss, parent, cgrp);
3323 if (err)
2946 goto err_destroy; 3324 goto err_destroy;
3325 }
2947 /* At error, ->destroy() callback has to free assigned ID. */ 3326 /* At error, ->destroy() callback has to free assigned ID. */
2948 } 3327 }
2949 3328
@@ -3010,11 +3389,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3010 * synchronization other than RCU, and the subsystem linked 3389 * synchronization other than RCU, and the subsystem linked
3011 * list isn't RCU-safe */ 3390 * list isn't RCU-safe */
3012 int i; 3391 int i;
3392 /*
3393 * We won't need to lock the subsys array, because the subsystems
3394 * we're concerned about aren't going anywhere since our cgroup root
3395 * has a reference on them.
3396 */
3013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3397 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3014 struct cgroup_subsys *ss = subsys[i]; 3398 struct cgroup_subsys *ss = subsys[i];
3015 struct cgroup_subsys_state *css; 3399 struct cgroup_subsys_state *css;
3016 /* Skip subsystems not in this hierarchy */ 3400 /* Skip subsystems not present or not in this hierarchy */
3017 if (ss->root != cgrp->root) 3401 if (ss == NULL || ss->root != cgrp->root)
3018 continue; 3402 continue;
3019 css = cgrp->subsys[ss->subsys_id]; 3403 css = cgrp->subsys[ss->subsys_id];
3020 /* When called from check_for_release() it's possible 3404 /* When called from check_for_release() it's possible
@@ -3088,6 +3472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3088 struct dentry *d; 3472 struct dentry *d;
3089 struct cgroup *parent; 3473 struct cgroup *parent;
3090 DEFINE_WAIT(wait); 3474 DEFINE_WAIT(wait);
3475 struct cgroup_event *event, *tmp;
3091 int ret; 3476 int ret;
3092 3477
3093 /* the vfs holds both inode->i_mutex already */ 3478 /* the vfs holds both inode->i_mutex already */
@@ -3171,6 +3556,20 @@ again:
3171 set_bit(CGRP_RELEASABLE, &parent->flags); 3556 set_bit(CGRP_RELEASABLE, &parent->flags);
3172 check_for_release(parent); 3557 check_for_release(parent);
3173 3558
3559 /*
3560 * Unregister events and notify userspace.
3561 * Notify userspace about cgroup removing only after rmdir of cgroup
3562 * directory to avoid race between userspace and kernelspace
3563 */
3564 spin_lock(&cgrp->event_list_lock);
3565 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3566 list_del(&event->list);
3567 remove_wait_queue(event->wqh, &event->wait);
3568 eventfd_signal(event->eventfd, 1);
3569 schedule_work(&event->remove);
3570 }
3571 spin_unlock(&cgrp->event_list_lock);
3572
3174 mutex_unlock(&cgroup_mutex); 3573 mutex_unlock(&cgroup_mutex);
3175 return 0; 3574 return 0;
3176} 3575}
@@ -3205,9 +3604,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3205 mutex_init(&ss->hierarchy_mutex); 3604 mutex_init(&ss->hierarchy_mutex);
3206 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3605 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3207 ss->active = 1; 3606 ss->active = 1;
3607
3608 /* this function shouldn't be used with modular subsystems, since they
3609 * need to register a subsys_id, among other things */
3610 BUG_ON(ss->module);
3208} 3611}
3209 3612
3210/** 3613/**
3614 * cgroup_load_subsys: load and register a modular subsystem at runtime
3615 * @ss: the subsystem to load
3616 *
3617 * This function should be called in a modular subsystem's initcall. If the
3618 * subsytem is built as a module, it will be assigned a new subsys_id and set
3619 * up for use. If the subsystem is built-in anyway, work is delegated to the
3620 * simpler cgroup_init_subsys.
3621 */
3622int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3623{
3624 int i;
3625 struct cgroup_subsys_state *css;
3626
3627 /* check name and function validity */
3628 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3629 ss->create == NULL || ss->destroy == NULL)
3630 return -EINVAL;
3631
3632 /*
3633 * we don't support callbacks in modular subsystems. this check is
3634 * before the ss->module check for consistency; a subsystem that could
3635 * be a module should still have no callbacks even if the user isn't
3636 * compiling it as one.
3637 */
3638 if (ss->fork || ss->exit)
3639 return -EINVAL;
3640
3641 /*
3642 * an optionally modular subsystem is built-in: we want to do nothing,
3643 * since cgroup_init_subsys will have already taken care of it.
3644 */
3645 if (ss->module == NULL) {
3646 /* a few sanity checks */
3647 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3648 BUG_ON(subsys[ss->subsys_id] != ss);
3649 return 0;
3650 }
3651
3652 /*
3653 * need to register a subsys id before anything else - for example,
3654 * init_cgroup_css needs it.
3655 */
3656 mutex_lock(&cgroup_mutex);
3657 /* find the first empty slot in the array */
3658 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3659 if (subsys[i] == NULL)
3660 break;
3661 }
3662 if (i == CGROUP_SUBSYS_COUNT) {
3663 /* maximum number of subsystems already registered! */
3664 mutex_unlock(&cgroup_mutex);
3665 return -EBUSY;
3666 }
3667 /* assign ourselves the subsys_id */
3668 ss->subsys_id = i;
3669 subsys[i] = ss;
3670
3671 /*
3672 * no ss->create seems to need anything important in the ss struct, so
3673 * this can happen first (i.e. before the rootnode attachment).
3674 */
3675 css = ss->create(ss, dummytop);
3676 if (IS_ERR(css)) {
3677 /* failure case - need to deassign the subsys[] slot. */
3678 subsys[i] = NULL;
3679 mutex_unlock(&cgroup_mutex);
3680 return PTR_ERR(css);
3681 }
3682
3683 list_add(&ss->sibling, &rootnode.subsys_list);
3684 ss->root = &rootnode;
3685
3686 /* our new subsystem will be attached to the dummy hierarchy. */
3687 init_cgroup_css(css, ss, dummytop);
3688 /* init_idr must be after init_cgroup_css because it sets css->id. */
3689 if (ss->use_id) {
3690 int ret = cgroup_init_idr(ss, css);
3691 if (ret) {
3692 dummytop->subsys[ss->subsys_id] = NULL;
3693 ss->destroy(ss, dummytop);
3694 subsys[i] = NULL;
3695 mutex_unlock(&cgroup_mutex);
3696 return ret;
3697 }
3698 }
3699
3700 /*
3701 * Now we need to entangle the css into the existing css_sets. unlike
3702 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3703 * will need a new pointer to it; done by iterating the css_set_table.
3704 * furthermore, modifying the existing css_sets will corrupt the hash
3705 * table state, so each changed css_set will need its hash recomputed.
3706 * this is all done under the css_set_lock.
3707 */
3708 write_lock(&css_set_lock);
3709 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3710 struct css_set *cg;
3711 struct hlist_node *node, *tmp;
3712 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3713
3714 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3715 /* skip entries that we already rehashed */
3716 if (cg->subsys[ss->subsys_id])
3717 continue;
3718 /* remove existing entry */
3719 hlist_del(&cg->hlist);
3720 /* set new value */
3721 cg->subsys[ss->subsys_id] = css;
3722 /* recompute hash and restore entry */
3723 new_bucket = css_set_hash(cg->subsys);
3724 hlist_add_head(&cg->hlist, new_bucket);
3725 }
3726 }
3727 write_unlock(&css_set_lock);
3728
3729 mutex_init(&ss->hierarchy_mutex);
3730 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3731 ss->active = 1;
3732
3733 /* success! */
3734 mutex_unlock(&cgroup_mutex);
3735 return 0;
3736}
3737EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3738
3739/**
3740 * cgroup_unload_subsys: unload a modular subsystem
3741 * @ss: the subsystem to unload
3742 *
3743 * This function should be called in a modular subsystem's exitcall. When this
3744 * function is invoked, the refcount on the subsystem's module will be 0, so
3745 * the subsystem will not be attached to any hierarchy.
3746 */
3747void cgroup_unload_subsys(struct cgroup_subsys *ss)
3748{
3749 struct cg_cgroup_link *link;
3750 struct hlist_head *hhead;
3751
3752 BUG_ON(ss->module == NULL);
3753
3754 /*
3755 * we shouldn't be called if the subsystem is in use, and the use of
3756 * try_module_get in parse_cgroupfs_options should ensure that it
3757 * doesn't start being used while we're killing it off.
3758 */
3759 BUG_ON(ss->root != &rootnode);
3760
3761 mutex_lock(&cgroup_mutex);
3762 /* deassign the subsys_id */
3763 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3764 subsys[ss->subsys_id] = NULL;
3765
3766 /* remove subsystem from rootnode's list of subsystems */
3767 list_del(&ss->sibling);
3768
3769 /*
3770 * disentangle the css from all css_sets attached to the dummytop. as
3771 * in loading, we need to pay our respects to the hashtable gods.
3772 */
3773 write_lock(&css_set_lock);
3774 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3775 struct css_set *cg = link->cg;
3776
3777 hlist_del(&cg->hlist);
3778 BUG_ON(!cg->subsys[ss->subsys_id]);
3779 cg->subsys[ss->subsys_id] = NULL;
3780 hhead = css_set_hash(cg->subsys);
3781 hlist_add_head(&cg->hlist, hhead);
3782 }
3783 write_unlock(&css_set_lock);
3784
3785 /*
3786 * remove subsystem's css from the dummytop and free it - need to free
3787 * before marking as null because ss->destroy needs the cgrp->subsys
3788 * pointer to find their state. note that this also takes care of
3789 * freeing the css_id.
3790 */
3791 ss->destroy(ss, dummytop);
3792 dummytop->subsys[ss->subsys_id] = NULL;
3793
3794 mutex_unlock(&cgroup_mutex);
3795}
3796EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3797
3798/**
3211 * cgroup_init_early - cgroup initialization at system boot 3799 * cgroup_init_early - cgroup initialization at system boot
3212 * 3800 *
3213 * Initialize cgroups at system boot, and initialize any 3801 * Initialize cgroups at system boot, and initialize any
@@ -3235,7 +3823,8 @@ int __init cgroup_init_early(void)
3235 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3823 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3236 INIT_HLIST_HEAD(&css_set_table[i]); 3824 INIT_HLIST_HEAD(&css_set_table[i]);
3237 3825
3238 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3826 /* at bootup time, we don't worry about modular subsystems */
3827 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3239 struct cgroup_subsys *ss = subsys[i]; 3828 struct cgroup_subsys *ss = subsys[i];
3240 3829
3241 BUG_ON(!ss->name); 3830 BUG_ON(!ss->name);
@@ -3270,12 +3859,13 @@ int __init cgroup_init(void)
3270 if (err) 3859 if (err)
3271 return err; 3860 return err;
3272 3861
3273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3862 /* at bootup time, we don't worry about modular subsystems */
3863 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3274 struct cgroup_subsys *ss = subsys[i]; 3864 struct cgroup_subsys *ss = subsys[i];
3275 if (!ss->early_init) 3865 if (!ss->early_init)
3276 cgroup_init_subsys(ss); 3866 cgroup_init_subsys(ss);
3277 if (ss->use_id) 3867 if (ss->use_id)
3278 cgroup_subsys_init_idr(ss); 3868 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3279 } 3869 }
3280 3870
3281 /* Add init_css_set to the hash table */ 3871 /* Add init_css_set to the hash table */
@@ -3379,9 +3969,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3379 int i; 3969 int i;
3380 3970
3381 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3971 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3972 /*
3973 * ideally we don't want subsystems moving around while we do this.
3974 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3975 * subsys/hierarchy state.
3976 */
3382 mutex_lock(&cgroup_mutex); 3977 mutex_lock(&cgroup_mutex);
3383 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3978 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3384 struct cgroup_subsys *ss = subsys[i]; 3979 struct cgroup_subsys *ss = subsys[i];
3980 if (ss == NULL)
3981 continue;
3385 seq_printf(m, "%s\t%d\t%d\t%d\n", 3982 seq_printf(m, "%s\t%d\t%d\t%d\n",
3386 ss->name, ss->root->hierarchy_id, 3983 ss->name, ss->root->hierarchy_id,
3387 ss->root->number_of_cgroups, !ss->disabled); 3984 ss->root->number_of_cgroups, !ss->disabled);
@@ -3439,7 +4036,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3439{ 4036{
3440 if (need_forkexit_callback) { 4037 if (need_forkexit_callback) {
3441 int i; 4038 int i;
3442 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4039 /*
4040 * forkexit callbacks are only supported for builtin
4041 * subsystems, and the builtin section of the subsys array is
4042 * immutable, so we don't need to lock the subsys array here.
4043 */
4044 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3443 struct cgroup_subsys *ss = subsys[i]; 4045 struct cgroup_subsys *ss = subsys[i];
3444 if (ss->fork) 4046 if (ss->fork)
3445 ss->fork(ss, child); 4047 ss->fork(ss, child);
@@ -3508,7 +4110,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3508 struct css_set *cg; 4110 struct css_set *cg;
3509 4111
3510 if (run_callbacks && need_forkexit_callback) { 4112 if (run_callbacks && need_forkexit_callback) {
3511 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4113 /*
4114 * modular subsystems can't use callbacks, so no need to lock
4115 * the subsys array
4116 */
4117 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3512 struct cgroup_subsys *ss = subsys[i]; 4118 struct cgroup_subsys *ss = subsys[i];
3513 if (ss->exit) 4119 if (ss->exit)
3514 ss->exit(ss, tsk); 4120 ss->exit(ss, tsk);
@@ -3702,12 +4308,13 @@ static void check_for_release(struct cgroup *cgrp)
3702 } 4308 }
3703} 4309}
3704 4310
3705void __css_put(struct cgroup_subsys_state *css) 4311/* Caller must verify that the css is not for root cgroup */
4312void __css_put(struct cgroup_subsys_state *css, int count)
3706{ 4313{
3707 struct cgroup *cgrp = css->cgroup; 4314 struct cgroup *cgrp = css->cgroup;
3708 int val; 4315 int val;
3709 rcu_read_lock(); 4316 rcu_read_lock();
3710 val = atomic_dec_return(&css->refcnt); 4317 val = atomic_sub_return(count, &css->refcnt);
3711 if (val == 1) { 4318 if (val == 1) {
3712 if (notify_on_release(cgrp)) { 4319 if (notify_on_release(cgrp)) {
3713 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4320 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3718,6 +4325,7 @@ void __css_put(struct cgroup_subsys_state *css)
3718 rcu_read_unlock(); 4325 rcu_read_unlock();
3719 WARN_ON_ONCE(val < 1); 4326 WARN_ON_ONCE(val < 1);
3720} 4327}
4328EXPORT_SYMBOL_GPL(__css_put);
3721 4329
3722/* 4330/*
3723 * Notify userspace when a cgroup is released, by running the 4331 * Notify userspace when a cgroup is released, by running the
@@ -3799,8 +4407,11 @@ static int __init cgroup_disable(char *str)
3799 while ((token = strsep(&str, ",")) != NULL) { 4407 while ((token = strsep(&str, ",")) != NULL) {
3800 if (!*token) 4408 if (!*token)
3801 continue; 4409 continue;
3802 4410 /*
3803 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4411 * cgroup_disable, being at boot time, can't know about module
4412 * subsystems, so we don't worry about them.
4413 */
4414 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3804 struct cgroup_subsys *ss = subsys[i]; 4415 struct cgroup_subsys *ss = subsys[i];
3805 4416
3806 if (!strcmp(token, ss->name)) { 4417 if (!strcmp(token, ss->name)) {
@@ -3830,6 +4441,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
3830 return cssid->id; 4441 return cssid->id;
3831 return 0; 4442 return 0;
3832} 4443}
4444EXPORT_SYMBOL_GPL(css_id);
3833 4445
3834unsigned short css_depth(struct cgroup_subsys_state *css) 4446unsigned short css_depth(struct cgroup_subsys_state *css)
3835{ 4447{
@@ -3839,6 +4451,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
3839 return cssid->depth; 4451 return cssid->depth;
3840 return 0; 4452 return 0;
3841} 4453}
4454EXPORT_SYMBOL_GPL(css_depth);
3842 4455
3843bool css_is_ancestor(struct cgroup_subsys_state *child, 4456bool css_is_ancestor(struct cgroup_subsys_state *child,
3844 const struct cgroup_subsys_state *root) 4457 const struct cgroup_subsys_state *root)
@@ -3875,6 +4488,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3875 spin_unlock(&ss->id_lock); 4488 spin_unlock(&ss->id_lock);
3876 call_rcu(&id->rcu_head, __free_css_id_cb); 4489 call_rcu(&id->rcu_head, __free_css_id_cb);
3877} 4490}
4491EXPORT_SYMBOL_GPL(free_css_id);
3878 4492
3879/* 4493/*
3880 * This is called by init or create(). Then, calls to this function are 4494 * This is called by init or create(). Then, calls to this function are
@@ -3924,15 +4538,14 @@ err_out:
3924 4538
3925} 4539}
3926 4540
3927static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4541static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4542 struct cgroup_subsys_state *rootcss)
3928{ 4543{
3929 struct css_id *newid; 4544 struct css_id *newid;
3930 struct cgroup_subsys_state *rootcss;
3931 4545
3932 spin_lock_init(&ss->id_lock); 4546 spin_lock_init(&ss->id_lock);
3933 idr_init(&ss->idr); 4547 idr_init(&ss->idr);
3934 4548
3935 rootcss = init_css_set.subsys[ss->subsys_id];
3936 newid = get_new_cssid(ss, 0); 4549 newid = get_new_cssid(ss, 0);
3937 if (IS_ERR(newid)) 4550 if (IS_ERR(newid))
3938 return PTR_ERR(newid); 4551 return PTR_ERR(newid);
@@ -3948,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3948{ 4561{
3949 int subsys_id, i, depth = 0; 4562 int subsys_id, i, depth = 0;
3950 struct cgroup_subsys_state *parent_css, *child_css; 4563 struct cgroup_subsys_state *parent_css, *child_css;
3951 struct css_id *child_id, *parent_id = NULL; 4564 struct css_id *child_id, *parent_id;
3952 4565
3953 subsys_id = ss->subsys_id; 4566 subsys_id = ss->subsys_id;
3954 parent_css = parent->subsys[subsys_id]; 4567 parent_css = parent->subsys[subsys_id];
3955 child_css = child->subsys[subsys_id]; 4568 child_css = child->subsys[subsys_id];
3956 depth = css_depth(parent_css) + 1;
3957 parent_id = parent_css->id; 4569 parent_id = parent_css->id;
4570 depth = parent_id->depth;
3958 4571
3959 child_id = get_new_cssid(ss, depth); 4572 child_id = get_new_cssid(ss, depth);
3960 if (IS_ERR(child_id)) 4573 if (IS_ERR(child_id))
@@ -3992,6 +4605,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3992 4605
3993 return rcu_dereference(cssid->css); 4606 return rcu_dereference(cssid->css);
3994} 4607}
4608EXPORT_SYMBOL_GPL(css_lookup);
3995 4609
3996/** 4610/**
3997 * css_get_next - lookup next cgroup under specified hierarchy. 4611 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..e5c0244962b0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/slab.h>
18#include <linux/cgroup.h> 19#include <linux/cgroup.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
47 struct freezer, css); 48 struct freezer, css);
48} 49}
49 50
50int cgroup_frozen(struct task_struct *task) 51int cgroup_freezing_or_frozen(struct task_struct *task)
51{ 52{
52 struct freezer *freezer; 53 struct freezer *freezer;
53 enum freezer_state state; 54 enum freezer_state state;
54 55
55 task_lock(task); 56 task_lock(task);
56 freezer = task_freezer(task); 57 freezer = task_freezer(task);
57 state = freezer->state; 58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
58 task_unlock(task); 62 task_unlock(task);
59 63
60 return state == CGROUP_FROZEN; 64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
61} 65}
62 66
63/* 67/*
@@ -201,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
201 * No lock is needed, since the task isn't on tasklist yet, 205 * No lock is needed, since the task isn't on tasklist yet,
202 * so it can't be moved to another cgroup, which means the 206 * so it can't be moved to another cgroup, which means the
203 * freezer won't be removed and will be valid during this 207 * freezer won't be removed and will be valid during this
204 * function call. 208 * function call. Nevertheless, apply RCU read-side critical
209 * section to suppress RCU lockdep false positives.
205 */ 210 */
211 rcu_read_lock();
206 freezer = task_freezer(task); 212 freezer = task_freezer(task);
213 rcu_read_unlock();
207 214
208 /* 215 /*
209 * The root cgroup is non-freezable, so we can skip the 216 * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..7f40e9275fd9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h> 27#include <linux/ptrace.h>
28#include <linux/gfp.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30 31
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee940..25bba73b1be3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/gfp.h>
17 18
18#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
19/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 20/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -151,13 +152,13 @@ static inline void check_for_tasks(int cpu)
151 152
152 write_lock_irq(&tasklist_lock); 153 write_lock_irq(&tasklist_lock);
153 for_each_process(p) { 154 for_each_process(p) {
154 if (task_cpu(p) == cpu && 155 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
155 (!cputime_eq(p->utime, cputime_zero) || 156 (!cputime_eq(p->utime, cputime_zero) ||
156 !cputime_eq(p->stime, cputime_zero))) 157 !cputime_eq(p->stime, cputime_zero)))
157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 158 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
158 (state = %ld, flags = %x) \n", 159 "(state = %ld, flags = %x)\n",
159 p->comm, task_pid_nr(p), cpu, 160 p->comm, task_pid_nr(p), cpu,
160 p->state, p->flags); 161 p->state, p->flags);
161 } 162 }
162 write_unlock_irq(&tasklist_lock); 163 write_unlock_irq(&tasklist_lock);
163} 164}
@@ -338,7 +339,7 @@ int __cpuinit cpu_up(unsigned int cpu)
338 if (!cpu_possible(cpu)) { 339 if (!cpu_possible(cpu)) {
339 printk(KERN_ERR "can't online cpu %d because it is not " 340 printk(KERN_ERR "can't online cpu %d because it is not "
340 "configured as may-hotadd at boot time\n", cpu); 341 "configured as may-hotadd at boot time\n", cpu);
341#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 342#if defined(CONFIG_IA64)
342 printk(KERN_ERR "please check additional_cpus= boot " 343 printk(KERN_ERR "please check additional_cpus= boot "
343 "parameter\n"); 344 "parameter\n");
344#endif 345#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..d10946748ec2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 * call to guarantee_online_mems(), as we know no one is changing 920 * call to guarantee_online_mems(), as we know no one is changing
921 * our task's cpuset. 921 * our task's cpuset.
922 * 922 *
923 * Hold callback_mutex around the two modifications of our tasks
924 * mems_allowed to synchronize with cpuset_mems_allowed().
925 *
926 * While the mm_struct we are migrating is typically from some 923 * While the mm_struct we are migrating is typically from some
927 * other task, the task_struct mems_allowed that we are hacking 924 * other task, the task_struct mems_allowed that we are hacking
928 * is for our current task, which must allocate new pages for that 925 * is for our current task, which must allocate new pages for that
@@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
973 struct cpuset *cs; 970 struct cpuset *cs;
974 int migrate; 971 int migrate;
975 const nodemask_t *oldmem = scan->data; 972 const nodemask_t *oldmem = scan->data;
976 nodemask_t newmems; 973 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
974
975 if (!newmems)
976 return;
977 977
978 cs = cgroup_cs(scan->cg); 978 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, &newmems); 979 guarantee_online_mems(cs, newmems);
980 980
981 task_lock(p); 981 task_lock(p);
982 cpuset_change_task_nodemask(p, &newmems); 982 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p); 983 task_unlock(p);
984 984
985 NODEMASK_FREE(newmems);
986
985 mm = get_task_mm(p); 987 mm = get_task_mm(p);
986 if (!mm) 988 if (!mm)
987 return; 989 return;
@@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1051static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1053static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1052 const char *buf) 1054 const char *buf)
1053{ 1055{
1054 nodemask_t oldmem; 1056 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1055 int retval; 1057 int retval;
1056 struct ptr_heap heap; 1058 struct ptr_heap heap;
1057 1059
1060 if (!oldmem)
1061 return -ENOMEM;
1062
1058 /* 1063 /*
1059 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1064 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1060 * it's read-only 1065 * it's read-only
1061 */ 1066 */
1062 if (cs == &top_cpuset) 1067 if (cs == &top_cpuset) {
1063 return -EACCES; 1068 retval = -EACCES;
1069 goto done;
1070 }
1064 1071
1065 /* 1072 /*
1066 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1073 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1076 goto done; 1083 goto done;
1077 1084
1078 if (!nodes_subset(trialcs->mems_allowed, 1085 if (!nodes_subset(trialcs->mems_allowed,
1079 node_states[N_HIGH_MEMORY])) 1086 node_states[N_HIGH_MEMORY])) {
1080 return -EINVAL; 1087 retval = -EINVAL;
1088 goto done;
1089 }
1081 } 1090 }
1082 oldmem = cs->mems_allowed; 1091 *oldmem = cs->mems_allowed;
1083 if (nodes_equal(oldmem, trialcs->mems_allowed)) { 1092 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1084 retval = 0; /* Too easy - nothing to do */ 1093 retval = 0; /* Too easy - nothing to do */
1085 goto done; 1094 goto done;
1086 } 1095 }
@@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1096 cs->mems_allowed = trialcs->mems_allowed; 1105 cs->mems_allowed = trialcs->mems_allowed;
1097 mutex_unlock(&callback_mutex); 1106 mutex_unlock(&callback_mutex);
1098 1107
1099 update_tasks_nodemask(cs, &oldmem, &heap); 1108 update_tasks_nodemask(cs, oldmem, &heap);
1100 1109
1101 heap_free(&heap); 1110 heap_free(&heap);
1102done: 1111done:
1112 NODEMASK_FREE(oldmem);
1103 return retval; 1113 return retval;
1104} 1114}
1105 1115
@@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1384 struct cgroup *oldcont, struct task_struct *tsk, 1394 struct cgroup *oldcont, struct task_struct *tsk,
1385 bool threadgroup) 1395 bool threadgroup)
1386{ 1396{
1387 nodemask_t from, to;
1388 struct mm_struct *mm; 1397 struct mm_struct *mm;
1389 struct cpuset *cs = cgroup_cs(cont); 1398 struct cpuset *cs = cgroup_cs(cont);
1390 struct cpuset *oldcs = cgroup_cs(oldcont); 1399 struct cpuset *oldcs = cgroup_cs(oldcont);
1400 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1401 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1402
1403 if (from == NULL || to == NULL)
1404 goto alloc_fail;
1391 1405
1392 if (cs == &top_cpuset) { 1406 if (cs == &top_cpuset) {
1393 cpumask_copy(cpus_attach, cpu_possible_mask); 1407 cpumask_copy(cpus_attach, cpu_possible_mask);
1394 to = node_possible_map;
1395 } else { 1408 } else {
1396 guarantee_online_cpus(cs, cpus_attach); 1409 guarantee_online_cpus(cs, cpus_attach);
1397 guarantee_online_mems(cs, &to);
1398 } 1410 }
1411 guarantee_online_mems(cs, to);
1399 1412
1400 /* do per-task migration stuff possibly for each in the threadgroup */ 1413 /* do per-task migration stuff possibly for each in the threadgroup */
1401 cpuset_attach_task(tsk, &to, cs); 1414 cpuset_attach_task(tsk, to, cs);
1402 if (threadgroup) { 1415 if (threadgroup) {
1403 struct task_struct *c; 1416 struct task_struct *c;
1404 rcu_read_lock(); 1417 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1418 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs); 1419 cpuset_attach_task(c, to, cs);
1407 } 1420 }
1408 rcu_read_unlock(); 1421 rcu_read_unlock();
1409 } 1422 }
1410 1423
1411 /* change mm; only needs to be done once even if threadgroup */ 1424 /* change mm; only needs to be done once even if threadgroup */
1412 from = oldcs->mems_allowed; 1425 *from = oldcs->mems_allowed;
1413 to = cs->mems_allowed; 1426 *to = cs->mems_allowed;
1414 mm = get_task_mm(tsk); 1427 mm = get_task_mm(tsk);
1415 if (mm) { 1428 if (mm) {
1416 mpol_rebind_mm(mm, &to); 1429 mpol_rebind_mm(mm, to);
1417 if (is_memory_migrate(cs)) 1430 if (is_memory_migrate(cs))
1418 cpuset_migrate_mm(mm, &from, &to); 1431 cpuset_migrate_mm(mm, from, to);
1419 mmput(mm); 1432 mmput(mm);
1420 } 1433 }
1434
1435alloc_fail:
1436 NODEMASK_FREE(from);
1437 NODEMASK_FREE(to);
1421} 1438}
1422 1439
1423/* The various types of files and directories in a cpuset file system */ 1440/* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1562 1579
1563static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1580static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1564{ 1581{
1565 nodemask_t mask; 1582 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
1583 int retval;
1584
1585 if (mask == NULL)
1586 return -ENOMEM;
1566 1587
1567 mutex_lock(&callback_mutex); 1588 mutex_lock(&callback_mutex);
1568 mask = cs->mems_allowed; 1589 *mask = cs->mems_allowed;
1569 mutex_unlock(&callback_mutex); 1590 mutex_unlock(&callback_mutex);
1570 1591
1571 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1592 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
1593
1594 NODEMASK_FREE(mask);
1595
1596 return retval;
1572} 1597}
1573 1598
1574static ssize_t cpuset_common_file_read(struct cgroup *cont, 1599static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1997 struct cpuset *cp; /* scans cpusets being updated */ 2022 struct cpuset *cp; /* scans cpusets being updated */
1998 struct cpuset *child; /* scans child cpusets of cp */ 2023 struct cpuset *child; /* scans child cpusets of cp */
1999 struct cgroup *cont; 2024 struct cgroup *cont;
2000 nodemask_t oldmems; 2025 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2026
2027 if (oldmems == NULL)
2028 return;
2001 2029
2002 list_add_tail((struct list_head *)&root->stack_list, &queue); 2030 list_add_tail((struct list_head *)&root->stack_list, &queue);
2003 2031
@@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2042 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2043 continue;
2016 2044
2017 oldmems = cp->mems_allowed; 2045 *oldmems = cp->mems_allowed;
2018 2046
2019 /* Remove offline cpus and mems from this cpuset. */ 2047 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2048 mutex_lock(&callback_mutex);
@@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2030 remove_tasks_in_empty_cpuset(cp); 2058 remove_tasks_in_empty_cpuset(cp);
2031 else { 2059 else {
2032 update_tasks_cpumask(cp, NULL); 2060 update_tasks_cpumask(cp, NULL);
2033 update_tasks_nodemask(cp, &oldmems, NULL); 2061 update_tasks_nodemask(cp, oldmems, NULL);
2034 } 2062 }
2035 } 2063 }
2064 NODEMASK_FREE(oldmems);
2036} 2065}
2037 2066
2038/* 2067/*
@@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2090static int cpuset_track_online_nodes(struct notifier_block *self, 2119static int cpuset_track_online_nodes(struct notifier_block *self,
2091 unsigned long action, void *arg) 2120 unsigned long action, void *arg)
2092{ 2121{
2122 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2123
2124 if (oldmems == NULL)
2125 return NOTIFY_DONE;
2126
2093 cgroup_lock(); 2127 cgroup_lock();
2094 switch (action) { 2128 switch (action) {
2095 case MEM_ONLINE: 2129 case MEM_ONLINE:
2096 case MEM_OFFLINE: 2130 *oldmems = top_cpuset.mems_allowed;
2097 mutex_lock(&callback_mutex); 2131 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2132 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 mutex_unlock(&callback_mutex); 2133 mutex_unlock(&callback_mutex);
2100 if (action == MEM_OFFLINE) 2134 update_tasks_nodemask(&top_cpuset, oldmems, NULL);
2101 scan_for_empty_cpusets(&top_cpuset); 2135 break;
2136 case MEM_OFFLINE:
2137 /*
2138 * needn't update top_cpuset.mems_allowed explicitly because
2139 * scan_for_empty_cpusets() will update it.
2140 */
2141 scan_for_empty_cpusets(&top_cpuset);
2102 break; 2142 break;
2103 default: 2143 default:
2104 break; 2144 break;
2105 } 2145 }
2106 cgroup_unlock(); 2146 cgroup_unlock();
2147
2148 NODEMASK_FREE(oldmems);
2107 return NOTIFY_OK; 2149 return NOTIFY_OK;
2108} 2150}
2109#endif 2151#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..62af1816c235 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/cred.h> 12#include <linux/cred.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/key.h> 15#include <linux/key.h>
15#include <linux/keyctl.h> 16#include <linux/keyctl.h>
@@ -224,7 +225,7 @@ struct cred *cred_alloc_blank(void)
224#ifdef CONFIG_KEYS 225#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); 226 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) { 227 if (!new->tgcred) {
227 kfree(new); 228 kmem_cache_free(cred_jar, new);
228 return NULL; 229 return NULL;
229 } 230 }
230 atomic_set(&new->tgcred->usage, 1); 231 atomic_set(&new->tgcred->usage, 1);
@@ -364,7 +365,7 @@ struct cred *prepare_usermodehelper_creds(void)
364 365
365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); 366 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
366 if (!new) 367 if (!new)
367 return NULL; 368 goto free_tgcred;
368 369
369 kdebug("prepare_usermodehelper_creds() alloc %p", new); 370 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370 371
@@ -398,6 +399,12 @@ struct cred *prepare_usermodehelper_creds(void)
398error: 399error:
399 put_cred(new); 400 put_cred(new);
400 return NULL; 401 return NULL;
402
403free_tgcred:
404#ifdef CONFIG_KEYS
405 kfree(tgcred);
406#endif
407 return NULL;
401} 408}
402 409
403/* 410/*
@@ -786,8 +793,6 @@ bool creds_are_invalid(const struct cred *cred)
786{ 793{
787 if (cred->magic != CRED_MAGIC) 794 if (cred->magic != CRED_MAGIC)
788 return true; 795 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX 796#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) { 797 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE) 798 if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..31aa9332ef3f
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,584 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10
11/*
12 * Early reserved memory areas.
13 */
14/*
15 * need to make sure this one is bigger enough before
16 * find_fw_memmap_area could be used
17 */
18#define MAX_EARLY_RES_X 32
19
20struct early_res {
21 u64 start, end;
22 char name[15];
23 char overlap_ok;
24};
25static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
26
27static int max_early_res __initdata = MAX_EARLY_RES_X;
28static struct early_res *early_res __initdata = &early_res_x[0];
29static int early_res_count __initdata;
30
31static int __init find_overlapped_early(u64 start, u64 end)
32{
33 int i;
34 struct early_res *r;
35
36 for (i = 0; i < max_early_res && early_res[i].end; i++) {
37 r = &early_res[i];
38 if (end > r->start && start < r->end)
39 break;
40 }
41
42 return i;
43}
44
45/*
46 * Drop the i-th range from the early reservation map,
47 * by copying any higher ranges down one over it, and
48 * clearing what had been the last slot.
49 */
50static void __init drop_range(int i)
51{
52 int j;
53
54 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
55 ;
56
57 memmove(&early_res[i], &early_res[i + 1],
58 (j - 1 - i) * sizeof(struct early_res));
59
60 early_res[j - 1].end = 0;
61 early_res_count--;
62}
63
64static void __init drop_range_partial(int i, u64 start, u64 end)
65{
66 u64 common_start, common_end;
67 u64 old_start, old_end;
68
69 old_start = early_res[i].start;
70 old_end = early_res[i].end;
71 common_start = max(old_start, start);
72 common_end = min(old_end, end);
73
74 /* no overlap ? */
75 if (common_start >= common_end)
76 return;
77
78 if (old_start < common_start) {
79 /* make head segment */
80 early_res[i].end = common_start;
81 if (old_end > common_end) {
82 char name[15];
83
84 /*
85 * Save a local copy of the name, since the
86 * early_res array could get resized inside
87 * reserve_early_without_check() ->
88 * __check_and_double_early_res(), which would
89 * make the current name pointer invalid.
90 */
91 strncpy(name, early_res[i].name,
92 sizeof(early_res[i].name) - 1);
93 /* add another for left over on tail */
94 reserve_early_without_check(common_end, old_end, name);
95 }
96 return;
97 } else {
98 if (old_end > common_end) {
99 /* reuse the entry for tail left */
100 early_res[i].start = common_end;
101 return;
102 }
103 /* all covered */
104 drop_range(i);
105 }
106}
107
108/*
109 * Split any existing ranges that:
110 * 1) are marked 'overlap_ok', and
111 * 2) overlap with the stated range [start, end)
112 * into whatever portion (if any) of the existing range is entirely
113 * below or entirely above the stated range. Drop the portion
114 * of the existing range that overlaps with the stated range,
115 * which will allow the caller of this routine to then add that
116 * stated range without conflicting with any existing range.
117 */
118static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
119{
120 int i;
121 struct early_res *r;
122 u64 lower_start, lower_end;
123 u64 upper_start, upper_end;
124 char name[15];
125
126 for (i = 0; i < max_early_res && early_res[i].end; i++) {
127 r = &early_res[i];
128
129 /* Continue past non-overlapping ranges */
130 if (end <= r->start || start >= r->end)
131 continue;
132
133 /*
134 * Leave non-ok overlaps as is; let caller
135 * panic "Overlapping early reservations"
136 * when it hits this overlap.
137 */
138 if (!r->overlap_ok)
139 return;
140
141 /*
142 * We have an ok overlap. We will drop it from the early
143 * reservation map, and add back in any non-overlapping
144 * portions (lower or upper) as separate, overlap_ok,
145 * non-overlapping ranges.
146 */
147
148 /* 1. Note any non-overlapping (lower or upper) ranges. */
149 strncpy(name, r->name, sizeof(name) - 1);
150
151 lower_start = lower_end = 0;
152 upper_start = upper_end = 0;
153 if (r->start < start) {
154 lower_start = r->start;
155 lower_end = start;
156 }
157 if (r->end > end) {
158 upper_start = end;
159 upper_end = r->end;
160 }
161
162 /* 2. Drop the original ok overlapping range */
163 drop_range(i);
164
165 i--; /* resume for-loop on copied down entry */
166
167 /* 3. Add back in any non-overlapping ranges. */
168 if (lower_end)
169 reserve_early_overlap_ok(lower_start, lower_end, name);
170 if (upper_end)
171 reserve_early_overlap_ok(upper_start, upper_end, name);
172 }
173}
174
175static void __init __reserve_early(u64 start, u64 end, char *name,
176 int overlap_ok)
177{
178 int i;
179 struct early_res *r;
180
181 i = find_overlapped_early(start, end);
182 if (i >= max_early_res)
183 panic("Too many early reservations");
184 r = &early_res[i];
185 if (r->end)
186 panic("Overlapping early reservations "
187 "%llx-%llx %s to %llx-%llx %s\n",
188 start, end - 1, name ? name : "", r->start,
189 r->end - 1, r->name);
190 r->start = start;
191 r->end = end;
192 r->overlap_ok = overlap_ok;
193 if (name)
194 strncpy(r->name, name, sizeof(r->name) - 1);
195 early_res_count++;
196}
197
198/*
199 * A few early reservtations come here.
200 *
201 * The 'overlap_ok' in the name of this routine does -not- mean it
202 * is ok for these reservations to overlap an earlier reservation.
203 * Rather it means that it is ok for subsequent reservations to
204 * overlap this one.
205 *
206 * Use this entry point to reserve early ranges when you are doing
207 * so out of "Paranoia", reserving perhaps more memory than you need,
208 * just in case, and don't mind a subsequent overlapping reservation
209 * that is known to be needed.
210 *
211 * The drop_overlaps_that_are_ok() call here isn't really needed.
212 * It would be needed if we had two colliding 'overlap_ok'
213 * reservations, so that the second such would not panic on the
214 * overlap with the first. We don't have any such as of this
215 * writing, but might as well tolerate such if it happens in
216 * the future.
217 */
218void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
219{
220 drop_overlaps_that_are_ok(start, end);
221 __reserve_early(start, end, name, 1);
222}
223
224static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
225{
226 u64 start, end, size, mem;
227 struct early_res *new;
228
229 /* do we have enough slots left ? */
230 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
231 return;
232
233 /* double it */
234 mem = -1ULL;
235 size = sizeof(struct early_res) * max_early_res * 2;
236 if (early_res == early_res_x)
237 start = 0;
238 else
239 start = early_res[0].end;
240 end = ex_start;
241 if (start + size < end)
242 mem = find_fw_memmap_area(start, end, size,
243 sizeof(struct early_res));
244 if (mem == -1ULL) {
245 start = ex_end;
246 end = get_max_mapped();
247 if (start + size < end)
248 mem = find_fw_memmap_area(start, end, size,
249 sizeof(struct early_res));
250 }
251 if (mem == -1ULL)
252 panic("can not find more space for early_res array");
253
254 new = __va(mem);
255 /* save the first one for own */
256 new[0].start = mem;
257 new[0].end = mem + size;
258 new[0].overlap_ok = 0;
259 /* copy old to new */
260 if (early_res == early_res_x) {
261 memcpy(&new[1], &early_res[0],
262 sizeof(struct early_res) * max_early_res);
263 memset(&new[max_early_res+1], 0,
264 sizeof(struct early_res) * (max_early_res - 1));
265 early_res_count++;
266 } else {
267 memcpy(&new[1], &early_res[1],
268 sizeof(struct early_res) * (max_early_res - 1));
269 memset(&new[max_early_res], 0,
270 sizeof(struct early_res) * max_early_res);
271 }
272 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
273 early_res = new;
274 max_early_res *= 2;
275 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
276 max_early_res, mem, mem + size - 1);
277}
278
279/*
280 * Most early reservations come here.
281 *
282 * We first have drop_overlaps_that_are_ok() drop any pre-existing
283 * 'overlap_ok' ranges, so that we can then reserve this memory
284 * range without risk of panic'ing on an overlapping overlap_ok
285 * early reservation.
286 */
287void __init reserve_early(u64 start, u64 end, char *name)
288{
289 if (start >= end)
290 return;
291
292 __check_and_double_early_res(start, end);
293
294 drop_overlaps_that_are_ok(start, end);
295 __reserve_early(start, end, name, 0);
296}
297
298void __init reserve_early_without_check(u64 start, u64 end, char *name)
299{
300 struct early_res *r;
301
302 if (start >= end)
303 return;
304
305 __check_and_double_early_res(start, end);
306
307 r = &early_res[early_res_count];
308
309 r->start = start;
310 r->end = end;
311 r->overlap_ok = 0;
312 if (name)
313 strncpy(r->name, name, sizeof(r->name) - 1);
314 early_res_count++;
315}
316
317void __init free_early(u64 start, u64 end)
318{
319 struct early_res *r;
320 int i;
321
322 i = find_overlapped_early(start, end);
323 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start)
325 panic("free_early on not reserved area: %llx-%llx!",
326 start, end - 1);
327
328 drop_range(i);
329}
330
331void __init free_early_partial(u64 start, u64 end)
332{
333 struct early_res *r;
334 int i;
335
336 if (start == end)
337 return;
338
339 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
340 return;
341
342try_next:
343 i = find_overlapped_early(start, end);
344 if (i >= max_early_res)
345 return;
346
347 r = &early_res[i];
348 /* hole ? */
349 if (r->end >= end && r->start <= start) {
350 drop_range_partial(i, start, end);
351 return;
352 }
353
354 drop_range_partial(i, start, end);
355 goto try_next;
356}
357
358#ifdef CONFIG_NO_BOOTMEM
359static void __init subtract_early_res(struct range *range, int az)
360{
361 int i, count;
362 u64 final_start, final_end;
363 int idx = 0;
364
365 count = 0;
366 for (i = 0; i < max_early_res && early_res[i].end; i++)
367 count++;
368
369 /* need to skip first one ?*/
370 if (early_res != early_res_x)
371 idx = 1;
372
373#define DEBUG_PRINT_EARLY_RES 1
374
375#if DEBUG_PRINT_EARLY_RES
376 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
377#endif
378 for (i = idx; i < count; i++) {
379 struct early_res *r = &early_res[i];
380#if DEBUG_PRINT_EARLY_RES
381 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
382 r->start, r->end, r->name);
383#endif
384 final_start = PFN_DOWN(r->start);
385 final_end = PFN_UP(r->end);
386 if (final_start >= final_end)
387 continue;
388 subtract_range(range, az, final_start, final_end);
389 }
390
391}
392
393int __init get_free_all_memory_range(struct range **rangep, int nodeid)
394{
395 int i, count;
396 u64 start = 0, end;
397 u64 size;
398 u64 mem;
399 struct range *range;
400 int nr_range;
401
402 count = 0;
403 for (i = 0; i < max_early_res && early_res[i].end; i++)
404 count++;
405
406 count *= 2;
407
408 size = sizeof(struct range) * count;
409 end = get_max_mapped();
410#ifdef MAX_DMA32_PFN
411 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
412 start = MAX_DMA32_PFN << PAGE_SHIFT;
413#endif
414 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
415 if (mem == -1ULL)
416 panic("can not find more space for range free");
417
418 range = __va(mem);
419 /* use early_node_map[] and early_res to get range array at first */
420 memset(range, 0, size);
421 nr_range = 0;
422
423 /* need to go over early_node_map to find out good range for node */
424 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
425#ifdef CONFIG_X86_32
426 subtract_range(range, count, max_low_pfn, -1ULL);
427#endif
428 subtract_early_res(range, count);
429 nr_range = clean_sort_range(range, count);
430
431 /* need to clear it ? */
432 if (nodeid == MAX_NUMNODES) {
433 memset(&early_res[0], 0,
434 sizeof(struct early_res) * max_early_res);
435 early_res = NULL;
436 max_early_res = 0;
437 }
438
439 *rangep = range;
440 return nr_range;
441}
442#else
443void __init early_res_to_bootmem(u64 start, u64 end)
444{
445 int i, count;
446 u64 final_start, final_end;
447 int idx = 0;
448
449 count = 0;
450 for (i = 0; i < max_early_res && early_res[i].end; i++)
451 count++;
452
453 /* need to skip first one ?*/
454 if (early_res != early_res_x)
455 idx = 1;
456
457 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
458 count - idx, max_early_res, start, end);
459 for (i = idx; i < count; i++) {
460 struct early_res *r = &early_res[i];
461 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
462 r->start, r->end, r->name);
463 final_start = max(start, r->start);
464 final_end = min(end, r->end);
465 if (final_start >= final_end) {
466 printk(KERN_CONT "\n");
467 continue;
468 }
469 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
470 final_start, final_end);
471 reserve_bootmem_generic(final_start, final_end - final_start,
472 BOOTMEM_DEFAULT);
473 }
474 /* clear them */
475 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
476 early_res = NULL;
477 max_early_res = 0;
478 early_res_count = 0;
479}
480#endif
481
482/* Check for already reserved areas */
483static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
484{
485 int i;
486 u64 addr = *addrp;
487 int changed = 0;
488 struct early_res *r;
489again:
490 i = find_overlapped_early(addr, addr + size);
491 r = &early_res[i];
492 if (i < max_early_res && r->end) {
493 *addrp = addr = round_up(r->end, align);
494 changed = 1;
495 goto again;
496 }
497 return changed;
498}
499
500/* Check for already reserved areas */
501static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
502{
503 int i;
504 u64 addr = *addrp, last;
505 u64 size = *sizep;
506 int changed = 0;
507again:
508 last = addr + size;
509 for (i = 0; i < max_early_res && early_res[i].end; i++) {
510 struct early_res *r = &early_res[i];
511 if (last > r->start && addr < r->start) {
512 size = r->start - addr;
513 changed = 1;
514 goto again;
515 }
516 if (last > r->end && addr < r->end) {
517 addr = round_up(r->end, align);
518 size = last - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last <= r->end && addr >= r->start) {
523 (*sizep)++;
524 return 0;
525 }
526 }
527 if (changed) {
528 *addrp = addr;
529 *sizep = size;
530 }
531 return changed;
532}
533
534/*
535 * Find a free area with specified alignment in a specific range.
536 * only with the area.between start to end is active range from early_node_map
537 * so they are good as RAM
538 */
539u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
540 u64 size, u64 align)
541{
542 u64 addr, last;
543
544 addr = round_up(ei_start, align);
545 if (addr < start)
546 addr = round_up(start, align);
547 if (addr >= ei_last)
548 goto out;
549 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
550 ;
551 last = addr + size;
552 if (last > ei_last)
553 goto out;
554 if (last > end)
555 goto out;
556
557 return addr;
558
559out:
560 return -1ULL;
561}
562
563u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
564 u64 *sizep, u64 align)
565{
566 u64 addr, last;
567
568 addr = round_up(ei_start, align);
569 if (addr < start)
570 addr = round_up(start, align);
571 if (addr >= ei_last)
572 goto out;
573 *sizep = ei_last - addr;
574 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
575 ;
576 last = addr + *sizep;
577 if (last > ei_last)
578 goto out;
579
580 return addr;
581
582out:
583 return -1ULL;
584}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a66..7f2683a10ac4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk)
85 BUG_ON(!sig); 85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count)); 86 BUG_ON(!atomic_read(&sig->count));
87 87
88 sighand = rcu_dereference(tsk->sighand); 88 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() ||
90 lockdep_tasklist_lock_is_held());
89 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
90 92
91 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p)
170repeat: 172repeat:
171 tracehook_prepare_release_task(p); 173 tracehook_prepare_release_task(p);
172 /* don't need to get the RCU readlock here - the process is dead and 174 /* don't need to get the RCU readlock here - the process is dead and
173 * can't be modifying its own credentials */ 175 * can't be modifying its own credentials. But shut RCU-lockdep up */
176 rcu_read_lock();
174 atomic_dec(&__task_cred(p)->user->processes); 177 atomic_dec(&__task_cred(p)->user->processes);
178 rcu_read_unlock();
175 179
176 proc_flush_task(p); 180 proc_flush_task(p);
177 181
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files)
473 /* 477 /*
474 * It is safe to dereference the fd table without RCU or 478 * It is safe to dereference the fd table without RCU or
475 * ->file_lock because this is the last reference to the 479 * ->file_lock because this is the last reference to the
476 * files structure. 480 * files structure. But use RCU to shut RCU-lockdep up.
477 */ 481 */
482 rcu_read_lock();
478 fdt = files_fdtable(files); 483 fdt = files_fdtable(files);
484 rcu_read_unlock();
479 for (;;) { 485 for (;;) {
480 unsigned long set; 486 unsigned long set;
481 i = j * __NFDBITS; 487 i = j * __NFDBITS;
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files)
521 * at the end of the RCU grace period. Otherwise, 527 * at the end of the RCU grace period. Otherwise,
522 * you can free files immediately. 528 * you can free files immediately.
523 */ 529 */
530 rcu_read_lock();
524 fdt = files_fdtable(files); 531 fdt = files_fdtable(files);
525 if (fdt != &files->fdtab) 532 if (fdt != &files->fdtab)
526 kmem_cache_free(files_cachep, files); 533 kmem_cache_free(files_cachep, files);
527 free_fdtable(fdt); 534 free_fdtable(fdt);
535 rcu_read_unlock();
528 } 536 }
529} 537}
530 538
@@ -944,7 +952,9 @@ NORET_TYPE void do_exit(long code)
944 preempt_count()); 952 preempt_count());
945 953
946 acct_update_integrals(tsk); 954 acct_update_integrals(tsk);
947 955 /* sync mm's RSS info before statistics gathering */
956 if (tsk->mm)
957 sync_mm_rss(tsk, tsk->mm);
948 group_dead = atomic_dec_and_test(&tsk->signal->live); 958 group_dead = atomic_dec_and_test(&tsk->signal->live);
949 if (group_dead) { 959 if (group_dead) {
950 hrtimer_cancel(&tsk->signal->real_timer); 960 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1180,7 +1190,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1180 1190
1181 if (unlikely(wo->wo_flags & WNOWAIT)) { 1191 if (unlikely(wo->wo_flags & WNOWAIT)) {
1182 int exit_code = p->exit_code; 1192 int exit_code = p->exit_code;
1183 int why, status; 1193 int why;
1184 1194
1185 get_task_struct(p); 1195 get_task_struct(p);
1186 read_unlock(&tasklist_lock); 1196 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc2..44b0791b0a2e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89 89
90#ifdef CONFIG_PROVE_RCU
91int lockdep_tasklist_lock_is_held(void)
92{
93 return lockdep_is_held(&tasklist_lock);
94}
95EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
96#endif /* #ifdef CONFIG_PROVE_RCU */
97
90int nr_processes(void) 98int nr_processes(void)
91{ 99{
92 int cpu; 100 int cpu;
@@ -328,15 +336,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (!tmp) 336 if (!tmp)
329 goto fail_nomem; 337 goto fail_nomem;
330 *tmp = *mpnt; 338 *tmp = *mpnt;
339 INIT_LIST_HEAD(&tmp->anon_vma_chain);
331 pol = mpol_dup(vma_policy(mpnt)); 340 pol = mpol_dup(vma_policy(mpnt));
332 retval = PTR_ERR(pol); 341 retval = PTR_ERR(pol);
333 if (IS_ERR(pol)) 342 if (IS_ERR(pol))
334 goto fail_nomem_policy; 343 goto fail_nomem_policy;
335 vma_set_policy(tmp, pol); 344 vma_set_policy(tmp, pol);
345 if (anon_vma_fork(tmp, mpnt))
346 goto fail_nomem_anon_vma_fork;
336 tmp->vm_flags &= ~VM_LOCKED; 347 tmp->vm_flags &= ~VM_LOCKED;
337 tmp->vm_mm = mm; 348 tmp->vm_mm = mm;
338 tmp->vm_next = NULL; 349 tmp->vm_next = NULL;
339 anon_vma_link(tmp);
340 file = tmp->vm_file; 350 file = tmp->vm_file;
341 if (file) { 351 if (file) {
342 struct inode *inode = file->f_path.dentry->d_inode; 352 struct inode *inode = file->f_path.dentry->d_inode;
@@ -391,6 +401,8 @@ out:
391 flush_tlb_mm(oldmm); 401 flush_tlb_mm(oldmm);
392 up_write(&oldmm->mmap_sem); 402 up_write(&oldmm->mmap_sem);
393 return retval; 403 return retval;
404fail_nomem_anon_vma_fork:
405 mpol_put(pol);
394fail_nomem_policy: 406fail_nomem_policy:
395 kmem_cache_free(vm_area_cachep, tmp); 407 kmem_cache_free(vm_area_cachep, tmp);
396fail_nomem: 408fail_nomem:
@@ -454,8 +466,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
454 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 466 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
455 mm->core_state = NULL; 467 mm->core_state = NULL;
456 mm->nr_ptes = 0; 468 mm->nr_ptes = 0;
457 set_mm_counter(mm, file_rss, 0); 469 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
458 set_mm_counter(mm, anon_rss, 0);
459 spin_lock_init(&mm->page_table_lock); 470 spin_lock_init(&mm->page_table_lock);
460 mm->free_area_cache = TASK_UNMAPPED_BASE; 471 mm->free_area_cache = TASK_UNMAPPED_BASE;
461 mm->cached_hole_size = ~0UL; 472 mm->cached_hole_size = ~0UL;
@@ -824,23 +835,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
824 */ 835 */
825static void posix_cpu_timers_init_group(struct signal_struct *sig) 836static void posix_cpu_timers_init_group(struct signal_struct *sig)
826{ 837{
838 unsigned long cpu_limit;
839
827 /* Thread group counters. */ 840 /* Thread group counters. */
828 thread_group_cputime_init(sig); 841 thread_group_cputime_init(sig);
829 842
830 /* Expiration times and increments. */ 843 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
831 sig->it[CPUCLOCK_PROF].expires = cputime_zero; 844 if (cpu_limit != RLIM_INFINITY) {
832 sig->it[CPUCLOCK_PROF].incr = cputime_zero; 845 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
833 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
834 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
835
836 /* Cached expiration times. */
837 sig->cputime_expires.prof_exp = cputime_zero;
838 sig->cputime_expires.virt_exp = cputime_zero;
839 sig->cputime_expires.sched_exp = 0;
840
841 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
842 sig->cputime_expires.prof_exp =
843 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
844 sig->cputimer.running = 1; 846 sig->cputimer.running = 1;
845 } 847 }
846 848
@@ -857,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
857 if (clone_flags & CLONE_THREAD) 859 if (clone_flags & CLONE_THREAD)
858 return 0; 860 return 0;
859 861
860 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 862 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
861 tsk->signal = sig; 863 tsk->signal = sig;
862 if (!sig) 864 if (!sig)
863 return -ENOMEM; 865 return -ENOMEM;
@@ -865,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
865 atomic_set(&sig->count, 1); 867 atomic_set(&sig->count, 1);
866 atomic_set(&sig->live, 1); 868 atomic_set(&sig->live, 1);
867 init_waitqueue_head(&sig->wait_chldexit); 869 init_waitqueue_head(&sig->wait_chldexit);
868 sig->flags = 0;
869 if (clone_flags & CLONE_NEWPID) 870 if (clone_flags & CLONE_NEWPID)
870 sig->flags |= SIGNAL_UNKILLABLE; 871 sig->flags |= SIGNAL_UNKILLABLE;
871 sig->group_exit_code = 0;
872 sig->group_exit_task = NULL;
873 sig->group_stop_count = 0;
874 sig->curr_target = tsk; 872 sig->curr_target = tsk;
875 init_sigpending(&sig->shared_pending); 873 init_sigpending(&sig->shared_pending);
876 INIT_LIST_HEAD(&sig->posix_timers); 874 INIT_LIST_HEAD(&sig->posix_timers);
877 875
878 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 876 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
879 sig->it_real_incr.tv64 = 0;
880 sig->real_timer.function = it_real_fn; 877 sig->real_timer.function = it_real_fn;
881 878
882 sig->leader = 0; /* session leadership doesn't inherit */
883 sig->tty_old_pgrp = NULL;
884 sig->tty = NULL;
885
886 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
887 sig->gtime = cputime_zero;
888 sig->cgtime = cputime_zero;
889#ifndef CONFIG_VIRT_CPU_ACCOUNTING
890 sig->prev_utime = sig->prev_stime = cputime_zero;
891#endif
892 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
893 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
894 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
895 sig->maxrss = sig->cmaxrss = 0;
896 task_io_accounting_init(&sig->ioac);
897 sig->sum_sched_runtime = 0;
898 taskstats_tgid_init(sig);
899
900 task_lock(current->group_leader); 879 task_lock(current->group_leader);
901 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 880 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
902 task_unlock(current->group_leader); 881 task_unlock(current->group_leader);
903 882
904 posix_cpu_timers_init_group(sig); 883 posix_cpu_timers_init_group(sig);
905 884
906 acct_init_pacct(&sig->pacct);
907
908 tty_audit_fork(sig); 885 tty_audit_fork(sig);
909 886
910 sig->oom_adj = current->signal->oom_adj; 887 sig->oom_adj = current->signal->oom_adj;
@@ -1033,7 +1010,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1033#endif 1010#endif
1034 retval = -EAGAIN; 1011 retval = -EAGAIN;
1035 if (atomic_read(&p->real_cred->user->processes) >= 1012 if (atomic_read(&p->real_cred->user->processes) >=
1036 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1013 task_rlimit(p, RLIMIT_NPROC)) {
1037 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1014 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1038 p->real_cred->user != INIT_USER) 1015 p->real_cred->user != INIT_USER)
1039 goto bad_fork_free; 1016 goto bad_fork_free;
@@ -1075,6 +1052,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1075 p->prev_utime = cputime_zero; 1052 p->prev_utime = cputime_zero;
1076 p->prev_stime = cputime_zero; 1053 p->prev_stime = cputime_zero;
1077#endif 1054#endif
1055#if defined(SPLIT_RSS_COUNTING)
1056 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1057#endif
1078 1058
1079 p->default_timer_slack_ns = current->timer_slack_ns; 1059 p->default_timer_slack_ns = current->timer_slack_ns;
1080 1060
@@ -1241,21 +1221,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1241 /* Need tasklist lock for parent etc handling! */ 1221 /* Need tasklist lock for parent etc handling! */
1242 write_lock_irq(&tasklist_lock); 1222 write_lock_irq(&tasklist_lock);
1243 1223
1244 /*
1245 * The task hasn't been attached yet, so its cpus_allowed mask will
1246 * not be changed, nor will its assigned CPU.
1247 *
1248 * The cpus_allowed mask of the parent may have changed after it was
1249 * copied first time - so re-copy it here, then check the child's CPU
1250 * to ensure it is on a valid CPU (and if not, just force it back to
1251 * parent's CPU). This avoids alot of nasty races.
1252 */
1253 p->cpus_allowed = current->cpus_allowed;
1254 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1255 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1256 !cpu_online(task_cpu(p))))
1257 set_task_cpu(p, smp_processor_id());
1258
1259 /* CLONE_PARENT re-uses the old parent */ 1224 /* CLONE_PARENT re-uses the old parent */
1260 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { 1225 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1261 p->real_parent = current->real_parent; 1226 p->real_parent = current->real_parent;
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9d..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
530 return -EINVAL; 530 return -EINVAL;
531 531
532 WARN_ON(!atomic_read(&pi_state->refcount)); 532 WARN_ON(!atomic_read(&pi_state->refcount));
533 WARN_ON(pid && pi_state->owner && 533
534 pi_state->owner->pid != pid); 534 /*
535 * When pi_state->owner is NULL then the owner died
536 * and another waiter is on the fly. pi_state->owner
537 * is fixed up by the task which acquires
538 * pi_state->rt_mutex.
539 *
540 * We do not check for pid == 0 which can happen when
541 * the owner died and robust_list_exit() cleared the
542 * TID.
543 */
544 if (pid && pi_state->owner) {
545 /*
546 * Bail out if user space manipulated the
547 * futex value.
548 */
549 if (pid != task_pid_vnr(pi_state->owner))
550 return -EINVAL;
551 }
535 552
536 atomic_inc(&pi_state->refcount); 553 atomic_inc(&pi_state->refcount);
537 *ps = pi_state; 554 *ps = pi_state;
@@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
758 if (!pi_state) 775 if (!pi_state)
759 return -EINVAL; 776 return -EINVAL;
760 777
778 /*
779 * If current does not own the pi_state then the futex is
780 * inconsistent and user space fiddled with the futex value.
781 */
782 if (pi_state->owner != current)
783 return -EINVAL;
784
761 raw_spin_lock(&pi_state->pi_mutex.wait_lock); 785 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
762 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 786 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
763 787
@@ -1971,7 +1995,7 @@ retry_private:
1971 /* Unqueue and drop the lock */ 1995 /* Unqueue and drop the lock */
1972 unqueue_me_pi(&q); 1996 unqueue_me_pi(&q);
1973 1997
1974 goto out; 1998 goto out_put_key;
1975 1999
1976out_unlock_put_key: 2000out_unlock_put_key:
1977 queue_unlock(&q, hb); 2001 queue_unlock(&q, hb);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
146 struct task_struct *p; 146 struct task_struct *p;
147 147
148 ret = -ESRCH; 148 ret = -ESRCH;
149 read_lock(&tasklist_lock); 149 rcu_read_lock();
150 p = find_task_by_vpid(pid); 150 p = find_task_by_vpid(pid);
151 if (!p) 151 if (!p)
152 goto err_unlock; 152 goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
157 !capable(CAP_SYS_PTRACE)) 157 !capable(CAP_SYS_PTRACE))
158 goto err_unlock; 158 goto err_unlock;
159 head = p->compat_robust_list; 159 head = p->compat_robust_list;
160 read_unlock(&tasklist_lock); 160 rcu_read_unlock();
161 } 161 }
162 162
163 if (put_user(sizeof(*head), len_ptr)) 163 if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
165 return put_user(ptr_to_compat(head), head_ptr); 165 return put_user(ptr_to_compat(head), head_ptr);
166 166
167err_unlock: 167err_unlock:
168 read_unlock(&tasklist_lock); 168 rcu_read_unlock();
169 169
170 return ret; 170 return ret;
171} 171}
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c030ae657f20..03808ed342a6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM 244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
245 */ 245 */
246int reserve_bp_slot(struct perf_event *bp) 246static int __reserve_bp_slot(struct perf_event *bp)
247{ 247{
248 struct bp_busy_slots slots = {0}; 248 struct bp_busy_slots slots = {0};
249 int ret = 0;
250
251 mutex_lock(&nr_bp_mutex);
252 249
253 fetch_bp_busy_slots(&slots, bp); 250 fetch_bp_busy_slots(&slots, bp);
254 251
255 /* Flexible counters need to keep at least one slot */ 252 /* Flexible counters need to keep at least one slot */
256 if (slots.pinned + (!!slots.flexible) == HBP_NUM) { 253 if (slots.pinned + (!!slots.flexible) == HBP_NUM)
257 ret = -ENOSPC; 254 return -ENOSPC;
258 goto end;
259 }
260 255
261 toggle_bp_slot(bp, true); 256 toggle_bp_slot(bp, true);
262 257
263end: 258 return 0;
259}
260
261int reserve_bp_slot(struct perf_event *bp)
262{
263 int ret;
264
265 mutex_lock(&nr_bp_mutex);
266
267 ret = __reserve_bp_slot(bp);
268
264 mutex_unlock(&nr_bp_mutex); 269 mutex_unlock(&nr_bp_mutex);
265 270
266 return ret; 271 return ret;
267} 272}
268 273
274static void __release_bp_slot(struct perf_event *bp)
275{
276 toggle_bp_slot(bp, false);
277}
278
269void release_bp_slot(struct perf_event *bp) 279void release_bp_slot(struct perf_event *bp)
270{ 280{
271 mutex_lock(&nr_bp_mutex); 281 mutex_lock(&nr_bp_mutex);
272 282
273 toggle_bp_slot(bp, false); 283 __release_bp_slot(bp);
274 284
275 mutex_unlock(&nr_bp_mutex); 285 mutex_unlock(&nr_bp_mutex);
276} 286}
277 287
288/*
289 * Allow the kernel debugger to reserve breakpoint slots without
290 * taking a lock using the dbg_* variant of for the reserve and
291 * release breakpoint slots.
292 */
293int dbg_reserve_bp_slot(struct perf_event *bp)
294{
295 if (mutex_is_locked(&nr_bp_mutex))
296 return -1;
297
298 return __reserve_bp_slot(bp);
299}
300
301int dbg_release_bp_slot(struct perf_event *bp)
302{
303 if (mutex_is_locked(&nr_bp_mutex))
304 return -1;
305
306 __release_bp_slot(bp);
307
308 return 0;
309}
278 310
279int register_perf_hw_breakpoint(struct perf_event *bp) 311int register_perf_hw_breakpoint(struct perf_event *bp)
280{ 312{
@@ -328,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
328int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) 360int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
329{ 361{
330 u64 old_addr = bp->attr.bp_addr; 362 u64 old_addr = bp->attr.bp_addr;
363 u64 old_len = bp->attr.bp_len;
331 int old_type = bp->attr.bp_type; 364 int old_type = bp->attr.bp_type;
332 int old_len = bp->attr.bp_len;
333 int err = 0; 365 int err = 0;
334 366
335 perf_event_disable(bp); 367 perf_event_disable(bp);
@@ -381,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
381 * 413 *
382 * @return a set of per_cpu pointers to perf events 414 * @return a set of per_cpu pointers to perf events
383 */ 415 */
384struct perf_event ** 416struct perf_event * __percpu *
385register_wide_hw_breakpoint(struct perf_event_attr *attr, 417register_wide_hw_breakpoint(struct perf_event_attr *attr,
386 perf_overflow_handler_t triggered) 418 perf_overflow_handler_t triggered)
387{ 419{
388 struct perf_event **cpu_events, **pevent, *bp; 420 struct perf_event * __percpu *cpu_events, **pevent, *bp;
389 long err; 421 long err;
390 int cpu; 422 int cpu;
391 423
392 cpu_events = alloc_percpu(typeof(*cpu_events)); 424 cpu_events = alloc_percpu(typeof(*cpu_events));
393 if (!cpu_events) 425 if (!cpu_events)
394 return ERR_PTR(-ENOMEM); 426 return (void __percpu __force *)ERR_PTR(-ENOMEM);
395 427
396 get_online_cpus(); 428 get_online_cpus();
397 for_each_online_cpu(cpu) { 429 for_each_online_cpu(cpu) {
@@ -419,7 +451,7 @@ fail:
419 put_online_cpus(); 451 put_online_cpus();
420 452
421 free_percpu(cpu_events); 453 free_percpu(cpu_events);
422 return ERR_PTR(err); 454 return (void __percpu __force *)ERR_PTR(err);
423} 455}
424EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
425 457
@@ -427,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
427 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel 459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
428 * @cpu_events: the per cpu set of events to unregister 460 * @cpu_events: the per cpu set of events to unregister
429 */ 461 */
430void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) 462void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
431{ 463{
432 int cpu; 464 int cpu;
433 struct perf_event **pevent; 465 struct perf_event **pevent;
@@ -457,5 +489,4 @@ struct pmu perf_ops_bp = {
457 .enable = arch_install_hw_breakpoint, 489 .enable = arch_install_hw_breakpoint,
458 .disable = arch_uninstall_hw_breakpoint, 490 .disable = arch_uninstall_hw_breakpoint,
459 .read = hw_breakpoint_pmu_read, 491 .read = hw_breakpoint_pmu_read,
460 .unthrottle = hw_breakpoint_pmu_unthrottle
461}; 492};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
85 raw_spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 100}
87 101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
88 122
89/** 123/**
90 * set_irq_chip - set the irq chip for an irq 124 * set_irq_chip - set the irq chip for an irq
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
325 if (desc->chip->ack) 359 if (desc->chip->ack)
326 desc->chip->ack(irq); 360 desc->chip->ack(irq);
327 } 361 }
362 desc->status |= IRQ_MASKED;
363}
364
365static inline void mask_irq(struct irq_desc *desc, int irq)
366{
367 if (desc->chip->mask) {
368 desc->chip->mask(irq);
369 desc->status |= IRQ_MASKED;
370 }
371}
372
373static inline void unmask_irq(struct irq_desc *desc, int irq)
374{
375 if (desc->chip->unmask) {
376 desc->chip->unmask(irq);
377 desc->status &= ~IRQ_MASKED;
378 }
328} 379}
329 380
330/* 381/*
@@ -450,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
450 raw_spin_lock(&desc->lock); 501 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 502 desc->status &= ~IRQ_INPROGRESS;
452 503
453 if (unlikely(desc->status & IRQ_ONESHOT)) 504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
454 desc->status |= IRQ_MASKED; 505 unmask_irq(desc, irq);
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq);
457out_unlock: 506out_unlock:
458 raw_spin_unlock(&desc->lock); 507 raw_spin_unlock(&desc->lock);
459} 508}
@@ -490,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
490 action = desc->action; 539 action = desc->action;
491 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
492 desc->status |= IRQ_PENDING; 541 desc->status |= IRQ_PENDING;
493 if (desc->chip->mask) 542 mask_irq(desc, irq);
494 desc->chip->mask(irq);
495 goto out; 543 goto out;
496 } 544 }
497 545
@@ -520,7 +568,7 @@ out:
520 * signal. The occurence is latched into the irq controller hardware 568 * signal. The occurence is latched into the irq controller hardware
521 * and must be acked in order to be reenabled. After the ack another 569 * and must be acked in order to be reenabled. After the ack another
522 * interrupt can happen on the same source even before the first one 570 * interrupt can happen on the same source even before the first one
523 * is handled by the assosiacted event handler. If this happens it 571 * is handled by the associated event handler. If this happens it
524 * might be necessary to disable (mask) the interrupt depending on the 572 * might be necessary to disable (mask) the interrupt depending on the
525 * controller hardware. This requires to reenable the interrupt inside 573 * controller hardware. This requires to reenable the interrupt inside
526 * of the loop which handles the interrupts which have arrived while 574 * of the loop which handles the interrupts which have arrived while
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
559 irqreturn_t action_ret; 607 irqreturn_t action_ret;
560 608
561 if (unlikely(!action)) { 609 if (unlikely(!action)) {
562 desc->chip->mask(irq); 610 mask_irq(desc, irq);
563 goto out_unlock; 611 goto out_unlock;
564 } 612 }
565 613
@@ -571,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
571 if (unlikely((desc->status & 619 if (unlikely((desc->status &
572 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
573 (IRQ_PENDING | IRQ_MASKED))) { 621 (IRQ_PENDING | IRQ_MASKED))) {
574 desc->chip->unmask(irq); 622 unmask_irq(desc, irq);
575 desc->status &= ~IRQ_MASKED;
576 } 623 }
577 624
578 desc->status &= ~IRQ_PENDING; 625 desc->status &= ~IRQ_PENDING;
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
682 __set_irq_handler(irq, handle, 0, name); 729 __set_irq_handler(irq, handle, 0, name);
683} 730}
684 731
685void __init set_irq_noprobe(unsigned int irq) 732void set_irq_noprobe(unsigned int irq)
686{ 733{
687 struct irq_desc *desc = irq_to_desc(irq); 734 struct irq_desc *desc = irq_to_desc(irq);
688 unsigned long flags; 735 unsigned long flags;
@@ -697,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
697 raw_spin_unlock_irqrestore(&desc->lock, flags); 744 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 745}
699 746
700void __init set_irq_probe(unsigned int irq) 747void set_irq_probe(unsigned int irq)
701{ 748{
702 struct irq_desc *desc = irq_to_desc(irq); 749 struct irq_desc *desc = irq_to_desc(irq);
703 unsigned long flags; 750 unsigned long flags;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * automatically freed on driver detach. 42 * automatically freed on driver detach.
43 * 43 *
44 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
45 * separately, dev_free_irq() must be used. 45 * separately, devm_free_irq() must be used.
46 */ 46 */
47int devm_request_threaded_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
48 irq_handler_t handler, irq_handler_t thread_fn, 48 irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
81 * Except for the extra @dev argument, this function takes the 81 * Except for the extra @dev argument, this function takes the
82 * same arguments and performs the same function as free_irq(). 82 * same arguments and performs the same function as free_irq().
83 * This function instead of free_irq() should be used to manually 83 * This function instead of free_irq() should be used to manually
84 * free IRQs allocated with dev_request_irq(). 84 * free IRQs allocated with devm_request_irq().
85 */ 85 */
86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) 86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
132 */ 128 */
133DEFINE_RAW_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..704e488730a5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
382{ 382{
383 struct irq_desc *desc = irq_to_desc(irq); 383 struct irq_desc *desc = irq_to_desc(irq);
384 struct irqaction *action; 384 struct irqaction *action;
385 unsigned long flags;
385 386
386 if (!desc) 387 if (!desc)
387 return 0; 388 return 0;
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
389 if (desc->status & IRQ_NOREQUEST) 390 if (desc->status & IRQ_NOREQUEST)
390 return 0; 391 return 0;
391 392
393 raw_spin_lock_irqsave(&desc->lock, flags);
392 action = desc->action; 394 action = desc->action;
393 if (action) 395 if (action)
394 if (irqflags & action->flags & IRQF_SHARED) 396 if (irqflags & action->flags & IRQF_SHARED)
395 action = NULL; 397 action = NULL;
396 398
399 raw_spin_unlock_irqrestore(&desc->lock, flags);
400
397 return !action; 401 return !action;
398} 402}
399 403
@@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
483 */ 487 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 488static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 489{
490again:
486 chip_bus_lock(irq, desc); 491 chip_bus_lock(irq, desc);
487 raw_spin_lock_irq(&desc->lock); 492 raw_spin_lock_irq(&desc->lock);
493
494 /*
495 * Implausible though it may be we need to protect us against
496 * the following scenario:
497 *
498 * The thread is faster done than the hard interrupt handler
499 * on the other CPU. If we unmask the irq line then the
500 * interrupt can come in again and masks the line, leaves due
501 * to IRQ_INPROGRESS and the irq line is masked forever.
502 */
503 if (unlikely(desc->status & IRQ_INPROGRESS)) {
504 raw_spin_unlock_irq(&desc->lock);
505 chip_bus_sync_unlock(irq, desc);
506 cpu_relax();
507 goto again;
508 }
509
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 510 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 511 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 512 desc->chip->unmask(irq);
@@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 if (new->flags & IRQF_ONESHOT) 757 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT; 758 desc->status |= IRQ_ONESHOT;
737 759
760 /*
761 * Force MSI interrupts to run with interrupts
762 * disabled. The multi vector cards can cause stack
763 * overflows due to nested interrupts when enough of
764 * them are directed to a core and fire at the same
765 * time.
766 */
767 if (desc->msi_desc)
768 new->flags |= IRQF_DISABLED;
769
738 if (!(desc->status & IRQ_NOAUTOEN)) { 770 if (!(desc->status & IRQ_NOAUTOEN)) {
739 desc->depth = 0; 771 desc->depth = 0;
740 desc->status &= ~IRQ_DISABLED; 772 desc->status &= ~IRQ_DISABLED;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
6 */ 6 */
7 7
8#include <linux/irq.h> 8#include <linux/irq.h>
9#include <linux/slab.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/random.h> 11#include <linux/random.h>
11#include <linux/interrupt.h> 12#include <linux/interrupt.h>
@@ -70,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 72
72 /* We have to check it to avoid races with another CPU */ 73 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 74 desc = irq_to_desc(irq);
74 75
75 if (desc && old_desc != desc) 76 if (desc && old_desc != desc)
76 goto out_unlock; 77 goto out_unlock;
@@ -90,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 91 goto out_unlock;
91 } 92 }
92 93
93 irq_desc_ptrs[irq] = desc; 94 replace_irq_desc(irq, desc);
94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 96
96 /* free the old one */ 97 /* free the old one */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c0..7a6eb04ef6b5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/gfp.h>
10#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
11#include <linux/seq_file.h> 12#include <linux/seq_file.h>
12#include <linux/interrupt.h> 13#include <linux/interrupt.h>
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8e5288a8a355..13aff293f4de 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -21,6 +21,7 @@
21#include <linux/sched.h> /* for cond_resched */ 21#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/ctype.h> 23#include <linux/ctype.h>
24#include <linux/slab.h>
24 25
25#include <asm/sections.h> 26#include <asm/sections.h>
26 27
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..87ebe8adc474 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
41#include <asm/sections.h> 41#include <asm/sections.h>
42 42
43/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
44note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
45 45
46/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 32c5c15d750d..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
80 80
81 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
82 if (!buffer) { 82 if (!buffer) {
83 _kfifo_init(fifo, 0, 0); 83 _kfifo_init(fifo, NULL, 0);
84 return -ENOMEM; 84 return -ENOMEM;
85 } 85 }
86 86
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
97void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
98{ 98{
99 kfree(fifo->buffer); 99 kfree(fifo->buffer);
100 _kfifo_init(fifo, NULL, 0);
100} 101}
101EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
102 103
@@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n);
349 * @fifo: the fifo to be used. 350 * @fifo: the fifo to be used.
350 * @from: pointer to the data to be added. 351 * @from: pointer to the data to be added.
351 * @len: the length of the data to be added. 352 * @len: the length of the data to be added.
353 * @total: the actual returned data length.
352 * 354 *
353 * This function copies at most @len bytes from the @from into the 355 * This function copies at most @len bytes from the @from into the
354 * FIFO depending and returns -EFAULT/0. 356 * FIFO depending and returns -EFAULT/0.
@@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n);
399 * @fifo: the fifo to be used. 401 * @fifo: the fifo to be used.
400 * @to: where the data must be copied. 402 * @to: where the data must be copied.
401 * @len: the size of the destination buffer. 403 * @len: the size of the destination buffer.
402 @ @lenout: pointer to output variable with copied data 404 * @lenout: pointer to output variable with copied data
403 * 405 *
404 * This function copies at most @len bytes from the FIFO into the 406 * This function copies at most @len bytes from the FIFO into the
405 * @to buffer and 0 or -EFAULT. 407 * @to buffer and 0 or -EFAULT.
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 2eb517e23514..11f3515ca83f 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -69,9 +69,16 @@ struct kgdb_state {
69 struct pt_regs *linux_regs; 69 struct pt_regs *linux_regs;
70}; 70};
71 71
72/* Exception state values */
73#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
74#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
75#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
76#define DCPU_SSTEP 0x8 /* CPU is single stepping */
77
72static struct debuggerinfo_struct { 78static struct debuggerinfo_struct {
73 void *debuggerinfo; 79 void *debuggerinfo;
74 struct task_struct *task; 80 struct task_struct *task;
81 int exception_state;
75} kgdb_info[NR_CPUS]; 82} kgdb_info[NR_CPUS];
76 83
77/** 84/**
@@ -391,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
391 398
392/* 399/*
393 * Copy the binary array pointed to by buf into mem. Fix $, #, and 400 * Copy the binary array pointed to by buf into mem. Fix $, #, and
394 * 0x7d escaped with 0x7d. Return a pointer to the character after 401 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
395 * the last byte written. 402 * The input buf is overwitten with the result to write to mem.
396 */ 403 */
397static int kgdb_ebin2mem(char *buf, char *mem, int count) 404static int kgdb_ebin2mem(char *buf, char *mem, int count)
398{ 405{
399 int err = 0; 406 int size = 0;
400 char c; 407 char *c = buf;
401 408
402 while (count-- > 0) { 409 while (count-- > 0) {
403 c = *buf++; 410 c[size] = *buf++;
404 if (c == 0x7d) 411 if (c[size] == 0x7d)
405 c = *buf++ ^ 0x20; 412 c[size] = *buf++ ^ 0x20;
406 413 size++;
407 err = probe_kernel_write(mem, &c, 1);
408 if (err)
409 break;
410
411 mem++;
412 } 414 }
413 415
414 return err; 416 return probe_kernel_write(mem, c, size);
415} 417}
416 418
417/* 419/*
@@ -563,46 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
563} 565}
564 566
565/* 567/*
566 * CPU debug state control:
567 */
568
569#ifdef CONFIG_SMP
570static void kgdb_wait(struct pt_regs *regs)
571{
572 unsigned long flags;
573 int cpu;
574
575 local_irq_save(flags);
576 cpu = raw_smp_processor_id();
577 kgdb_info[cpu].debuggerinfo = regs;
578 kgdb_info[cpu].task = current;
579 /*
580 * Make sure the above info reaches the primary CPU before
581 * our cpu_in_kgdb[] flag setting does:
582 */
583 smp_wmb();
584 atomic_set(&cpu_in_kgdb[cpu], 1);
585
586 /* Wait till primary CPU is done with debugging */
587 while (atomic_read(&passive_cpu_wait[cpu]))
588 cpu_relax();
589
590 kgdb_info[cpu].debuggerinfo = NULL;
591 kgdb_info[cpu].task = NULL;
592
593 /* fix up hardware debug registers on local cpu */
594 if (arch_kgdb_ops.correct_hw_break)
595 arch_kgdb_ops.correct_hw_break();
596
597 /* Signal the primary CPU that we are done: */
598 atomic_set(&cpu_in_kgdb[cpu], 0);
599 touch_softlockup_watchdog();
600 clocksource_touch_watchdog();
601 local_irq_restore(flags);
602}
603#endif
604
605/*
606 * Some architectures need cache flushes when we set/clear a 568 * Some architectures need cache flushes when we set/clear a
607 * breakpoint: 569 * breakpoint:
608 */ 570 */
@@ -1397,34 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
1397 return 1; 1359 return 1;
1398} 1360}
1399 1361
1400/* 1362static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
1401 * kgdb_handle_exception() - main entry point from a kernel exception
1402 *
1403 * Locking hierarchy:
1404 * interface locks, if any (begin_session)
1405 * kgdb lock (kgdb_active)
1406 */
1407int
1408kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1409{ 1363{
1410 struct kgdb_state kgdb_var;
1411 struct kgdb_state *ks = &kgdb_var;
1412 unsigned long flags; 1364 unsigned long flags;
1413 int sstep_tries = 100; 1365 int sstep_tries = 100;
1414 int error = 0; 1366 int error = 0;
1415 int i, cpu; 1367 int i, cpu;
1416 1368 int trace_on = 0;
1417 ks->cpu = raw_smp_processor_id();
1418 ks->ex_vector = evector;
1419 ks->signo = signo;
1420 ks->ex_vector = evector;
1421 ks->err_code = ecode;
1422 ks->kgdb_usethreadid = 0;
1423 ks->linux_regs = regs;
1424
1425 if (kgdb_reenter_check(ks))
1426 return 0; /* Ouch, double exception ! */
1427
1428acquirelock: 1369acquirelock:
1429 /* 1370 /*
1430 * Interrupts will be restored by the 'trap return' code, except when 1371 * Interrupts will be restored by the 'trap return' code, except when
@@ -1432,13 +1373,43 @@ acquirelock:
1432 */ 1373 */
1433 local_irq_save(flags); 1374 local_irq_save(flags);
1434 1375
1435 cpu = raw_smp_processor_id(); 1376 cpu = ks->cpu;
1377 kgdb_info[cpu].debuggerinfo = regs;
1378 kgdb_info[cpu].task = current;
1379 /*
1380 * Make sure the above info reaches the primary CPU before
1381 * our cpu_in_kgdb[] flag setting does:
1382 */
1383 atomic_inc(&cpu_in_kgdb[cpu]);
1436 1384
1437 /* 1385 /*
1438 * Acquire the kgdb_active lock: 1386 * CPU will loop if it is a slave or request to become a kgdb
1387 * master cpu and acquire the kgdb_active lock:
1439 */ 1388 */
1440 while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) 1389 while (1) {
1390 if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
1391 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
1392 break;
1393 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
1394 if (!atomic_read(&passive_cpu_wait[cpu]))
1395 goto return_normal;
1396 } else {
1397return_normal:
1398 /* Return to normal operation by executing any
1399 * hw breakpoint fixup.
1400 */
1401 if (arch_kgdb_ops.correct_hw_break)
1402 arch_kgdb_ops.correct_hw_break();
1403 if (trace_on)
1404 tracing_on();
1405 atomic_dec(&cpu_in_kgdb[cpu]);
1406 touch_softlockup_watchdog_sync();
1407 clocksource_touch_watchdog();
1408 local_irq_restore(flags);
1409 return 0;
1410 }
1441 cpu_relax(); 1411 cpu_relax();
1412 }
1442 1413
1443 /* 1414 /*
1444 * For single stepping, try to only enter on the processor 1415 * For single stepping, try to only enter on the processor
@@ -1450,7 +1421,7 @@ acquirelock:
1450 (kgdb_info[cpu].task && 1421 (kgdb_info[cpu].task &&
1451 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 1422 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1452 atomic_set(&kgdb_active, -1); 1423 atomic_set(&kgdb_active, -1);
1453 touch_softlockup_watchdog(); 1424 touch_softlockup_watchdog_sync();
1454 clocksource_touch_watchdog(); 1425 clocksource_touch_watchdog();
1455 local_irq_restore(flags); 1426 local_irq_restore(flags);
1456 1427
@@ -1472,9 +1443,6 @@ acquirelock:
1472 if (kgdb_io_ops->pre_exception) 1443 if (kgdb_io_ops->pre_exception)
1473 kgdb_io_ops->pre_exception(); 1444 kgdb_io_ops->pre_exception();
1474 1445
1475 kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
1476 kgdb_info[ks->cpu].task = current;
1477
1478 kgdb_disable_hw_debug(ks->linux_regs); 1446 kgdb_disable_hw_debug(ks->linux_regs);
1479 1447
1480 /* 1448 /*
@@ -1483,15 +1451,9 @@ acquirelock:
1483 */ 1451 */
1484 if (!kgdb_single_step) { 1452 if (!kgdb_single_step) {
1485 for (i = 0; i < NR_CPUS; i++) 1453 for (i = 0; i < NR_CPUS; i++)
1486 atomic_set(&passive_cpu_wait[i], 1); 1454 atomic_inc(&passive_cpu_wait[i]);
1487 } 1455 }
1488 1456
1489 /*
1490 * spin_lock code is good enough as a barrier so we don't
1491 * need one here:
1492 */
1493 atomic_set(&cpu_in_kgdb[ks->cpu], 1);
1494
1495#ifdef CONFIG_SMP 1457#ifdef CONFIG_SMP
1496 /* Signal the other CPUs to enter kgdb_wait() */ 1458 /* Signal the other CPUs to enter kgdb_wait() */
1497 if ((!kgdb_single_step) && kgdb_do_roundup) 1459 if ((!kgdb_single_step) && kgdb_do_roundup)
@@ -1515,6 +1477,9 @@ acquirelock:
1515 kgdb_single_step = 0; 1477 kgdb_single_step = 0;
1516 kgdb_contthread = current; 1478 kgdb_contthread = current;
1517 exception_level = 0; 1479 exception_level = 0;
1480 trace_on = tracing_is_on();
1481 if (trace_on)
1482 tracing_off();
1518 1483
1519 /* Talk to debugger with gdbserial protocol */ 1484 /* Talk to debugger with gdbserial protocol */
1520 error = gdb_serial_stub(ks); 1485 error = gdb_serial_stub(ks);
@@ -1523,13 +1488,11 @@ acquirelock:
1523 if (kgdb_io_ops->post_exception) 1488 if (kgdb_io_ops->post_exception)
1524 kgdb_io_ops->post_exception(); 1489 kgdb_io_ops->post_exception();
1525 1490
1526 kgdb_info[ks->cpu].debuggerinfo = NULL; 1491 atomic_dec(&cpu_in_kgdb[ks->cpu]);
1527 kgdb_info[ks->cpu].task = NULL;
1528 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1529 1492
1530 if (!kgdb_single_step) { 1493 if (!kgdb_single_step) {
1531 for (i = NR_CPUS-1; i >= 0; i--) 1494 for (i = NR_CPUS-1; i >= 0; i--)
1532 atomic_set(&passive_cpu_wait[i], 0); 1495 atomic_dec(&passive_cpu_wait[i]);
1533 /* 1496 /*
1534 * Wait till all the CPUs have quit 1497 * Wait till all the CPUs have quit
1535 * from the debugger. 1498 * from the debugger.
@@ -1548,22 +1511,63 @@ kgdb_restore:
1548 else 1511 else
1549 kgdb_sstep_pid = 0; 1512 kgdb_sstep_pid = 0;
1550 } 1513 }
1514 if (trace_on)
1515 tracing_on();
1551 /* Free kgdb_active */ 1516 /* Free kgdb_active */
1552 atomic_set(&kgdb_active, -1); 1517 atomic_set(&kgdb_active, -1);
1553 touch_softlockup_watchdog(); 1518 touch_softlockup_watchdog_sync();
1554 clocksource_touch_watchdog(); 1519 clocksource_touch_watchdog();
1555 local_irq_restore(flags); 1520 local_irq_restore(flags);
1556 1521
1557 return error; 1522 return error;
1558} 1523}
1559 1524
1525/*
1526 * kgdb_handle_exception() - main entry point from a kernel exception
1527 *
1528 * Locking hierarchy:
1529 * interface locks, if any (begin_session)
1530 * kgdb lock (kgdb_active)
1531 */
1532int
1533kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1534{
1535 struct kgdb_state kgdb_var;
1536 struct kgdb_state *ks = &kgdb_var;
1537 int ret;
1538
1539 ks->cpu = raw_smp_processor_id();
1540 ks->ex_vector = evector;
1541 ks->signo = signo;
1542 ks->ex_vector = evector;
1543 ks->err_code = ecode;
1544 ks->kgdb_usethreadid = 0;
1545 ks->linux_regs = regs;
1546
1547 if (kgdb_reenter_check(ks))
1548 return 0; /* Ouch, double exception ! */
1549 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
1550 ret = kgdb_cpu_enter(ks, regs);
1551 kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
1552 return ret;
1553}
1554
1560int kgdb_nmicallback(int cpu, void *regs) 1555int kgdb_nmicallback(int cpu, void *regs)
1561{ 1556{
1562#ifdef CONFIG_SMP 1557#ifdef CONFIG_SMP
1558 struct kgdb_state kgdb_var;
1559 struct kgdb_state *ks = &kgdb_var;
1560
1561 memset(ks, 0, sizeof(struct kgdb_state));
1562 ks->cpu = cpu;
1563 ks->linux_regs = regs;
1564
1563 if (!atomic_read(&cpu_in_kgdb[cpu]) && 1565 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1564 atomic_read(&kgdb_active) != cpu && 1566 atomic_read(&kgdb_active) != -1 &&
1565 atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { 1567 atomic_read(&kgdb_active) != cpu) {
1566 kgdb_wait((struct pt_regs *)regs); 1568 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
1569 kgdb_cpu_enter(ks, regs);
1570 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
1567 return 0; 1571 return 0;
1568 } 1572 }
1569#endif 1573#endif
@@ -1739,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1739 */ 1743 */
1740void kgdb_breakpoint(void) 1744void kgdb_breakpoint(void)
1741{ 1745{
1742 atomic_set(&kgdb_setting_breakpoint, 1); 1746 atomic_inc(&kgdb_setting_breakpoint);
1743 wmb(); /* Sync point before breakpoint */ 1747 wmb(); /* Sync point before breakpoint */
1744 arch_kgdb_breakpoint(); 1748 arch_kgdb_breakpoint();
1745 wmb(); /* Sync point after breakpoint */ 1749 wmb(); /* Sync point after breakpoint */
1746 atomic_set(&kgdb_setting_breakpoint, 0); 1750 atomic_dec(&kgdb_setting_breakpoint);
1747} 1751}
1748EXPORT_SYMBOL_GPL(kgdb_breakpoint); 1752EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1749 1753
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ccec774c716d..0ed46f3e51e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,9 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
47#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h>
48 50
49#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
50#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
105 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
106 * is a recipe for disaster 108 * is a recipe for disaster
107 */ 109 */
108#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
109
110struct kprobe_insn_page { 110struct kprobe_insn_page {
111 struct list_head list; 111 struct list_head list;
112 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
113 char slot_used[INSNS_PER_PAGE];
114 int nused; 113 int nused;
115 int ngarbage; 114 int ngarbage;
115 char slot_used[];
116};
117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
116}; 126};
117 127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
118enum kprobe_slot_state { 133enum kprobe_slot_state {
119 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
120 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
121 SLOT_USED = 2, 136 SLOT_USED = 2,
122}; 137};
123 138
124static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
125static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
126static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
127static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
143 .nr_garbage = 0,
144};
145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
128 146
129/** 147/**
130 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
131 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
132 */ 150 */
133static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
134{ 152{
135 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
136 154
137 retry: 155 retry:
138 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
139 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
140 int i; 158 int i;
141 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
142 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
143 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
144 kip->nused++; 162 kip->nused++;
145 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
146 } 164 }
147 } 165 }
148 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
149 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
150 } 169 }
151 } 170 }
152 171
153 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
154 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
155 goto retry; 174 goto retry;
156 } 175
157 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
158 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
159 if (!kip) 178 if (!kip)
160 return NULL; 179 return NULL;
161 180
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
170 return NULL; 189 return NULL;
171 } 190 }
172 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
173 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
174 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
175 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
176 kip->nused = 1; 194 kip->nused = 1;
177 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
178 return kip->insns; 197 return kip->insns;
179} 198}
180 199
200
181kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
182{ 202{
183 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
184 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
185 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
186 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
187 return ret; 209 return ret;
188} 210}
189 211
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
199 * so as not to have to set it up again the 221 * so as not to have to set it up again the
200 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
201 */ 223 */
202 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
203 list_del(&kip->list); 225 list_del(&kip->list);
204 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
205 kfree(kip); 227 kfree(kip);
@@ -209,51 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
209 return 0; 231 return 0;
210} 232}
211 233
212static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
213{ 235{
214 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
215 237
216 /* Ensure no-one is interrupted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
217 synchronize_sched(); 239 synchronize_sched();
218 240
219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
220 int i; 242 int i;
221 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
222 continue; 244 continue;
223 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
224 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
225 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
226 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
227 break; 249 break;
228 } 250 }
229 } 251 }
230 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
231 return 0; 253 return 0;
232} 254}
233 255
234void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
235{ 258{
236 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
237 260
238 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
239 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) /
240 if (kip->insns <= slot && 263 (c->insn_size * sizeof(kprobe_opcode_t));
241 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 if (idx >= 0 && idx < slots_per_page(c)) {
242 int i = (slot - kip->insns) / MAX_INSN_SIZE; 265 WARN_ON(kip->slot_used[idx] != SLOT_USED);
243 if (dirty) { 266 if (dirty) {
244 kip->slot_used[i] = SLOT_DIRTY; 267 kip->slot_used[idx] = SLOT_DIRTY;
245 kip->ngarbage++; 268 kip->ngarbage++;
269 if (++c->nr_garbage > slots_per_page(c))
270 collect_garbage_slots(c);
246 } else 271 } else
247 collect_one_slot(kip, i); 272 collect_one_slot(kip, idx);
248 break; 273 return;
249 } 274 }
250 } 275 }
276 /* Could not free this slot. */
277 WARN_ON(1);
278}
251 279
252 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 280void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
253 collect_garbage_slots(); 281{
254 282 mutex_lock(&kprobe_insn_mutex);
283 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
255 mutex_unlock(&kprobe_insn_mutex); 284 mutex_unlock(&kprobe_insn_mutex);
256} 285}
286#ifdef CONFIG_OPTPROBES
287/* For optimized_kprobe buffer */
288static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
289static struct kprobe_insn_cache kprobe_optinsn_slots = {
290 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
291 /* .insn_size is initialized later */
292 .nr_garbage = 0,
293};
294/* Get a slot for optimized_kprobe buffer */
295kprobe_opcode_t __kprobes *get_optinsn_slot(void)
296{
297 kprobe_opcode_t *ret = NULL;
298
299 mutex_lock(&kprobe_optinsn_mutex);
300 ret = __get_insn_slot(&kprobe_optinsn_slots);
301 mutex_unlock(&kprobe_optinsn_mutex);
302
303 return ret;
304}
305
306void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
307{
308 mutex_lock(&kprobe_optinsn_mutex);
309 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
310 mutex_unlock(&kprobe_optinsn_mutex);
311}
312#endif
257#endif 313#endif
258 314
259/* We have preemption disabled.. so it is safe to use __ versions */ 315/* We have preemption disabled.. so it is safe to use __ versions */
@@ -284,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
284 if (p->addr == addr) 340 if (p->addr == addr)
285 return p; 341 return p;
286 } 342 }
343
344 return NULL;
345}
346
347static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
348
349/* Return true if the kprobe is an aggregator */
350static inline int kprobe_aggrprobe(struct kprobe *p)
351{
352 return p->pre_handler == aggr_pre_handler;
353}
354
355/*
356 * Keep all fields in the kprobe consistent
357 */
358static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
359{
360 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
361 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
362}
363
364#ifdef CONFIG_OPTPROBES
365/* NOTE: change this value only with kprobe_mutex held */
366static bool kprobes_allow_optimization;
367
368/*
369 * Call all pre_handler on the list, but ignores its return value.
370 * This must be called from arch-dep optimized caller.
371 */
372void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
373{
374 struct kprobe *kp;
375
376 list_for_each_entry_rcu(kp, &p->list, list) {
377 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
378 set_kprobe_instance(kp);
379 kp->pre_handler(kp, regs);
380 }
381 reset_kprobe_instance();
382 }
383}
384
385/* Return true(!0) if the kprobe is ready for optimization. */
386static inline int kprobe_optready(struct kprobe *p)
387{
388 struct optimized_kprobe *op;
389
390 if (kprobe_aggrprobe(p)) {
391 op = container_of(p, struct optimized_kprobe, kp);
392 return arch_prepared_optinsn(&op->optinsn);
393 }
394
395 return 0;
396}
397
398/*
399 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint).
401 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{
404 int i;
405 struct kprobe *p = NULL;
406 struct optimized_kprobe *op;
407
408 /* Don't check i == 0, since that is a breakpoint case. */
409 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
410 p = get_kprobe((void *)(addr - i));
411
412 if (p && kprobe_optready(p)) {
413 op = container_of(p, struct optimized_kprobe, kp);
414 if (arch_within_optimized_kprobe(op, addr))
415 return p;
416 }
417
287 return NULL; 418 return NULL;
288} 419}
289 420
421/* Optimization staging list, protected by kprobe_mutex */
422static LIST_HEAD(optimizing_list);
423
424static void kprobe_optimizer(struct work_struct *work);
425static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
426#define OPTIMIZE_DELAY 5
427
428/* Kprobe jump optimizer */
429static __kprobes void kprobe_optimizer(struct work_struct *work)
430{
431 struct optimized_kprobe *op, *tmp;
432
433 /* Lock modules while optimizing kprobes */
434 mutex_lock(&module_mutex);
435 mutex_lock(&kprobe_mutex);
436 if (kprobes_all_disarmed || !kprobes_allow_optimization)
437 goto end;
438
439 /*
440 * Wait for quiesence period to ensure all running interrupts
441 * are done. Because optprobe may modify multiple instructions
442 * there is a chance that Nth instruction is interrupted. In that
443 * case, running interrupt can return to 2nd-Nth byte of jump
444 * instruction. This wait is for avoiding it.
445 */
446 synchronize_sched();
447
448 /*
449 * The optimization/unoptimization refers online_cpus via
450 * stop_machine() and cpu-hotplug modifies online_cpus.
451 * And same time, text_mutex will be held in cpu-hotplug and here.
452 * This combination can cause a deadlock (cpu-hotplug try to lock
453 * text_mutex but stop_machine can not be done because online_cpus
454 * has been changed)
455 * To avoid this deadlock, we need to call get_online_cpus()
456 * for preventing cpu-hotplug outside of text_mutex locking.
457 */
458 get_online_cpus();
459 mutex_lock(&text_mutex);
460 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
461 WARN_ON(kprobe_disabled(&op->kp));
462 if (arch_optimize_kprobe(op) < 0)
463 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
464 list_del_init(&op->list);
465 }
466 mutex_unlock(&text_mutex);
467 put_online_cpus();
468end:
469 mutex_unlock(&kprobe_mutex);
470 mutex_unlock(&module_mutex);
471}
472
473/* Optimize kprobe if p is ready to be optimized */
474static __kprobes void optimize_kprobe(struct kprobe *p)
475{
476 struct optimized_kprobe *op;
477
478 /* Check if the kprobe is disabled or not ready for optimization. */
479 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
480 (kprobe_disabled(p) || kprobes_all_disarmed))
481 return;
482
483 /* Both of break_handler and post_handler are not supported. */
484 if (p->break_handler || p->post_handler)
485 return;
486
487 op = container_of(p, struct optimized_kprobe, kp);
488
489 /* Check there is no other kprobes at the optimized instructions */
490 if (arch_check_optimized_kprobe(op) < 0)
491 return;
492
493 /* Check if it is already optimized. */
494 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
495 return;
496
497 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
498 list_add(&op->list, &optimizing_list);
499 if (!delayed_work_pending(&optimizing_work))
500 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
501}
502
503/* Unoptimize a kprobe if p is optimized */
504static __kprobes void unoptimize_kprobe(struct kprobe *p)
505{
506 struct optimized_kprobe *op;
507
508 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
509 op = container_of(p, struct optimized_kprobe, kp);
510 if (!list_empty(&op->list))
511 /* Dequeue from the optimization queue */
512 list_del_init(&op->list);
513 else
514 /* Replace jump with break */
515 arch_unoptimize_kprobe(op);
516 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
517 }
518}
519
520/* Remove optimized instructions */
521static void __kprobes kill_optimized_kprobe(struct kprobe *p)
522{
523 struct optimized_kprobe *op;
524
525 op = container_of(p, struct optimized_kprobe, kp);
526 if (!list_empty(&op->list)) {
527 /* Dequeue from the optimization queue */
528 list_del_init(&op->list);
529 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
530 }
531 /* Don't unoptimize, because the target code will be freed. */
532 arch_remove_optimized_kprobe(op);
533}
534
535/* Try to prepare optimized instructions */
536static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
537{
538 struct optimized_kprobe *op;
539
540 op = container_of(p, struct optimized_kprobe, kp);
541 arch_prepare_optimized_kprobe(op);
542}
543
544/* Free optimized instructions and optimized_kprobe */
545static __kprobes void free_aggr_kprobe(struct kprobe *p)
546{
547 struct optimized_kprobe *op;
548
549 op = container_of(p, struct optimized_kprobe, kp);
550 arch_remove_optimized_kprobe(op);
551 kfree(op);
552}
553
554/* Allocate new optimized_kprobe and try to prepare optimized instructions */
555static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
556{
557 struct optimized_kprobe *op;
558
559 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
560 if (!op)
561 return NULL;
562
563 INIT_LIST_HEAD(&op->list);
564 op->kp.addr = p->addr;
565 arch_prepare_optimized_kprobe(op);
566
567 return &op->kp;
568}
569
570static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
571
572/*
573 * Prepare an optimized_kprobe and optimize it
574 * NOTE: p must be a normal registered kprobe
575 */
576static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
577{
578 struct kprobe *ap;
579 struct optimized_kprobe *op;
580
581 ap = alloc_aggr_kprobe(p);
582 if (!ap)
583 return;
584
585 op = container_of(ap, struct optimized_kprobe, kp);
586 if (!arch_prepared_optinsn(&op->optinsn)) {
587 /* If failed to setup optimizing, fallback to kprobe */
588 free_aggr_kprobe(ap);
589 return;
590 }
591
592 init_aggr_kprobe(ap, p);
593 optimize_kprobe(ap);
594}
595
596#ifdef CONFIG_SYSCTL
597static void __kprobes optimize_all_kprobes(void)
598{
599 struct hlist_head *head;
600 struct hlist_node *node;
601 struct kprobe *p;
602 unsigned int i;
603
604 /* If optimization is already allowed, just return */
605 if (kprobes_allow_optimization)
606 return;
607
608 kprobes_allow_optimization = true;
609 mutex_lock(&text_mutex);
610 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
611 head = &kprobe_table[i];
612 hlist_for_each_entry_rcu(p, node, head, hlist)
613 if (!kprobe_disabled(p))
614 optimize_kprobe(p);
615 }
616 mutex_unlock(&text_mutex);
617 printk(KERN_INFO "Kprobes globally optimized\n");
618}
619
620static void __kprobes unoptimize_all_kprobes(void)
621{
622 struct hlist_head *head;
623 struct hlist_node *node;
624 struct kprobe *p;
625 unsigned int i;
626
627 /* If optimization is already prohibited, just return */
628 if (!kprobes_allow_optimization)
629 return;
630
631 kprobes_allow_optimization = false;
632 printk(KERN_INFO "Kprobes globally unoptimized\n");
633 get_online_cpus(); /* For avoiding text_mutex deadlock */
634 mutex_lock(&text_mutex);
635 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
636 head = &kprobe_table[i];
637 hlist_for_each_entry_rcu(p, node, head, hlist) {
638 if (!kprobe_disabled(p))
639 unoptimize_kprobe(p);
640 }
641 }
642
643 mutex_unlock(&text_mutex);
644 put_online_cpus();
645 /* Allow all currently running kprobes to complete */
646 synchronize_sched();
647}
648
649int sysctl_kprobes_optimization;
650int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
651 void __user *buffer, size_t *length,
652 loff_t *ppos)
653{
654 int ret;
655
656 mutex_lock(&kprobe_mutex);
657 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
658 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
659
660 if (sysctl_kprobes_optimization)
661 optimize_all_kprobes();
662 else
663 unoptimize_all_kprobes();
664 mutex_unlock(&kprobe_mutex);
665
666 return ret;
667}
668#endif /* CONFIG_SYSCTL */
669
670static void __kprobes __arm_kprobe(struct kprobe *p)
671{
672 struct kprobe *old_p;
673
674 /* Check collision with other optimized kprobes */
675 old_p = get_optimized_kprobe((unsigned long)p->addr);
676 if (unlikely(old_p))
677 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
678
679 arch_arm_kprobe(p);
680 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
681}
682
683static void __kprobes __disarm_kprobe(struct kprobe *p)
684{
685 struct kprobe *old_p;
686
687 unoptimize_kprobe(p); /* Try to unoptimize */
688 arch_disarm_kprobe(p);
689
690 /* If another kprobe was blocked, optimize it. */
691 old_p = get_optimized_kprobe((unsigned long)p->addr);
692 if (unlikely(old_p))
693 optimize_kprobe(old_p);
694}
695
696#else /* !CONFIG_OPTPROBES */
697
698#define optimize_kprobe(p) do {} while (0)
699#define unoptimize_kprobe(p) do {} while (0)
700#define kill_optimized_kprobe(p) do {} while (0)
701#define prepare_optimized_kprobe(p) do {} while (0)
702#define try_to_optimize_kprobe(p) do {} while (0)
703#define __arm_kprobe(p) arch_arm_kprobe(p)
704#define __disarm_kprobe(p) arch_disarm_kprobe(p)
705
706static __kprobes void free_aggr_kprobe(struct kprobe *p)
707{
708 kfree(p);
709}
710
711static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
712{
713 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
714}
715#endif /* CONFIG_OPTPROBES */
716
290/* Arm a kprobe with text_mutex */ 717/* Arm a kprobe with text_mutex */
291static void __kprobes arm_kprobe(struct kprobe *kp) 718static void __kprobes arm_kprobe(struct kprobe *kp)
292{ 719{
720 /*
721 * Here, since __arm_kprobe() doesn't use stop_machine(),
722 * this doesn't cause deadlock on text_mutex. So, we don't
723 * need get_online_cpus().
724 */
293 mutex_lock(&text_mutex); 725 mutex_lock(&text_mutex);
294 arch_arm_kprobe(kp); 726 __arm_kprobe(kp);
295 mutex_unlock(&text_mutex); 727 mutex_unlock(&text_mutex);
296} 728}
297 729
298/* Disarm a kprobe with text_mutex */ 730/* Disarm a kprobe with text_mutex */
299static void __kprobes disarm_kprobe(struct kprobe *kp) 731static void __kprobes disarm_kprobe(struct kprobe *kp)
300{ 732{
733 get_online_cpus(); /* For avoiding text_mutex deadlock */
301 mutex_lock(&text_mutex); 734 mutex_lock(&text_mutex);
302 arch_disarm_kprobe(kp); 735 __disarm_kprobe(kp);
303 mutex_unlock(&text_mutex); 736 mutex_unlock(&text_mutex);
737 put_online_cpus();
304} 738}
305 739
306/* 740/*
@@ -369,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
369void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 803void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
370{ 804{
371 struct kprobe *kp; 805 struct kprobe *kp;
372 if (p->pre_handler != aggr_pre_handler) { 806 if (!kprobe_aggrprobe(p)) {
373 p->nmissed++; 807 p->nmissed++;
374 } else { 808 } else {
375 list_for_each_entry_rcu(kp, &p->list, list) 809 list_for_each_entry_rcu(kp, &p->list, list)
@@ -493,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
493} 927}
494 928
495/* 929/*
496 * Keep all fields in the kprobe consistent
497 */
498static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
499{
500 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
501 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
502}
503
504/*
505* Add the new probe to ap->list. Fail if this is the 930* Add the new probe to ap->list. Fail if this is the
506* second jprobe at the address - two jprobes can't coexist 931* second jprobe at the address - two jprobes can't coexist
507*/ 932*/
508static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 933static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
509{ 934{
510 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 935 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
936
937 if (p->break_handler || p->post_handler)
938 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
939
511 if (p->break_handler) { 940 if (p->break_handler) {
512 if (ap->break_handler) 941 if (ap->break_handler)
513 return -EEXIST; 942 return -EEXIST;
@@ -522,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
522 ap->flags &= ~KPROBE_FLAG_DISABLED; 951 ap->flags &= ~KPROBE_FLAG_DISABLED;
523 if (!kprobes_all_disarmed) 952 if (!kprobes_all_disarmed)
524 /* Arm the breakpoint again. */ 953 /* Arm the breakpoint again. */
525 arm_kprobe(ap); 954 __arm_kprobe(ap);
526 } 955 }
527 return 0; 956 return 0;
528} 957}
@@ -531,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
531 * Fill in the required fields of the "manager kprobe". Replace the 960 * Fill in the required fields of the "manager kprobe". Replace the
532 * earlier kprobe in the hlist with the manager kprobe 961 * earlier kprobe in the hlist with the manager kprobe
533 */ 962 */
534static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 963static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
535{ 964{
965 /* Copy p's insn slot to ap */
536 copy_kprobe(p, ap); 966 copy_kprobe(p, ap);
537 flush_insn_slot(ap); 967 flush_insn_slot(ap);
538 ap->addr = p->addr; 968 ap->addr = p->addr;
539 ap->flags = p->flags; 969 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
540 ap->pre_handler = aggr_pre_handler; 970 ap->pre_handler = aggr_pre_handler;
541 ap->fault_handler = aggr_fault_handler; 971 ap->fault_handler = aggr_fault_handler;
542 /* We don't care the kprobe which has gone. */ 972 /* We don't care the kprobe which has gone. */
@@ -546,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
546 ap->break_handler = aggr_break_handler; 976 ap->break_handler = aggr_break_handler;
547 977
548 INIT_LIST_HEAD(&ap->list); 978 INIT_LIST_HEAD(&ap->list);
549 list_add_rcu(&p->list, &ap->list); 979 INIT_HLIST_NODE(&ap->hlist);
550 980
981 list_add_rcu(&p->list, &ap->list);
551 hlist_replace_rcu(&p->hlist, &ap->hlist); 982 hlist_replace_rcu(&p->hlist, &ap->hlist);
552} 983}
553 984
@@ -561,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
561 int ret = 0; 992 int ret = 0;
562 struct kprobe *ap = old_p; 993 struct kprobe *ap = old_p;
563 994
564 if (old_p->pre_handler != aggr_pre_handler) { 995 if (!kprobe_aggrprobe(old_p)) {
565 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 996 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
566 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 997 ap = alloc_aggr_kprobe(old_p);
567 if (!ap) 998 if (!ap)
568 return -ENOMEM; 999 return -ENOMEM;
569 add_aggr_kprobe(ap, old_p); 1000 init_aggr_kprobe(ap, old_p);
570 } 1001 }
571 1002
572 if (kprobe_gone(ap)) { 1003 if (kprobe_gone(ap)) {
@@ -585,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
585 */ 1016 */
586 return ret; 1017 return ret;
587 1018
1019 /* Prepare optimized instructions if possible. */
1020 prepare_optimized_kprobe(ap);
1021
588 /* 1022 /*
589 * Clear gone flag to prevent allocating new slot again, and 1023 * Clear gone flag to prevent allocating new slot again, and
590 * set disabled flag because it is not armed yet. 1024 * set disabled flag because it is not armed yet.
@@ -593,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
593 | KPROBE_FLAG_DISABLED; 1027 | KPROBE_FLAG_DISABLED;
594 } 1028 }
595 1029
1030 /* Copy ap's insn slot to p */
596 copy_kprobe(ap, p); 1031 copy_kprobe(ap, p);
597 return add_new_kprobe(ap, p); 1032 return add_new_kprobe(ap, p);
598} 1033}
@@ -743,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
743 p->nmissed = 0; 1178 p->nmissed = 0;
744 INIT_LIST_HEAD(&p->list); 1179 INIT_LIST_HEAD(&p->list);
745 mutex_lock(&kprobe_mutex); 1180 mutex_lock(&kprobe_mutex);
1181
1182 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1183 mutex_lock(&text_mutex);
1184
746 old_p = get_kprobe(p->addr); 1185 old_p = get_kprobe(p->addr);
747 if (old_p) { 1186 if (old_p) {
1187 /* Since this may unoptimize old_p, locking text_mutex. */
748 ret = register_aggr_kprobe(old_p, p); 1188 ret = register_aggr_kprobe(old_p, p);
749 goto out; 1189 goto out;
750 } 1190 }
751 1191
752 mutex_lock(&text_mutex);
753 ret = arch_prepare_kprobe(p); 1192 ret = arch_prepare_kprobe(p);
754 if (ret) 1193 if (ret)
755 goto out_unlock_text; 1194 goto out;
756 1195
757 INIT_HLIST_NODE(&p->hlist); 1196 INIT_HLIST_NODE(&p->hlist);
758 hlist_add_head_rcu(&p->hlist, 1197 hlist_add_head_rcu(&p->hlist,
759 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1198 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
760 1199
761 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1200 if (!kprobes_all_disarmed && !kprobe_disabled(p))
762 arch_arm_kprobe(p); 1201 __arm_kprobe(p);
1202
1203 /* Try to optimize kprobe */
1204 try_to_optimize_kprobe(p);
763 1205
764out_unlock_text:
765 mutex_unlock(&text_mutex);
766out: 1206out:
1207 mutex_unlock(&text_mutex);
1208 put_online_cpus();
767 mutex_unlock(&kprobe_mutex); 1209 mutex_unlock(&kprobe_mutex);
768 1210
769 if (probed_mod) 1211 if (probed_mod)
@@ -785,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
785 return -EINVAL; 1227 return -EINVAL;
786 1228
787 if (old_p == p || 1229 if (old_p == p ||
788 (old_p->pre_handler == aggr_pre_handler && 1230 (kprobe_aggrprobe(old_p) &&
789 list_is_singular(&old_p->list))) { 1231 list_is_singular(&old_p->list))) {
790 /* 1232 /*
791 * Only probe on the hash list. Disarm only if kprobes are 1233 * Only probe on the hash list. Disarm only if kprobes are
@@ -793,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
793 * already have been removed. We save on flushing icache. 1235 * already have been removed. We save on flushing icache.
794 */ 1236 */
795 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1237 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
796 disarm_kprobe(p); 1238 disarm_kprobe(old_p);
797 hlist_del_rcu(&old_p->hlist); 1239 hlist_del_rcu(&old_p->hlist);
798 } else { 1240 } else {
799 if (p->break_handler && !kprobe_gone(p)) 1241 if (p->break_handler && !kprobe_gone(p))
@@ -809,8 +1251,13 @@ noclean:
809 list_del_rcu(&p->list); 1251 list_del_rcu(&p->list);
810 if (!kprobe_disabled(old_p)) { 1252 if (!kprobe_disabled(old_p)) {
811 try_to_disable_aggr_kprobe(old_p); 1253 try_to_disable_aggr_kprobe(old_p);
812 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1254 if (!kprobes_all_disarmed) {
813 disarm_kprobe(old_p); 1255 if (kprobe_disabled(old_p))
1256 disarm_kprobe(old_p);
1257 else
1258 /* Try to optimize this probe again */
1259 optimize_kprobe(old_p);
1260 }
814 } 1261 }
815 } 1262 }
816 return 0; 1263 return 0;
@@ -827,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
827 old_p = list_entry(p->list.next, struct kprobe, list); 1274 old_p = list_entry(p->list.next, struct kprobe, list);
828 list_del(&p->list); 1275 list_del(&p->list);
829 arch_remove_kprobe(old_p); 1276 arch_remove_kprobe(old_p);
830 kfree(old_p); 1277 free_aggr_kprobe(old_p);
831 } 1278 }
832} 1279}
833 1280
@@ -1123,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1123 struct kprobe *kp; 1570 struct kprobe *kp;
1124 1571
1125 p->flags |= KPROBE_FLAG_GONE; 1572 p->flags |= KPROBE_FLAG_GONE;
1126 if (p->pre_handler == aggr_pre_handler) { 1573 if (kprobe_aggrprobe(p)) {
1127 /* 1574 /*
1128 * If this is an aggr_kprobe, we have to list all the 1575 * If this is an aggr_kprobe, we have to list all the
1129 * chained probes and mark them GONE. 1576 * chained probes and mark them GONE.
@@ -1132,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1132 kp->flags |= KPROBE_FLAG_GONE; 1579 kp->flags |= KPROBE_FLAG_GONE;
1133 p->post_handler = NULL; 1580 p->post_handler = NULL;
1134 p->break_handler = NULL; 1581 p->break_handler = NULL;
1582 kill_optimized_kprobe(p);
1135 } 1583 }
1136 /* 1584 /*
1137 * Here, we can remove insn_slot safely, because no thread calls 1585 * Here, we can remove insn_slot safely, because no thread calls
@@ -1241,6 +1689,15 @@ static int __init init_kprobes(void)
1241 } 1689 }
1242 } 1690 }
1243 1691
1692#if defined(CONFIG_OPTPROBES)
1693#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1694 /* Init kprobe_optinsn_slots */
1695 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1696#endif
1697 /* By default, kprobes can be optimized */
1698 kprobes_allow_optimization = true;
1699#endif
1700
1244 /* By default, kprobes are armed */ 1701 /* By default, kprobes are armed */
1245 kprobes_all_disarmed = false; 1702 kprobes_all_disarmed = false;
1246 1703
@@ -1259,7 +1716,7 @@ static int __init init_kprobes(void)
1259 1716
1260#ifdef CONFIG_DEBUG_FS 1717#ifdef CONFIG_DEBUG_FS
1261static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1718static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1262 const char *sym, int offset,char *modname) 1719 const char *sym, int offset, char *modname, struct kprobe *pp)
1263{ 1720{
1264 char *kprobe_type; 1721 char *kprobe_type;
1265 1722
@@ -1269,19 +1726,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1269 kprobe_type = "j"; 1726 kprobe_type = "j";
1270 else 1727 else
1271 kprobe_type = "k"; 1728 kprobe_type = "k";
1729
1272 if (sym) 1730 if (sym)
1273 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1731 seq_printf(pi, "%p %s %s+0x%x %s ",
1274 p->addr, kprobe_type, sym, offset, 1732 p->addr, kprobe_type, sym, offset,
1275 (modname ? modname : " "), 1733 (modname ? modname : " "));
1276 (kprobe_gone(p) ? "[GONE]" : ""),
1277 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1278 "[DISABLED]" : ""));
1279 else 1734 else
1280 seq_printf(pi, "%p %s %p %s%s\n", 1735 seq_printf(pi, "%p %s %p ",
1281 p->addr, kprobe_type, p->addr, 1736 p->addr, kprobe_type, p->addr);
1282 (kprobe_gone(p) ? "[GONE]" : ""), 1737
1283 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1738 if (!pp)
1284 "[DISABLED]" : "")); 1739 pp = p;
1740 seq_printf(pi, "%s%s%s\n",
1741 (kprobe_gone(p) ? "[GONE]" : ""),
1742 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1743 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1285} 1744}
1286 1745
1287static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1746static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1317,11 +1776,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1317 hlist_for_each_entry_rcu(p, node, head, hlist) { 1776 hlist_for_each_entry_rcu(p, node, head, hlist) {
1318 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1777 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1319 &offset, &modname, namebuf); 1778 &offset, &modname, namebuf);
1320 if (p->pre_handler == aggr_pre_handler) { 1779 if (kprobe_aggrprobe(p)) {
1321 list_for_each_entry_rcu(kp, &p->list, list) 1780 list_for_each_entry_rcu(kp, &p->list, list)
1322 report_probe(pi, kp, sym, offset, modname); 1781 report_probe(pi, kp, sym, offset, modname, p);
1323 } else 1782 } else
1324 report_probe(pi, p, sym, offset, modname); 1783 report_probe(pi, p, sym, offset, modname, NULL);
1325 } 1784 }
1326 preempt_enable(); 1785 preempt_enable();
1327 return 0; 1786 return 0;
@@ -1399,12 +1858,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
1399 goto out; 1858 goto out;
1400 } 1859 }
1401 1860
1402 if (!kprobes_all_disarmed && kprobe_disabled(p))
1403 arm_kprobe(p);
1404
1405 p->flags &= ~KPROBE_FLAG_DISABLED;
1406 if (p != kp) 1861 if (p != kp)
1407 kp->flags &= ~KPROBE_FLAG_DISABLED; 1862 kp->flags &= ~KPROBE_FLAG_DISABLED;
1863
1864 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1865 p->flags &= ~KPROBE_FLAG_DISABLED;
1866 arm_kprobe(p);
1867 }
1408out: 1868out:
1409 mutex_unlock(&kprobe_mutex); 1869 mutex_unlock(&kprobe_mutex);
1410 return ret; 1870 return ret;
@@ -1424,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
1424 if (!kprobes_all_disarmed) 1884 if (!kprobes_all_disarmed)
1425 goto already_enabled; 1885 goto already_enabled;
1426 1886
1887 /* Arming kprobes doesn't optimize kprobe itself */
1427 mutex_lock(&text_mutex); 1888 mutex_lock(&text_mutex);
1428 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1889 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1429 head = &kprobe_table[i]; 1890 head = &kprobe_table[i];
1430 hlist_for_each_entry_rcu(p, node, head, hlist) 1891 hlist_for_each_entry_rcu(p, node, head, hlist)
1431 if (!kprobe_disabled(p)) 1892 if (!kprobe_disabled(p))
1432 arch_arm_kprobe(p); 1893 __arm_kprobe(p);
1433 } 1894 }
1434 mutex_unlock(&text_mutex); 1895 mutex_unlock(&text_mutex);
1435 1896
@@ -1456,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
1456 1917
1457 kprobes_all_disarmed = true; 1918 kprobes_all_disarmed = true;
1458 printk(KERN_INFO "Kprobes globally disabled\n"); 1919 printk(KERN_INFO "Kprobes globally disabled\n");
1920
1921 /*
1922 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1923 * because disarming may also unoptimize kprobes.
1924 */
1925 get_online_cpus();
1459 mutex_lock(&text_mutex); 1926 mutex_lock(&text_mutex);
1460 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1927 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1461 head = &kprobe_table[i]; 1928 head = &kprobe_table[i];
1462 hlist_for_each_entry_rcu(p, node, head, hlist) { 1929 hlist_for_each_entry_rcu(p, node, head, hlist) {
1463 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1930 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1464 arch_disarm_kprobe(p); 1931 __disarm_kprobe(p);
1465 } 1932 }
1466 } 1933 }
1467 1934
1468 mutex_unlock(&text_mutex); 1935 mutex_unlock(&text_mutex);
1936 put_online_cpus();
1469 mutex_unlock(&kprobe_mutex); 1937 mutex_unlock(&kprobe_mutex);
1470 /* Allow all currently running kprobes to complete */ 1938 /* Allow all currently running kprobes to complete */
1471 synchronize_sched(); 1939 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
33} 33}
34KERNEL_ATTR_RO(uevent_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
35 35
36/* uevent helper program, used during early boo */ 36/* uevent helper program, used during early boot */
37static ssize_t uevent_helper_show(struct kobject *kobj, 37static ssize_t uevent_helper_show(struct kobject *kobj,
38 struct kobj_attribute *attr, char *buf) 38 struct kobj_attribute *attr, char *buf)
39{ 39{
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
219 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
220 ignore_signals(tsk); 220 ignore_signals(tsk);
221 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
222 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_states[N_HIGH_MEMORY]);
223 223
224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
225 225
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/slab.h>
60#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
61 60
62static DEFINE_SPINLOCK(latency_lock); 61static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe49..2594e1ce41cb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/gfp.h>
46 47
47#include <asm/sections.h> 48#include <asm/sections.h>
48 49
@@ -582,9 +583,6 @@ static int static_obj(void *obj)
582 unsigned long start = (unsigned long) &_stext, 583 unsigned long start = (unsigned long) &_stext,
583 end = (unsigned long) &_end, 584 end = (unsigned long) &_end,
584 addr = (unsigned long) obj; 585 addr = (unsigned long) obj;
585#ifdef CONFIG_SMP
586 int i;
587#endif
588 586
589 /* 587 /*
590 * static variable? 588 * static variable?
@@ -595,24 +593,16 @@ static int static_obj(void *obj)
595 if (arch_is_kernel_data(addr)) 593 if (arch_is_kernel_data(addr))
596 return 1; 594 return 1;
597 595
598#ifdef CONFIG_SMP
599 /* 596 /*
600 * percpu var? 597 * in-kernel percpu var?
601 */ 598 */
602 for_each_possible_cpu(i) { 599 if (is_kernel_percpu_address(addr))
603 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); 600 return 1;
604 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
605 + per_cpu_offset(i);
606
607 if ((addr >= start) && (addr < end))
608 return 1;
609 }
610#endif
611 601
612 /* 602 /*
613 * module var? 603 * module static or percpu var?
614 */ 604 */
615 return is_module_address(addr); 605 return is_module_address(addr) || is_module_percpu_address(addr);
616} 606}
617 607
618/* 608/*
@@ -2147,7 +2137,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2147 return ret; 2137 return ret;
2148 2138
2149 return print_irq_inversion_bug(curr, &root, target_entry, 2139 return print_irq_inversion_bug(curr, &root, target_entry,
2150 this, 1, irqclass); 2140 this, 0, irqclass);
2151} 2141}
2152 2142
2153void print_irqtrace_events(struct task_struct *curr) 2143void print_irqtrace_events(struct task_struct *curr)
@@ -3211,8 +3201,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3211{ 3201{
3212 unsigned long flags; 3202 unsigned long flags;
3213 3203
3214 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3215
3216 if (unlikely(current->lockdep_recursion)) 3204 if (unlikely(current->lockdep_recursion))
3217 return; 3205 return;
3218 3206
@@ -3220,6 +3208,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3220 check_flags(flags); 3208 check_flags(flags);
3221 3209
3222 current->lockdep_recursion = 1; 3210 current->lockdep_recursion = 1;
3211 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3223 __lock_acquire(lock, subclass, trylock, read, check, 3212 __lock_acquire(lock, subclass, trylock, read, check,
3224 irqs_disabled_flags(flags), nest_lock, ip, 0); 3213 irqs_disabled_flags(flags), nest_lock, ip, 0);
3225 current->lockdep_recursion = 0; 3214 current->lockdep_recursion = 0;
@@ -3232,14 +3221,13 @@ void lock_release(struct lockdep_map *lock, int nested,
3232{ 3221{
3233 unsigned long flags; 3222 unsigned long flags;
3234 3223
3235 trace_lock_release(lock, nested, ip);
3236
3237 if (unlikely(current->lockdep_recursion)) 3224 if (unlikely(current->lockdep_recursion))
3238 return; 3225 return;
3239 3226
3240 raw_local_irq_save(flags); 3227 raw_local_irq_save(flags);
3241 check_flags(flags); 3228 check_flags(flags);
3242 current->lockdep_recursion = 1; 3229 current->lockdep_recursion = 1;
3230 trace_lock_release(lock, nested, ip);
3243 __lock_release(lock, nested, ip); 3231 __lock_release(lock, nested, ip);
3244 current->lockdep_recursion = 0; 3232 current->lockdep_recursion = 0;
3245 raw_local_irq_restore(flags); 3233 raw_local_irq_restore(flags);
@@ -3413,8 +3401,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3413{ 3401{
3414 unsigned long flags; 3402 unsigned long flags;
3415 3403
3416 trace_lock_contended(lock, ip);
3417
3418 if (unlikely(!lock_stat)) 3404 if (unlikely(!lock_stat))
3419 return; 3405 return;
3420 3406
@@ -3424,6 +3410,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3424 raw_local_irq_save(flags); 3410 raw_local_irq_save(flags);
3425 check_flags(flags); 3411 check_flags(flags);
3426 current->lockdep_recursion = 1; 3412 current->lockdep_recursion = 1;
3413 trace_lock_contended(lock, ip);
3427 __lock_contended(lock, ip); 3414 __lock_contended(lock, ip);
3428 current->lockdep_recursion = 0; 3415 current->lockdep_recursion = 0;
3429 raw_local_irq_restore(flags); 3416 raw_local_irq_restore(flags);
@@ -3809,3 +3796,22 @@ void lockdep_sys_exit(void)
3809 lockdep_print_held_locks(curr); 3796 lockdep_print_held_locks(curr);
3810 } 3797 }
3811} 3798}
3799
3800void lockdep_rcu_dereference(const char *file, const int line)
3801{
3802 struct task_struct *curr = current;
3803
3804 if (!debug_locks_off())
3805 return;
3806 printk("\n===================================================\n");
3807 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3808 printk( "---------------------------------------------------\n");
3809 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3810 file, line);
3811 printk("\nother info that might help us debug this:\n\n");
3812 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
3813 lockdep_print_held_locks(curr);
3814 printk("\nstack backtrace:\n");
3815 dump_stack();
3816}
3817EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index f82386bd9ee9..1016b75b026a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,27 +370,33 @@ EXPORT_SYMBOL_GPL(find_module);
370 370
371#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
372 372
373static void *percpu_modalloc(unsigned long size, unsigned long align, 373static inline void __percpu *mod_percpu(struct module *mod)
374 const char *name)
375{ 374{
376 void *ptr; 375 return mod->percpu;
376}
377 377
378static int percpu_modalloc(struct module *mod,
379 unsigned long size, unsigned long align)
380{
378 if (align > PAGE_SIZE) { 381 if (align > PAGE_SIZE) {
379 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 382 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
380 name, align, PAGE_SIZE); 383 mod->name, align, PAGE_SIZE);
381 align = PAGE_SIZE; 384 align = PAGE_SIZE;
382 } 385 }
383 386
384 ptr = __alloc_reserved_percpu(size, align); 387 mod->percpu = __alloc_reserved_percpu(size, align);
385 if (!ptr) 388 if (!mod->percpu) {
386 printk(KERN_WARNING 389 printk(KERN_WARNING
387 "Could not allocate %lu bytes percpu data\n", size); 390 "Could not allocate %lu bytes percpu data\n", size);
388 return ptr; 391 return -ENOMEM;
392 }
393 mod->percpu_size = size;
394 return 0;
389} 395}
390 396
391static void percpu_modfree(void *freeme) 397static void percpu_modfree(struct module *mod)
392{ 398{
393 free_percpu(freeme); 399 free_percpu(mod->percpu);
394} 400}
395 401
396static unsigned int find_pcpusec(Elf_Ehdr *hdr, 402static unsigned int find_pcpusec(Elf_Ehdr *hdr,
@@ -400,24 +406,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
400 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 406 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
401} 407}
402 408
403static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) 409static void percpu_modcopy(struct module *mod,
410 const void *from, unsigned long size)
404{ 411{
405 int cpu; 412 int cpu;
406 413
407 for_each_possible_cpu(cpu) 414 for_each_possible_cpu(cpu)
408 memcpy(pcpudest + per_cpu_offset(cpu), from, size); 415 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
416}
417
418/**
419 * is_module_percpu_address - test whether address is from module static percpu
420 * @addr: address to test
421 *
422 * Test whether @addr belongs to module static percpu area.
423 *
424 * RETURNS:
425 * %true if @addr is from module static percpu area
426 */
427bool is_module_percpu_address(unsigned long addr)
428{
429 struct module *mod;
430 unsigned int cpu;
431
432 preempt_disable();
433
434 list_for_each_entry_rcu(mod, &modules, list) {
435 if (!mod->percpu_size)
436 continue;
437 for_each_possible_cpu(cpu) {
438 void *start = per_cpu_ptr(mod->percpu, cpu);
439
440 if ((void *)addr >= start &&
441 (void *)addr < start + mod->percpu_size) {
442 preempt_enable();
443 return true;
444 }
445 }
446 }
447
448 preempt_enable();
449 return false;
409} 450}
410 451
411#else /* ... !CONFIG_SMP */ 452#else /* ... !CONFIG_SMP */
412 453
413static inline void *percpu_modalloc(unsigned long size, unsigned long align, 454static inline void __percpu *mod_percpu(struct module *mod)
414 const char *name)
415{ 455{
416 return NULL; 456 return NULL;
417} 457}
418static inline void percpu_modfree(void *pcpuptr) 458static inline int percpu_modalloc(struct module *mod,
459 unsigned long size, unsigned long align)
460{
461 return -ENOMEM;
462}
463static inline void percpu_modfree(struct module *mod)
419{ 464{
420 BUG();
421} 465}
422static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 466static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
423 Elf_Shdr *sechdrs, 467 Elf_Shdr *sechdrs,
@@ -425,12 +469,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
425{ 469{
426 return 0; 470 return 0;
427} 471}
428static inline void percpu_modcopy(void *pcpudst, const void *src, 472static inline void percpu_modcopy(struct module *mod,
429 unsigned long size) 473 const void *from, unsigned long size)
430{ 474{
431 /* pcpusec should be 0, and size of that section should be 0. */ 475 /* pcpusec should be 0, and size of that section should be 0. */
432 BUG_ON(size != 0); 476 BUG_ON(size != 0);
433} 477}
478bool is_module_percpu_address(unsigned long addr)
479{
480 return false;
481}
434 482
435#endif /* CONFIG_SMP */ 483#endif /* CONFIG_SMP */
436 484
@@ -473,10 +521,13 @@ static void module_unload_init(struct module *mod)
473 int cpu; 521 int cpu;
474 522
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 523 INIT_LIST_HEAD(&mod->modules_which_use_me);
476 for_each_possible_cpu(cpu) 524 for_each_possible_cpu(cpu) {
477 local_set(__module_ref_addr(mod, cpu), 0); 525 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
526 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
527 }
528
478 /* Hold reference count during initialization. */ 529 /* Hold reference count during initialization. */
479 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 530 __this_cpu_write(mod->refptr->incs, 1);
480 /* Backwards compatibility macros put refcount during init. */ 531 /* Backwards compatibility macros put refcount during init. */
481 mod->waiter = current; 532 mod->waiter = current;
482} 533}
@@ -615,12 +666,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
615 666
616unsigned int module_refcount(struct module *mod) 667unsigned int module_refcount(struct module *mod)
617{ 668{
618 unsigned int total = 0; 669 unsigned int incs = 0, decs = 0;
619 int cpu; 670 int cpu;
620 671
621 for_each_possible_cpu(cpu) 672 for_each_possible_cpu(cpu)
622 total += local_read(__module_ref_addr(mod, cpu)); 673 decs += per_cpu_ptr(mod->refptr, cpu)->decs;
623 return total; 674 /*
675 * ensure the incs are added up after the decs.
676 * module_put ensures incs are visible before decs with smp_wmb.
677 *
678 * This 2-count scheme avoids the situation where the refcount
679 * for CPU0 is read, then CPU0 increments the module refcount,
680 * then CPU1 drops that refcount, then the refcount for CPU1 is
681 * read. We would record a decrement but not its corresponding
682 * increment so we would see a low count (disaster).
683 *
684 * Rare situation? But module_refcount can be preempted, and we
685 * might be tallying up 4096+ CPUs. So it is not impossible.
686 */
687 smp_rmb();
688 for_each_possible_cpu(cpu)
689 incs += per_cpu_ptr(mod->refptr, cpu)->incs;
690 return incs - decs;
624} 691}
625EXPORT_SYMBOL(module_refcount); 692EXPORT_SYMBOL(module_refcount);
626 693
@@ -796,14 +863,16 @@ static struct module_attribute refcnt = {
796void module_put(struct module *module) 863void module_put(struct module *module)
797{ 864{
798 if (module) { 865 if (module) {
799 unsigned int cpu = get_cpu(); 866 preempt_disable();
800 local_dec(__module_ref_addr(module, cpu)); 867 smp_wmb(); /* see comment in module_refcount */
868 __this_cpu_inc(module->refptr->decs);
869
801 trace_module_put(module, _RET_IP_, 870 trace_module_put(module, _RET_IP_,
802 local_read(__module_ref_addr(module, cpu))); 871 __this_cpu_read(module->refptr->decs));
803 /* Maybe they're waiting for us to drop reference? */ 872 /* Maybe they're waiting for us to drop reference? */
804 if (unlikely(!module_is_live(module))) 873 if (unlikely(!module_is_live(module)))
805 wake_up_process(module->waiter); 874 wake_up_process(module->waiter);
806 put_cpu(); 875 preempt_enable();
807 } 876 }
808} 877}
809EXPORT_SYMBOL(module_put); 878EXPORT_SYMBOL(module_put);
@@ -1083,6 +1152,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1083 if (sattr->name == NULL) 1152 if (sattr->name == NULL)
1084 goto out; 1153 goto out;
1085 sect_attrs->nsections++; 1154 sect_attrs->nsections++;
1155 sysfs_attr_init(&sattr->mattr.attr);
1086 sattr->mattr.show = module_sect_show; 1156 sattr->mattr.show = module_sect_show;
1087 sattr->mattr.store = NULL; 1157 sattr->mattr.store = NULL;
1088 sattr->mattr.attr.name = sattr->name; 1158 sattr->mattr.attr.name = sattr->name;
@@ -1178,6 +1248,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1178 if (sect_empty(&sechdrs[i])) 1248 if (sect_empty(&sechdrs[i]))
1179 continue; 1249 continue;
1180 if (sechdrs[i].sh_type == SHT_NOTE) { 1250 if (sechdrs[i].sh_type == SHT_NOTE) {
1251 sysfs_bin_attr_init(nattr);
1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1252 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1182 nattr->attr.mode = S_IRUGO; 1253 nattr->attr.mode = S_IRUGO;
1183 nattr->size = sechdrs[i].sh_size; 1254 nattr->size = sechdrs[i].sh_size;
@@ -1250,6 +1321,7 @@ int module_add_modinfo_attrs(struct module *mod)
1250 if (!attr->test || 1321 if (!attr->test ||
1251 (attr->test && attr->test(mod))) { 1322 (attr->test && attr->test(mod))) {
1252 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1323 memcpy(temp_attr, attr, sizeof(*temp_attr));
1324 sysfs_attr_init(&temp_attr->attr);
1253 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1325 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1254 ++temp_attr; 1326 ++temp_attr;
1255 } 1327 }
@@ -1395,11 +1467,10 @@ static void free_module(struct module *mod)
1395 /* This may be NULL, but that's OK */ 1467 /* This may be NULL, but that's OK */
1396 module_free(mod, mod->module_init); 1468 module_free(mod, mod->module_init);
1397 kfree(mod->args); 1469 kfree(mod->args);
1398 if (mod->percpu) 1470 percpu_modfree(mod);
1399 percpu_modfree(mod->percpu); 1471#if defined(CONFIG_MODULE_UNLOAD)
1400#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
1401 if (mod->refptr) 1472 if (mod->refptr)
1402 percpu_modfree(mod->refptr); 1473 free_percpu(mod->refptr);
1403#endif 1474#endif
1404 /* Free lock-classes: */ 1475 /* Free lock-classes: */
1405 lockdep_free_key_range(mod->module_core, mod->core_size); 1476 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1515,7 +1586,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1515 default: 1586 default:
1516 /* Divert to percpu allocation if a percpu var. */ 1587 /* Divert to percpu allocation if a percpu var. */
1517 if (sym[i].st_shndx == pcpuindex) 1588 if (sym[i].st_shndx == pcpuindex)
1518 secbase = (unsigned long)mod->percpu; 1589 secbase = (unsigned long)mod_percpu(mod);
1519 else 1590 else
1520 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1591 secbase = sechdrs[sym[i].st_shndx].sh_addr;
1521 sym[i].st_value += secbase; 1592 sym[i].st_value += secbase;
@@ -1949,7 +2020,7 @@ static noinline struct module *load_module(void __user *umod,
1949 unsigned int modindex, versindex, infoindex, pcpuindex; 2020 unsigned int modindex, versindex, infoindex, pcpuindex;
1950 struct module *mod; 2021 struct module *mod;
1951 long err = 0; 2022 long err = 0;
1952 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2023 void *ptr = NULL; /* Stops spurious gcc warning */
1953 unsigned long symoffs, stroffs, *strmap; 2024 unsigned long symoffs, stroffs, *strmap;
1954 2025
1955 mm_segment_t old_fs; 2026 mm_segment_t old_fs;
@@ -2089,15 +2160,11 @@ static noinline struct module *load_module(void __user *umod,
2089 2160
2090 if (pcpuindex) { 2161 if (pcpuindex) {
2091 /* We have a special allocation for this section. */ 2162 /* We have a special allocation for this section. */
2092 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, 2163 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
2093 sechdrs[pcpuindex].sh_addralign, 2164 sechdrs[pcpuindex].sh_addralign);
2094 mod->name); 2165 if (err)
2095 if (!percpu) {
2096 err = -ENOMEM;
2097 goto free_mod; 2166 goto free_mod;
2098 }
2099 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2167 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2100 mod->percpu = percpu;
2101 } 2168 }
2102 2169
2103 /* Determine total sizes, and put offsets in sh_entsize. For now 2170 /* Determine total sizes, and put offsets in sh_entsize. For now
@@ -2162,9 +2229,8 @@ static noinline struct module *load_module(void __user *umod,
2162 mod = (void *)sechdrs[modindex].sh_addr; 2229 mod = (void *)sechdrs[modindex].sh_addr;
2163 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2230 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2164 2231
2165#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2232#if defined(CONFIG_MODULE_UNLOAD)
2166 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2233 mod->refptr = alloc_percpu(struct module_ref);
2167 mod->name);
2168 if (!mod->refptr) { 2234 if (!mod->refptr) {
2169 err = -ENOMEM; 2235 err = -ENOMEM;
2170 goto free_init; 2236 goto free_init;
@@ -2313,7 +2379,7 @@ static noinline struct module *load_module(void __user *umod,
2313 sort_extable(mod->extable, mod->extable + mod->num_exentries); 2379 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2314 2380
2315 /* Finally, copy percpu area over. */ 2381 /* Finally, copy percpu area over. */
2316 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2382 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2317 sechdrs[pcpuindex].sh_size); 2383 sechdrs[pcpuindex].sh_size);
2318 2384
2319 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, 2385 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2396,8 +2462,8 @@ static noinline struct module *load_module(void __user *umod,
2396 kobject_put(&mod->mkobj.kobj); 2462 kobject_put(&mod->mkobj.kobj);
2397 free_unload: 2463 free_unload:
2398 module_unload_free(mod); 2464 module_unload_free(mod);
2399#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2465#if defined(CONFIG_MODULE_UNLOAD)
2400 percpu_modfree(mod->refptr); 2466 free_percpu(mod->refptr);
2401 free_init: 2467 free_init:
2402#endif 2468#endif
2403 module_free(mod, mod->module_init); 2469 module_free(mod, mod->module_init);
@@ -2405,8 +2471,7 @@ static noinline struct module *load_module(void __user *umod,
2405 module_free(mod, mod->module_core); 2471 module_free(mod, mod->module_core);
2406 /* mod will be freed with core. Don't access it beyond this line! */ 2472 /* mod will be freed with core. Don't access it beyond this line! */
2407 free_percpu: 2473 free_percpu:
2408 if (percpu) 2474 percpu_modfree(mod);
2409 percpu_modfree(percpu);
2410 free_mod: 2475 free_mod:
2411 kfree(args); 2476 kfree(args);
2412 kfree(strmap); 2477 kfree(strmap);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
80 80
81 nb = rcu_dereference(*nl); 81 nb = rcu_dereference_raw(*nl);
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference_raw(nb->next);
85 85
86#ifdef CONFIG_DEBUG_NOTIFIERS 86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { 87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference(nh->head)) { 312 if (rcu_dereference_raw(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
13 * Pavel Emelianov <xemul@openvz.org> 13 * Pavel Emelianov <xemul@openvz.org>
14 */ 14 */
15 15
16#include <linux/slab.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
18#include <linux/init_task.h> 19#include <linux/init_task.h>
@@ -24,7 +25,18 @@
24 25
25static struct kmem_cache *nsproxy_cachep; 26static struct kmem_cache *nsproxy_cachep;
26 27
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 28struct nsproxy init_nsproxy = {
29 .count = ATOMIC_INIT(1),
30 .uts_ns = &init_uts_ns,
31#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
32 .ipc_ns = &init_ipc_ns,
33#endif
34 .mnt_ns = NULL,
35 .pid_ns = &init_pid_ns,
36#ifdef CONFIG_NET
37 .net_ns = &init_net,
38#endif
39};
28 40
29static inline struct nsproxy *create_nsproxy(void) 41static inline struct nsproxy *create_nsproxy(void)
30{ 42{
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..fd03513c7327
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,697 @@
1/*
2 * padata.c - generic interface to process data streams in parallel
3 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include <linux/module.h>
22#include <linux/cpumask.h>
23#include <linux/err.h>
24#include <linux/cpu.h>
25#include <linux/padata.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/rcupdate.h>
30
31#define MAX_SEQ_NR INT_MAX - NR_CPUS
32#define MAX_OBJ_NUM 10000 * NR_CPUS
33
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{
36 int cpu, target_cpu;
37
38 target_cpu = cpumask_first(pd->cpumask);
39 for (cpu = 0; cpu < cpu_index; cpu++)
40 target_cpu = cpumask_next(target_cpu, pd->cpumask);
41
42 return target_cpu;
43}
44
45static int padata_cpu_hash(struct padata_priv *padata)
46{
47 int cpu_index;
48 struct parallel_data *pd;
49
50 pd = padata->pd;
51
52 /*
53 * Hash the sequence numbers to the cpus by taking
54 * seq_nr mod. number of cpus in use.
55 */
56 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
57
58 return padata_index_to_cpu(pd, cpu_index);
59}
60
61static void padata_parallel_worker(struct work_struct *work)
62{
63 struct padata_queue *queue;
64 struct parallel_data *pd;
65 struct padata_instance *pinst;
66 LIST_HEAD(local_list);
67
68 local_bh_disable();
69 queue = container_of(work, struct padata_queue, pwork);
70 pd = queue->pd;
71 pinst = pd->pinst;
72
73 spin_lock(&queue->parallel.lock);
74 list_replace_init(&queue->parallel.list, &local_list);
75 spin_unlock(&queue->parallel.lock);
76
77 while (!list_empty(&local_list)) {
78 struct padata_priv *padata;
79
80 padata = list_entry(local_list.next,
81 struct padata_priv, list);
82
83 list_del_init(&padata->list);
84
85 padata->parallel(padata);
86 }
87
88 local_bh_enable();
89}
90
91/*
92 * padata_do_parallel - padata parallelization function
93 *
94 * @pinst: padata instance
95 * @padata: object to be parallelized
96 * @cb_cpu: cpu the serialization callback function will run on,
97 * must be in the cpumask of padata.
98 *
99 * The parallelization callback function will run with BHs off.
100 * Note: Every object which is parallelized by padata_do_parallel
101 * must be seen by padata_do_serial.
102 */
103int padata_do_parallel(struct padata_instance *pinst,
104 struct padata_priv *padata, int cb_cpu)
105{
106 int target_cpu, err;
107 struct padata_queue *queue;
108 struct parallel_data *pd;
109
110 rcu_read_lock_bh();
111
112 pd = rcu_dereference(pinst->pd);
113
114 err = 0;
115 if (!(pinst->flags & PADATA_INIT))
116 goto out;
117
118 err = -EBUSY;
119 if ((pinst->flags & PADATA_RESET))
120 goto out;
121
122 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
123 goto out;
124
125 err = -EINVAL;
126 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
127 goto out;
128
129 err = -EINPROGRESS;
130 atomic_inc(&pd->refcnt);
131 padata->pd = pd;
132 padata->cb_cpu = cb_cpu;
133
134 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
135 atomic_set(&pd->seq_nr, -1);
136
137 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
138
139 target_cpu = padata_cpu_hash(padata);
140 queue = per_cpu_ptr(pd->queue, target_cpu);
141
142 spin_lock(&queue->parallel.lock);
143 list_add_tail(&padata->list, &queue->parallel.list);
144 spin_unlock(&queue->parallel.lock);
145
146 queue_work_on(target_cpu, pinst->wq, &queue->pwork);
147
148out:
149 rcu_read_unlock_bh();
150
151 return err;
152}
153EXPORT_SYMBOL(padata_do_parallel);
154
155static struct padata_priv *padata_get_next(struct parallel_data *pd)
156{
157 int cpu, num_cpus, empty, calc_seq_nr;
158 int seq_nr, next_nr, overrun, next_overrun;
159 struct padata_queue *queue, *next_queue;
160 struct padata_priv *padata;
161 struct padata_list *reorder;
162
163 empty = 0;
164 next_nr = -1;
165 next_overrun = 0;
166 next_queue = NULL;
167
168 num_cpus = cpumask_weight(pd->cpumask);
169
170 for_each_cpu(cpu, pd->cpumask) {
171 queue = per_cpu_ptr(pd->queue, cpu);
172 reorder = &queue->reorder;
173
174 /*
175 * Calculate the seq_nr of the object that should be
176 * next in this queue.
177 */
178 overrun = 0;
179 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
180 + queue->cpu_index;
181
182 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
183 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
184 overrun = 1;
185 }
186
187 if (!list_empty(&reorder->list)) {
188 padata = list_entry(reorder->list.next,
189 struct padata_priv, list);
190
191 seq_nr = padata->seq_nr;
192 BUG_ON(calc_seq_nr != seq_nr);
193 } else {
194 seq_nr = calc_seq_nr;
195 empty++;
196 }
197
198 if (next_nr < 0 || seq_nr < next_nr
199 || (next_overrun && !overrun)) {
200 next_nr = seq_nr;
201 next_overrun = overrun;
202 next_queue = queue;
203 }
204 }
205
206 padata = NULL;
207
208 if (empty == num_cpus)
209 goto out;
210
211 reorder = &next_queue->reorder;
212
213 if (!list_empty(&reorder->list)) {
214 padata = list_entry(reorder->list.next,
215 struct padata_priv, list);
216
217 if (unlikely(next_overrun)) {
218 for_each_cpu(cpu, pd->cpumask) {
219 queue = per_cpu_ptr(pd->queue, cpu);
220 atomic_set(&queue->num_obj, 0);
221 }
222 }
223
224 spin_lock(&reorder->lock);
225 list_del_init(&padata->list);
226 atomic_dec(&pd->reorder_objects);
227 spin_unlock(&reorder->lock);
228
229 atomic_inc(&next_queue->num_obj);
230
231 goto out;
232 }
233
234 if (next_nr % num_cpus == next_queue->cpu_index) {
235 padata = ERR_PTR(-ENODATA);
236 goto out;
237 }
238
239 padata = ERR_PTR(-EINPROGRESS);
240out:
241 return padata;
242}
243
244static void padata_reorder(struct parallel_data *pd)
245{
246 struct padata_priv *padata;
247 struct padata_queue *queue;
248 struct padata_instance *pinst = pd->pinst;
249
250try_again:
251 if (!spin_trylock_bh(&pd->lock))
252 goto out;
253
254 while (1) {
255 padata = padata_get_next(pd);
256
257 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
258 break;
259
260 if (PTR_ERR(padata) == -ENODATA) {
261 spin_unlock_bh(&pd->lock);
262 goto out;
263 }
264
265 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
266
267 spin_lock(&queue->serial.lock);
268 list_add_tail(&padata->list, &queue->serial.list);
269 spin_unlock(&queue->serial.lock);
270
271 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
272 }
273
274 spin_unlock_bh(&pd->lock);
275
276 if (atomic_read(&pd->reorder_objects))
277 goto try_again;
278
279out:
280 return;
281}
282
283static void padata_serial_worker(struct work_struct *work)
284{
285 struct padata_queue *queue;
286 struct parallel_data *pd;
287 LIST_HEAD(local_list);
288
289 local_bh_disable();
290 queue = container_of(work, struct padata_queue, swork);
291 pd = queue->pd;
292
293 spin_lock(&queue->serial.lock);
294 list_replace_init(&queue->serial.list, &local_list);
295 spin_unlock(&queue->serial.lock);
296
297 while (!list_empty(&local_list)) {
298 struct padata_priv *padata;
299
300 padata = list_entry(local_list.next,
301 struct padata_priv, list);
302
303 list_del_init(&padata->list);
304
305 padata->serial(padata);
306 atomic_dec(&pd->refcnt);
307 }
308 local_bh_enable();
309}
310
311/*
312 * padata_do_serial - padata serialization function
313 *
314 * @padata: object to be serialized.
315 *
316 * padata_do_serial must be called for every parallelized object.
317 * The serialization callback function will run with BHs off.
318 */
319void padata_do_serial(struct padata_priv *padata)
320{
321 int cpu;
322 struct padata_queue *queue;
323 struct parallel_data *pd;
324
325 pd = padata->pd;
326
327 cpu = get_cpu();
328 queue = per_cpu_ptr(pd->queue, cpu);
329
330 spin_lock(&queue->reorder.lock);
331 atomic_inc(&pd->reorder_objects);
332 list_add_tail(&padata->list, &queue->reorder.list);
333 spin_unlock(&queue->reorder.lock);
334
335 put_cpu();
336
337 padata_reorder(pd);
338}
339EXPORT_SYMBOL(padata_do_serial);
340
341static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
342 const struct cpumask *cpumask)
343{
344 int cpu, cpu_index, num_cpus;
345 struct padata_queue *queue;
346 struct parallel_data *pd;
347
348 cpu_index = 0;
349
350 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
351 if (!pd)
352 goto err;
353
354 pd->queue = alloc_percpu(struct padata_queue);
355 if (!pd->queue)
356 goto err_free_pd;
357
358 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
359 goto err_free_queue;
360
361 for_each_possible_cpu(cpu) {
362 queue = per_cpu_ptr(pd->queue, cpu);
363
364 queue->pd = pd;
365
366 if (cpumask_test_cpu(cpu, cpumask)
367 && cpumask_test_cpu(cpu, cpu_active_mask)) {
368 queue->cpu_index = cpu_index;
369 cpu_index++;
370 } else
371 queue->cpu_index = -1;
372
373 INIT_LIST_HEAD(&queue->reorder.list);
374 INIT_LIST_HEAD(&queue->parallel.list);
375 INIT_LIST_HEAD(&queue->serial.list);
376 spin_lock_init(&queue->reorder.lock);
377 spin_lock_init(&queue->parallel.lock);
378 spin_lock_init(&queue->serial.lock);
379
380 INIT_WORK(&queue->pwork, padata_parallel_worker);
381 INIT_WORK(&queue->swork, padata_serial_worker);
382 atomic_set(&queue->num_obj, 0);
383 }
384
385 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
386
387 num_cpus = cpumask_weight(pd->cpumask);
388 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
389
390 atomic_set(&pd->seq_nr, -1);
391 atomic_set(&pd->reorder_objects, 0);
392 atomic_set(&pd->refcnt, 0);
393 pd->pinst = pinst;
394 spin_lock_init(&pd->lock);
395
396 return pd;
397
398err_free_queue:
399 free_percpu(pd->queue);
400err_free_pd:
401 kfree(pd);
402err:
403 return NULL;
404}
405
406static void padata_free_pd(struct parallel_data *pd)
407{
408 free_cpumask_var(pd->cpumask);
409 free_percpu(pd->queue);
410 kfree(pd);
411}
412
413static void padata_replace(struct padata_instance *pinst,
414 struct parallel_data *pd_new)
415{
416 struct parallel_data *pd_old = pinst->pd;
417
418 pinst->flags |= PADATA_RESET;
419
420 rcu_assign_pointer(pinst->pd, pd_new);
421
422 synchronize_rcu();
423
424 while (atomic_read(&pd_old->refcnt) != 0)
425 yield();
426
427 flush_workqueue(pinst->wq);
428
429 padata_free_pd(pd_old);
430
431 pinst->flags &= ~PADATA_RESET;
432}
433
434/*
435 * padata_set_cpumask - set the cpumask that padata should use
436 *
437 * @pinst: padata instance
438 * @cpumask: the cpumask to use
439 */
440int padata_set_cpumask(struct padata_instance *pinst,
441 cpumask_var_t cpumask)
442{
443 struct parallel_data *pd;
444 int err = 0;
445
446 might_sleep();
447
448 mutex_lock(&pinst->lock);
449
450 pd = padata_alloc_pd(pinst, cpumask);
451 if (!pd) {
452 err = -ENOMEM;
453 goto out;
454 }
455
456 cpumask_copy(pinst->cpumask, cpumask);
457
458 padata_replace(pinst, pd);
459
460out:
461 mutex_unlock(&pinst->lock);
462
463 return err;
464}
465EXPORT_SYMBOL(padata_set_cpumask);
466
467static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
468{
469 struct parallel_data *pd;
470
471 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
472 pd = padata_alloc_pd(pinst, pinst->cpumask);
473 if (!pd)
474 return -ENOMEM;
475
476 padata_replace(pinst, pd);
477 }
478
479 return 0;
480}
481
482/*
483 * padata_add_cpu - add a cpu to the padata cpumask
484 *
485 * @pinst: padata instance
486 * @cpu: cpu to add
487 */
488int padata_add_cpu(struct padata_instance *pinst, int cpu)
489{
490 int err;
491
492 might_sleep();
493
494 mutex_lock(&pinst->lock);
495
496 cpumask_set_cpu(cpu, pinst->cpumask);
497 err = __padata_add_cpu(pinst, cpu);
498
499 mutex_unlock(&pinst->lock);
500
501 return err;
502}
503EXPORT_SYMBOL(padata_add_cpu);
504
505static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
506{
507 struct parallel_data *pd;
508
509 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
510 pd = padata_alloc_pd(pinst, pinst->cpumask);
511 if (!pd)
512 return -ENOMEM;
513
514 padata_replace(pinst, pd);
515 }
516
517 return 0;
518}
519
520/*
521 * padata_remove_cpu - remove a cpu from the padata cpumask
522 *
523 * @pinst: padata instance
524 * @cpu: cpu to remove
525 */
526int padata_remove_cpu(struct padata_instance *pinst, int cpu)
527{
528 int err;
529
530 might_sleep();
531
532 mutex_lock(&pinst->lock);
533
534 cpumask_clear_cpu(cpu, pinst->cpumask);
535 err = __padata_remove_cpu(pinst, cpu);
536
537 mutex_unlock(&pinst->lock);
538
539 return err;
540}
541EXPORT_SYMBOL(padata_remove_cpu);
542
543/*
544 * padata_start - start the parallel processing
545 *
546 * @pinst: padata instance to start
547 */
548void padata_start(struct padata_instance *pinst)
549{
550 might_sleep();
551
552 mutex_lock(&pinst->lock);
553 pinst->flags |= PADATA_INIT;
554 mutex_unlock(&pinst->lock);
555}
556EXPORT_SYMBOL(padata_start);
557
558/*
559 * padata_stop - stop the parallel processing
560 *
561 * @pinst: padata instance to stop
562 */
563void padata_stop(struct padata_instance *pinst)
564{
565 might_sleep();
566
567 mutex_lock(&pinst->lock);
568 pinst->flags &= ~PADATA_INIT;
569 mutex_unlock(&pinst->lock);
570}
571EXPORT_SYMBOL(padata_stop);
572
573static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
574 unsigned long action, void *hcpu)
575{
576 int err;
577 struct padata_instance *pinst;
578 int cpu = (unsigned long)hcpu;
579
580 pinst = container_of(nfb, struct padata_instance, cpu_notifier);
581
582 switch (action) {
583 case CPU_ONLINE:
584 case CPU_ONLINE_FROZEN:
585 if (!cpumask_test_cpu(cpu, pinst->cpumask))
586 break;
587 mutex_lock(&pinst->lock);
588 err = __padata_add_cpu(pinst, cpu);
589 mutex_unlock(&pinst->lock);
590 if (err)
591 return NOTIFY_BAD;
592 break;
593
594 case CPU_DOWN_PREPARE:
595 case CPU_DOWN_PREPARE_FROZEN:
596 if (!cpumask_test_cpu(cpu, pinst->cpumask))
597 break;
598 mutex_lock(&pinst->lock);
599 err = __padata_remove_cpu(pinst, cpu);
600 mutex_unlock(&pinst->lock);
601 if (err)
602 return NOTIFY_BAD;
603 break;
604
605 case CPU_UP_CANCELED:
606 case CPU_UP_CANCELED_FROZEN:
607 if (!cpumask_test_cpu(cpu, pinst->cpumask))
608 break;
609 mutex_lock(&pinst->lock);
610 __padata_remove_cpu(pinst, cpu);
611 mutex_unlock(&pinst->lock);
612
613 case CPU_DOWN_FAILED:
614 case CPU_DOWN_FAILED_FROZEN:
615 if (!cpumask_test_cpu(cpu, pinst->cpumask))
616 break;
617 mutex_lock(&pinst->lock);
618 __padata_add_cpu(pinst, cpu);
619 mutex_unlock(&pinst->lock);
620 }
621
622 return NOTIFY_OK;
623}
624
625/*
626 * padata_alloc - allocate and initialize a padata instance
627 *
628 * @cpumask: cpumask that padata uses for parallelization
629 * @wq: workqueue to use for the allocated padata instance
630 */
631struct padata_instance *padata_alloc(const struct cpumask *cpumask,
632 struct workqueue_struct *wq)
633{
634 int err;
635 struct padata_instance *pinst;
636 struct parallel_data *pd;
637
638 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
639 if (!pinst)
640 goto err;
641
642 pd = padata_alloc_pd(pinst, cpumask);
643 if (!pd)
644 goto err_free_inst;
645
646 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
647 goto err_free_pd;
648
649 rcu_assign_pointer(pinst->pd, pd);
650
651 pinst->wq = wq;
652
653 cpumask_copy(pinst->cpumask, cpumask);
654
655 pinst->flags = 0;
656
657 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
658 pinst->cpu_notifier.priority = 0;
659 err = register_hotcpu_notifier(&pinst->cpu_notifier);
660 if (err)
661 goto err_free_cpumask;
662
663 mutex_init(&pinst->lock);
664
665 return pinst;
666
667err_free_cpumask:
668 free_cpumask_var(pinst->cpumask);
669err_free_pd:
670 padata_free_pd(pd);
671err_free_inst:
672 kfree(pinst);
673err:
674 return NULL;
675}
676EXPORT_SYMBOL(padata_alloc);
677
678/*
679 * padata_free - free a padata instance
680 *
681 * @ padata_inst: padata instance to free
682 */
683void padata_free(struct padata_instance *pinst)
684{
685 padata_stop(pinst);
686
687 synchronize_rcu();
688
689 while (atomic_read(&pinst->pd->refcnt) != 0)
690 yield();
691
692 unregister_hotcpu_notifier(&pinst->cpu_notifier);
693 padata_free_pd(pinst->pd);
694 free_cpumask_var(pinst->cpumask);
695 kfree(pinst);
696}
697EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 36
37EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
38 38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
45long (*panic_blink)(long time); 40long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
47 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
48/** 69/**
49 * panic - halt the system 70 * panic - halt the system
50 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
95 116
96 bust_spinlocks(0); 117 bust_spinlocks(0);
97 118
98 if (!panic_blink)
99 panic_blink = no_blink;
100
101 if (panic_timeout > 0) { 119 if (panic_timeout > 0) {
102 /* 120 /*
103 * Delay timeout seconds before rebooting the machine. 121 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
105 */ 123 */
106 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
107 125
108 for (i = 0; i < panic_timeout*1000; ) { 126 for (i = 0; i < panic_timeout; i++) {
109 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
110 i += panic_blink(i); 128 panic_blink_one_second();
111 mdelay(1);
112 i++;
113 } 129 }
114 /* 130 /*
115 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
135 } 151 }
136#endif 152#endif
137 local_irq_enable(); 153 local_irq_enable();
138 for (i = 0; ; ) { 154 while (1) {
139 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
140 i += panic_blink(i); 156 panic_blink_one_second();
141 mdelay(1);
142 i++;
143 } 157 }
144} 158}
145 159
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
28 27
29#if 0 28#if 0
30#define DEBUGP printk 29#define DEBUGP printk
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
402} 401}
403 402
404/* sysfs output in /sys/modules/XYZ/parameters/ */ 403/* sysfs output in /sys/modules/XYZ/parameters/ */
405#define to_module_attr(n) container_of(n, struct module_attribute, attr); 404#define to_module_attr(n) container_of(n, struct module_attribute, attr)
406#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 405#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
407 406
408extern struct kernel_param __start___param[], __stop___param[]; 407extern struct kernel_param __start___param[], __stop___param[];
409 408
@@ -421,7 +420,7 @@ struct module_param_attrs
421}; 420};
422 421
423#ifdef CONFIG_SYSFS 422#ifdef CONFIG_SYSFS
424#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 423#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
425 424
426static ssize_t param_attr_show(struct module_attribute *mattr, 425static ssize_t param_attr_show(struct module_attribute *mattr,
427 struct module *mod, char *buf) 426 struct module *mod, char *buf)
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
517 new->grp.attrs = attrs; 516 new->grp.attrs = attrs;
518 517
519 /* Tack new one on the end. */ 518 /* Tack new one on the end. */
519 sysfs_attr_init(&new->attrs[num].mattr.attr);
520 new->attrs[num].param = kp; 520 new->attrs[num].param = kp;
521 new->attrs[num].mattr.show = param_attr_show; 521 new->attrs[num].mattr.show = param_attr_show;
522 new->attrs[num].mattr.store = param_attr_store; 522 new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
723 return ret; 723 return ret;
724} 724}
725 725
726static struct sysfs_ops module_sysfs_ops = { 726static const struct sysfs_ops module_sysfs_ops = {
727 .show = module_attr_show, 727 .show = module_attr_show,
728 .store = module_attr_store, 728 .store = module_attr_store,
729}; 729};
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
737 return 0; 737 return 0;
738} 738}
739 739
740static struct kset_uevent_ops module_uevent_ops = { 740static const struct kset_uevent_ops module_uevent_ops = {
741 .filter = uevent_filter, 741 .filter = uevent_filter,
742}; 742};
743 743
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 087025fe3ba1..3d1552d3c12b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,7 @@
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h>
18#include <linux/sysfs.h> 19#include <linux/sysfs.h>
19#include <linux/dcache.h> 20#include <linux/dcache.h>
20#include <linux/percpu.h> 21#include <linux/percpu.h>
@@ -56,21 +57,6 @@ static atomic_t nr_task_events __read_mostly;
56 */ 57 */
57int sysctl_perf_event_paranoid __read_mostly = 1; 58int sysctl_perf_event_paranoid __read_mostly = 1;
58 59
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 60int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75 61
76/* 62/*
@@ -96,13 +82,10 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
96void __weak hw_perf_disable(void) { barrier(); } 82void __weak hw_perf_disable(void) { barrier(); }
97void __weak hw_perf_enable(void) { barrier(); } 83void __weak hw_perf_enable(void) { barrier(); }
98 84
99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101
102int __weak 85int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 86hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 87 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 88 struct perf_event_context *ctx)
106{ 89{
107 return 0; 90 return 0;
108} 91}
@@ -111,25 +94,15 @@ void __weak perf_event_print_debug(void) { }
111 94
112static DEFINE_PER_CPU(int, perf_disable_count); 95static DEFINE_PER_CPU(int, perf_disable_count);
113 96
114void __perf_disable(void)
115{
116 __get_cpu_var(perf_disable_count)++;
117}
118
119bool __perf_enable(void)
120{
121 return !--__get_cpu_var(perf_disable_count);
122}
123
124void perf_disable(void) 97void perf_disable(void)
125{ 98{
126 __perf_disable(); 99 if (!__get_cpu_var(perf_disable_count)++)
127 hw_perf_disable(); 100 hw_perf_disable();
128} 101}
129 102
130void perf_enable(void) 103void perf_enable(void)
131{ 104{
132 if (__perf_enable()) 105 if (!--__get_cpu_var(perf_disable_count))
133 hw_perf_enable(); 106 hw_perf_enable();
134} 107}
135 108
@@ -248,7 +221,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 221
249static inline u64 perf_clock(void) 222static inline u64 perf_clock(void)
250{ 223{
251 return cpu_clock(smp_processor_id()); 224 return cpu_clock(raw_smp_processor_id());
252} 225}
253 226
254/* 227/*
@@ -632,14 +605,13 @@ void perf_event_disable(struct perf_event *event)
632static int 605static int
633event_sched_in(struct perf_event *event, 606event_sched_in(struct perf_event *event,
634 struct perf_cpu_context *cpuctx, 607 struct perf_cpu_context *cpuctx,
635 struct perf_event_context *ctx, 608 struct perf_event_context *ctx)
636 int cpu)
637{ 609{
638 if (event->state <= PERF_EVENT_STATE_OFF) 610 if (event->state <= PERF_EVENT_STATE_OFF)
639 return 0; 611 return 0;
640 612
641 event->state = PERF_EVENT_STATE_ACTIVE; 613 event->state = PERF_EVENT_STATE_ACTIVE;
642 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 614 event->oncpu = smp_processor_id();
643 /* 615 /*
644 * The new state must be visible before we turn it on in the hardware: 616 * The new state must be visible before we turn it on in the hardware:
645 */ 617 */
@@ -666,8 +638,7 @@ event_sched_in(struct perf_event *event,
666static int 638static int
667group_sched_in(struct perf_event *group_event, 639group_sched_in(struct perf_event *group_event,
668 struct perf_cpu_context *cpuctx, 640 struct perf_cpu_context *cpuctx,
669 struct perf_event_context *ctx, 641 struct perf_event_context *ctx)
670 int cpu)
671{ 642{
672 struct perf_event *event, *partial_group; 643 struct perf_event *event, *partial_group;
673 int ret; 644 int ret;
@@ -675,18 +646,18 @@ group_sched_in(struct perf_event *group_event,
675 if (group_event->state == PERF_EVENT_STATE_OFF) 646 if (group_event->state == PERF_EVENT_STATE_OFF)
676 return 0; 647 return 0;
677 648
678 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 649 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
679 if (ret) 650 if (ret)
680 return ret < 0 ? ret : 0; 651 return ret < 0 ? ret : 0;
681 652
682 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 653 if (event_sched_in(group_event, cpuctx, ctx))
683 return -EAGAIN; 654 return -EAGAIN;
684 655
685 /* 656 /*
686 * Schedule in siblings as one group (if any): 657 * Schedule in siblings as one group (if any):
687 */ 658 */
688 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 659 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
689 if (event_sched_in(event, cpuctx, ctx, cpu)) { 660 if (event_sched_in(event, cpuctx, ctx)) {
690 partial_group = event; 661 partial_group = event;
691 goto group_error; 662 goto group_error;
692 } 663 }
@@ -760,7 +731,6 @@ static void __perf_install_in_context(void *info)
760 struct perf_event *event = info; 731 struct perf_event *event = info;
761 struct perf_event_context *ctx = event->ctx; 732 struct perf_event_context *ctx = event->ctx;
762 struct perf_event *leader = event->group_leader; 733 struct perf_event *leader = event->group_leader;
763 int cpu = smp_processor_id();
764 int err; 734 int err;
765 735
766 /* 736 /*
@@ -807,7 +777,7 @@ static void __perf_install_in_context(void *info)
807 if (!group_can_go_on(event, cpuctx, 1)) 777 if (!group_can_go_on(event, cpuctx, 1))
808 err = -EEXIST; 778 err = -EEXIST;
809 else 779 else
810 err = event_sched_in(event, cpuctx, ctx, cpu); 780 err = event_sched_in(event, cpuctx, ctx);
811 781
812 if (err) { 782 if (err) {
813 /* 783 /*
@@ -949,11 +919,9 @@ static void __perf_event_enable(void *info)
949 } else { 919 } else {
950 perf_disable(); 920 perf_disable();
951 if (event == leader) 921 if (event == leader)
952 err = group_sched_in(event, cpuctx, ctx, 922 err = group_sched_in(event, cpuctx, ctx);
953 smp_processor_id());
954 else 923 else
955 err = event_sched_in(event, cpuctx, ctx, 924 err = event_sched_in(event, cpuctx, ctx);
956 smp_processor_id());
957 perf_enable(); 925 perf_enable();
958 } 926 }
959 927
@@ -1197,11 +1165,9 @@ void perf_event_task_sched_out(struct task_struct *task,
1197 struct perf_event_context *ctx = task->perf_event_ctxp; 1165 struct perf_event_context *ctx = task->perf_event_ctxp;
1198 struct perf_event_context *next_ctx; 1166 struct perf_event_context *next_ctx;
1199 struct perf_event_context *parent; 1167 struct perf_event_context *parent;
1200 struct pt_regs *regs;
1201 int do_switch = 1; 1168 int do_switch = 1;
1202 1169
1203 regs = task_pt_regs(task); 1170 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1204 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1205 1171
1206 if (likely(!ctx || !cpuctx->task_ctx)) 1172 if (likely(!ctx || !cpuctx->task_ctx))
1207 return; 1173 return;
@@ -1280,19 +1246,18 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1280 1246
1281static void 1247static void
1282ctx_pinned_sched_in(struct perf_event_context *ctx, 1248ctx_pinned_sched_in(struct perf_event_context *ctx,
1283 struct perf_cpu_context *cpuctx, 1249 struct perf_cpu_context *cpuctx)
1284 int cpu)
1285{ 1250{
1286 struct perf_event *event; 1251 struct perf_event *event;
1287 1252
1288 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1253 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1289 if (event->state <= PERF_EVENT_STATE_OFF) 1254 if (event->state <= PERF_EVENT_STATE_OFF)
1290 continue; 1255 continue;
1291 if (event->cpu != -1 && event->cpu != cpu) 1256 if (event->cpu != -1 && event->cpu != smp_processor_id())
1292 continue; 1257 continue;
1293 1258
1294 if (group_can_go_on(event, cpuctx, 1)) 1259 if (group_can_go_on(event, cpuctx, 1))
1295 group_sched_in(event, cpuctx, ctx, cpu); 1260 group_sched_in(event, cpuctx, ctx);
1296 1261
1297 /* 1262 /*
1298 * If this pinned group hasn't been scheduled, 1263 * If this pinned group hasn't been scheduled,
@@ -1307,8 +1272,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1307 1272
1308static void 1273static void
1309ctx_flexible_sched_in(struct perf_event_context *ctx, 1274ctx_flexible_sched_in(struct perf_event_context *ctx,
1310 struct perf_cpu_context *cpuctx, 1275 struct perf_cpu_context *cpuctx)
1311 int cpu)
1312{ 1276{
1313 struct perf_event *event; 1277 struct perf_event *event;
1314 int can_add_hw = 1; 1278 int can_add_hw = 1;
@@ -1321,11 +1285,11 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1321 * Listen to the 'cpu' scheduling filter constraint 1285 * Listen to the 'cpu' scheduling filter constraint
1322 * of events: 1286 * of events:
1323 */ 1287 */
1324 if (event->cpu != -1 && event->cpu != cpu) 1288 if (event->cpu != -1 && event->cpu != smp_processor_id())
1325 continue; 1289 continue;
1326 1290
1327 if (group_can_go_on(event, cpuctx, can_add_hw)) 1291 if (group_can_go_on(event, cpuctx, can_add_hw))
1328 if (group_sched_in(event, cpuctx, ctx, cpu)) 1292 if (group_sched_in(event, cpuctx, ctx))
1329 can_add_hw = 0; 1293 can_add_hw = 0;
1330 } 1294 }
1331} 1295}
@@ -1335,8 +1299,6 @@ ctx_sched_in(struct perf_event_context *ctx,
1335 struct perf_cpu_context *cpuctx, 1299 struct perf_cpu_context *cpuctx,
1336 enum event_type_t event_type) 1300 enum event_type_t event_type)
1337{ 1301{
1338 int cpu = smp_processor_id();
1339
1340 raw_spin_lock(&ctx->lock); 1302 raw_spin_lock(&ctx->lock);
1341 ctx->is_active = 1; 1303 ctx->is_active = 1;
1342 if (likely(!ctx->nr_events)) 1304 if (likely(!ctx->nr_events))
@@ -1351,11 +1313,11 @@ ctx_sched_in(struct perf_event_context *ctx,
1351 * in order to give them the best chance of going on. 1313 * in order to give them the best chance of going on.
1352 */ 1314 */
1353 if (event_type & EVENT_PINNED) 1315 if (event_type & EVENT_PINNED)
1354 ctx_pinned_sched_in(ctx, cpuctx, cpu); 1316 ctx_pinned_sched_in(ctx, cpuctx);
1355 1317
1356 /* Then walk through the lower prio flexible groups */ 1318 /* Then walk through the lower prio flexible groups */
1357 if (event_type & EVENT_FLEXIBLE) 1319 if (event_type & EVENT_FLEXIBLE)
1358 ctx_flexible_sched_in(ctx, cpuctx, cpu); 1320 ctx_flexible_sched_in(ctx, cpuctx);
1359 1321
1360 perf_enable(); 1322 perf_enable();
1361 out: 1323 out:
@@ -1493,6 +1455,22 @@ do { \
1493 return div64_u64(dividend, divisor); 1455 return div64_u64(dividend, divisor);
1494} 1456}
1495 1457
1458static void perf_event_stop(struct perf_event *event)
1459{
1460 if (!event->pmu->stop)
1461 return event->pmu->disable(event);
1462
1463 return event->pmu->stop(event);
1464}
1465
1466static int perf_event_start(struct perf_event *event)
1467{
1468 if (!event->pmu->start)
1469 return event->pmu->enable(event);
1470
1471 return event->pmu->start(event);
1472}
1473
1496static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1474static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1497{ 1475{
1498 struct hw_perf_event *hwc = &event->hw; 1476 struct hw_perf_event *hwc = &event->hw;
@@ -1513,9 +1491,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1513 1491
1514 if (atomic64_read(&hwc->period_left) > 8*sample_period) { 1492 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1515 perf_disable(); 1493 perf_disable();
1516 event->pmu->disable(event); 1494 perf_event_stop(event);
1517 atomic64_set(&hwc->period_left, 0); 1495 atomic64_set(&hwc->period_left, 0);
1518 event->pmu->enable(event); 1496 perf_event_start(event);
1519 perf_enable(); 1497 perf_enable();
1520 } 1498 }
1521} 1499}
@@ -1545,12 +1523,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1545 */ 1523 */
1546 if (interrupts == MAX_INTERRUPTS) { 1524 if (interrupts == MAX_INTERRUPTS) {
1547 perf_log_throttle(event, 1); 1525 perf_log_throttle(event, 1);
1526 perf_disable();
1548 event->pmu->unthrottle(event); 1527 event->pmu->unthrottle(event);
1528 perf_enable();
1549 } 1529 }
1550 1530
1551 if (!event->attr.freq || !event->attr.sample_freq) 1531 if (!event->attr.freq || !event->attr.sample_freq)
1552 continue; 1532 continue;
1553 1533
1534 perf_disable();
1554 event->pmu->read(event); 1535 event->pmu->read(event);
1555 now = atomic64_read(&event->count); 1536 now = atomic64_read(&event->count);
1556 delta = now - hwc->freq_count_stamp; 1537 delta = now - hwc->freq_count_stamp;
@@ -1558,6 +1539,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1558 1539
1559 if (delta > 0) 1540 if (delta > 0)
1560 perf_adjust_period(event, TICK_NSEC, delta); 1541 perf_adjust_period(event, TICK_NSEC, delta);
1542 perf_enable();
1561 } 1543 }
1562 raw_spin_unlock(&ctx->lock); 1544 raw_spin_unlock(&ctx->lock);
1563} 1545}
@@ -1567,9 +1549,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1567 */ 1549 */
1568static void rotate_ctx(struct perf_event_context *ctx) 1550static void rotate_ctx(struct perf_event_context *ctx)
1569{ 1551{
1570 if (!ctx->nr_events)
1571 return;
1572
1573 raw_spin_lock(&ctx->lock); 1552 raw_spin_lock(&ctx->lock);
1574 1553
1575 /* Rotate the first entry last of non-pinned groups */ 1554 /* Rotate the first entry last of non-pinned groups */
@@ -1582,19 +1561,28 @@ void perf_event_task_tick(struct task_struct *curr)
1582{ 1561{
1583 struct perf_cpu_context *cpuctx; 1562 struct perf_cpu_context *cpuctx;
1584 struct perf_event_context *ctx; 1563 struct perf_event_context *ctx;
1564 int rotate = 0;
1585 1565
1586 if (!atomic_read(&nr_events)) 1566 if (!atomic_read(&nr_events))
1587 return; 1567 return;
1588 1568
1589 cpuctx = &__get_cpu_var(perf_cpu_context); 1569 cpuctx = &__get_cpu_var(perf_cpu_context);
1590 ctx = curr->perf_event_ctxp; 1570 if (cpuctx->ctx.nr_events &&
1571 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1572 rotate = 1;
1591 1573
1592 perf_disable(); 1574 ctx = curr->perf_event_ctxp;
1575 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1576 rotate = 1;
1593 1577
1594 perf_ctx_adjust_freq(&cpuctx->ctx); 1578 perf_ctx_adjust_freq(&cpuctx->ctx);
1595 if (ctx) 1579 if (ctx)
1596 perf_ctx_adjust_freq(ctx); 1580 perf_ctx_adjust_freq(ctx);
1597 1581
1582 if (!rotate)
1583 return;
1584
1585 perf_disable();
1598 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1586 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1599 if (ctx) 1587 if (ctx)
1600 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1588 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1606,7 +1594,6 @@ void perf_event_task_tick(struct task_struct *curr)
1606 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1594 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1607 if (ctx) 1595 if (ctx)
1608 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1596 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1609
1610 perf_enable(); 1597 perf_enable();
1611} 1598}
1612 1599
@@ -2602,7 +2589,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2602 if (user_locked > user_lock_limit) 2589 if (user_locked > user_lock_limit)
2603 extra = user_locked - user_lock_limit; 2590 extra = user_locked - user_lock_limit;
2604 2591
2605 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2592 lock_limit = rlimit(RLIMIT_MEMLOCK);
2606 lock_limit >>= PAGE_SHIFT; 2593 lock_limit >>= PAGE_SHIFT;
2607 locked = vma->vm_mm->locked_vm + extra; 2594 locked = vma->vm_mm->locked_vm + extra;
2608 2595
@@ -2798,6 +2785,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2798 return NULL; 2785 return NULL;
2799} 2786}
2800 2787
2788__weak
2789void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2790{
2791}
2792
2793
2801/* 2794/*
2802 * Output 2795 * Output
2803 */ 2796 */
@@ -3383,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event,
3383 struct perf_task_event *task_event) 3376 struct perf_task_event *task_event)
3384{ 3377{
3385 struct perf_output_handle handle; 3378 struct perf_output_handle handle;
3386 int size;
3387 struct task_struct *task = task_event->task; 3379 struct task_struct *task = task_event->task;
3388 int ret; 3380 unsigned long flags;
3381 int size, ret;
3382
3383 /*
3384 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3385 * in perf_output_lock() from interrupt context, it's game over.
3386 */
3387 local_irq_save(flags);
3389 3388
3390 size = task_event->event_id.header.size; 3389 size = task_event->event_id.header.size;
3391 ret = perf_output_begin(&handle, event, size, 0, 0); 3390 ret = perf_output_begin(&handle, event, size, 0, 0);
3392 3391
3393 if (ret) 3392 if (ret) {
3393 local_irq_restore(flags);
3394 return; 3394 return;
3395 }
3395 3396
3396 task_event->event_id.pid = perf_event_pid(event, task); 3397 task_event->event_id.pid = perf_event_pid(event, task);
3397 task_event->event_id.ppid = perf_event_pid(event, current); 3398 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3399,16 +3400,15 @@ static void perf_event_task_output(struct perf_event *event,
3399 task_event->event_id.tid = perf_event_tid(event, task); 3400 task_event->event_id.tid = perf_event_tid(event, task);
3400 task_event->event_id.ptid = perf_event_tid(event, current); 3401 task_event->event_id.ptid = perf_event_tid(event, current);
3401 3402
3402 task_event->event_id.time = perf_clock();
3403
3404 perf_output_put(&handle, task_event->event_id); 3403 perf_output_put(&handle, task_event->event_id);
3405 3404
3406 perf_output_end(&handle); 3405 perf_output_end(&handle);
3406 local_irq_restore(flags);
3407} 3407}
3408 3408
3409static int perf_event_task_match(struct perf_event *event) 3409static int perf_event_task_match(struct perf_event *event)
3410{ 3410{
3411 if (event->state != PERF_EVENT_STATE_ACTIVE) 3411 if (event->state < PERF_EVENT_STATE_INACTIVE)
3412 return 0; 3412 return 0;
3413 3413
3414 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3414 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3440,7 +3440,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3440 cpuctx = &get_cpu_var(perf_cpu_context); 3440 cpuctx = &get_cpu_var(perf_cpu_context);
3441 perf_event_task_ctx(&cpuctx->ctx, task_event); 3441 perf_event_task_ctx(&cpuctx->ctx, task_event);
3442 if (!ctx) 3442 if (!ctx)
3443 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3443 ctx = rcu_dereference(current->perf_event_ctxp);
3444 if (ctx) 3444 if (ctx)
3445 perf_event_task_ctx(ctx, task_event); 3445 perf_event_task_ctx(ctx, task_event);
3446 put_cpu_var(perf_cpu_context); 3446 put_cpu_var(perf_cpu_context);
@@ -3471,6 +3471,7 @@ static void perf_event_task(struct task_struct *task,
3471 /* .ppid */ 3471 /* .ppid */
3472 /* .tid */ 3472 /* .tid */
3473 /* .ptid */ 3473 /* .ptid */
3474 .time = perf_clock(),
3474 }, 3475 },
3475 }; 3476 };
3476 3477
@@ -3520,7 +3521,7 @@ static void perf_event_comm_output(struct perf_event *event,
3520 3521
3521static int perf_event_comm_match(struct perf_event *event) 3522static int perf_event_comm_match(struct perf_event *event)
3522{ 3523{
3523 if (event->state != PERF_EVENT_STATE_ACTIVE) 3524 if (event->state < PERF_EVENT_STATE_INACTIVE)
3524 return 0; 3525 return 0;
3525 3526
3526 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3527 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3640,7 +3641,7 @@ static void perf_event_mmap_output(struct perf_event *event,
3640static int perf_event_mmap_match(struct perf_event *event, 3641static int perf_event_mmap_match(struct perf_event *event,
3641 struct perf_mmap_event *mmap_event) 3642 struct perf_mmap_event *mmap_event)
3642{ 3643{
3643 if (event->state != PERF_EVENT_STATE_ACTIVE) 3644 if (event->state < PERF_EVENT_STATE_INACTIVE)
3644 return 0; 3645 return 0;
3645 3646
3646 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3647 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3749,7 +3750,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3749 /* .tid */ 3750 /* .tid */
3750 .start = vma->vm_start, 3751 .start = vma->vm_start,
3751 .len = vma->vm_end - vma->vm_start, 3752 .len = vma->vm_end - vma->vm_start,
3752 .pgoff = vma->vm_pgoff, 3753 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3753 }, 3754 },
3754 }; 3755 };
3755 3756
@@ -4116,8 +4117,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4116 if (rctx < 0) 4117 if (rctx < 0)
4117 return; 4118 return;
4118 4119
4119 data.addr = addr; 4120 perf_sample_data_init(&data, addr);
4120 data.raw = NULL;
4121 4121
4122 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4122 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4123 4123
@@ -4162,11 +4162,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4162 struct perf_event *event; 4162 struct perf_event *event;
4163 u64 period; 4163 u64 period;
4164 4164
4165 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4165 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4166 event->pmu->read(event); 4166 event->pmu->read(event);
4167 4167
4168 data.addr = 0; 4168 perf_sample_data_init(&data, 0);
4169 data.raw = NULL;
4170 data.period = event->hw.last_period; 4169 data.period = event->hw.last_period;
4171 regs = get_irq_regs(); 4170 regs = get_irq_regs();
4172 /* 4171 /*
@@ -4328,26 +4327,20 @@ static const struct pmu perf_ops_task_clock = {
4328#ifdef CONFIG_EVENT_TRACING 4327#ifdef CONFIG_EVENT_TRACING
4329 4328
4330void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4329void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4331 int entry_size) 4330 int entry_size, struct pt_regs *regs)
4332{ 4331{
4332 struct perf_sample_data data;
4333 struct perf_raw_record raw = { 4333 struct perf_raw_record raw = {
4334 .size = entry_size, 4334 .size = entry_size,
4335 .data = record, 4335 .data = record,
4336 }; 4336 };
4337 4337
4338 struct perf_sample_data data = { 4338 perf_sample_data_init(&data, addr);
4339 .addr = addr, 4339 data.raw = &raw;
4340 .raw = &raw,
4341 };
4342
4343 struct pt_regs *regs = get_irq_regs();
4344
4345 if (!regs)
4346 regs = task_pt_regs(current);
4347 4340
4348 /* Trace events already protected against recursion */ 4341 /* Trace events already protected against recursion */
4349 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4342 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4350 &data, regs); 4343 &data, regs);
4351} 4344}
4352EXPORT_SYMBOL_GPL(perf_tp_event); 4345EXPORT_SYMBOL_GPL(perf_tp_event);
4353 4346
@@ -4363,7 +4356,7 @@ static int perf_tp_event_match(struct perf_event *event,
4363 4356
4364static void tp_perf_event_destroy(struct perf_event *event) 4357static void tp_perf_event_destroy(struct perf_event *event)
4365{ 4358{
4366 ftrace_profile_disable(event->attr.config); 4359 perf_trace_disable(event->attr.config);
4367} 4360}
4368 4361
4369static const struct pmu *tp_perf_event_init(struct perf_event *event) 4362static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4377,7 +4370,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4377 !capable(CAP_SYS_ADMIN)) 4370 !capable(CAP_SYS_ADMIN))
4378 return ERR_PTR(-EPERM); 4371 return ERR_PTR(-EPERM);
4379 4372
4380 if (ftrace_profile_enable(event->attr.config)) 4373 if (perf_trace_enable(event->attr.config))
4381 return NULL; 4374 return NULL;
4382 4375
4383 event->destroy = tp_perf_event_destroy; 4376 event->destroy = tp_perf_event_destroy;
@@ -4456,8 +4449,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4456 struct perf_sample_data sample; 4449 struct perf_sample_data sample;
4457 struct pt_regs *regs = data; 4450 struct pt_regs *regs = data;
4458 4451
4459 sample.raw = NULL; 4452 perf_sample_data_init(&sample, bp->attr.bp_addr);
4460 sample.addr = bp->attr.bp_addr;
4461 4453
4462 if (!perf_exclude_event(bp, regs)) 4454 if (!perf_exclude_event(bp, regs))
4463 perf_swevent_add(bp, 1, 1, &sample, regs); 4455 perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4720,7 +4712,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4720 if (attr->type >= PERF_TYPE_MAX) 4712 if (attr->type >= PERF_TYPE_MAX)
4721 return -EINVAL; 4713 return -EINVAL;
4722 4714
4723 if (attr->__reserved_1 || attr->__reserved_2) 4715 if (attr->__reserved_1)
4724 return -EINVAL; 4716 return -EINVAL;
4725 4717
4726 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4718 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4905,7 +4897,7 @@ err_fput_free_put_context:
4905 4897
4906err_free_put_context: 4898err_free_put_context:
4907 if (err < 0) 4899 if (err < 0)
4908 kfree(event); 4900 free_event(event);
4909 4901
4910err_put_context: 4902err_put_context:
4911 if (err < 0) 4903 if (err < 0)
@@ -5385,18 +5377,26 @@ int perf_event_init_task(struct task_struct *child)
5385 return ret; 5377 return ret;
5386} 5378}
5387 5379
5380static void __init perf_event_init_all_cpus(void)
5381{
5382 int cpu;
5383 struct perf_cpu_context *cpuctx;
5384
5385 for_each_possible_cpu(cpu) {
5386 cpuctx = &per_cpu(perf_cpu_context, cpu);
5387 __perf_event_init_context(&cpuctx->ctx, NULL);
5388 }
5389}
5390
5388static void __cpuinit perf_event_init_cpu(int cpu) 5391static void __cpuinit perf_event_init_cpu(int cpu)
5389{ 5392{
5390 struct perf_cpu_context *cpuctx; 5393 struct perf_cpu_context *cpuctx;
5391 5394
5392 cpuctx = &per_cpu(perf_cpu_context, cpu); 5395 cpuctx = &per_cpu(perf_cpu_context, cpu);
5393 __perf_event_init_context(&cpuctx->ctx, NULL);
5394 5396
5395 spin_lock(&perf_resource_lock); 5397 spin_lock(&perf_resource_lock);
5396 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5398 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5397 spin_unlock(&perf_resource_lock); 5399 spin_unlock(&perf_resource_lock);
5398
5399 hw_perf_event_setup(cpu);
5400} 5400}
5401 5401
5402#ifdef CONFIG_HOTPLUG_CPU 5402#ifdef CONFIG_HOTPLUG_CPU
@@ -5436,11 +5436,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5436 perf_event_init_cpu(cpu); 5436 perf_event_init_cpu(cpu);
5437 break; 5437 break;
5438 5438
5439 case CPU_ONLINE:
5440 case CPU_ONLINE_FROZEN:
5441 hw_perf_event_setup_online(cpu);
5442 break;
5443
5444 case CPU_DOWN_PREPARE: 5439 case CPU_DOWN_PREPARE:
5445 case CPU_DOWN_PREPARE_FROZEN: 5440 case CPU_DOWN_PREPARE_FROZEN:
5446 perf_event_exit_cpu(cpu); 5441 perf_event_exit_cpu(cpu);
@@ -5463,6 +5458,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
5463 5458
5464void __init perf_event_init(void) 5459void __init perf_event_init(void)
5465{ 5460{
5461 perf_event_init_all_cpus();
5466 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 5462 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5467 (void *)(long)smp_processor_id()); 5463 (void *)(long)smp_processor_id());
5468 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 5464 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5470,13 +5466,16 @@ void __init perf_event_init(void)
5470 register_cpu_notifier(&perf_cpu_nb); 5466 register_cpu_notifier(&perf_cpu_nb);
5471} 5467}
5472 5468
5473static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5469static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5470 struct sysdev_class_attribute *attr,
5471 char *buf)
5474{ 5472{
5475 return sprintf(buf, "%d\n", perf_reserved_percpu); 5473 return sprintf(buf, "%d\n", perf_reserved_percpu);
5476} 5474}
5477 5475
5478static ssize_t 5476static ssize_t
5479perf_set_reserve_percpu(struct sysdev_class *class, 5477perf_set_reserve_percpu(struct sysdev_class *class,
5478 struct sysdev_class_attribute *attr,
5480 const char *buf, 5479 const char *buf,
5481 size_t count) 5480 size_t count)
5482{ 5481{
@@ -5505,13 +5504,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5505 return count; 5504 return count;
5506} 5505}
5507 5506
5508static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5507static ssize_t perf_show_overcommit(struct sysdev_class *class,
5508 struct sysdev_class_attribute *attr,
5509 char *buf)
5509{ 5510{
5510 return sprintf(buf, "%d\n", perf_overcommit); 5511 return sprintf(buf, "%d\n", perf_overcommit);
5511} 5512}
5512 5513
5513static ssize_t 5514static ssize_t
5514perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5515perf_set_overcommit(struct sysdev_class *class,
5516 struct sysdev_class_attribute *attr,
5517 const char *buf, size_t count)
5515{ 5518{
5516 unsigned long val; 5519 unsigned long val;
5517 int err; 5520 int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cbe..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference(pid->tasks[type].first); 370 first = rcu_dereference_check(pid->tasks[type].first,
371 rcu_read_lock_held() ||
372 lockdep_tasklist_lock_is_held());
371 if (first) 373 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 374 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 375 }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 378EXPORT_SYMBOL(pid_task);
377 379
378/* 380/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 381 * Must be called under rcu_read_lock().
380 */ 382 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 383struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 384{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h>
16 17
17#define BITS_PER_PAGE (PAGE_SIZE*8) 18#define BITS_PER_PAGE (PAGE_SIZE*8)
18 19
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
161 rcu_read_lock(); 162 rcu_read_lock();
162 163
163 /* 164 /*
164 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring 165 * Any nested-container's init processes won't ignore the
165 * any nested-container's init processes don't ignore the 166 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
166 * signal
167 */ 167 */
168 task = pid_task(find_vpid(nr), PIDTYPE_PID); 168 task = pid_task(find_vpid(nr), PIDTYPE_PID);
169 if (task) 169 if (task)
170 force_sig(SIGKILL, task); 170 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
171 171
172 rcu_read_unlock(); 172 rcu_read_unlock();
173 173
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..bc7704b3a443 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
982 int maxfire; 982 int maxfire;
983 struct list_head *timers = tsk->cpu_timers; 983 struct list_head *timers = tsk->cpu_timers;
984 struct signal_struct *const sig = tsk->signal; 984 struct signal_struct *const sig = tsk->signal;
985 unsigned long soft;
985 986
986 maxfire = 20; 987 maxfire = 20;
987 tsk->cputime_expires.prof_exp = cputime_zero; 988 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
1030 /* 1031 /*
1031 * Check for the special case thread timers. 1032 * Check for the special case thread timers.
1032 */ 1033 */
1033 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 1034 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1034 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 1035 if (soft != RLIM_INFINITY) {
1035 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 1036 unsigned long hard =
1037 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1036 1038
1037 if (hard != RLIM_INFINITY && 1039 if (hard != RLIM_INFINITY &&
1038 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 1040 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
1043 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1045 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1044 return; 1046 return;
1045 } 1047 }
1046 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 1048 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1047 /* 1049 /*
1048 * At the soft limit, send a SIGXCPU every second. 1050 * At the soft limit, send a SIGXCPU every second.
1049 */ 1051 */
1050 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 1052 if (soft < hard) {
1051 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 1053 soft += USEC_PER_SEC;
1052 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1054 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1053 USEC_PER_SEC;
1054 } 1055 }
1055 printk(KERN_INFO 1056 printk(KERN_INFO
1056 "RT Watchdog Timeout: %s[%d]\n", 1057 "RT Watchdog Timeout: %s[%d]\n",
@@ -1060,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,
1060 } 1061 }
1061} 1062}
1062 1063
1063static void stop_process_timers(struct task_struct *tsk) 1064static void stop_process_timers(struct signal_struct *sig)
1064{ 1065{
1065 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 1066 struct thread_group_cputimer *cputimer = &sig->cputimer;
1066 unsigned long flags; 1067 unsigned long flags;
1067 1068
1068 if (!cputimer->running) 1069 if (!cputimer->running)
@@ -1071,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)
1071 spin_lock_irqsave(&cputimer->lock, flags); 1072 spin_lock_irqsave(&cputimer->lock, flags);
1072 cputimer->running = 0; 1073 cputimer->running = 0;
1073 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1074} 1079}
1075 1080
1076static u32 onecputick; 1081static u32 onecputick;
@@ -1121,6 +1126,7 @@ static void check_process_timers(struct task_struct *tsk,
1121 unsigned long long sum_sched_runtime, sched_expires; 1126 unsigned long long sum_sched_runtime, sched_expires;
1122 struct list_head *timers = sig->cpu_timers; 1127 struct list_head *timers = sig->cpu_timers;
1123 struct task_cputime cputime; 1128 struct task_cputime cputime;
1129 unsigned long soft;
1124 1130
1125 /* 1131 /*
1126 * Don't sample the current process CPU clocks if there are no timers. 1132 * Don't sample the current process CPU clocks if there are no timers.
@@ -1131,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,
1131 list_empty(&timers[CPUCLOCK_VIRT]) && 1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1132 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && 1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1133 list_empty(&timers[CPUCLOCK_SCHED])) { 1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1134 stop_process_timers(tsk); 1140 stop_process_timers(sig);
1135 return; 1141 return;
1136 } 1142 }
1137 1143
@@ -1193,11 +1199,13 @@ static void check_process_timers(struct task_struct *tsk,
1193 SIGPROF); 1199 SIGPROF);
1194 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1200 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1195 SIGVTALRM); 1201 SIGVTALRM);
1196 1202 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1197 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1203 if (soft != RLIM_INFINITY) {
1198 unsigned long psecs = cputime_to_secs(ptime); 1204 unsigned long psecs = cputime_to_secs(ptime);
1205 unsigned long hard =
1206 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1199 cputime_t x; 1207 cputime_t x;
1200 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1208 if (psecs >= hard) {
1201 /* 1209 /*
1202 * At the hard limit, we just die. 1210 * At the hard limit, we just die.
1203 * No need to calculate anything else now. 1211 * No need to calculate anything else now.
@@ -1205,17 +1213,17 @@ static void check_process_timers(struct task_struct *tsk,
1205 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1213 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1206 return; 1214 return;
1207 } 1215 }
1208 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1216 if (psecs >= soft) {
1209 /* 1217 /*
1210 * At the soft limit, send a SIGXCPU every second. 1218 * At the soft limit, send a SIGXCPU every second.
1211 */ 1219 */
1212 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1220 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1213 if (sig->rlim[RLIMIT_CPU].rlim_cur 1221 if (soft < hard) {
1214 < sig->rlim[RLIMIT_CPU].rlim_max) { 1222 soft++;
1215 sig->rlim[RLIMIT_CPU].rlim_cur++; 1223 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1216 } 1224 }
1217 } 1225 }
1218 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1226 x = secs_to_cputime(soft);
1219 if (cputime_eq(prof_expires, cputime_zero) || 1227 if (cputime_eq(prof_expires, cputime_zero) ||
1220 cputime_lt(x, prof_expires)) { 1228 cputime_lt(x, prof_expires)) {
1221 prof_expires = x; 1229 prof_expires = x;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
256 return 0; 256 return 0;
257} 257}
258 258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 259static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{ 260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 262 return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
27 code. This is helpful when debugging and reporting PM bugs, like 27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support. 28 suspend support.
29 29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
30config PM_VERBOSE 39config PM_VERBOSE
31 bool "Verbose Power Management debugging" 40 bool "Verbose Power Management debugging"
32 depends on PM_DEBUG 41 depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 95 default y
87 96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
88config SUSPEND 102config SUSPEND
89 bool "Suspend to RAM and standby" 103 bool "Suspend to RAM and standby"
90 depends on PM && ARCH_SUSPEND_POSSIBLE 104 depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
222 and the bus type drivers of the buses the devices are on are 236 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and 237 responsible for the actual handling of the autosuspend requests and
224 wake-up events. 238 wake-up events.
239
240config PM_OPS
241 bool
242 depends on PM_SLEEP || PM_RUNTIME
243 default y
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..aa9e916da4d5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h>
25#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
26#include <asm/suspend.h> 27#include <asm/suspend.h>
27 28
@@ -323,6 +324,7 @@ static int create_image(int platform_mode)
323int hibernation_snapshot(int platform_mode) 324int hibernation_snapshot(int platform_mode)
324{ 325{
325 int error; 326 int error;
327 gfp_t saved_mask;
326 328
327 error = platform_begin(platform_mode); 329 error = platform_begin(platform_mode);
328 if (error) 330 if (error)
@@ -334,6 +336,7 @@ int hibernation_snapshot(int platform_mode)
334 goto Close; 336 goto Close;
335 337
336 suspend_console(); 338 suspend_console();
339 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
337 error = dpm_suspend_start(PMSG_FREEZE); 340 error = dpm_suspend_start(PMSG_FREEZE);
338 if (error) 341 if (error)
339 goto Recover_platform; 342 goto Recover_platform;
@@ -351,6 +354,7 @@ int hibernation_snapshot(int platform_mode)
351 354
352 dpm_resume_end(in_suspend ? 355 dpm_resume_end(in_suspend ?
353 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 356 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
357 set_gfp_allowed_mask(saved_mask);
354 resume_console(); 358 resume_console();
355 Close: 359 Close:
356 platform_end(platform_mode); 360 platform_end(platform_mode);
@@ -445,14 +449,17 @@ static int resume_target_kernel(bool platform_mode)
445int hibernation_restore(int platform_mode) 449int hibernation_restore(int platform_mode)
446{ 450{
447 int error; 451 int error;
452 gfp_t saved_mask;
448 453
449 pm_prepare_console(); 454 pm_prepare_console();
450 suspend_console(); 455 suspend_console();
456 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
451 error = dpm_suspend_start(PMSG_QUIESCE); 457 error = dpm_suspend_start(PMSG_QUIESCE);
452 if (!error) { 458 if (!error) {
453 error = resume_target_kernel(platform_mode); 459 error = resume_target_kernel(platform_mode);
454 dpm_resume_end(PMSG_RECOVER); 460 dpm_resume_end(PMSG_RECOVER);
455 } 461 }
462 set_gfp_allowed_mask(saved_mask);
456 resume_console(); 463 resume_console();
457 pm_restore_console(); 464 pm_restore_console();
458 return error; 465 return error;
@@ -466,6 +473,7 @@ int hibernation_restore(int platform_mode)
466int hibernation_platform_enter(void) 473int hibernation_platform_enter(void)
467{ 474{
468 int error; 475 int error;
476 gfp_t saved_mask;
469 477
470 if (!hibernation_ops) 478 if (!hibernation_ops)
471 return -ENOSYS; 479 return -ENOSYS;
@@ -481,6 +489,7 @@ int hibernation_platform_enter(void)
481 489
482 entering_platform_hibernation = true; 490 entering_platform_hibernation = true;
483 suspend_console(); 491 suspend_console();
492 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
484 error = dpm_suspend_start(PMSG_HIBERNATE); 493 error = dpm_suspend_start(PMSG_HIBERNATE);
485 if (error) { 494 if (error) {
486 if (hibernation_ops->recover) 495 if (hibernation_ops->recover)
@@ -518,6 +527,7 @@ int hibernation_platform_enter(void)
518 Resume_devices: 527 Resume_devices:
519 entering_platform_hibernation = false; 528 entering_platform_hibernation = false;
520 dpm_resume_end(PMSG_RESTORE); 529 dpm_resume_end(PMSG_RESTORE);
530 set_gfp_allowed_mask(saved_mask);
521 resume_console(); 531 resume_console();
522 532
523 Close: 533 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
index 39ac698ef836..fdcad9ed5a7b 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/hibernate_nvs.c
@@ -10,6 +10,7 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
13#include <linux/suspend.h> 14#include <linux/suspend.h>
14 15
15/* 16/*
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
44 == NOTIFY_BAD) ? -EINVAL : 0; 44 == NOTIFY_BAD) ? -EINVAL : 0;
45} 45}
46 46
47/* If set, devices may be suspended and resumed asynchronously. */
48int pm_async_enabled = 1;
49
50static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
51 char *buf)
52{
53 return sprintf(buf, "%d\n", pm_async_enabled);
54}
55
56static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
57 const char *buf, size_t n)
58{
59 unsigned long val;
60
61 if (strict_strtoul(buf, 10, &val))
62 return -EINVAL;
63
64 if (val > 1)
65 return -EINVAL;
66
67 pm_async_enabled = val;
68 return n;
69}
70
71power_attr(pm_async);
72
47#ifdef CONFIG_PM_DEBUG 73#ifdef CONFIG_PM_DEBUG
48int pm_test_level = TEST_NONE; 74int pm_test_level = TEST_NONE;
49 75
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
208#ifdef CONFIG_PM_TRACE 234#ifdef CONFIG_PM_TRACE
209 &pm_trace_attr.attr, 235 &pm_trace_attr.attr,
210#endif 236#endif
211#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) 237#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr,
239#ifdef CONFIG_PM_DEBUG
212 &pm_test_attr.attr, 240 &pm_test_attr.attr,
213#endif 241#endif
242#endif
214 NULL, 243 NULL,
215}; 244};
216 245
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5ade1bdcf366..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 89 "(%d tasks refusing to freeze):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 90 elapsed_csecs / 100, elapsed_csecs % 100, todo);
91 show_state();
92 read_lock(&tasklist_lock); 91 read_lock(&tasklist_lock);
93 do_each_thread(g, p) { 92 do_each_thread(g, p) {
94 task_lock(p); 93 task_lock(p);
95 if (freezing(p) && !freezer_should_skip(p)) 94 if (freezing(p) && !freezer_should_skip(p))
96 printk(KERN_ERR " %s\n", p->comm); 95 sched_show_task(p);
97 cancel_freezing(p); 96 cancel_freezing(p);
98 task_unlock(p); 97 task_unlock(p);
99 } while_each_thread(g, p); 98 } while_each_thread(g, p);
@@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
145 if (nosig_only && should_send_signal(p)) 144 if (nosig_only && should_send_signal(p))
146 continue; 145 continue;
147 146
148 if (cgroup_frozen(p)) 147 if (cgroup_freezing_or_frozen(p))
149 continue; 148 continue;
150 149
151 thaw_process(p); 150 thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..be861c26dda7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -26,6 +26,7 @@
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/mmu_context.h> 32#include <asm/mmu_context.h>
@@ -1181,7 +1182,7 @@ static void free_unnecessary_pages(void)
1181 1182
1182 memory_bm_position_reset(&copy_bm); 1183 memory_bm_position_reset(&copy_bm);
1183 1184
1184 while (to_free_normal > 0 && to_free_highmem > 0) { 1185 while (to_free_normal > 0 || to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1186 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn); 1187 struct page *page = pfn_to_page(pfn);
1187 1188
@@ -1500,7 +1501,7 @@ asmlinkage int swsusp_save(void)
1500{ 1501{
1501 unsigned int nr_pages, nr_highmem; 1502 unsigned int nr_pages, nr_highmem;
1502 1503
1503 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1504 printk(KERN_INFO "PM: Creating hibernation image:\n");
1504 1505
1505 drain_local_pages(NULL); 1506 drain_local_pages(NULL);
1506 nr_pages = count_data_pages(); 1507 nr_pages = count_data_pages();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..56e7dbb8b996 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,7 @@
15#include <linux/console.h> 15#include <linux/console.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/gfp.h>
18 19
19#include "power.h" 20#include "power.h"
20 21
@@ -189,6 +190,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 190int suspend_devices_and_enter(suspend_state_t state)
190{ 191{
191 int error; 192 int error;
193 gfp_t saved_mask;
192 194
193 if (!suspend_ops) 195 if (!suspend_ops)
194 return -ENOSYS; 196 return -ENOSYS;
@@ -199,6 +201,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 201 goto Close;
200 } 202 }
201 suspend_console(); 203 suspend_console();
204 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 205 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 206 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 207 if (error) {
@@ -215,6 +218,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 218 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 219 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 220 suspend_test_finish("resume devices");
221 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 222 resume_console();
219 Close: 223 Close:
220 if (suspend_ops->end) 224 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..66824d71983a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/swapops.h> 24#include <linux/swapops.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/slab.h>
26 27
27#include "power.h" 28#include "power.h"
28 29
@@ -657,10 +658,6 @@ int swsusp_read(unsigned int *flags_p)
657 struct swsusp_info *header; 658 struct swsusp_info *header;
658 659
659 *flags_p = swsusp_header->flags; 660 *flags_p = swsusp_header->flags;
660 if (IS_ERR(resume_bdev)) {
661 pr_debug("PM: Image device not initialised\n");
662 return PTR_ERR(resume_bdev);
663 }
664 661
665 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 662 memset(&snapshot, 0, sizeof(struct snapshot_handle));
666 error = snapshot_write_next(&snapshot, PAGE_SIZE); 663 error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file provides code to write suspend image to swap and read it back.
5 *
6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 *
9 * This file is released under the GPLv2.
10 *
11 * I'd like to thank the following people for their work:
12 *
13 * Pavel Machek <pavel@ucw.cz>:
14 * Modifications, defectiveness pointing, being with me at the very beginning,
15 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16 *
17 * Steve Doddi <dirk@loth.demon.co.uk>:
18 * Support the possibility of hardware state restoring.
19 *
20 * Raph <grey.havens@earthling.net>:
21 * Support for preserving states of network devices and virtual console
22 * (including X and svgatextmode)
23 *
24 * Kurt Garloff <garloff@suse.de>:
25 * Straightened the critical function in order to prevent compilers from
26 * playing tricks with local variables.
27 *
28 * Andreas Mohr <a.mohr@mailto.de>
29 *
30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init
32 *
33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Reworked the freeing of memory and the handling of swap
35 *
36 * More state savers are welcome. Especially for the scsi layer...
37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */
40
41#include <linux/mm.h>
42#include <linux/suspend.h>
43#include <linux/spinlock.h>
44#include <linux/kernel.h>
45#include <linux/major.h>
46#include <linux/swap.h>
47#include <linux/pm.h>
48#include <linux/swapops.h>
49#include <linux/bootmem.h>
50#include <linux/syscalls.h>
51#include <linux/highmem.h>
52#include <linux/time.h>
53#include <linux/rbtree.h>
54#include <linux/io.h>
55
56#include "power.h"
57
58int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..a8c96212bc1b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
195 return res; 195 return res;
196} 196}
197 197
198static void snapshot_deprecated_ioctl(unsigned int cmd)
199{
200 if (printk_ratelimit())
201 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
202 "be removed soon, update your suspend-to-disk "
203 "utilities\n",
204 __builtin_return_address(0), cmd);
205}
206
198static long snapshot_ioctl(struct file *filp, unsigned int cmd, 207static long snapshot_ioctl(struct file *filp, unsigned int cmd,
199 unsigned long arg) 208 unsigned long arg)
200{ 209{
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
246 data->frozen = 0; 255 data->frozen = 0;
247 break; 256 break;
248 257
249 case SNAPSHOT_CREATE_IMAGE:
250 case SNAPSHOT_ATOMIC_SNAPSHOT: 258 case SNAPSHOT_ATOMIC_SNAPSHOT:
259 snapshot_deprecated_ioctl(cmd);
260 case SNAPSHOT_CREATE_IMAGE:
251 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 261 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
252 error = -EPERM; 262 error = -EPERM;
253 break; 263 break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 data->ready = 0; 285 data->ready = 0;
276 break; 286 break;
277 287
278 case SNAPSHOT_PREF_IMAGE_SIZE:
279 case SNAPSHOT_SET_IMAGE_SIZE: 288 case SNAPSHOT_SET_IMAGE_SIZE:
289 snapshot_deprecated_ioctl(cmd);
290 case SNAPSHOT_PREF_IMAGE_SIZE:
280 image_size = arg; 291 image_size = arg;
281 break; 292 break;
282 293
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
290 error = put_user(size, (loff_t __user *)arg); 301 error = put_user(size, (loff_t __user *)arg);
291 break; 302 break;
292 303
293 case SNAPSHOT_AVAIL_SWAP_SIZE:
294 case SNAPSHOT_AVAIL_SWAP: 304 case SNAPSHOT_AVAIL_SWAP:
305 snapshot_deprecated_ioctl(cmd);
306 case SNAPSHOT_AVAIL_SWAP_SIZE:
295 size = count_swap_pages(data->swap, 1); 307 size = count_swap_pages(data->swap, 1);
296 size <<= PAGE_SHIFT; 308 size <<= PAGE_SHIFT;
297 error = put_user(size, (loff_t __user *)arg); 309 error = put_user(size, (loff_t __user *)arg);
298 break; 310 break;
299 311
300 case SNAPSHOT_ALLOC_SWAP_PAGE:
301 case SNAPSHOT_GET_SWAP_PAGE: 312 case SNAPSHOT_GET_SWAP_PAGE:
313 snapshot_deprecated_ioctl(cmd);
314 case SNAPSHOT_ALLOC_SWAP_PAGE:
302 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 315 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
303 error = -ENODEV; 316 error = -ENODEV;
304 break; 317 break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 break; 334 break;
322 335
323 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ 336 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
337 snapshot_deprecated_ioctl(cmd);
324 if (!swsusp_swap_in_use()) { 338 if (!swsusp_swap_in_use()) {
325 /* 339 /*
326 * User space encodes device types as two-byte values, 340 * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
362 break; 376 break;
363 377
364 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ 378 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
379 snapshot_deprecated_ioctl(cmd);
365 error = -EINVAL; 380 error = -EINVAL;
366 381
367 switch (arg) { 382 switch (arg) {
@@ -405,7 +420,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
405 * User space encodes device types as two-byte values, 420 * User space encodes device types as two-byte values,
406 * so we need to recode them 421 * so we need to recode them
407 */ 422 */
408 swdev = old_decode_dev(swap_area.dev); 423 swdev = new_decode_dev(swap_area.dev);
409 if (swdev) { 424 if (swdev) {
410 offset = swap_area.offset; 425 offset = swap_area.offset;
411 data->swap = swap_type_of(swdev, offset, NULL); 426 data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 37#include <linux/kmsg_dump.h>
38#include <linux/syslog.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
@@ -69,8 +70,6 @@ int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 70 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
70}; 71};
71 72
72static int saved_console_loglevel = -1;
73
74/* 73/*
75 * Low level drivers may need that to know if they can schedule in 74 * Low level drivers may need that to know if they can schedule in
76 * their unblank() callback or not. So let's export it. 75 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
145static char *log_buf = __log_buf; 144static char *log_buf = __log_buf;
146static int log_buf_len = __LOG_BUF_LEN; 145static int log_buf_len = __LOG_BUF_LEN;
147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 146static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
147static int saved_console_loglevel = -1;
148 148
149#ifdef CONFIG_KEXEC 149#ifdef CONFIG_KEXEC
150/* 150/*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
258} 258}
259#endif 259#endif
260 260
261/* 261int do_syslog(int type, char __user *buf, int len, bool from_file)
262 * Commands to do_syslog:
263 *
264 * 0 -- Close the log. Currently a NOP.
265 * 1 -- Open the log. Currently a NOP.
266 * 2 -- Read from the log.
267 * 3 -- Read all messages remaining in the ring buffer.
268 * 4 -- Read and clear all messages remaining in the ring buffer
269 * 5 -- Clear ring buffer.
270 * 6 -- Disable printk's to console
271 * 7 -- Enable printk's to console
272 * 8 -- Set level of messages printed to console
273 * 9 -- Return number of unread characters in the log buffer
274 * 10 -- Return size of the log buffer
275 */
276int do_syslog(int type, char __user *buf, int len)
277{ 262{
278 unsigned i, j, limit, count; 263 unsigned i, j, limit, count;
279 int do_clear = 0; 264 int do_clear = 0;
280 char c; 265 char c;
281 int error = 0; 266 int error = 0;
282 267
283 error = security_syslog(type); 268 error = security_syslog(type, from_file);
284 if (error) 269 if (error)
285 return error; 270 return error;
286 271
287 switch (type) { 272 switch (type) {
288 case 0: /* Close log */ 273 case SYSLOG_ACTION_CLOSE: /* Close log */
289 break; 274 break;
290 case 1: /* Open log */ 275 case SYSLOG_ACTION_OPEN: /* Open log */
291 break; 276 break;
292 case 2: /* Read from log */ 277 case SYSLOG_ACTION_READ: /* Read from log */
293 error = -EINVAL; 278 error = -EINVAL;
294 if (!buf || len < 0) 279 if (!buf || len < 0)
295 goto out; 280 goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
320 if (!error) 305 if (!error)
321 error = i; 306 error = i;
322 break; 307 break;
323 case 4: /* Read/clear last kernel messages */ 308 /* Read/clear last kernel messages */
309 case SYSLOG_ACTION_READ_CLEAR:
324 do_clear = 1; 310 do_clear = 1;
325 /* FALL THRU */ 311 /* FALL THRU */
326 case 3: /* Read last kernel messages */ 312 /* Read last kernel messages */
313 case SYSLOG_ACTION_READ_ALL:
327 error = -EINVAL; 314 error = -EINVAL;
328 if (!buf || len < 0) 315 if (!buf || len < 0)
329 goto out; 316 goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
376 } 363 }
377 } 364 }
378 break; 365 break;
379 case 5: /* Clear ring buffer */ 366 /* Clear ring buffer */
367 case SYSLOG_ACTION_CLEAR:
380 logged_chars = 0; 368 logged_chars = 0;
381 break; 369 break;
382 case 6: /* Disable logging to console */ 370 /* Disable logging to console */
371 case SYSLOG_ACTION_CONSOLE_OFF:
383 if (saved_console_loglevel == -1) 372 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel; 373 saved_console_loglevel = console_loglevel;
385 console_loglevel = minimum_console_loglevel; 374 console_loglevel = minimum_console_loglevel;
386 break; 375 break;
387 case 7: /* Enable logging to console */ 376 /* Enable logging to console */
377 case SYSLOG_ACTION_CONSOLE_ON:
388 if (saved_console_loglevel != -1) { 378 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel; 379 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1; 380 saved_console_loglevel = -1;
391 } 381 }
392 break; 382 break;
393 case 8: /* Set level of messages printed to console */ 383 /* Set level of messages printed to console */
384 case SYSLOG_ACTION_CONSOLE_LEVEL:
394 error = -EINVAL; 385 error = -EINVAL;
395 if (len < 1 || len > 8) 386 if (len < 1 || len > 8)
396 goto out; 387 goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
401 saved_console_loglevel = -1; 392 saved_console_loglevel = -1;
402 error = 0; 393 error = 0;
403 break; 394 break;
404 case 9: /* Number of chars in the log buffer */ 395 /* Number of chars in the log buffer */
396 case SYSLOG_ACTION_SIZE_UNREAD:
405 error = log_end - log_start; 397 error = log_end - log_start;
406 break; 398 break;
407 case 10: /* Size of the log buffer */ 399 /* Size of the log buffer */
400 case SYSLOG_ACTION_SIZE_BUFFER:
408 error = log_buf_len; 401 error = log_buf_len;
409 break; 402 break;
410 default: 403 default:
@@ -417,7 +410,7 @@ out:
417 410
418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 411SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
419{ 412{
420 return do_syslog(type, buf, len); 413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 414}
422 415
423/* 416/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/regset.h>
25 26
26 27
27/* 28/*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 512 return 0;
512} 513}
513 514
515#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
516
517static const struct user_regset *
518find_regset(const struct user_regset_view *view, unsigned int type)
519{
520 const struct user_regset *regset;
521 int n;
522
523 for (n = 0; n < view->n; ++n) {
524 regset = view->regsets + n;
525 if (regset->core_note_type == type)
526 return regset;
527 }
528
529 return NULL;
530}
531
532static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
533 struct iovec *kiov)
534{
535 const struct user_regset_view *view = task_user_regset_view(task);
536 const struct user_regset *regset = find_regset(view, type);
537 int regset_no;
538
539 if (!regset || (kiov->iov_len % regset->size) != 0)
540 return -EINVAL;
541
542 regset_no = regset - view->regsets;
543 kiov->iov_len = min(kiov->iov_len,
544 (__kernel_size_t) (regset->n * regset->size));
545
546 if (req == PTRACE_GETREGSET)
547 return copy_regset_to_user(task, view, regset_no, 0,
548 kiov->iov_len, kiov->iov_base);
549 else
550 return copy_regset_from_user(task, view, regset_no, 0,
551 kiov->iov_len, kiov->iov_base);
552}
553
554#endif
555
514int ptrace_request(struct task_struct *child, long request, 556int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 557 long addr, long data)
516{ 558{
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 615 return 0;
574 return ptrace_resume(child, request, SIGKILL); 616 return ptrace_resume(child, request, SIGKILL);
575 617
618#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
619 case PTRACE_GETREGSET:
620 case PTRACE_SETREGSET:
621 {
622 struct iovec kiov;
623 struct iovec __user *uiov = (struct iovec __user *) data;
624
625 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
626 return -EFAULT;
627
628 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
629 __get_user(kiov.iov_len, &uiov->iov_len))
630 return -EFAULT;
631
632 ret = ptrace_regset(child, request, addr, &kiov);
633 if (!ret)
634 ret = __put_user(kiov.iov_len, &uiov->iov_len);
635 break;
636 }
637#endif
576 default: 638 default:
577 break; 639 break;
578 } 640 }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 773 else
712 ret = ptrace_setsiginfo(child, &siginfo); 774 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 775 break;
776#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
777 case PTRACE_GETREGSET:
778 case PTRACE_SETREGSET:
779 {
780 struct iovec kiov;
781 struct compat_iovec __user *uiov =
782 (struct compat_iovec __user *) datap;
783 compat_uptr_t ptr;
784 compat_size_t len;
785
786 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
787 return -EFAULT;
788
789 if (__get_user(ptr, &uiov->iov_base) ||
790 __get_user(len, &uiov->iov_len))
791 return -EFAULT;
792
793 kiov.iov_base = compat_ptr(ptr);
794 kiov.iov_len = len;
795
796 ret = ptrace_regset(child, request, addr, &kiov);
797 if (!ret)
798 ret = __put_user(kiov.iov_len, &uiov->iov_len);
799 break;
800 }
801#endif
714 802
715 default: 803 default:
716 ret = ptrace_request(child, request, addr, data); 804 ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..49d808e833b0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,73 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48#include <linux/hardirq.h>
47 49
48#ifdef CONFIG_DEBUG_LOCK_ALLOC 50#ifdef CONFIG_DEBUG_LOCK_ALLOC
49static struct lock_class_key rcu_lock_key; 51static struct lock_class_key rcu_lock_key;
50struct lockdep_map rcu_lock_map = 52struct lockdep_map rcu_lock_map =
51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 53 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
52EXPORT_SYMBOL_GPL(rcu_lock_map); 54EXPORT_SYMBOL_GPL(rcu_lock_map);
55
56static struct lock_class_key rcu_bh_lock_key;
57struct lockdep_map rcu_bh_lock_map =
58 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
59EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
60
61static struct lock_class_key rcu_sched_lock_key;
62struct lockdep_map rcu_sched_lock_map =
63 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
64EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
53#endif 65#endif
54 66
67int rcu_scheduler_active __read_mostly;
68EXPORT_SYMBOL_GPL(rcu_scheduler_active);
69
70#ifdef CONFIG_DEBUG_LOCK_ALLOC
71
72int debug_lockdep_rcu_enabled(void)
73{
74 return rcu_scheduler_active && debug_locks &&
75 current->lockdep_recursion == 0;
76}
77EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
78
79/**
80 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
81 *
82 * Check for bottom half being disabled, which covers both the
83 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
84 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
85 * will show the situation.
86 *
87 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
88 */
89int rcu_read_lock_bh_held(void)
90{
91 if (!debug_lockdep_rcu_enabled())
92 return 1;
93 return in_softirq();
94}
95EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
96
97#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
98
99/*
100 * This function is invoked towards the end of the scheduler's initialization
101 * process. Before this is called, the idle task might contain
102 * RCU read-side critical sections (during which time, this idle
103 * task is booting the system). After this function is called, the
104 * idle tasks are prohibited from containing RCU read-side critical
105 * sections.
106 */
107void rcu_scheduler_starting(void)
108{
109 WARN_ON(num_online_cpus() != 1);
110 WARN_ON(nr_context_switches() > 0);
111 rcu_scheduler_active = 1;
112}
113
55/* 114/*
56 * Awaken the corresponding synchronize_rcu() instance now that a 115 * Awaken the corresponding synchronize_rcu() instance now that a
57 * grace period has elapsed. 116 * grace period has elapsed.
@@ -63,3 +122,14 @@ void wakeme_after_rcu(struct rcu_head *head)
63 rcu = container_of(head, struct rcu_synchronize, head); 122 rcu = container_of(head, struct rcu_synchronize, head);
64 complete(&rcu->completion); 123 complete(&rcu->completion);
65} 124}
125
126#ifdef CONFIG_PROVE_RCU
127/*
128 * wrapper function to avoid #include problems.
129 */
130int rcu_my_thread_group_empty(void)
131{
132 return thread_group_empty(current);
133}
134EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
135#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af02..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */
64static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 67static char *torture_type = "rcu"; /* What RCU implementation to torture. */
65 68
66module_param(nreaders, int, 0444); 69module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
79MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 82MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
80module_param(irqreader, int, 0444); 83module_param(irqreader, int, 0444);
81MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 84MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
85module_param(fqs_duration, int, 0444);
86MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
87module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
82module_param(torture_type, charp, 0444); 91module_param(torture_type, charp, 0444);
83MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
84 93
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
99static struct task_struct *stats_task; 108static struct task_struct *stats_task;
100static struct task_struct *shuffler_task; 109static struct task_struct *shuffler_task;
101static struct task_struct *stutter_task; 110static struct task_struct *stutter_task;
111static struct task_struct *fqs_task;
102 112
103#define RCU_TORTURE_PIPE_LEN 10 113#define RCU_TORTURE_PIPE_LEN 10
104 114
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
263 void (*deferred_free)(struct rcu_torture *p); 273 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 274 void (*sync)(void);
265 void (*cb_barrier)(void); 275 void (*cb_barrier)(void);
276 void (*fqs)(void);
266 int (*stats)(char *page); 277 int (*stats)(char *page);
267 int irq_capable; 278 int irq_capable;
268 char *name; 279 char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
347 .deferred_free = rcu_torture_deferred_free, 358 .deferred_free = rcu_torture_deferred_free,
348 .sync = synchronize_rcu, 359 .sync = synchronize_rcu,
349 .cb_barrier = rcu_barrier, 360 .cb_barrier = rcu_barrier,
361 .fqs = rcu_force_quiescent_state,
350 .stats = NULL, 362 .stats = NULL,
351 .irq_capable = 1, 363 .irq_capable = 1,
352 .name = "rcu" 364 .name = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .deferred_free = rcu_sync_torture_deferred_free, 400 .deferred_free = rcu_sync_torture_deferred_free,
389 .sync = synchronize_rcu, 401 .sync = synchronize_rcu,
390 .cb_barrier = NULL, 402 .cb_barrier = NULL,
403 .fqs = rcu_force_quiescent_state,
391 .stats = NULL, 404 .stats = NULL,
392 .irq_capable = 1, 405 .irq_capable = 1,
393 .name = "rcu_sync" 406 .name = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
403 .deferred_free = rcu_sync_torture_deferred_free, 416 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited, 417 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL, 418 .cb_barrier = NULL,
419 .fqs = rcu_force_quiescent_state,
406 .stats = NULL, 420 .stats = NULL,
407 .irq_capable = 1, 421 .irq_capable = 1,
408 .name = "rcu_expedited" 422 .name = "rcu_expedited"
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
465 .deferred_free = rcu_bh_torture_deferred_free, 479 .deferred_free = rcu_bh_torture_deferred_free,
466 .sync = rcu_bh_torture_synchronize, 480 .sync = rcu_bh_torture_synchronize,
467 .cb_barrier = rcu_barrier_bh, 481 .cb_barrier = rcu_barrier_bh,
482 .fqs = rcu_bh_force_quiescent_state,
468 .stats = NULL, 483 .stats = NULL,
469 .irq_capable = 1, 484 .irq_capable = 1,
470 .name = "rcu_bh" 485 .name = "rcu_bh"
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
480 .deferred_free = rcu_sync_torture_deferred_free, 495 .deferred_free = rcu_sync_torture_deferred_free,
481 .sync = rcu_bh_torture_synchronize, 496 .sync = rcu_bh_torture_synchronize,
482 .cb_barrier = NULL, 497 .cb_barrier = NULL,
498 .fqs = rcu_bh_force_quiescent_state,
483 .stats = NULL, 499 .stats = NULL,
484 .irq_capable = 1, 500 .irq_capable = 1,
485 .name = "rcu_bh_sync" 501 .name = "rcu_bh_sync"
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = {
621 .deferred_free = rcu_sched_torture_deferred_free, 637 .deferred_free = rcu_sched_torture_deferred_free,
622 .sync = sched_torture_synchronize, 638 .sync = sched_torture_synchronize,
623 .cb_barrier = rcu_barrier_sched, 639 .cb_barrier = rcu_barrier_sched,
640 .fqs = rcu_sched_force_quiescent_state,
624 .stats = NULL, 641 .stats = NULL,
625 .irq_capable = 1, 642 .irq_capable = 1,
626 .name = "sched" 643 .name = "sched"
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = {
636 .deferred_free = rcu_sync_torture_deferred_free, 653 .deferred_free = rcu_sync_torture_deferred_free,
637 .sync = sched_torture_synchronize, 654 .sync = sched_torture_synchronize,
638 .cb_barrier = NULL, 655 .cb_barrier = NULL,
656 .fqs = rcu_sched_force_quiescent_state,
639 .stats = NULL, 657 .stats = NULL,
640 .name = "sched_sync" 658 .name = "sched_sync"
641}; 659};
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
650 .deferred_free = rcu_sync_torture_deferred_free, 668 .deferred_free = rcu_sync_torture_deferred_free,
651 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
652 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state,
653 .stats = rcu_expedited_torture_stats, 672 .stats = rcu_expedited_torture_stats,
654 .irq_capable = 1, 673 .irq_capable = 1,
655 .name = "sched_expedited" 674 .name = "sched_expedited"
656}; 675};
657 676
658/* 677/*
678 * RCU torture force-quiescent-state kthread. Repeatedly induces
679 * bursts of calls to force_quiescent_state(), increasing the probability
680 * of occurrence of some important types of race conditions.
681 */
682static int
683rcu_torture_fqs(void *arg)
684{
685 unsigned long fqs_resume_time;
686 int fqs_burst_remaining;
687
688 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
689 do {
690 fqs_resume_time = jiffies + fqs_stutter * HZ;
691 while (jiffies - fqs_resume_time > LONG_MAX) {
692 schedule_timeout_interruptible(1);
693 }
694 fqs_burst_remaining = fqs_duration;
695 while (fqs_burst_remaining > 0) {
696 cur_ops->fqs();
697 udelay(fqs_holdoff);
698 fqs_burst_remaining -= fqs_holdoff;
699 }
700 rcu_stutter_wait("rcu_torture_fqs");
701 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
702 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
703 rcutorture_shutdown_absorb("rcu_torture_fqs");
704 while (!kthread_should_stop())
705 schedule_timeout_uninterruptible(1);
706 return 0;
707}
708
709/*
659 * RCU torture writer kthread. Repeatedly substitutes a new structure 710 * RCU torture writer kthread. Repeatedly substitutes a new structure
660 * for that pointed to by rcu_torture_current, freeing the old structure 711 * for that pointed to by rcu_torture_current, freeing the old structure
661 * after a series of grace periods (the "pipeline"). 712 * after a series of grace periods (the "pipeline").
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
745 796
746 idx = cur_ops->readlock(); 797 idx = cur_ops->readlock();
747 completed = cur_ops->completed(); 798 completed = cur_ops->completed();
748 p = rcu_dereference(rcu_torture_current); 799 p = rcu_dereference_check(rcu_torture_current,
800 rcu_read_lock_held() ||
801 rcu_read_lock_bh_held() ||
802 rcu_read_lock_sched_held() ||
803 srcu_read_lock_held(&srcu_ctl));
749 if (p == NULL) { 804 if (p == NULL) {
750 /* Leave because rcu_torture_writer is not yet underway */ 805 /* Leave because rcu_torture_writer is not yet underway */
751 cur_ops->readunlock(idx); 806 cur_ops->readunlock(idx);
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
763 /* Should not happen, but... */ 818 /* Should not happen, but... */
764 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
765 } 820 }
766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 821 __this_cpu_inc(rcu_torture_count[pipe_count]);
767 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
768 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
769 /* Should not happen, but... */ 824 /* Should not happen, but... */
770 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
771 } 826 }
772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 827 __this_cpu_inc(rcu_torture_batch[completed]);
773 preempt_enable(); 828 preempt_enable();
774 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
775} 830}
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg)
798 do { 853 do {
799 if (irqreader && cur_ops->irq_capable) { 854 if (irqreader && cur_ops->irq_capable) {
800 if (!timer_pending(&t)) 855 if (!timer_pending(&t))
801 mod_timer(&t, 1); 856 mod_timer(&t, jiffies + 1);
802 } 857 }
803 idx = cur_ops->readlock(); 858 idx = cur_ops->readlock();
804 completed = cur_ops->completed(); 859 completed = cur_ops->completed();
805 p = rcu_dereference(rcu_torture_current); 860 p = rcu_dereference_check(rcu_torture_current,
861 rcu_read_lock_held() ||
862 rcu_read_lock_bh_held() ||
863 rcu_read_lock_sched_held() ||
864 srcu_read_lock_held(&srcu_ctl));
806 if (p == NULL) { 865 if (p == NULL) {
807 /* Wait for rcu_torture_writer to get underway */ 866 /* Wait for rcu_torture_writer to get underway */
808 cur_ops->readunlock(idx); 867 cur_ops->readunlock(idx);
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg)
818 /* Should not happen, but... */ 877 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 878 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 879 }
821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 880 __this_cpu_inc(rcu_torture_count[pipe_count]);
822 completed = cur_ops->completed() - completed; 881 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 882 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 883 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 884 completed = RCU_TORTURE_PIPE_LEN;
826 } 885 }
827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 886 __this_cpu_inc(rcu_torture_batch[completed]);
828 preempt_enable(); 887 preempt_enable();
829 cur_ops->readunlock(idx); 888 cur_ops->readunlock(idx);
830 schedule(); 889 schedule();
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
1030 printk(KERN_ALERT "%s" TORTURE_FLAG 1089 printk(KERN_ALERT "%s" TORTURE_FLAG
1031 "--- %s: nreaders=%d nfakewriters=%d " 1090 "--- %s: nreaders=%d nfakewriters=%d "
1032 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1091 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1033 "shuffle_interval=%d stutter=%d irqreader=%d\n", 1092 "shuffle_interval=%d stutter=%d irqreader=%d "
1093 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
1034 torture_type, tag, nrealreaders, nfakewriters, 1094 torture_type, tag, nrealreaders, nfakewriters,
1035 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1095 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1036 stutter, irqreader); 1096 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
1037} 1097}
1038 1098
1039static struct notifier_block rcutorture_nb = { 1099static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void)
1109 } 1169 }
1110 stats_task = NULL; 1170 stats_task = NULL;
1111 1171
1172 if (fqs_task) {
1173 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1174 kthread_stop(fqs_task);
1175 }
1176 fqs_task = NULL;
1177
1112 /* Wait for all RCU callbacks to fire. */ 1178 /* Wait for all RCU callbacks to fire. */
1113 1179
1114 if (cur_ops->cb_barrier != NULL) 1180 if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1220,11 @@ rcu_torture_init(void)
1154 mutex_unlock(&fullstop_mutex); 1220 mutex_unlock(&fullstop_mutex);
1155 return -EINVAL; 1221 return -EINVAL;
1156 } 1222 }
1223 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1224 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1225 "fqs_duration, fqs disabled.\n");
1226 fqs_duration = 0;
1227 }
1157 if (cur_ops->init) 1228 if (cur_ops->init)
1158 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1229 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1159 1230
@@ -1282,6 +1353,19 @@ rcu_torture_init(void)
1282 goto unwind; 1353 goto unwind;
1283 } 1354 }
1284 } 1355 }
1356 if (fqs_duration < 0)
1357 fqs_duration = 0;
1358 if (fqs_duration) {
1359 /* Create the stutter thread */
1360 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1361 "rcu_torture_fqs");
1362 if (IS_ERR(fqs_task)) {
1363 firsterr = PTR_ERR(fqs_task);
1364 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1365 fqs_task = NULL;
1366 goto unwind;
1367 }
1368 }
1285 register_reboot_notifier(&rcutorture_nb); 1369 register_reboot_notifier(&rcutorture_nb);
1286 mutex_unlock(&fullstop_mutex); 1370 mutex_unlock(&fullstop_mutex);
1287 return 0; 1371 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,7 +46,6 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
50 49
51#include "rcutree.h" 50#include "rcutree.h"
52 51
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
66 .signaled = RCU_GP_IDLE, \ 65 .signaled = RCU_GP_IDLE, \
67 .gpnum = -300, \ 66 .gpnum = -300, \
68 .completed = -300, \ 67 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \ 69 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \ 70 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \ 71 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 73 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 74 .n_force_qs_ngp = 0, \
76} 75}
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 82
84static int rcu_scheduler_active __read_mostly;
85
86
87/* 83/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node 85 * permit this function to be invoked without holding the root rcu_node
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void)
157EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 153EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
158 154
159/* 155/*
156 * Force a quiescent state for RCU BH.
157 */
158void rcu_bh_force_quiescent_state(void)
159{
160 force_quiescent_state(&rcu_bh_state, 0);
161}
162EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
163
164/*
165 * Force a quiescent state for RCU-sched.
166 */
167void rcu_sched_force_quiescent_state(void)
168{
169 force_quiescent_state(&rcu_sched_state, 0);
170}
171EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
172
173/*
160 * Does the CPU have callbacks ready to be invoked? 174 * Does the CPU have callbacks ready to be invoked?
161 */ 175 */
162static int 176static int
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
439 453
440 /* Only let one CPU complain about others per time interval. */ 454 /* Only let one CPU complain about others per time interval. */
441 455
442 spin_lock_irqsave(&rnp->lock, flags); 456 raw_spin_lock_irqsave(&rnp->lock, flags);
443 delta = jiffies - rsp->jiffies_stall; 457 delta = jiffies - rsp->jiffies_stall;
444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 458 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
445 spin_unlock_irqrestore(&rnp->lock, flags); 459 raw_spin_unlock_irqrestore(&rnp->lock, flags);
446 return; 460 return;
447 } 461 }
448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 462 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
452 * due to CPU offlining. 466 * due to CPU offlining.
453 */ 467 */
454 rcu_print_task_stall(rnp); 468 rcu_print_task_stall(rnp);
455 spin_unlock_irqrestore(&rnp->lock, flags); 469 raw_spin_unlock_irqrestore(&rnp->lock, flags);
456 470
457 /* OK, time to rat on our buddy... */ 471 /* OK, time to rat on our buddy... */
458 472
459 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 473 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
460 rcu_for_each_leaf_node(rsp, rnp) { 474 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags);
461 rcu_print_task_stall(rnp); 476 rcu_print_task_stall(rnp);
477 raw_spin_unlock_irqrestore(&rnp->lock, flags);
462 if (rnp->qsmask == 0) 478 if (rnp->qsmask == 0)
463 continue; 479 continue;
464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 480 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 485 smp_processor_id(), (long)(jiffies - rsp->gp_start));
470 trigger_all_cpu_backtrace(); 486 trigger_all_cpu_backtrace();
471 487
488 /* If so configured, complain about tasks blocking the grace period. */
489
490 rcu_print_detail_task_stall(rsp);
491
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 492 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 493}
474 494
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
481 smp_processor_id(), jiffies - rsp->gp_start); 501 smp_processor_id(), jiffies - rsp->gp_start);
482 trigger_all_cpu_backtrace(); 502 trigger_all_cpu_backtrace();
483 503
484 spin_lock_irqsave(&rnp->lock, flags); 504 raw_spin_lock_irqsave(&rnp->lock, flags);
485 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 505 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
486 rsp->jiffies_stall = 506 rsp->jiffies_stall =
487 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 507 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
488 spin_unlock_irqrestore(&rnp->lock, flags); 508 raw_spin_unlock_irqrestore(&rnp->lock, flags);
489 509
490 set_need_resched(); /* kick ourselves to get things going. */ 510 set_need_resched(); /* kick ourselves to get things going. */
491} 511}
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
545 local_irq_save(flags); 565 local_irq_save(flags);
546 rnp = rdp->mynode; 566 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ 567 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 568 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
549 local_irq_restore(flags); 569 local_irq_restore(flags);
550 return; 570 return;
551 } 571 }
552 __note_new_gpnum(rsp, rnp, rdp); 572 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags); 573 raw_spin_unlock_irqrestore(&rnp->lock, flags);
554} 574}
555 575
556/* 576/*
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
609 local_irq_save(flags); 629 local_irq_save(flags);
610 rnp = rdp->mynode; 630 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 631 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 632 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
613 local_irq_restore(flags); 633 local_irq_restore(flags);
614 return; 634 return;
615 } 635 }
616 __rcu_process_gp_end(rsp, rnp, rdp); 636 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags); 637 raw_spin_unlock_irqrestore(&rnp->lock, flags);
618} 638}
619 639
620/* 640/*
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
659 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 679 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
660 struct rcu_node *rnp = rcu_get_root(rsp); 680 struct rcu_node *rnp = rcu_get_root(rsp);
661 681
662 if (!cpu_needs_another_gp(rsp, rdp)) { 682 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
683 if (cpu_needs_another_gp(rsp, rdp))
684 rsp->fqs_need_gp = 1;
663 if (rnp->completed == rsp->completed) { 685 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags); 686 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 687 return;
666 } 688 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 689 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 690
669 /* 691 /*
670 * Propagate new ->completed value to rcu_node structures 692 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672 * of the next grace period to process their callbacks. 694 * of the next grace period to process their callbacks.
673 */ 695 */
674 rcu_for_each_node_breadth_first(rsp, rnp) { 696 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */ 697 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed; 698 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 699 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 } 700 }
679 local_irq_restore(flags); 701 local_irq_restore(flags);
680 return; 702 return;
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
695 rnp->completed = rsp->completed; 717 rnp->completed = rsp->completed;
696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 718 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp); 719 rcu_start_gp_per_cpu(rsp, rnp, rdp);
698 spin_unlock_irqrestore(&rnp->lock, flags); 720 raw_spin_unlock_irqrestore(&rnp->lock, flags);
699 return; 721 return;
700 } 722 }
701 723
702 spin_unlock(&rnp->lock); /* leave irqs disabled. */ 724 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
703 725
704 726
705 /* Exclude any concurrent CPU-hotplug operations. */ 727 /* Exclude any concurrent CPU-hotplug operations. */
706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 728 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
707 729
708 /* 730 /*
709 * Set the quiescent-state-needed bits in all the rcu_node 731 * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
723 * irqs disabled. 745 * irqs disabled.
724 */ 746 */
725 rcu_for_each_node_breadth_first(rsp, rnp) { 747 rcu_for_each_node_breadth_first(rsp, rnp) {
726 spin_lock(&rnp->lock); /* irqs already disabled. */ 748 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
727 rcu_preempt_check_blocked_tasks(rnp); 749 rcu_preempt_check_blocked_tasks(rnp);
728 rnp->qsmask = rnp->qsmaskinit; 750 rnp->qsmask = rnp->qsmaskinit;
729 rnp->gpnum = rsp->gpnum; 751 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed; 752 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode) 753 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp); 754 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 755 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
734 } 756 }
735 757
736 rnp = rcu_get_root(rsp); 758 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */ 759 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 760 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 761 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
740 spin_unlock_irqrestore(&rsp->onofflock, flags); 762 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
741} 763}
742 764
743/* 765/*
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
776 if (!(rnp->qsmask & mask)) { 798 if (!(rnp->qsmask & mask)) {
777 799
778 /* Our bit has already been cleared, so done. */ 800 /* Our bit has already been cleared, so done. */
779 spin_unlock_irqrestore(&rnp->lock, flags); 801 raw_spin_unlock_irqrestore(&rnp->lock, flags);
780 return; 802 return;
781 } 803 }
782 rnp->qsmask &= ~mask; 804 rnp->qsmask &= ~mask;
783 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 805 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
784 806
785 /* Other bits still set at this level, so done. */ 807 /* Other bits still set at this level, so done. */
786 spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
787 return; 809 return;
788 } 810 }
789 mask = rnp->grpmask; 811 mask = rnp->grpmask;
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
793 815
794 break; 816 break;
795 } 817 }
796 spin_unlock_irqrestore(&rnp->lock, flags); 818 raw_spin_unlock_irqrestore(&rnp->lock, flags);
797 rnp_c = rnp; 819 rnp_c = rnp;
798 rnp = rnp->parent; 820 rnp = rnp->parent;
799 spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
800 WARN_ON_ONCE(rnp_c->qsmask); 822 WARN_ON_ONCE(rnp_c->qsmask);
801 } 823 }
802 824
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
825 struct rcu_node *rnp; 847 struct rcu_node *rnp;
826 848
827 rnp = rdp->mynode; 849 rnp = rdp->mynode;
828 spin_lock_irqsave(&rnp->lock, flags); 850 raw_spin_lock_irqsave(&rnp->lock, flags);
829 if (lastcomp != rnp->completed) { 851 if (lastcomp != rnp->completed) {
830 852
831 /* 853 /*
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
837 * race occurred. 859 * race occurred.
838 */ 860 */
839 rdp->passed_quiesc = 0; /* try again later! */ 861 rdp->passed_quiesc = 0; /* try again later! */
840 spin_unlock_irqrestore(&rnp->lock, flags); 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
841 return; 863 return;
842 } 864 }
843 mask = rdp->grpmask; 865 mask = rdp->grpmask;
844 if ((rnp->qsmask & mask) == 0) { 866 if ((rnp->qsmask & mask) == 0) {
845 spin_unlock_irqrestore(&rnp->lock, flags); 867 raw_spin_unlock_irqrestore(&rnp->lock, flags);
846 } else { 868 } else {
847 rdp->qs_pending = 0; 869 rdp->qs_pending = 0;
848 870
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
906 928
907 if (rdp->nxtlist == NULL) 929 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */ 930 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 931 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist; 932 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 933 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL; 934 rdp->nxtlist = NULL;
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
914 rdp->nxttail[i] = &rdp->nxtlist; 936 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen; 937 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0; 938 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 939 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918} 940}
919 941
920/* 942/*
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
925 unsigned long flags; 947 unsigned long flags;
926 struct rcu_data *rdp; 948 struct rcu_data *rdp;
927 949
928 spin_lock_irqsave(&rsp->onofflock, flags); 950 raw_spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()]; 951 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) { 952 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags); 953 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return; 954 return;
933 } 955 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 956 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
937 rsp->orphan_cbs_list = NULL; 959 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 960 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0; 961 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 962 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
941} 963}
942 964
943/* 965/*
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
953 struct rcu_node *rnp; 975 struct rcu_node *rnp;
954 976
955 /* Exclude any attempts to start a new grace period. */ 977 /* Exclude any attempts to start a new grace period. */
956 spin_lock_irqsave(&rsp->onofflock, flags); 978 raw_spin_lock_irqsave(&rsp->onofflock, flags);
957 979
958 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 980 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
959 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ 981 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
960 mask = rdp->grpmask; /* rnp->grplo is constant. */ 982 mask = rdp->grpmask; /* rnp->grplo is constant. */
961 do { 983 do {
962 spin_lock(&rnp->lock); /* irqs already disabled. */ 984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
963 rnp->qsmaskinit &= ~mask; 985 rnp->qsmaskinit &= ~mask;
964 if (rnp->qsmaskinit != 0) { 986 if (rnp->qsmaskinit != 0) {
965 if (rnp != rdp->mynode) 987 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 988 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
967 break; 989 break;
968 } 990 }
969 if (rnp == rdp->mynode) 991 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 992 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else 993 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 994 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
973 mask = rnp->grpmask; 995 mask = rnp->grpmask;
974 rnp = rnp->parent; 996 rnp = rnp->parent;
975 } while (rnp != NULL); 997 } while (rnp != NULL);
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1002 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock. 1003 * held leads to deadlock.
982 */ 1004 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1005 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode; 1006 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP) 1007 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags); 1008 rcu_report_unblock_qs_rnp(rnp, flags);
987 else 1009 else
988 spin_unlock_irqrestore(&rnp->lock, flags); 1010 raw_spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP) 1011 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp); 1012 rcu_report_exp_rnp(rsp, rnp);
991 1013
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
1144/* 1166/*
1145 * Scan the leaf rcu_node structures, processing dyntick state for any that 1167 * Scan the leaf rcu_node structures, processing dyntick state for any that
1146 * have not yet encountered a quiescent state, using the function specified. 1168 * have not yet encountered a quiescent state, using the function specified.
1147 * Returns 1 if the current grace period ends while scanning (possibly 1169 * The caller must have suppressed start of new grace periods.
1148 * because we made it end).
1149 */ 1170 */
1150static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, 1171static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1151 int (*f)(struct rcu_data *))
1152{ 1172{
1153 unsigned long bit; 1173 unsigned long bit;
1154 int cpu; 1174 int cpu;
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1158 1178
1159 rcu_for_each_leaf_node(rsp, rnp) { 1179 rcu_for_each_leaf_node(rsp, rnp) {
1160 mask = 0; 1180 mask = 0;
1161 spin_lock_irqsave(&rnp->lock, flags); 1181 raw_spin_lock_irqsave(&rnp->lock, flags);
1162 if (rnp->completed != lastcomp) { 1182 if (!rcu_gp_in_progress(rsp)) {
1163 spin_unlock_irqrestore(&rnp->lock, flags); 1183 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1164 return 1; 1184 return;
1165 } 1185 }
1166 if (rnp->qsmask == 0) { 1186 if (rnp->qsmask == 0) {
1167 spin_unlock_irqrestore(&rnp->lock, flags); 1187 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168 continue; 1188 continue;
1169 } 1189 }
1170 cpu = rnp->grplo; 1190 cpu = rnp->grplo;
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1193 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1174 mask |= bit; 1194 mask |= bit;
1175 } 1195 }
1176 if (mask != 0 && rnp->completed == lastcomp) { 1196 if (mask != 0) {
1177 1197
1178 /* rcu_report_qs_rnp() releases rnp->lock. */ 1198 /* rcu_report_qs_rnp() releases rnp->lock. */
1179 rcu_report_qs_rnp(mask, rsp, rnp, flags); 1199 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1180 continue; 1200 continue;
1181 } 1201 }
1182 spin_unlock_irqrestore(&rnp->lock, flags); 1202 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1183 } 1203 }
1184 return 0;
1185} 1204}
1186 1205
1187/* 1206/*
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1191static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1210static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1192{ 1211{
1193 unsigned long flags; 1212 unsigned long flags;
1194 long lastcomp;
1195 struct rcu_node *rnp = rcu_get_root(rsp); 1213 struct rcu_node *rnp = rcu_get_root(rsp);
1196 u8 signaled;
1197 u8 forcenow;
1198 1214
1199 if (!rcu_gp_in_progress(rsp)) 1215 if (!rcu_gp_in_progress(rsp))
1200 return; /* No grace period in progress, nothing to force. */ 1216 return; /* No grace period in progress, nothing to force. */
1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1217 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1218 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1203 return; /* Someone else is already on the job. */ 1219 return; /* Someone else is already on the job. */
1204 } 1220 }
1205 if (relaxed && 1221 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1206 (long)(rsp->jiffies_force_qs - jiffies) >= 0) 1222 goto unlock_fqs_ret; /* no emergency and done recently. */
1207 goto unlock_ret; /* no emergency and done recently. */
1208 rsp->n_force_qs++; 1223 rsp->n_force_qs++;
1209 spin_lock(&rnp->lock); 1224 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1210 lastcomp = rsp->gpnum - 1;
1211 signaled = rsp->signaled;
1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1225 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1213 if(!rcu_gp_in_progress(rsp)) { 1226 if(!rcu_gp_in_progress(rsp)) {
1214 rsp->n_force_qs_ngp++; 1227 rsp->n_force_qs_ngp++;
1215 spin_unlock(&rnp->lock); 1228 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1216 goto unlock_ret; /* no GP in progress, time updated. */ 1229 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1217 } 1230 }
1218 spin_unlock(&rnp->lock); 1231 rsp->fqs_active = 1;
1219 switch (signaled) { 1232 switch (rsp->signaled) {
1220 case RCU_GP_IDLE: 1233 case RCU_GP_IDLE:
1221 case RCU_GP_INIT: 1234 case RCU_GP_INIT:
1222 1235
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1224 1237
1225 case RCU_SAVE_DYNTICK: 1238 case RCU_SAVE_DYNTICK:
1226 1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1227 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1228 break; /* So gcc recognizes the dead code. */ 1242 break; /* So gcc recognizes the dead code. */
1229 1243
1230 /* Record dyntick-idle state. */ 1244 /* Record dyntick-idle state. */
1231 if (rcu_process_dyntick(rsp, lastcomp, 1245 force_qs_rnp(rsp, dyntick_save_progress_counter);
1232 dyntick_save_progress_counter)) 1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1233 goto unlock_ret; 1247 if (rcu_gp_in_progress(rsp))
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1237
1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1240 spin_lock(&rnp->lock);
1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1244 rsp->signaled = RCU_FORCE_QS; 1248 rsp->signaled = RCU_FORCE_QS;
1245 rsp->completed_fqs = lastcomp; 1249 break;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1247 }
1248 spin_unlock(&rnp->lock);
1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1252 1250
1253 case RCU_FORCE_QS: 1251 case RCU_FORCE_QS:
1254 1252
1255 /* Check dyntick-idle state, send IPI to laggarts. */ 1253 /* Check dyntick-idle state, send IPI to laggarts. */
1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs, 1254 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1257 rcu_implicit_dynticks_qs)) 1255 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1258 goto unlock_ret;
1259 1256
1260 /* Leave state in case more forcing is required. */ 1257 /* Leave state in case more forcing is required. */
1261 1258
1259 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1262 break; 1260 break;
1263 } 1261 }
1264unlock_ret: 1262 rsp->fqs_active = 0;
1265 spin_unlock_irqrestore(&rsp->fqslock, flags); 1263 if (rsp->fqs_need_gp) {
1264 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1265 rsp->fqs_need_gp = 0;
1266 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1267 return;
1268 }
1269 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1270unlock_fqs_ret:
1271 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1266} 1272}
1267 1273
1268#else /* #ifdef CONFIG_SMP */ 1274#else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1290 * If an RCU GP has gone long enough, go check for dyntick 1296 * If an RCU GP has gone long enough, go check for dyntick
1291 * idle CPUs and, if needed, send resched IPIs. 1297 * idle CPUs and, if needed, send resched IPIs.
1292 */ 1298 */
1293 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1299 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1294 force_quiescent_state(rsp, 1); 1300 force_quiescent_state(rsp, 1);
1295 1301
1296 /* 1302 /*
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1304 1310
1305 /* Does this CPU require a not-yet-started grace period? */ 1311 /* Does this CPU require a not-yet-started grace period? */
1306 if (cpu_needs_another_gp(rsp, rdp)) { 1312 if (cpu_needs_another_gp(rsp, rdp)) {
1307 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 1313 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1308 rcu_start_gp(rsp, flags); /* releases above lock */ 1314 rcu_start_gp(rsp, flags); /* releases above lock */
1309 } 1315 }
1310 1316
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1335 * grace-period manipulations above. 1341 * grace-period manipulations above.
1336 */ 1342 */
1337 smp_mb(); /* See above block comment. */ 1343 smp_mb(); /* See above block comment. */
1344
1345 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1346 rcu_needs_cpu_flush();
1338} 1347}
1339 1348
1340static void 1349static void
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1369 unsigned long nestflag; 1378 unsigned long nestflag;
1370 struct rcu_node *rnp_root = rcu_get_root(rsp); 1379 struct rcu_node *rnp_root = rcu_get_root(rsp);
1371 1380
1372 spin_lock_irqsave(&rnp_root->lock, nestflag); 1381 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1382 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1374 } 1383 }
1375 1384
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1387 force_quiescent_state(rsp, 0); 1396 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs; 1397 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen; 1398 rdp->qlen_last_fqs_check = rdp->qlen;
1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1399 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1391 force_quiescent_state(rsp, 1); 1400 force_quiescent_state(rsp, 1);
1392 local_irq_restore(flags); 1401 local_irq_restore(flags);
1393} 1402}
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1520 1529
1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1530 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1522 if (rcu_gp_in_progress(rsp) && 1531 if (rcu_gp_in_progress(rsp) &&
1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1532 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1524 rdp->n_rp_need_fqs++; 1533 rdp->n_rp_need_fqs++;
1525 return 1; 1534 return 1;
1526 } 1535 }
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu)
1545/* 1554/*
1546 * Check to see if any future RCU-related work will need to be done 1555 * Check to see if any future RCU-related work will need to be done
1547 * by the current CPU, even if none need be done immediately, returning 1556 * by the current CPU, even if none need be done immediately, returning
1548 * 1 if so. This function is part of the RCU implementation; it is -not- 1557 * 1 if so.
1549 * an exported member of the RCU API.
1550 */ 1558 */
1551int rcu_needs_cpu(int cpu) 1559static int rcu_needs_cpu_quick_check(int cpu)
1552{ 1560{
1553 /* RCU callbacks either ready or pending? */ 1561 /* RCU callbacks either ready or pending? */
1554 return per_cpu(rcu_sched_data, cpu).nxtlist || 1562 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu)
1556 rcu_preempt_needs_cpu(cpu); 1564 rcu_preempt_needs_cpu(cpu);
1557} 1565}
1558 1566
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1567static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count; 1568static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex); 1569static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1659 struct rcu_node *rnp = rcu_get_root(rsp); 1652 struct rcu_node *rnp = rcu_get_root(rsp);
1660 1653
1661 /* Set up local state, ensuring consistent view of global state. */ 1654 /* Set up local state, ensuring consistent view of global state. */
1662 spin_lock_irqsave(&rnp->lock, flags); 1655 raw_spin_lock_irqsave(&rnp->lock, flags);
1663 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1656 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1664 rdp->nxtlist = NULL; 1657 rdp->nxtlist = NULL;
1665 for (i = 0; i < RCU_NEXT_SIZE; i++) 1658 for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1669 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1662 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1670#endif /* #ifdef CONFIG_NO_HZ */ 1663#endif /* #ifdef CONFIG_NO_HZ */
1671 rdp->cpu = cpu; 1664 rdp->cpu = cpu;
1672 spin_unlock_irqrestore(&rnp->lock, flags); 1665 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1673} 1666}
1674 1667
1675/* 1668/*
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1687 struct rcu_node *rnp = rcu_get_root(rsp); 1680 struct rcu_node *rnp = rcu_get_root(rsp);
1688 1681
1689 /* Set up local state, ensuring consistent view of global state. */ 1682 /* Set up local state, ensuring consistent view of global state. */
1690 spin_lock_irqsave(&rnp->lock, flags); 1683 raw_spin_lock_irqsave(&rnp->lock, flags);
1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1684 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1685 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1693 rdp->beenonline = 1; /* We have now been online. */ 1686 rdp->beenonline = 1; /* We have now been online. */
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1695 rdp->qlen_last_fqs_check = 0; 1688 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs; 1689 rdp->n_force_qs_snap = rsp->n_force_qs;
1697 rdp->blimit = blimit; 1690 rdp->blimit = blimit;
1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1691 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1699 1692
1700 /* 1693 /*
1701 * A new grace period might start here. If so, we won't be part 1694 * A new grace period might start here. If so, we won't be part
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1703 */ 1696 */
1704 1697
1705 /* Exclude any attempts to start a new GP on large systems. */ 1698 /* Exclude any attempts to start a new GP on large systems. */
1706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1699 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1707 1700
1708 /* Add CPU to rcu_node bitmasks. */ 1701 /* Add CPU to rcu_node bitmasks. */
1709 rnp = rdp->mynode; 1702 rnp = rdp->mynode;
1710 mask = rdp->grpmask; 1703 mask = rdp->grpmask;
1711 do { 1704 do {
1712 /* Exclude any attempts to start a new GP on small systems. */ 1705 /* Exclude any attempts to start a new GP on small systems. */
1713 spin_lock(&rnp->lock); /* irqs already disabled. */ 1706 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1714 rnp->qsmaskinit |= mask; 1707 rnp->qsmaskinit |= mask;
1715 mask = rnp->grpmask; 1708 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) { 1709 if (rnp == rdp->mynode) {
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1718 rdp->completed = rnp->completed; 1711 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1; 1712 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 } 1713 }
1721 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1714 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1722 rnp = rnp->parent; 1715 rnp = rnp->parent;
1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1716 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1724 1717
1725 spin_unlock_irqrestore(&rsp->onofflock, flags); 1718 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1726} 1719}
1727 1720
1728static void __cpuinit rcu_online_cpu(int cpu) 1721static void __cpuinit rcu_online_cpu(int cpu)
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1806 */ 1799 */
1807static void __init rcu_init_one(struct rcu_state *rsp) 1800static void __init rcu_init_one(struct rcu_state *rsp)
1808{ 1801{
1802 static char *buf[] = { "rcu_node_level_0",
1803 "rcu_node_level_1",
1804 "rcu_node_level_2",
1805 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
1809 int cpustride = 1; 1806 int cpustride = 1;
1810 int i; 1807 int i;
1811 int j; 1808 int j;
1812 struct rcu_node *rnp; 1809 struct rcu_node *rnp;
1813 1810
1811 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
1812
1814 /* Initialize the level-tracking arrays. */ 1813 /* Initialize the level-tracking arrays. */
1815 1814
1816 for (i = 1; i < NUM_RCU_LVLS; i++) 1815 for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1823 cpustride *= rsp->levelspread[i]; 1822 cpustride *= rsp->levelspread[i];
1824 rnp = rsp->level[i]; 1823 rnp = rsp->level[i];
1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1824 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1826 spin_lock_init(&rnp->lock); 1825 raw_spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]); 1826 lockdep_set_class_and_name(&rnp->lock,
1827 &rcu_node_class[i], buf[i]);
1828 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1829 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1830 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1876,7 +1876,7 @@ do { \
1876 1876
1877void __init rcu_init(void) 1877void __init rcu_init(void)
1878{ 1878{
1879 int i; 1879 int cpu;
1880 1880
1881 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -1896,8 +1896,8 @@ void __init rcu_init(void)
1896 * or the scheduler are operational. 1896 * or the scheduler are operational.
1897 */ 1897 */
1898 cpu_notifier(rcu_cpu_notify, 0); 1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i) 1899 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); 1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1901} 1901}
1902 1902
1903#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
90 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
91 */ 91 */
92struct rcu_node { 92struct rcu_node {
93 spinlock_t lock; /* Root rcu_node's lock protects some */ 93 raw_spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */ 94 /* rcu_state fields as well as following. */
95 long gpnum; /* Current grace period for this node. */ 95 unsigned long gpnum; /* Current grace period for this node. */
96 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
97 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */ 98 unsigned long completed; /* Last GP completed for this node. */
99 /* This will either be equal to or one */ 99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */ 100 /* behind the root rcu_node's gpnum. */
101 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
161/* Per-CPU data for read-copy update. */ 161/* Per-CPU data for read-copy update. */
162struct rcu_data { 162struct rcu_data {
163 /* 1) quiescent-state and grace-period handling : */ 163 /* 1) quiescent-state and grace-period handling : */
164 long completed; /* Track rsp->completed gp number */ 164 unsigned long completed; /* Track rsp->completed gp number */
165 /* in order to detect GP end. */ 165 /* in order to detect GP end. */
166 long gpnum; /* Highest gp number that this CPU */ 166 unsigned long gpnum; /* Highest gp number that this CPU */
167 /* is aware of having started. */ 167 /* is aware of having started. */
168 long passed_quiesc_completed; 168 unsigned long passed_quiesc_completed;
169 /* Value of completed at time of qs. */ 169 /* Value of completed at time of qs. */
170 bool passed_quiesc; /* User-mode/idle loop etc. */ 170 bool passed_quiesc; /* User-mode/idle loop etc. */
171 bool qs_pending; /* Core waits for quiesc state. */ 171 bool qs_pending; /* Core waits for quiesc state. */
@@ -221,14 +221,14 @@ struct rcu_data {
221 unsigned long resched_ipi; /* Sent a resched IPI. */ 221 unsigned long resched_ipi; /* Sent a resched IPI. */
222 222
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 long n_rp_cb_ready; 226 unsigned long n_rp_cb_ready;
227 long n_rp_cpu_needs_gp; 227 unsigned long n_rp_cpu_needs_gp;
228 long n_rp_gp_completed; 228 unsigned long n_rp_gp_completed;
229 long n_rp_gp_started; 229 unsigned long n_rp_gp_started;
230 long n_rp_need_fqs; 230 unsigned long n_rp_need_fqs;
231 long n_rp_need_nothing; 231 unsigned long n_rp_need_nothing;
232 232
233 int cpu; 233 int cpu;
234}; 234};
@@ -237,25 +237,36 @@ struct rcu_data {
237#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
238#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ 240#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
242#ifdef CONFIG_NO_HZ 241#ifdef CONFIG_NO_HZ
243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 242#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
244#else /* #ifdef CONFIG_NO_HZ */ 243#else /* #ifdef CONFIG_NO_HZ */
245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED 244#define RCU_SIGNAL_INIT RCU_FORCE_QS
246#endif /* #else #ifdef CONFIG_NO_HZ */ 245#endif /* #else #ifdef CONFIG_NO_HZ */
247 246
248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
249#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 248#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
250#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ 249
251#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ 250#ifdef CONFIG_PROVE_RCU
252#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 251#define RCU_STALL_DELAY_DELTA (5 * HZ)
253 /* to take at least one */ 252#else
254 /* scheduling clock irq */ 253#define RCU_STALL_DELAY_DELTA 0
255 /* before ratting on them. */ 254#endif
255
256#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
257 /* for rsp->jiffies_stall */
258#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
259 /* for rsp->jiffies_stall */
260#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
261 /* to take at least one */
262 /* scheduling clock irq */
263 /* before ratting on them. */
256 264
257#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 265#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
258 266
267#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
268#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
269
259/* 270/*
260 * RCU global state, including node hierarchy. This hierarchy is 271 * RCU global state, including node hierarchy. This hierarchy is
261 * represented in "heap" form in a dense array. The root (first level) 272 * represented in "heap" form in a dense array. The root (first level)
@@ -277,12 +288,19 @@ struct rcu_state {
277 288
278 u8 signaled ____cacheline_internodealigned_in_smp; 289 u8 signaled ____cacheline_internodealigned_in_smp;
279 /* Force QS state. */ 290 /* Force QS state. */
280 long gpnum; /* Current gp number. */ 291 u8 fqs_active; /* force_quiescent_state() */
281 long completed; /* # of last completed gp. */ 292 /* is running. */
293 u8 fqs_need_gp; /* A CPU was prevented from */
294 /* starting a new grace */
295 /* period because */
296 /* force_quiescent_state() */
297 /* was running. */
298 unsigned long gpnum; /* Current gp number. */
299 unsigned long completed; /* # of last completed gp. */
282 300
283 /* End of fields guarded by root rcu_node's lock. */ 301 /* End of fields guarded by root rcu_node's lock. */
284 302
285 spinlock_t onofflock; /* exclude on/offline and */ 303 raw_spinlock_t onofflock; /* exclude on/offline and */
286 /* starting new GP. Also */ 304 /* starting new GP. Also */
287 /* protects the following */ 305 /* protects the following */
288 /* orphan_cbs fields. */ 306 /* orphan_cbs fields. */
@@ -292,10 +310,8 @@ struct rcu_state {
292 /* going offline. */ 310 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ 311 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */ 312 long orphan_qlen; /* Number of orphaned cbs. */
295 spinlock_t fqslock; /* Only one task forcing */ 313 raw_spinlock_t fqslock; /* Only one task forcing */
296 /* quiescent states. */ 314 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
299 unsigned long jiffies_force_qs; /* Time at which to invoke */ 315 unsigned long jiffies_force_qs; /* Time at which to invoke */
300 /* force_quiescent_state(). */ 316 /* force_quiescent_state(). */
301 unsigned long n_force_qs; /* Number of calls to */ 317 unsigned long n_force_qs; /* Number of calls to */
@@ -319,8 +335,6 @@ struct rcu_state {
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ 335#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */ 336 /* GP were moved to root. */
321 337
322#ifdef RCU_TREE_NONCORE
323
324/* 338/*
325 * RCU implementation internal declarations: 339 * RCU implementation internal declarations:
326 */ 340 */
@@ -335,7 +349,7 @@ extern struct rcu_state rcu_preempt_state;
335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 349DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 350#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
337 351
338#else /* #ifdef RCU_TREE_NONCORE */ 352#ifndef RCU_TREE_NONCORE
339 353
340/* Forward declarations for rcutree_plugin.h */ 354/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void); 355static void rcu_bootup_announce(void);
@@ -347,6 +361,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags); 361 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 362#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 363#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
364static void rcu_print_detail_task_stall(struct rcu_state *rsp);
350static void rcu_print_task_stall(struct rcu_node *rnp); 365static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 366#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 367static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +382,6 @@ static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 382static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void); 383static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void); 384static void __init __rcu_init_preempt(void);
385static void rcu_needs_cpu_flush(void);
370 386
371#endif /* #else #ifdef RCU_TREE_NONCORE */ 387#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -62,6 +62,15 @@ long rcu_batches_completed(void)
62EXPORT_SYMBOL_GPL(rcu_batches_completed); 62EXPORT_SYMBOL_GPL(rcu_batches_completed);
63 63
64/* 64/*
65 * Force a quiescent state for preemptible RCU.
66 */
67void rcu_force_quiescent_state(void)
68{
69 force_quiescent_state(&rcu_preempt_state, 0);
70}
71EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
72
73/*
65 * Record a preemptable-RCU quiescent state for the specified CPU. Note 74 * Record a preemptable-RCU quiescent state for the specified CPU. Note
66 * that this just means that the task currently running on the CPU is 75 * that this just means that the task currently running on the CPU is
67 * not in a quiescent state. There might be any number of tasks blocked 76 * not in a quiescent state. There might be any number of tasks blocked
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
102 /* Possibly blocking in an RCU read-side critical section. */ 111 /* Possibly blocking in an RCU read-side critical section. */
103 rdp = rcu_preempt_state.rda[cpu]; 112 rdp = rcu_preempt_state.rda[cpu];
104 rnp = rdp->mynode; 113 rnp = rdp->mynode;
105 spin_lock_irqsave(&rnp->lock, flags); 114 raw_spin_lock_irqsave(&rnp->lock, flags);
106 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 115 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
107 t->rcu_blocked_node = rnp; 116 t->rcu_blocked_node = rnp;
108 117
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
123 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 132 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
124 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 133 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
125 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 134 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
126 spin_unlock_irqrestore(&rnp->lock, flags); 135 raw_spin_unlock_irqrestore(&rnp->lock, flags);
127 } 136 }
128 137
129 /* 138 /*
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
180 struct rcu_node *rnp_p; 189 struct rcu_node *rnp_p;
181 190
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 191 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */ 193 return; /* Still need more quiescent states! */
185 } 194 }
186 195
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
197 206
198 /* Report up the rest of the hierarchy. */ 207 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask; 208 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 209 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */ 210 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 211 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203} 212}
204 213
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
248 */ 257 */
249 for (;;) { 258 for (;;) {
250 rnp = t->rcu_blocked_node; 259 rnp = t->rcu_blocked_node;
251 spin_lock(&rnp->lock); /* irqs already disabled. */ 260 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
252 if (rnp == t->rcu_blocked_node) 261 if (rnp == t->rcu_blocked_node)
253 break; 262 break;
254 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
255 } 264 }
256 empty = !rcu_preempted_readers(rnp); 265 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp); 266 empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 274 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
266 */ 275 */
267 if (empty) 276 if (empty)
268 spin_unlock_irqrestore(&rnp->lock, flags); 277 raw_spin_unlock_irqrestore(&rnp->lock, flags);
269 else 278 else
270 rcu_report_unblock_qs_rnp(rnp, flags); 279 rcu_report_unblock_qs_rnp(rnp, flags);
271 280
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void)
295 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 304 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
296 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 305 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
297 rcu_read_unlock_special(t); 306 rcu_read_unlock_special(t);
307#ifdef CONFIG_PROVE_LOCKING
308 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
309#endif /* #ifdef CONFIG_PROVE_LOCKING */
298} 310}
299EXPORT_SYMBOL_GPL(__rcu_read_unlock); 311EXPORT_SYMBOL_GPL(__rcu_read_unlock);
300 312
301#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 313#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
302 314
315#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
316
317/*
318 * Dump detailed information for all tasks blocking the current RCU
319 * grace period on the specified rcu_node structure.
320 */
321static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
322{
323 unsigned long flags;
324 struct list_head *lp;
325 int phase;
326 struct task_struct *t;
327
328 if (rcu_preempted_readers(rnp)) {
329 raw_spin_lock_irqsave(&rnp->lock, flags);
330 phase = rnp->gpnum & 0x1;
331 lp = &rnp->blocked_tasks[phase];
332 list_for_each_entry(t, lp, rcu_node_entry)
333 sched_show_task(t);
334 raw_spin_unlock_irqrestore(&rnp->lock, flags);
335 }
336}
337
338/*
339 * Dump detailed information for all tasks blocking the current RCU
340 * grace period.
341 */
342static void rcu_print_detail_task_stall(struct rcu_state *rsp)
343{
344 struct rcu_node *rnp = rcu_get_root(rsp);
345
346 rcu_print_detail_task_stall_rnp(rnp);
347 rcu_for_each_leaf_node(rsp, rnp)
348 rcu_print_detail_task_stall_rnp(rnp);
349}
350
351#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
352
353static void rcu_print_detail_task_stall(struct rcu_state *rsp)
354{
355}
356
357#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
358
303/* 359/*
304 * Scan the current list of tasks blocked within RCU read-side critical 360 * Scan the current list of tasks blocked within RCU read-side critical
305 * sections, printing out the tid of each. 361 * sections, printing out the tid of each.
306 */ 362 */
307static void rcu_print_task_stall(struct rcu_node *rnp) 363static void rcu_print_task_stall(struct rcu_node *rnp)
308{ 364{
309 unsigned long flags;
310 struct list_head *lp; 365 struct list_head *lp;
311 int phase; 366 int phase;
312 struct task_struct *t; 367 struct task_struct *t;
313 368
314 if (rcu_preempted_readers(rnp)) { 369 if (rcu_preempted_readers(rnp)) {
315 spin_lock_irqsave(&rnp->lock, flags);
316 phase = rnp->gpnum & 0x1; 370 phase = rnp->gpnum & 0x1;
317 lp = &rnp->blocked_tasks[phase]; 371 lp = &rnp->blocked_tasks[phase];
318 list_for_each_entry(t, lp, rcu_node_entry) 372 list_for_each_entry(t, lp, rcu_node_entry)
319 printk(" P%d", t->pid); 373 printk(" P%d", t->pid);
320 spin_unlock_irqrestore(&rnp->lock, flags);
321 } 374 }
322} 375}
323 376
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
388 lp_root = &rnp_root->blocked_tasks[i]; 441 lp_root = &rnp_root->blocked_tasks[i];
389 while (!list_empty(lp)) { 442 while (!list_empty(lp)) {
390 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 443 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
391 spin_lock(&rnp_root->lock); /* irqs already disabled */ 444 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
392 list_del(&tp->rcu_node_entry); 445 list_del(&tp->rcu_node_entry);
393 tp->rcu_blocked_node = rnp_root; 446 tp->rcu_blocked_node = rnp_root;
394 list_add(&tp->rcu_node_entry, lp_root); 447 list_add(&tp->rcu_node_entry, lp_root);
395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 448 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
396 } 449 }
397 } 450 }
398 return retval; 451 return retval;
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
516 unsigned long flags; 569 unsigned long flags;
517 unsigned long mask; 570 unsigned long mask;
518 571
519 spin_lock_irqsave(&rnp->lock, flags); 572 raw_spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) { 573 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp)) 574 if (!sync_rcu_preempt_exp_done(rnp))
522 break; 575 break;
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
525 break; 578 break;
526 } 579 }
527 mask = rnp->grpmask; 580 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */ 581 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent; 582 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */ 583 raw_spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask; 584 rnp->expmask &= ~mask;
532 } 585 }
533 spin_unlock_irqrestore(&rnp->lock, flags); 586 raw_spin_unlock_irqrestore(&rnp->lock, flags);
534} 587}
535 588
536/* 589/*
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{ 598{
546 int must_wait; 599 int must_wait;
547 600
548 spin_lock(&rnp->lock); /* irqs already disabled */ 601 raw_spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 602 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 603 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp); 604 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */ 605 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait) 606 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp); 607 rcu_report_exp_rnp(rsp, rnp);
555} 608}
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void)
594 /* force all RCU readers onto blocked_tasks[]. */ 647 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited(); 648 synchronize_sched_expedited();
596 649
597 spin_lock_irqsave(&rsp->onofflock, flags); 650 raw_spin_lock_irqsave(&rsp->onofflock, flags);
598 651
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 652 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 653 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */ 654 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit; 655 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 656 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 } 657 }
605 658
606 /* Snapshot current state of ->blocked_tasks[] lists. */ 659 /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void)
609 if (NUM_RCU_NODES > 1) 662 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 663 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611 664
612 spin_unlock_irqrestore(&rsp->onofflock, flags); 665 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
613 666
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 667 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp); 668 rnp = rcu_get_root(rsp);
@@ -713,6 +766,16 @@ long rcu_batches_completed(void)
713EXPORT_SYMBOL_GPL(rcu_batches_completed); 766EXPORT_SYMBOL_GPL(rcu_batches_completed);
714 767
715/* 768/*
769 * Force a quiescent state for RCU, which, because there is no preemptible
770 * RCU, becomes the same as rcu-sched.
771 */
772void rcu_force_quiescent_state(void)
773{
774 rcu_sched_force_quiescent_state();
775}
776EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
777
778/*
716 * Because preemptable RCU does not exist, we never have to check for 779 * Because preemptable RCU does not exist, we never have to check for
717 * CPUs being in quiescent states. 780 * CPUs being in quiescent states.
718 */ 781 */
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
734/* Because preemptible RCU does not exist, no quieting of tasks. */ 797/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 798static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{ 799{
737 spin_unlock_irqrestore(&rnp->lock, flags); 800 raw_spin_unlock_irqrestore(&rnp->lock, flags);
738} 801}
739 802
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 803#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
745 * Because preemptable RCU does not exist, we never have to check for 808 * Because preemptable RCU does not exist, we never have to check for
746 * tasks blocked within RCU read-side critical sections. 809 * tasks blocked within RCU read-side critical sections.
747 */ 810 */
811static void rcu_print_detail_task_stall(struct rcu_state *rsp)
812{
813}
814
815/*
816 * Because preemptable RCU does not exist, we never have to check for
817 * tasks blocked within RCU read-side critical sections.
818 */
748static void rcu_print_task_stall(struct rcu_node *rnp) 819static void rcu_print_task_stall(struct rcu_node *rnp)
749{ 820{
750} 821}
@@ -884,3 +955,115 @@ static void __init __rcu_init_preempt(void)
884} 955}
885 956
886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 957#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
958
959#if !defined(CONFIG_RCU_FAST_NO_HZ)
960
961/*
962 * Check to see if any future RCU-related work will need to be done
963 * by the current CPU, even if none need be done immediately, returning
964 * 1 if so. This function is part of the RCU implementation; it is -not-
965 * an exported member of the RCU API.
966 *
967 * Because we have preemptible RCU, just check whether this CPU needs
968 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
969 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
970 */
971int rcu_needs_cpu(int cpu)
972{
973 return rcu_needs_cpu_quick_check(cpu);
974}
975
976/*
977 * Check to see if we need to continue a callback-flush operations to
978 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
979 * entry is not configured, so we never do need to.
980 */
981static void rcu_needs_cpu_flush(void)
982{
983}
984
985#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
986
987#define RCU_NEEDS_CPU_FLUSHES 5
988static DEFINE_PER_CPU(int, rcu_dyntick_drain);
989static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
990
991/*
992 * Check to see if any future RCU-related work will need to be done
993 * by the current CPU, even if none need be done immediately, returning
994 * 1 if so. This function is part of the RCU implementation; it is -not-
995 * an exported member of the RCU API.
996 *
997 * Because we are not supporting preemptible RCU, attempt to accelerate
998 * any current grace periods so that RCU no longer needs this CPU, but
999 * only if all other CPUs are already in dynticks-idle mode. This will
1000 * allow the CPU cores to be powered down immediately, as opposed to after
1001 * waiting many milliseconds for grace periods to elapse.
1002 *
1003 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1004 * disabled, we do one pass of force_quiescent_state(), then do a
1005 * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
1006 * The per-cpu rcu_dyntick_drain variable controls the sequencing.
1007 */
1008int rcu_needs_cpu(int cpu)
1009{
1010 int c = 0;
1011 int thatcpu;
1012
1013 /* Check for being in the holdoff period. */
1014 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1015 return rcu_needs_cpu_quick_check(cpu);
1016
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1019 if (thatcpu != cpu) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu);
1023 }
1024
1025 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1027 /* First time through, initialize the counter. */
1028 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1029 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1030 /* We have hit the limit, so time to give up. */
1031 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1032 return rcu_needs_cpu_quick_check(cpu);
1033 }
1034
1035 /* Do one step pushing remaining RCU callbacks through. */
1036 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1037 rcu_sched_qs(cpu);
1038 force_quiescent_state(&rcu_sched_state, 0);
1039 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1040 }
1041 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1042 rcu_bh_qs(cpu);
1043 force_quiescent_state(&rcu_bh_state, 0);
1044 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1045 }
1046
1047 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1048 if (c)
1049 raise_softirq(RCU_SOFTIRQ);
1050 return c;
1051}
1052
1053/*
1054 * Check to see if we need to continue a callback-flush operations to
1055 * allow the last CPU to enter dyntick-idle mode.
1056 */
1057static void rcu_needs_cpu_flush(void)
1058{
1059 int cpu = smp_processor_id();
1060 unsigned long flags;
1061
1062 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
1063 return;
1064 local_irq_save(flags);
1065 (void)rcu_needs_cpu(cpu);
1066 local_irq_restore(flags);
1067}
1068
1069#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 50{
51 if (!rdp->beenonline) 51 if (!rdp->beenonline)
52 return; 52 return;
53 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", 53 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
54 rdp->cpu, 54 rdp->cpu,
55 cpu_is_offline(rdp->cpu) ? '!' : ' ', 55 cpu_is_offline(rdp->cpu) ? '!' : ' ',
56 rdp->completed, rdp->gpnum, 56 rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
105{ 105{
106 if (!rdp->beenonline) 106 if (!rdp->beenonline)
107 return; 107 return;
108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
109 rdp->cpu, 109 rdp->cpu,
110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
111 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum; 158 unsigned long gpnum;
159 int level = 0; 159 int level = 0;
160 int phase; 160 int phase;
161 struct rcu_node *rnp; 161 struct rcu_node *rnp;
162 162
163 gpnum = rsp->gpnum; 163 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
166 rsp->completed, gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
167 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
215static int show_rcugp(struct seq_file *m, void *unused) 215static int show_rcugp(struct seq_file *m, void *unused)
216{ 216{
217#ifdef CONFIG_TREE_PREEMPT_RCU 217#ifdef CONFIG_TREE_PREEMPT_RCU
218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", 218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
219 rcu_preempt_state.completed, rcu_preempt_state.gpnum); 219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", 221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
222 rcu_sched_state.completed, rcu_sched_state.gpnum); 222 rcu_sched_state.completed, rcu_sched_state.gpnum);
223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
224 rcu_bh_state.completed, rcu_bh_state.gpnum); 224 rcu_bh_state.completed, rcu_bh_state.gpnum);
225 return 0; 225 return 0;
226} 226}
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/res_counter.h> 13#include <linux/res_counter.h>
15#include <linux/uaccess.h> 14#include <linux/uaccess.h>
16#include <linux/mm.h> 15#include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..9c358e263534 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,20 +188,65 @@ static int __release_resource(struct resource *old)
188 return -EINVAL; 188 return -EINVAL;
189} 189}
190 190
191static void __release_child_resources(struct resource *r)
192{
193 struct resource *tmp, *p;
194 resource_size_t size;
195
196 p = r->child;
197 r->child = NULL;
198 while (p) {
199 tmp = p;
200 p = p->sibling;
201
202 tmp->parent = NULL;
203 tmp->sibling = NULL;
204 __release_child_resources(tmp);
205
206 printk(KERN_DEBUG "release child resource %pR\n", tmp);
207 /* need to restore size, and keep flags */
208 size = resource_size(tmp);
209 tmp->start = 0;
210 tmp->end = size - 1;
211 }
212}
213
214void release_child_resources(struct resource *r)
215{
216 write_lock(&resource_lock);
217 __release_child_resources(r);
218 write_unlock(&resource_lock);
219}
220
191/** 221/**
192 * request_resource - request and reserve an I/O or memory resource 222 * request_resource_conflict - request and reserve an I/O or memory resource
193 * @root: root resource descriptor 223 * @root: root resource descriptor
194 * @new: resource descriptor desired by caller 224 * @new: resource descriptor desired by caller
195 * 225 *
196 * Returns 0 for success, negative error code on error. 226 * Returns 0 for success, conflict resource on error.
197 */ 227 */
198int request_resource(struct resource *root, struct resource *new) 228struct resource *request_resource_conflict(struct resource *root, struct resource *new)
199{ 229{
200 struct resource *conflict; 230 struct resource *conflict;
201 231
202 write_lock(&resource_lock); 232 write_lock(&resource_lock);
203 conflict = __request_resource(root, new); 233 conflict = __request_resource(root, new);
204 write_unlock(&resource_lock); 234 write_unlock(&resource_lock);
235 return conflict;
236}
237
238/**
239 * request_resource - request and reserve an I/O or memory resource
240 * @root: root resource descriptor
241 * @new: resource descriptor desired by caller
242 *
243 * Returns 0 for success, negative error code on error.
244 */
245int request_resource(struct resource *root, struct resource *new)
246{
247 struct resource *conflict;
248
249 conflict = request_resource_conflict(root, new);
205 return conflict ? -EBUSY : 0; 250 return conflict ? -EBUSY : 0;
206} 251}
207 252
@@ -274,7 +319,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *)) 319 void *arg, int (*func)(unsigned long, unsigned long, void *))
275{ 320{
276 struct resource res; 321 struct resource res;
277 unsigned long pfn, len; 322 unsigned long pfn, end_pfn;
278 u64 orig_end; 323 u64 orig_end;
279 int ret = -1; 324 int ret = -1;
280 325
@@ -284,9 +329,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
284 orig_end = res.end; 329 orig_end = res.end;
285 while ((res.start < res.end) && 330 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) { 331 (find_next_system_ram(&res, "System RAM") >= 0)) {
287 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 332 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 333 end_pfn = (res.end + 1) >> PAGE_SHIFT;
289 ret = (*func)(pfn, len, arg); 334 if (end_pfn > pfn)
335 ret = (*func)(pfn, end_pfn - pfn, arg);
290 if (ret) 336 if (ret)
291 break; 337 break;
292 res.start = res.end + 1; 338 res.start = res.end + 1;
@@ -297,14 +343,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
297 343
298#endif 344#endif
299 345
346static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
347{
348 return 1;
349}
350/*
351 * This generic page_is_ram() returns true if specified address is
352 * registered as "System RAM" in iomem_resource list.
353 */
354int __weak page_is_ram(unsigned long pfn)
355{
356 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
357}
358
300/* 359/*
301 * Find empty slot in the resource tree given range and alignment. 360 * Find empty slot in the resource tree given range and alignment.
302 */ 361 */
303static int find_resource(struct resource *root, struct resource *new, 362static int find_resource(struct resource *root, struct resource *new,
304 resource_size_t size, resource_size_t min, 363 resource_size_t size, resource_size_t min,
305 resource_size_t max, resource_size_t align, 364 resource_size_t max, resource_size_t align,
306 void (*alignf)(void *, struct resource *, 365 resource_size_t (*alignf)(void *,
307 resource_size_t, resource_size_t), 366 const struct resource *,
367 resource_size_t,
368 resource_size_t),
308 void *alignf_data) 369 void *alignf_data)
309{ 370{
310 struct resource *this = root->child; 371 struct resource *this = root->child;
@@ -330,7 +391,7 @@ static int find_resource(struct resource *root, struct resource *new,
330 tmp.end = max; 391 tmp.end = max;
331 tmp.start = ALIGN(tmp.start, align); 392 tmp.start = ALIGN(tmp.start, align);
332 if (alignf) 393 if (alignf)
333 alignf(alignf_data, &tmp, size, align); 394 tmp.start = alignf(alignf_data, &tmp, size, align);
334 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 395 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
335 new->start = tmp.start; 396 new->start = tmp.start;
336 new->end = tmp.start + size - 1; 397 new->end = tmp.start + size - 1;
@@ -358,8 +419,10 @@ static int find_resource(struct resource *root, struct resource *new,
358int allocate_resource(struct resource *root, struct resource *new, 419int allocate_resource(struct resource *root, struct resource *new,
359 resource_size_t size, resource_size_t min, 420 resource_size_t size, resource_size_t min,
360 resource_size_t max, resource_size_t align, 421 resource_size_t max, resource_size_t align,
361 void (*alignf)(void *, struct resource *, 422 resource_size_t (*alignf)(void *,
362 resource_size_t, resource_size_t), 423 const struct resource *,
424 resource_size_t,
425 resource_size_t),
363 void *alignf_data) 426 void *alignf_data)
364{ 427{
365 int err; 428 int err;
@@ -426,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
426} 489}
427 490
428/** 491/**
429 * insert_resource - Inserts a resource in the resource tree 492 * insert_resource_conflict - Inserts resource in the resource tree
430 * @parent: parent of the new resource 493 * @parent: parent of the new resource
431 * @new: new resource to insert 494 * @new: new resource to insert
432 * 495 *
433 * Returns 0 on success, -EBUSY if the resource can't be inserted. 496 * Returns 0 on success, conflict resource if the resource can't be inserted.
434 * 497 *
435 * This function is equivalent to request_resource when no conflict 498 * This function is equivalent to request_resource_conflict when no conflict
436 * happens. If a conflict happens, and the conflicting resources 499 * happens. If a conflict happens, and the conflicting resources
437 * entirely fit within the range of the new resource, then the new 500 * entirely fit within the range of the new resource, then the new
438 * resource is inserted and the conflicting resources become children of 501 * resource is inserted and the conflicting resources become children of
439 * the new resource. 502 * the new resource.
440 */ 503 */
441int insert_resource(struct resource *parent, struct resource *new) 504struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
442{ 505{
443 struct resource *conflict; 506 struct resource *conflict;
444 507
445 write_lock(&resource_lock); 508 write_lock(&resource_lock);
446 conflict = __insert_resource(parent, new); 509 conflict = __insert_resource(parent, new);
447 write_unlock(&resource_lock); 510 write_unlock(&resource_lock);
511 return conflict;
512}
513
514/**
515 * insert_resource - Inserts a resource in the resource tree
516 * @parent: parent of the new resource
517 * @new: new resource to insert
518 *
519 * Returns 0 on success, -EBUSY if the resource can't be inserted.
520 */
521int insert_resource(struct resource *parent, struct resource *new)
522{
523 struct resource *conflict;
524
525 conflict = insert_resource_conflict(parent, new);
448 return conflict ? -EBUSY : 0; 526 return conflict ? -EBUSY : 0;
449} 527}
450 528
diff --git a/kernel/sched.c b/kernel/sched.c
index 7266b912139f..3c2a54f70ffe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h> 73#include <linux/ftrace.h>
74#include <linux/slab.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -233,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 234 */
234static DEFINE_MUTEX(sched_domains_mutex); 235static DEFINE_MUTEX(sched_domains_mutex);
235 236
236#ifdef CONFIG_GROUP_SCHED 237#ifdef CONFIG_CGROUP_SCHED
237 238
238#include <linux/cgroup.h> 239#include <linux/cgroup.h>
239 240
@@ -243,13 +244,7 @@ static LIST_HEAD(task_groups);
243 244
244/* task group related information */ 245/* task group related information */
245struct task_group { 246struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 247 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 248
254#ifdef CONFIG_FAIR_GROUP_SCHED 249#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 250 /* schedulable entities of this group on each cpu */
@@ -274,35 +269,7 @@ struct task_group {
274 struct list_head children; 269 struct list_head children;
275}; 270};
276 271
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 272#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 273
307/* task_group_lock serializes add/remove of task groups and also changes to 274/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 275 * a task group's cpu shares.
@@ -318,11 +285,7 @@ static int root_task_group_empty(void)
318} 285}
319#endif 286#endif
320 287
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 289
327/* 290/*
328 * A weight of 0 or 1 can cause arithmetics problems. 291 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +311,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 311{
349 struct task_group *tg; 312 struct task_group *tg;
350 313
351#ifdef CONFIG_USER_SCHED 314#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 315 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 316 struct task_group, css);
358#else 317#else
@@ -364,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p)
364/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 324static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
366{ 325{
326 /*
327 * Strictly speaking this rcu_read_lock() is not needed since the
328 * task_group is tied to the cgroup, which in turn can never go away
329 * as long as there are tasks attached to it.
330 *
331 * However since task_group() uses task_subsys_state() which is an
332 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
333 */
334 rcu_read_lock();
367#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
368 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 336 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
369 p->se.parent = task_group(p)->se[cpu]; 337 p->se.parent = task_group(p)->se[cpu];
@@ -373,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
373 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 341 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
374 p->rt.parent = task_group(p)->rt_se[cpu]; 342 p->rt.parent = task_group(p)->rt_se[cpu];
375#endif 343#endif
344 rcu_read_unlock();
376} 345}
377 346
378#else 347#else
@@ -383,7 +352,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 352 return NULL;
384} 353}
385 354
386#endif /* CONFIG_GROUP_SCHED */ 355#endif /* CONFIG_CGROUP_SCHED */
387 356
388/* CFS-related fields in a runqueue */ 357/* CFS-related fields in a runqueue */
389struct cfs_rq { 358struct cfs_rq {
@@ -478,7 +447,6 @@ struct rt_rq {
478 struct rq *rq; 447 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 448 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 449 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 450#endif
483}; 451};
484 452
@@ -645,6 +613,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 613#endif
646} 614}
647 615
616#define rcu_dereference_check_sched_domain(p) \
617 rcu_dereference_check((p), \
618 rcu_read_lock_sched_held() || \
619 lockdep_is_held(&sched_domains_mutex))
620
648/* 621/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 622 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 623 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +626,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 626 * preempt-disabled sections.
654 */ 627 */
655#define for_each_domain(cpu, __sd) \ 628#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 629 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 630
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 631#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 632#define this_rq() (&__get_cpu_var(runqueues))
@@ -941,16 +914,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 914#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 915
943/* 916/*
917 * Check whether the task is waking, we use this to synchronize against
918 * ttwu() so that task_cpu() reports a stable number.
919 *
920 * We need to make an exception for PF_STARTING tasks because the fork
921 * path might require task_rq_lock() to work, eg. it can call
922 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
923 */
924static inline int task_is_waking(struct task_struct *p)
925{
926 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
927}
928
929/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 930 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 931 * Must be called interrupts disabled.
946 */ 932 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 933static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 934 __acquires(rq->lock)
949{ 935{
936 struct rq *rq;
937
950 for (;;) { 938 for (;;) {
951 struct rq *rq = task_rq(p); 939 while (task_is_waking(p))
940 cpu_relax();
941 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 942 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 943 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 944 return rq;
955 raw_spin_unlock(&rq->lock); 945 raw_spin_unlock(&rq->lock);
956 } 946 }
@@ -967,10 +957,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 957 struct rq *rq;
968 958
969 for (;;) { 959 for (;;) {
960 while (task_is_waking(p))
961 cpu_relax();
970 local_irq_save(*flags); 962 local_irq_save(*flags);
971 rq = task_rq(p); 963 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 966 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 968 }
@@ -1390,32 +1382,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1382 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1383};
1392 1384
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1385/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1386enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1387 CPUACCT_STAT_USER, /* ... user mode */
@@ -1531,7 +1497,7 @@ static unsigned long target_load(int cpu, int type)
1531 1497
1532static struct sched_group *group_of(int cpu) 1498static struct sched_group *group_of(int cpu)
1533{ 1499{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1500 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1535 1501
1536 if (!sd) 1502 if (!sd)
1537 return NULL; 1503 return NULL;
@@ -1566,7 +1532,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1566 1532
1567#ifdef CONFIG_FAIR_GROUP_SCHED 1533#ifdef CONFIG_FAIR_GROUP_SCHED
1568 1534
1569static __read_mostly unsigned long *update_shares_data; 1535static __read_mostly unsigned long __percpu *update_shares_data;
1570 1536
1571static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1537static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1572 1538
@@ -1701,16 +1667,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1667 }
1702} 1668}
1703 1669
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1670static void update_h_load(long cpu)
1715{ 1671{
1716 if (root_task_group_empty()) 1672 if (root_task_group_empty())
@@ -1725,10 +1681,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1681{
1726} 1682}
1727 1683
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1684#endif
1733 1685
1734#ifdef CONFIG_PREEMPT 1686#ifdef CONFIG_PREEMPT
@@ -1805,6 +1757,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1757 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1758 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1759}
1760
1761/*
1762 * double_rq_lock - safely lock two runqueues
1763 *
1764 * Note this does not disable interrupts like task_rq_lock,
1765 * you need to do so manually before calling.
1766 */
1767static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1768 __acquires(rq1->lock)
1769 __acquires(rq2->lock)
1770{
1771 BUG_ON(!irqs_disabled());
1772 if (rq1 == rq2) {
1773 raw_spin_lock(&rq1->lock);
1774 __acquire(rq2->lock); /* Fake it out ;) */
1775 } else {
1776 if (rq1 < rq2) {
1777 raw_spin_lock(&rq1->lock);
1778 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1779 } else {
1780 raw_spin_lock(&rq2->lock);
1781 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1782 }
1783 }
1784 update_rq_clock(rq1);
1785 update_rq_clock(rq2);
1786}
1787
1788/*
1789 * double_rq_unlock - safely unlock two runqueues
1790 *
1791 * Note this does not restore interrupts like task_rq_unlock,
1792 * you need to do so manually after calling.
1793 */
1794static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1795 __releases(rq1->lock)
1796 __releases(rq2->lock)
1797{
1798 raw_spin_unlock(&rq1->lock);
1799 if (rq1 != rq2)
1800 raw_spin_unlock(&rq2->lock);
1801 else
1802 __release(rq2->lock);
1803}
1804
1808#endif 1805#endif
1809 1806
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1807#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1831,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1831#endif
1835} 1832}
1836 1833
1837#include "sched_stats.h" 1834static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1835
1845#define sched_class_highest (&rt_sched_class) 1836#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1837#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1838 for (class = sched_class_highest; class; class = class->next)
1848 1839
1840#include "sched_stats.h"
1841
1849static void inc_nr_running(struct rq *rq) 1842static void inc_nr_running(struct rq *rq)
1850{ 1843{
1851 rq->nr_running++; 1844 rq->nr_running++;
@@ -1883,13 +1876,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1876 *avg += diff >> 3;
1884} 1877}
1885 1878
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1879static void
1880enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1881{
1888 if (wakeup) 1882 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1883 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1884
1891 sched_info_queued(p); 1885 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1886 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1887 p->se.on_rq = 1;
1894} 1888}
1895 1889
@@ -1912,6 +1906,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1906}
1913 1907
1914/* 1908/*
1909 * activate_task - move a task to the runqueue.
1910 */
1911static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1912{
1913 if (task_contributes_to_load(p))
1914 rq->nr_uninterruptible--;
1915
1916 enqueue_task(rq, p, wakeup, false);
1917 inc_nr_running(rq);
1918}
1919
1920/*
1921 * deactivate_task - remove a task from the runqueue.
1922 */
1923static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1924{
1925 if (task_contributes_to_load(p))
1926 rq->nr_uninterruptible++;
1927
1928 dequeue_task(rq, p, sleep);
1929 dec_nr_running(rq);
1930}
1931
1932#include "sched_idletask.c"
1933#include "sched_fair.c"
1934#include "sched_rt.c"
1935#ifdef CONFIG_SCHED_DEBUG
1936# include "sched_debug.c"
1937#endif
1938
1939/*
1915 * __normal_prio - return the priority that is based on the static prio 1940 * __normal_prio - return the priority that is based on the static prio
1916 */ 1941 */
1917static inline int __normal_prio(struct task_struct *p) 1942static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1982,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1982 return p->prio;
1958} 1983}
1959 1984
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1985/**
1985 * task_curr - is this task currently executing on a CPU? 1986 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1987 * @p: the task in question.
@@ -2320,14 +2321,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2320} 2321}
2321 2322
2322/* 2323/*
2323 * Called from: 2324 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2324 * 2325 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2325 * - fork, @p is stable because it isn't on the tasklist yet 2326 * by:
2326 * 2327 *
2327 * - exec, @p is unstable, retry loop 2328 * exec: is unstable, retry loop
2328 * 2329 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2329 * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
2330 * we should be good.
2331 */ 2330 */
2332static inline 2331static inline
2333int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2332int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -2371,7 +2370,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2371{ 2370{
2372 int cpu, orig_cpu, this_cpu, success = 0; 2371 int cpu, orig_cpu, this_cpu, success = 0;
2373 unsigned long flags; 2372 unsigned long flags;
2374 struct rq *rq, *orig_rq; 2373 struct rq *rq;
2375 2374
2376 if (!sched_feat(SYNC_WAKEUPS)) 2375 if (!sched_feat(SYNC_WAKEUPS))
2377 wake_flags &= ~WF_SYNC; 2376 wake_flags &= ~WF_SYNC;
@@ -2379,7 +2378,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2379 this_cpu = get_cpu(); 2378 this_cpu = get_cpu();
2380 2379
2381 smp_wmb(); 2380 smp_wmb();
2382 rq = orig_rq = task_rq_lock(p, &flags); 2381 rq = task_rq_lock(p, &flags);
2383 update_rq_clock(rq); 2382 update_rq_clock(rq);
2384 if (!(p->state & state)) 2383 if (!(p->state & state))
2385 goto out; 2384 goto out;
@@ -2410,14 +2409,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2410 __task_rq_unlock(rq); 2409 __task_rq_unlock(rq);
2411 2410
2412 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2411 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2413 if (cpu != orig_cpu) 2412 if (cpu != orig_cpu) {
2413 /*
2414 * Since we migrate the task without holding any rq->lock,
2415 * we need to be careful with task_rq_lock(), since that
2416 * might end up locking an invalid rq.
2417 */
2414 set_task_cpu(p, cpu); 2418 set_task_cpu(p, cpu);
2419 }
2415 2420
2416 rq = __task_rq_lock(p); 2421 rq = cpu_rq(cpu);
2422 raw_spin_lock(&rq->lock);
2417 update_rq_clock(rq); 2423 update_rq_clock(rq);
2418 2424
2425 /*
2426 * We migrated the task without holding either rq->lock, however
2427 * since the task is not on the task list itself, nobody else
2428 * will try and migrate the task, hence the rq should match the
2429 * cpu we just moved it to.
2430 */
2431 WARN_ON(task_cpu(p) != cpu);
2419 WARN_ON(p->state != TASK_WAKING); 2432 WARN_ON(p->state != TASK_WAKING);
2420 cpu = task_cpu(p);
2421 2433
2422#ifdef CONFIG_SCHEDSTATS 2434#ifdef CONFIG_SCHEDSTATS
2423 schedstat_inc(rq, ttwu_count); 2435 schedstat_inc(rq, ttwu_count);
@@ -2620,9 +2632,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
2620 if (p->sched_class->task_fork) 2632 if (p->sched_class->task_fork)
2621 p->sched_class->task_fork(p); 2633 p->sched_class->task_fork(p);
2622 2634
2623#ifdef CONFIG_SMP
2624 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2625#endif
2626 set_task_cpu(p, cpu); 2635 set_task_cpu(p, cpu);
2627 2636
2628#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2637#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2652,8 +2661,29 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2652{ 2661{
2653 unsigned long flags; 2662 unsigned long flags;
2654 struct rq *rq; 2663 struct rq *rq;
2664 int cpu __maybe_unused = get_cpu();
2665
2666#ifdef CONFIG_SMP
2667 /*
2668 * Fork balancing, do it here and not earlier because:
2669 * - cpus_allowed can change in the fork path
2670 * - any previously selected cpu might disappear through hotplug
2671 *
2672 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2673 * ->cpus_allowed is stable, we have preemption disabled, meaning
2674 * cpu_online_mask is stable.
2675 */
2676 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2677 set_task_cpu(p, cpu);
2678#endif
2679
2680 /*
2681 * Since the task is not on the rq and we still have TASK_WAKING set
2682 * nobody else will migrate this task.
2683 */
2684 rq = cpu_rq(cpu);
2685 raw_spin_lock_irqsave(&rq->lock, flags);
2655 2686
2656 rq = task_rq_lock(p, &flags);
2657 BUG_ON(p->state != TASK_WAKING); 2687 BUG_ON(p->state != TASK_WAKING);
2658 p->state = TASK_RUNNING; 2688 p->state = TASK_RUNNING;
2659 update_rq_clock(rq); 2689 update_rq_clock(rq);
@@ -2665,6 +2695,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2665 p->sched_class->task_woken(rq, p); 2695 p->sched_class->task_woken(rq, p);
2666#endif 2696#endif
2667 task_rq_unlock(rq, &flags); 2697 task_rq_unlock(rq, &flags);
2698 put_cpu();
2668} 2699}
2669 2700
2670#ifdef CONFIG_PREEMPT_NOTIFIERS 2701#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3094,50 +3125,6 @@ static void update_cpu_load(struct rq *this_rq)
3094#ifdef CONFIG_SMP 3125#ifdef CONFIG_SMP
3095 3126
3096/* 3127/*
3097 * double_rq_lock - safely lock two runqueues
3098 *
3099 * Note this does not disable interrupts like task_rq_lock,
3100 * you need to do so manually before calling.
3101 */
3102static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3103 __acquires(rq1->lock)
3104 __acquires(rq2->lock)
3105{
3106 BUG_ON(!irqs_disabled());
3107 if (rq1 == rq2) {
3108 raw_spin_lock(&rq1->lock);
3109 __acquire(rq2->lock); /* Fake it out ;) */
3110 } else {
3111 if (rq1 < rq2) {
3112 raw_spin_lock(&rq1->lock);
3113 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3114 } else {
3115 raw_spin_lock(&rq2->lock);
3116 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3117 }
3118 }
3119 update_rq_clock(rq1);
3120 update_rq_clock(rq2);
3121}
3122
3123/*
3124 * double_rq_unlock - safely unlock two runqueues
3125 *
3126 * Note this does not restore interrupts like task_rq_unlock,
3127 * you need to do so manually after calling.
3128 */
3129static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3130 __releases(rq1->lock)
3131 __releases(rq2->lock)
3132{
3133 raw_spin_unlock(&rq1->lock);
3134 if (rq1 != rq2)
3135 raw_spin_unlock(&rq2->lock);
3136 else
3137 __release(rq2->lock);
3138}
3139
3140/*
3141 * sched_exec - execve() is a valuable balancing opportunity, because at 3128 * sched_exec - execve() is a valuable balancing opportunity, because at
3142 * this point the task has the smallest effective memory and cache footprint. 3129 * this point the task has the smallest effective memory and cache footprint.
3143 */ 3130 */
@@ -3185,1771 +3172,6 @@ again:
3185 task_rq_unlock(rq, &flags); 3172 task_rq_unlock(rq, &flags);
3186} 3173}
3187 3174
3188/*
3189 * pull_task - move a task from a remote runqueue to the local runqueue.
3190 * Both runqueues must be locked.
3191 */
3192static void pull_task(struct rq *src_rq, struct task_struct *p,
3193 struct rq *this_rq, int this_cpu)
3194{
3195 deactivate_task(src_rq, p, 0);
3196 set_task_cpu(p, this_cpu);
3197 activate_task(this_rq, p, 0);
3198 check_preempt_curr(this_rq, p, 0);
3199}
3200
3201/*
3202 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3203 */
3204static
3205int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3206 struct sched_domain *sd, enum cpu_idle_type idle,
3207 int *all_pinned)
3208{
3209 int tsk_cache_hot = 0;
3210 /*
3211 * We do not migrate tasks that are:
3212 * 1) running (obviously), or
3213 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3214 * 3) are cache-hot on their current CPU.
3215 */
3216 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3217 schedstat_inc(p, se.nr_failed_migrations_affine);
3218 return 0;
3219 }
3220 *all_pinned = 0;
3221
3222 if (task_running(rq, p)) {
3223 schedstat_inc(p, se.nr_failed_migrations_running);
3224 return 0;
3225 }
3226
3227 /*
3228 * Aggressive migration if:
3229 * 1) task is cache cold, or
3230 * 2) too many balance attempts have failed.
3231 */
3232
3233 tsk_cache_hot = task_hot(p, rq->clock, sd);
3234 if (!tsk_cache_hot ||
3235 sd->nr_balance_failed > sd->cache_nice_tries) {
3236#ifdef CONFIG_SCHEDSTATS
3237 if (tsk_cache_hot) {
3238 schedstat_inc(sd, lb_hot_gained[idle]);
3239 schedstat_inc(p, se.nr_forced_migrations);
3240 }
3241#endif
3242 return 1;
3243 }
3244
3245 if (tsk_cache_hot) {
3246 schedstat_inc(p, se.nr_failed_migrations_hot);
3247 return 0;
3248 }
3249 return 1;
3250}
3251
3252static unsigned long
3253balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3254 unsigned long max_load_move, struct sched_domain *sd,
3255 enum cpu_idle_type idle, int *all_pinned,
3256 int *this_best_prio, struct rq_iterator *iterator)
3257{
3258 int loops = 0, pulled = 0, pinned = 0;
3259 struct task_struct *p;
3260 long rem_load_move = max_load_move;
3261
3262 if (max_load_move == 0)
3263 goto out;
3264
3265 pinned = 1;
3266
3267 /*
3268 * Start the load-balancing iterator:
3269 */
3270 p = iterator->start(iterator->arg);
3271next:
3272 if (!p || loops++ > sysctl_sched_nr_migrate)
3273 goto out;
3274
3275 if ((p->se.load.weight >> 1) > rem_load_move ||
3276 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3277 p = iterator->next(iterator->arg);
3278 goto next;
3279 }
3280
3281 pull_task(busiest, p, this_rq, this_cpu);
3282 pulled++;
3283 rem_load_move -= p->se.load.weight;
3284
3285#ifdef CONFIG_PREEMPT
3286 /*
3287 * NEWIDLE balancing is a source of latency, so preemptible kernels
3288 * will stop after the first task is pulled to minimize the critical
3289 * section.
3290 */
3291 if (idle == CPU_NEWLY_IDLE)
3292 goto out;
3293#endif
3294
3295 /*
3296 * We only want to steal up to the prescribed amount of weighted load.
3297 */
3298 if (rem_load_move > 0) {
3299 if (p->prio < *this_best_prio)
3300 *this_best_prio = p->prio;
3301 p = iterator->next(iterator->arg);
3302 goto next;
3303 }
3304out:
3305 /*
3306 * Right now, this is one of only two places pull_task() is called,
3307 * so we can safely collect pull_task() stats here rather than
3308 * inside pull_task().
3309 */
3310 schedstat_add(sd, lb_gained[idle], pulled);
3311
3312 if (all_pinned)
3313 *all_pinned = pinned;
3314
3315 return max_load_move - rem_load_move;
3316}
3317
3318/*
3319 * move_tasks tries to move up to max_load_move weighted load from busiest to
3320 * this_rq, as part of a balancing operation within domain "sd".
3321 * Returns 1 if successful and 0 otherwise.
3322 *
3323 * Called with both runqueues locked.
3324 */
3325static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3326 unsigned long max_load_move,
3327 struct sched_domain *sd, enum cpu_idle_type idle,
3328 int *all_pinned)
3329{
3330 const struct sched_class *class = sched_class_highest;
3331 unsigned long total_load_moved = 0;
3332 int this_best_prio = this_rq->curr->prio;
3333
3334 do {
3335 total_load_moved +=
3336 class->load_balance(this_rq, this_cpu, busiest,
3337 max_load_move - total_load_moved,
3338 sd, idle, all_pinned, &this_best_prio);
3339 class = class->next;
3340
3341#ifdef CONFIG_PREEMPT
3342 /*
3343 * NEWIDLE balancing is a source of latency, so preemptible
3344 * kernels will stop after the first task is pulled to minimize
3345 * the critical section.
3346 */
3347 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3348 break;
3349#endif
3350 } while (class && max_load_move > total_load_moved);
3351
3352 return total_load_moved > 0;
3353}
3354
3355static int
3356iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3357 struct sched_domain *sd, enum cpu_idle_type idle,
3358 struct rq_iterator *iterator)
3359{
3360 struct task_struct *p = iterator->start(iterator->arg);
3361 int pinned = 0;
3362
3363 while (p) {
3364 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3365 pull_task(busiest, p, this_rq, this_cpu);
3366 /*
3367 * Right now, this is only the second place pull_task()
3368 * is called, so we can safely collect pull_task()
3369 * stats here rather than inside pull_task().
3370 */
3371 schedstat_inc(sd, lb_gained[idle]);
3372
3373 return 1;
3374 }
3375 p = iterator->next(iterator->arg);
3376 }
3377
3378 return 0;
3379}
3380
3381/*
3382 * move_one_task tries to move exactly one task from busiest to this_rq, as
3383 * part of active balancing operations within "domain".
3384 * Returns 1 if successful and 0 otherwise.
3385 *
3386 * Called with both runqueues locked.
3387 */
3388static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3389 struct sched_domain *sd, enum cpu_idle_type idle)
3390{
3391 const struct sched_class *class;
3392
3393 for_each_class(class) {
3394 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3395 return 1;
3396 }
3397
3398 return 0;
3399}
3400/********** Helpers for find_busiest_group ************************/
3401/*
3402 * sd_lb_stats - Structure to store the statistics of a sched_domain
3403 * during load balancing.
3404 */
3405struct sd_lb_stats {
3406 struct sched_group *busiest; /* Busiest group in this sd */
3407 struct sched_group *this; /* Local group in this sd */
3408 unsigned long total_load; /* Total load of all groups in sd */
3409 unsigned long total_pwr; /* Total power of all groups in sd */
3410 unsigned long avg_load; /* Average load across all groups in sd */
3411
3412 /** Statistics of this group */
3413 unsigned long this_load;
3414 unsigned long this_load_per_task;
3415 unsigned long this_nr_running;
3416
3417 /* Statistics of the busiest group */
3418 unsigned long max_load;
3419 unsigned long busiest_load_per_task;
3420 unsigned long busiest_nr_running;
3421
3422 int group_imb; /* Is there imbalance in this sd */
3423#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3424 int power_savings_balance; /* Is powersave balance needed for this sd */
3425 struct sched_group *group_min; /* Least loaded group in sd */
3426 struct sched_group *group_leader; /* Group which relieves group_min */
3427 unsigned long min_load_per_task; /* load_per_task in group_min */
3428 unsigned long leader_nr_running; /* Nr running of group_leader */
3429 unsigned long min_nr_running; /* Nr running of group_min */
3430#endif
3431};
3432
3433/*
3434 * sg_lb_stats - stats of a sched_group required for load_balancing
3435 */
3436struct sg_lb_stats {
3437 unsigned long avg_load; /*Avg load across the CPUs of the group */
3438 unsigned long group_load; /* Total load over the CPUs of the group */
3439 unsigned long sum_nr_running; /* Nr tasks running in the group */
3440 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3441 unsigned long group_capacity;
3442 int group_imb; /* Is there an imbalance in the group ? */
3443};
3444
3445/**
3446 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3447 * @group: The group whose first cpu is to be returned.
3448 */
3449static inline unsigned int group_first_cpu(struct sched_group *group)
3450{
3451 return cpumask_first(sched_group_cpus(group));
3452}
3453
3454/**
3455 * get_sd_load_idx - Obtain the load index for a given sched domain.
3456 * @sd: The sched_domain whose load_idx is to be obtained.
3457 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3458 */
3459static inline int get_sd_load_idx(struct sched_domain *sd,
3460 enum cpu_idle_type idle)
3461{
3462 int load_idx;
3463
3464 switch (idle) {
3465 case CPU_NOT_IDLE:
3466 load_idx = sd->busy_idx;
3467 break;
3468
3469 case CPU_NEWLY_IDLE:
3470 load_idx = sd->newidle_idx;
3471 break;
3472 default:
3473 load_idx = sd->idle_idx;
3474 break;
3475 }
3476
3477 return load_idx;
3478}
3479
3480
3481#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3482/**
3483 * init_sd_power_savings_stats - Initialize power savings statistics for
3484 * the given sched_domain, during load balancing.
3485 *
3486 * @sd: Sched domain whose power-savings statistics are to be initialized.
3487 * @sds: Variable containing the statistics for sd.
3488 * @idle: Idle status of the CPU at which we're performing load-balancing.
3489 */
3490static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3491 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3492{
3493 /*
3494 * Busy processors will not participate in power savings
3495 * balance.
3496 */
3497 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3498 sds->power_savings_balance = 0;
3499 else {
3500 sds->power_savings_balance = 1;
3501 sds->min_nr_running = ULONG_MAX;
3502 sds->leader_nr_running = 0;
3503 }
3504}
3505
3506/**
3507 * update_sd_power_savings_stats - Update the power saving stats for a
3508 * sched_domain while performing load balancing.
3509 *
3510 * @group: sched_group belonging to the sched_domain under consideration.
3511 * @sds: Variable containing the statistics of the sched_domain
3512 * @local_group: Does group contain the CPU for which we're performing
3513 * load balancing ?
3514 * @sgs: Variable containing the statistics of the group.
3515 */
3516static inline void update_sd_power_savings_stats(struct sched_group *group,
3517 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3518{
3519
3520 if (!sds->power_savings_balance)
3521 return;
3522
3523 /*
3524 * If the local group is idle or completely loaded
3525 * no need to do power savings balance at this domain
3526 */
3527 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3528 !sds->this_nr_running))
3529 sds->power_savings_balance = 0;
3530
3531 /*
3532 * If a group is already running at full capacity or idle,
3533 * don't include that group in power savings calculations
3534 */
3535 if (!sds->power_savings_balance ||
3536 sgs->sum_nr_running >= sgs->group_capacity ||
3537 !sgs->sum_nr_running)
3538 return;
3539
3540 /*
3541 * Calculate the group which has the least non-idle load.
3542 * This is the group from where we need to pick up the load
3543 * for saving power
3544 */
3545 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3546 (sgs->sum_nr_running == sds->min_nr_running &&
3547 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3548 sds->group_min = group;
3549 sds->min_nr_running = sgs->sum_nr_running;
3550 sds->min_load_per_task = sgs->sum_weighted_load /
3551 sgs->sum_nr_running;
3552 }
3553
3554 /*
3555 * Calculate the group which is almost near its
3556 * capacity but still has some space to pick up some load
3557 * from other group and save more power
3558 */
3559 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3560 return;
3561
3562 if (sgs->sum_nr_running > sds->leader_nr_running ||
3563 (sgs->sum_nr_running == sds->leader_nr_running &&
3564 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3565 sds->group_leader = group;
3566 sds->leader_nr_running = sgs->sum_nr_running;
3567 }
3568}
3569
3570/**
3571 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3572 * @sds: Variable containing the statistics of the sched_domain
3573 * under consideration.
3574 * @this_cpu: Cpu at which we're currently performing load-balancing.
3575 * @imbalance: Variable to store the imbalance.
3576 *
3577 * Description:
3578 * Check if we have potential to perform some power-savings balance.
3579 * If yes, set the busiest group to be the least loaded group in the
3580 * sched_domain, so that it's CPUs can be put to idle.
3581 *
3582 * Returns 1 if there is potential to perform power-savings balance.
3583 * Else returns 0.
3584 */
3585static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3586 int this_cpu, unsigned long *imbalance)
3587{
3588 if (!sds->power_savings_balance)
3589 return 0;
3590
3591 if (sds->this != sds->group_leader ||
3592 sds->group_leader == sds->group_min)
3593 return 0;
3594
3595 *imbalance = sds->min_load_per_task;
3596 sds->busiest = sds->group_min;
3597
3598 return 1;
3599
3600}
3601#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3602static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3603 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3604{
3605 return;
3606}
3607
3608static inline void update_sd_power_savings_stats(struct sched_group *group,
3609 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3610{
3611 return;
3612}
3613
3614static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3615 int this_cpu, unsigned long *imbalance)
3616{
3617 return 0;
3618}
3619#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3620
3621
3622unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3623{
3624 return SCHED_LOAD_SCALE;
3625}
3626
3627unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3628{
3629 return default_scale_freq_power(sd, cpu);
3630}
3631
3632unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3633{
3634 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3635 unsigned long smt_gain = sd->smt_gain;
3636
3637 smt_gain /= weight;
3638
3639 return smt_gain;
3640}
3641
3642unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3643{
3644 return default_scale_smt_power(sd, cpu);
3645}
3646
3647unsigned long scale_rt_power(int cpu)
3648{
3649 struct rq *rq = cpu_rq(cpu);
3650 u64 total, available;
3651
3652 sched_avg_update(rq);
3653
3654 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3655 available = total - rq->rt_avg;
3656
3657 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3658 total = SCHED_LOAD_SCALE;
3659
3660 total >>= SCHED_LOAD_SHIFT;
3661
3662 return div_u64(available, total);
3663}
3664
3665static void update_cpu_power(struct sched_domain *sd, int cpu)
3666{
3667 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3668 unsigned long power = SCHED_LOAD_SCALE;
3669 struct sched_group *sdg = sd->groups;
3670
3671 if (sched_feat(ARCH_POWER))
3672 power *= arch_scale_freq_power(sd, cpu);
3673 else
3674 power *= default_scale_freq_power(sd, cpu);
3675
3676 power >>= SCHED_LOAD_SHIFT;
3677
3678 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3679 if (sched_feat(ARCH_POWER))
3680 power *= arch_scale_smt_power(sd, cpu);
3681 else
3682 power *= default_scale_smt_power(sd, cpu);
3683
3684 power >>= SCHED_LOAD_SHIFT;
3685 }
3686
3687 power *= scale_rt_power(cpu);
3688 power >>= SCHED_LOAD_SHIFT;
3689
3690 if (!power)
3691 power = 1;
3692
3693 sdg->cpu_power = power;
3694}
3695
3696static void update_group_power(struct sched_domain *sd, int cpu)
3697{
3698 struct sched_domain *child = sd->child;
3699 struct sched_group *group, *sdg = sd->groups;
3700 unsigned long power;
3701
3702 if (!child) {
3703 update_cpu_power(sd, cpu);
3704 return;
3705 }
3706
3707 power = 0;
3708
3709 group = child->groups;
3710 do {
3711 power += group->cpu_power;
3712 group = group->next;
3713 } while (group != child->groups);
3714
3715 sdg->cpu_power = power;
3716}
3717
3718/**
3719 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3720 * @sd: The sched_domain whose statistics are to be updated.
3721 * @group: sched_group whose statistics are to be updated.
3722 * @this_cpu: Cpu for which load balance is currently performed.
3723 * @idle: Idle status of this_cpu
3724 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3725 * @sd_idle: Idle status of the sched_domain containing group.
3726 * @local_group: Does group contain this_cpu.
3727 * @cpus: Set of cpus considered for load balancing.
3728 * @balance: Should we balance.
3729 * @sgs: variable to hold the statistics for this group.
3730 */
3731static inline void update_sg_lb_stats(struct sched_domain *sd,
3732 struct sched_group *group, int this_cpu,
3733 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3734 int local_group, const struct cpumask *cpus,
3735 int *balance, struct sg_lb_stats *sgs)
3736{
3737 unsigned long load, max_cpu_load, min_cpu_load;
3738 int i;
3739 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3740 unsigned long sum_avg_load_per_task;
3741 unsigned long avg_load_per_task;
3742
3743 if (local_group) {
3744 balance_cpu = group_first_cpu(group);
3745 if (balance_cpu == this_cpu)
3746 update_group_power(sd, this_cpu);
3747 }
3748
3749 /* Tally up the load of all CPUs in the group */
3750 sum_avg_load_per_task = avg_load_per_task = 0;
3751 max_cpu_load = 0;
3752 min_cpu_load = ~0UL;
3753
3754 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3755 struct rq *rq = cpu_rq(i);
3756
3757 if (*sd_idle && rq->nr_running)
3758 *sd_idle = 0;
3759
3760 /* Bias balancing toward cpus of our domain */
3761 if (local_group) {
3762 if (idle_cpu(i) && !first_idle_cpu) {
3763 first_idle_cpu = 1;
3764 balance_cpu = i;
3765 }
3766
3767 load = target_load(i, load_idx);
3768 } else {
3769 load = source_load(i, load_idx);
3770 if (load > max_cpu_load)
3771 max_cpu_load = load;
3772 if (min_cpu_load > load)
3773 min_cpu_load = load;
3774 }
3775
3776 sgs->group_load += load;
3777 sgs->sum_nr_running += rq->nr_running;
3778 sgs->sum_weighted_load += weighted_cpuload(i);
3779
3780 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3781 }
3782
3783 /*
3784 * First idle cpu or the first cpu(busiest) in this sched group
3785 * is eligible for doing load balancing at this and above
3786 * domains. In the newly idle case, we will allow all the cpu's
3787 * to do the newly idle load balance.
3788 */
3789 if (idle != CPU_NEWLY_IDLE && local_group &&
3790 balance_cpu != this_cpu && balance) {
3791 *balance = 0;
3792 return;
3793 }
3794
3795 /* Adjust by relative CPU power of the group */
3796 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3797
3798
3799 /*
3800 * Consider the group unbalanced when the imbalance is larger
3801 * than the average weight of two tasks.
3802 *
3803 * APZ: with cgroup the avg task weight can vary wildly and
3804 * might not be a suitable number - should we keep a
3805 * normalized nr_running number somewhere that negates
3806 * the hierarchy?
3807 */
3808 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3809 group->cpu_power;
3810
3811 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3812 sgs->group_imb = 1;
3813
3814 sgs->group_capacity =
3815 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3816}
3817
3818/**
3819 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3820 * @sd: sched_domain whose statistics are to be updated.
3821 * @this_cpu: Cpu for which load balance is currently performed.
3822 * @idle: Idle status of this_cpu
3823 * @sd_idle: Idle status of the sched_domain containing group.
3824 * @cpus: Set of cpus considered for load balancing.
3825 * @balance: Should we balance.
3826 * @sds: variable to hold the statistics for this sched_domain.
3827 */
3828static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3829 enum cpu_idle_type idle, int *sd_idle,
3830 const struct cpumask *cpus, int *balance,
3831 struct sd_lb_stats *sds)
3832{
3833 struct sched_domain *child = sd->child;
3834 struct sched_group *group = sd->groups;
3835 struct sg_lb_stats sgs;
3836 int load_idx, prefer_sibling = 0;
3837
3838 if (child && child->flags & SD_PREFER_SIBLING)
3839 prefer_sibling = 1;
3840
3841 init_sd_power_savings_stats(sd, sds, idle);
3842 load_idx = get_sd_load_idx(sd, idle);
3843
3844 do {
3845 int local_group;
3846
3847 local_group = cpumask_test_cpu(this_cpu,
3848 sched_group_cpus(group));
3849 memset(&sgs, 0, sizeof(sgs));
3850 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3851 local_group, cpus, balance, &sgs);
3852
3853 if (local_group && balance && !(*balance))
3854 return;
3855
3856 sds->total_load += sgs.group_load;
3857 sds->total_pwr += group->cpu_power;
3858
3859 /*
3860 * In case the child domain prefers tasks go to siblings
3861 * first, lower the group capacity to one so that we'll try
3862 * and move all the excess tasks away.
3863 */
3864 if (prefer_sibling)
3865 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3866
3867 if (local_group) {
3868 sds->this_load = sgs.avg_load;
3869 sds->this = group;
3870 sds->this_nr_running = sgs.sum_nr_running;
3871 sds->this_load_per_task = sgs.sum_weighted_load;
3872 } else if (sgs.avg_load > sds->max_load &&
3873 (sgs.sum_nr_running > sgs.group_capacity ||
3874 sgs.group_imb)) {
3875 sds->max_load = sgs.avg_load;
3876 sds->busiest = group;
3877 sds->busiest_nr_running = sgs.sum_nr_running;
3878 sds->busiest_load_per_task = sgs.sum_weighted_load;
3879 sds->group_imb = sgs.group_imb;
3880 }
3881
3882 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3883 group = group->next;
3884 } while (group != sd->groups);
3885}
3886
3887/**
3888 * fix_small_imbalance - Calculate the minor imbalance that exists
3889 * amongst the groups of a sched_domain, during
3890 * load balancing.
3891 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3892 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3893 * @imbalance: Variable to store the imbalance.
3894 */
3895static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3896 int this_cpu, unsigned long *imbalance)
3897{
3898 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3899 unsigned int imbn = 2;
3900
3901 if (sds->this_nr_running) {
3902 sds->this_load_per_task /= sds->this_nr_running;
3903 if (sds->busiest_load_per_task >
3904 sds->this_load_per_task)
3905 imbn = 1;
3906 } else
3907 sds->this_load_per_task =
3908 cpu_avg_load_per_task(this_cpu);
3909
3910 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3911 sds->busiest_load_per_task * imbn) {
3912 *imbalance = sds->busiest_load_per_task;
3913 return;
3914 }
3915
3916 /*
3917 * OK, we don't have enough imbalance to justify moving tasks,
3918 * however we may be able to increase total CPU power used by
3919 * moving them.
3920 */
3921
3922 pwr_now += sds->busiest->cpu_power *
3923 min(sds->busiest_load_per_task, sds->max_load);
3924 pwr_now += sds->this->cpu_power *
3925 min(sds->this_load_per_task, sds->this_load);
3926 pwr_now /= SCHED_LOAD_SCALE;
3927
3928 /* Amount of load we'd subtract */
3929 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3930 sds->busiest->cpu_power;
3931 if (sds->max_load > tmp)
3932 pwr_move += sds->busiest->cpu_power *
3933 min(sds->busiest_load_per_task, sds->max_load - tmp);
3934
3935 /* Amount of load we'd add */
3936 if (sds->max_load * sds->busiest->cpu_power <
3937 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3938 tmp = (sds->max_load * sds->busiest->cpu_power) /
3939 sds->this->cpu_power;
3940 else
3941 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3942 sds->this->cpu_power;
3943 pwr_move += sds->this->cpu_power *
3944 min(sds->this_load_per_task, sds->this_load + tmp);
3945 pwr_move /= SCHED_LOAD_SCALE;
3946
3947 /* Move if we gain throughput */
3948 if (pwr_move > pwr_now)
3949 *imbalance = sds->busiest_load_per_task;
3950}
3951
3952/**
3953 * calculate_imbalance - Calculate the amount of imbalance present within the
3954 * groups of a given sched_domain during load balance.
3955 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3956 * @this_cpu: Cpu for which currently load balance is being performed.
3957 * @imbalance: The variable to store the imbalance.
3958 */
3959static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3960 unsigned long *imbalance)
3961{
3962 unsigned long max_pull;
3963 /*
3964 * In the presence of smp nice balancing, certain scenarios can have
3965 * max load less than avg load(as we skip the groups at or below
3966 * its cpu_power, while calculating max_load..)
3967 */
3968 if (sds->max_load < sds->avg_load) {
3969 *imbalance = 0;
3970 return fix_small_imbalance(sds, this_cpu, imbalance);
3971 }
3972
3973 /* Don't want to pull so many tasks that a group would go idle */
3974 max_pull = min(sds->max_load - sds->avg_load,
3975 sds->max_load - sds->busiest_load_per_task);
3976
3977 /* How much load to actually move to equalise the imbalance */
3978 *imbalance = min(max_pull * sds->busiest->cpu_power,
3979 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3980 / SCHED_LOAD_SCALE;
3981
3982 /*
3983 * if *imbalance is less than the average load per runnable task
3984 * there is no gaurantee that any tasks will be moved so we'll have
3985 * a think about bumping its value to force at least one task to be
3986 * moved
3987 */
3988 if (*imbalance < sds->busiest_load_per_task)
3989 return fix_small_imbalance(sds, this_cpu, imbalance);
3990
3991}
3992/******* find_busiest_group() helpers end here *********************/
3993
3994/**
3995 * find_busiest_group - Returns the busiest group within the sched_domain
3996 * if there is an imbalance. If there isn't an imbalance, and
3997 * the user has opted for power-savings, it returns a group whose
3998 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3999 * such a group exists.
4000 *
4001 * Also calculates the amount of weighted load which should be moved
4002 * to restore balance.
4003 *
4004 * @sd: The sched_domain whose busiest group is to be returned.
4005 * @this_cpu: The cpu for which load balancing is currently being performed.
4006 * @imbalance: Variable which stores amount of weighted load which should
4007 * be moved to restore balance/put a group to idle.
4008 * @idle: The idle status of this_cpu.
4009 * @sd_idle: The idleness of sd
4010 * @cpus: The set of CPUs under consideration for load-balancing.
4011 * @balance: Pointer to a variable indicating if this_cpu
4012 * is the appropriate cpu to perform load balancing at this_level.
4013 *
4014 * Returns: - the busiest group if imbalance exists.
4015 * - If no imbalance and user has opted for power-savings balance,
4016 * return the least loaded group whose CPUs can be
4017 * put to idle by rebalancing its tasks onto our group.
4018 */
4019static struct sched_group *
4020find_busiest_group(struct sched_domain *sd, int this_cpu,
4021 unsigned long *imbalance, enum cpu_idle_type idle,
4022 int *sd_idle, const struct cpumask *cpus, int *balance)
4023{
4024 struct sd_lb_stats sds;
4025
4026 memset(&sds, 0, sizeof(sds));
4027
4028 /*
4029 * Compute the various statistics relavent for load balancing at
4030 * this level.
4031 */
4032 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4033 balance, &sds);
4034
4035 /* Cases where imbalance does not exist from POV of this_cpu */
4036 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4037 * at this level.
4038 * 2) There is no busy sibling group to pull from.
4039 * 3) This group is the busiest group.
4040 * 4) This group is more busy than the avg busieness at this
4041 * sched_domain.
4042 * 5) The imbalance is within the specified limit.
4043 * 6) Any rebalance would lead to ping-pong
4044 */
4045 if (balance && !(*balance))
4046 goto ret;
4047
4048 if (!sds.busiest || sds.busiest_nr_running == 0)
4049 goto out_balanced;
4050
4051 if (sds.this_load >= sds.max_load)
4052 goto out_balanced;
4053
4054 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4055
4056 if (sds.this_load >= sds.avg_load)
4057 goto out_balanced;
4058
4059 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4060 goto out_balanced;
4061
4062 sds.busiest_load_per_task /= sds.busiest_nr_running;
4063 if (sds.group_imb)
4064 sds.busiest_load_per_task =
4065 min(sds.busiest_load_per_task, sds.avg_load);
4066
4067 /*
4068 * We're trying to get all the cpus to the average_load, so we don't
4069 * want to push ourselves above the average load, nor do we wish to
4070 * reduce the max loaded cpu below the average load, as either of these
4071 * actions would just result in more rebalancing later, and ping-pong
4072 * tasks around. Thus we look for the minimum possible imbalance.
4073 * Negative imbalances (*we* are more loaded than anyone else) will
4074 * be counted as no imbalance for these purposes -- we can't fix that
4075 * by pulling tasks to us. Be careful of negative numbers as they'll
4076 * appear as very large values with unsigned longs.
4077 */
4078 if (sds.max_load <= sds.busiest_load_per_task)
4079 goto out_balanced;
4080
4081 /* Looks like there is an imbalance. Compute it */
4082 calculate_imbalance(&sds, this_cpu, imbalance);
4083 return sds.busiest;
4084
4085out_balanced:
4086 /*
4087 * There is no obvious imbalance. But check if we can do some balancing
4088 * to save power.
4089 */
4090 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4091 return sds.busiest;
4092ret:
4093 *imbalance = 0;
4094 return NULL;
4095}
4096
4097/*
4098 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4099 */
4100static struct rq *
4101find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4102 unsigned long imbalance, const struct cpumask *cpus)
4103{
4104 struct rq *busiest = NULL, *rq;
4105 unsigned long max_load = 0;
4106 int i;
4107
4108 for_each_cpu(i, sched_group_cpus(group)) {
4109 unsigned long power = power_of(i);
4110 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4111 unsigned long wl;
4112
4113 if (!cpumask_test_cpu(i, cpus))
4114 continue;
4115
4116 rq = cpu_rq(i);
4117 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4118 wl /= power;
4119
4120 if (capacity && rq->nr_running == 1 && wl > imbalance)
4121 continue;
4122
4123 if (wl > max_load) {
4124 max_load = wl;
4125 busiest = rq;
4126 }
4127 }
4128
4129 return busiest;
4130}
4131
4132/*
4133 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4134 * so long as it is large enough.
4135 */
4136#define MAX_PINNED_INTERVAL 512
4137
4138/* Working cpumask for load_balance and load_balance_newidle. */
4139static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4140
4141/*
4142 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4143 * tasks if there is an imbalance.
4144 */
4145static int load_balance(int this_cpu, struct rq *this_rq,
4146 struct sched_domain *sd, enum cpu_idle_type idle,
4147 int *balance)
4148{
4149 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4150 struct sched_group *group;
4151 unsigned long imbalance;
4152 struct rq *busiest;
4153 unsigned long flags;
4154 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4155
4156 cpumask_copy(cpus, cpu_active_mask);
4157
4158 /*
4159 * When power savings policy is enabled for the parent domain, idle
4160 * sibling can pick up load irrespective of busy siblings. In this case,
4161 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4162 * portraying it as CPU_NOT_IDLE.
4163 */
4164 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4165 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4166 sd_idle = 1;
4167
4168 schedstat_inc(sd, lb_count[idle]);
4169
4170redo:
4171 update_shares(sd);
4172 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4173 cpus, balance);
4174
4175 if (*balance == 0)
4176 goto out_balanced;
4177
4178 if (!group) {
4179 schedstat_inc(sd, lb_nobusyg[idle]);
4180 goto out_balanced;
4181 }
4182
4183 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4184 if (!busiest) {
4185 schedstat_inc(sd, lb_nobusyq[idle]);
4186 goto out_balanced;
4187 }
4188
4189 BUG_ON(busiest == this_rq);
4190
4191 schedstat_add(sd, lb_imbalance[idle], imbalance);
4192
4193 ld_moved = 0;
4194 if (busiest->nr_running > 1) {
4195 /*
4196 * Attempt to move tasks. If find_busiest_group has found
4197 * an imbalance but busiest->nr_running <= 1, the group is
4198 * still unbalanced. ld_moved simply stays zero, so it is
4199 * correctly treated as an imbalance.
4200 */
4201 local_irq_save(flags);
4202 double_rq_lock(this_rq, busiest);
4203 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4204 imbalance, sd, idle, &all_pinned);
4205 double_rq_unlock(this_rq, busiest);
4206 local_irq_restore(flags);
4207
4208 /*
4209 * some other cpu did the load balance for us.
4210 */
4211 if (ld_moved && this_cpu != smp_processor_id())
4212 resched_cpu(this_cpu);
4213
4214 /* All tasks on this runqueue were pinned by CPU affinity */
4215 if (unlikely(all_pinned)) {
4216 cpumask_clear_cpu(cpu_of(busiest), cpus);
4217 if (!cpumask_empty(cpus))
4218 goto redo;
4219 goto out_balanced;
4220 }
4221 }
4222
4223 if (!ld_moved) {
4224 schedstat_inc(sd, lb_failed[idle]);
4225 sd->nr_balance_failed++;
4226
4227 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4228
4229 raw_spin_lock_irqsave(&busiest->lock, flags);
4230
4231 /* don't kick the migration_thread, if the curr
4232 * task on busiest cpu can't be moved to this_cpu
4233 */
4234 if (!cpumask_test_cpu(this_cpu,
4235 &busiest->curr->cpus_allowed)) {
4236 raw_spin_unlock_irqrestore(&busiest->lock,
4237 flags);
4238 all_pinned = 1;
4239 goto out_one_pinned;
4240 }
4241
4242 if (!busiest->active_balance) {
4243 busiest->active_balance = 1;
4244 busiest->push_cpu = this_cpu;
4245 active_balance = 1;
4246 }
4247 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4248 if (active_balance)
4249 wake_up_process(busiest->migration_thread);
4250
4251 /*
4252 * We've kicked active balancing, reset the failure
4253 * counter.
4254 */
4255 sd->nr_balance_failed = sd->cache_nice_tries+1;
4256 }
4257 } else
4258 sd->nr_balance_failed = 0;
4259
4260 if (likely(!active_balance)) {
4261 /* We were unbalanced, so reset the balancing interval */
4262 sd->balance_interval = sd->min_interval;
4263 } else {
4264 /*
4265 * If we've begun active balancing, start to back off. This
4266 * case may not be covered by the all_pinned logic if there
4267 * is only 1 task on the busy runqueue (because we don't call
4268 * move_tasks).
4269 */
4270 if (sd->balance_interval < sd->max_interval)
4271 sd->balance_interval *= 2;
4272 }
4273
4274 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4275 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4276 ld_moved = -1;
4277
4278 goto out;
4279
4280out_balanced:
4281 schedstat_inc(sd, lb_balanced[idle]);
4282
4283 sd->nr_balance_failed = 0;
4284
4285out_one_pinned:
4286 /* tune up the balancing interval */
4287 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4288 (sd->balance_interval < sd->max_interval))
4289 sd->balance_interval *= 2;
4290
4291 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4292 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4293 ld_moved = -1;
4294 else
4295 ld_moved = 0;
4296out:
4297 if (ld_moved)
4298 update_shares(sd);
4299 return ld_moved;
4300}
4301
4302/*
4303 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4304 * tasks if there is an imbalance.
4305 *
4306 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4307 * this_rq is locked.
4308 */
4309static int
4310load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4311{
4312 struct sched_group *group;
4313 struct rq *busiest = NULL;
4314 unsigned long imbalance;
4315 int ld_moved = 0;
4316 int sd_idle = 0;
4317 int all_pinned = 0;
4318 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4319
4320 cpumask_copy(cpus, cpu_active_mask);
4321
4322 /*
4323 * When power savings policy is enabled for the parent domain, idle
4324 * sibling can pick up load irrespective of busy siblings. In this case,
4325 * let the state of idle sibling percolate up as IDLE, instead of
4326 * portraying it as CPU_NOT_IDLE.
4327 */
4328 if (sd->flags & SD_SHARE_CPUPOWER &&
4329 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4330 sd_idle = 1;
4331
4332 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4333redo:
4334 update_shares_locked(this_rq, sd);
4335 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4336 &sd_idle, cpus, NULL);
4337 if (!group) {
4338 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4339 goto out_balanced;
4340 }
4341
4342 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4343 if (!busiest) {
4344 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4345 goto out_balanced;
4346 }
4347
4348 BUG_ON(busiest == this_rq);
4349
4350 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4351
4352 ld_moved = 0;
4353 if (busiest->nr_running > 1) {
4354 /* Attempt to move tasks */
4355 double_lock_balance(this_rq, busiest);
4356 /* this_rq->clock is already updated */
4357 update_rq_clock(busiest);
4358 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4359 imbalance, sd, CPU_NEWLY_IDLE,
4360 &all_pinned);
4361 double_unlock_balance(this_rq, busiest);
4362
4363 if (unlikely(all_pinned)) {
4364 cpumask_clear_cpu(cpu_of(busiest), cpus);
4365 if (!cpumask_empty(cpus))
4366 goto redo;
4367 }
4368 }
4369
4370 if (!ld_moved) {
4371 int active_balance = 0;
4372
4373 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4374 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4375 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4376 return -1;
4377
4378 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4379 return -1;
4380
4381 if (sd->nr_balance_failed++ < 2)
4382 return -1;
4383
4384 /*
4385 * The only task running in a non-idle cpu can be moved to this
4386 * cpu in an attempt to completely freeup the other CPU
4387 * package. The same method used to move task in load_balance()
4388 * have been extended for load_balance_newidle() to speedup
4389 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4390 *
4391 * The package power saving logic comes from
4392 * find_busiest_group(). If there are no imbalance, then
4393 * f_b_g() will return NULL. However when sched_mc={1,2} then
4394 * f_b_g() will select a group from which a running task may be
4395 * pulled to this cpu in order to make the other package idle.
4396 * If there is no opportunity to make a package idle and if
4397 * there are no imbalance, then f_b_g() will return NULL and no
4398 * action will be taken in load_balance_newidle().
4399 *
4400 * Under normal task pull operation due to imbalance, there
4401 * will be more than one task in the source run queue and
4402 * move_tasks() will succeed. ld_moved will be true and this
4403 * active balance code will not be triggered.
4404 */
4405
4406 /* Lock busiest in correct order while this_rq is held */
4407 double_lock_balance(this_rq, busiest);
4408
4409 /*
4410 * don't kick the migration_thread, if the curr
4411 * task on busiest cpu can't be moved to this_cpu
4412 */
4413 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4414 double_unlock_balance(this_rq, busiest);
4415 all_pinned = 1;
4416 return ld_moved;
4417 }
4418
4419 if (!busiest->active_balance) {
4420 busiest->active_balance = 1;
4421 busiest->push_cpu = this_cpu;
4422 active_balance = 1;
4423 }
4424
4425 double_unlock_balance(this_rq, busiest);
4426 /*
4427 * Should not call ttwu while holding a rq->lock
4428 */
4429 raw_spin_unlock(&this_rq->lock);
4430 if (active_balance)
4431 wake_up_process(busiest->migration_thread);
4432 raw_spin_lock(&this_rq->lock);
4433
4434 } else
4435 sd->nr_balance_failed = 0;
4436
4437 update_shares_locked(this_rq, sd);
4438 return ld_moved;
4439
4440out_balanced:
4441 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4442 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4443 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4444 return -1;
4445 sd->nr_balance_failed = 0;
4446
4447 return 0;
4448}
4449
4450/*
4451 * idle_balance is called by schedule() if this_cpu is about to become
4452 * idle. Attempts to pull tasks from other CPUs.
4453 */
4454static void idle_balance(int this_cpu, struct rq *this_rq)
4455{
4456 struct sched_domain *sd;
4457 int pulled_task = 0;
4458 unsigned long next_balance = jiffies + HZ;
4459
4460 this_rq->idle_stamp = this_rq->clock;
4461
4462 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4463 return;
4464
4465 for_each_domain(this_cpu, sd) {
4466 unsigned long interval;
4467
4468 if (!(sd->flags & SD_LOAD_BALANCE))
4469 continue;
4470
4471 if (sd->flags & SD_BALANCE_NEWIDLE)
4472 /* If we've pulled tasks over stop searching: */
4473 pulled_task = load_balance_newidle(this_cpu, this_rq,
4474 sd);
4475
4476 interval = msecs_to_jiffies(sd->balance_interval);
4477 if (time_after(next_balance, sd->last_balance + interval))
4478 next_balance = sd->last_balance + interval;
4479 if (pulled_task) {
4480 this_rq->idle_stamp = 0;
4481 break;
4482 }
4483 }
4484 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4485 /*
4486 * We are going idle. next_balance may be set based on
4487 * a busy processor. So reset next_balance.
4488 */
4489 this_rq->next_balance = next_balance;
4490 }
4491}
4492
4493/*
4494 * active_load_balance is run by migration threads. It pushes running tasks
4495 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4496 * running on each physical CPU where possible, and avoids physical /
4497 * logical imbalances.
4498 *
4499 * Called with busiest_rq locked.
4500 */
4501static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4502{
4503 int target_cpu = busiest_rq->push_cpu;
4504 struct sched_domain *sd;
4505 struct rq *target_rq;
4506
4507 /* Is there any task to move? */
4508 if (busiest_rq->nr_running <= 1)
4509 return;
4510
4511 target_rq = cpu_rq(target_cpu);
4512
4513 /*
4514 * This condition is "impossible", if it occurs
4515 * we need to fix it. Originally reported by
4516 * Bjorn Helgaas on a 128-cpu setup.
4517 */
4518 BUG_ON(busiest_rq == target_rq);
4519
4520 /* move a task from busiest_rq to target_rq */
4521 double_lock_balance(busiest_rq, target_rq);
4522 update_rq_clock(busiest_rq);
4523 update_rq_clock(target_rq);
4524
4525 /* Search for an sd spanning us and the target CPU. */
4526 for_each_domain(target_cpu, sd) {
4527 if ((sd->flags & SD_LOAD_BALANCE) &&
4528 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4529 break;
4530 }
4531
4532 if (likely(sd)) {
4533 schedstat_inc(sd, alb_count);
4534
4535 if (move_one_task(target_rq, target_cpu, busiest_rq,
4536 sd, CPU_IDLE))
4537 schedstat_inc(sd, alb_pushed);
4538 else
4539 schedstat_inc(sd, alb_failed);
4540 }
4541 double_unlock_balance(busiest_rq, target_rq);
4542}
4543
4544#ifdef CONFIG_NO_HZ
4545static struct {
4546 atomic_t load_balancer;
4547 cpumask_var_t cpu_mask;
4548 cpumask_var_t ilb_grp_nohz_mask;
4549} nohz ____cacheline_aligned = {
4550 .load_balancer = ATOMIC_INIT(-1),
4551};
4552
4553int get_nohz_load_balancer(void)
4554{
4555 return atomic_read(&nohz.load_balancer);
4556}
4557
4558#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4559/**
4560 * lowest_flag_domain - Return lowest sched_domain containing flag.
4561 * @cpu: The cpu whose lowest level of sched domain is to
4562 * be returned.
4563 * @flag: The flag to check for the lowest sched_domain
4564 * for the given cpu.
4565 *
4566 * Returns the lowest sched_domain of a cpu which contains the given flag.
4567 */
4568static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4569{
4570 struct sched_domain *sd;
4571
4572 for_each_domain(cpu, sd)
4573 if (sd && (sd->flags & flag))
4574 break;
4575
4576 return sd;
4577}
4578
4579/**
4580 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4581 * @cpu: The cpu whose domains we're iterating over.
4582 * @sd: variable holding the value of the power_savings_sd
4583 * for cpu.
4584 * @flag: The flag to filter the sched_domains to be iterated.
4585 *
4586 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4587 * set, starting from the lowest sched_domain to the highest.
4588 */
4589#define for_each_flag_domain(cpu, sd, flag) \
4590 for (sd = lowest_flag_domain(cpu, flag); \
4591 (sd && (sd->flags & flag)); sd = sd->parent)
4592
4593/**
4594 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4595 * @ilb_group: group to be checked for semi-idleness
4596 *
4597 * Returns: 1 if the group is semi-idle. 0 otherwise.
4598 *
4599 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4600 * and atleast one non-idle CPU. This helper function checks if the given
4601 * sched_group is semi-idle or not.
4602 */
4603static inline int is_semi_idle_group(struct sched_group *ilb_group)
4604{
4605 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4606 sched_group_cpus(ilb_group));
4607
4608 /*
4609 * A sched_group is semi-idle when it has atleast one busy cpu
4610 * and atleast one idle cpu.
4611 */
4612 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4613 return 0;
4614
4615 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4616 return 0;
4617
4618 return 1;
4619}
4620/**
4621 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4622 * @cpu: The cpu which is nominating a new idle_load_balancer.
4623 *
4624 * Returns: Returns the id of the idle load balancer if it exists,
4625 * Else, returns >= nr_cpu_ids.
4626 *
4627 * This algorithm picks the idle load balancer such that it belongs to a
4628 * semi-idle powersavings sched_domain. The idea is to try and avoid
4629 * completely idle packages/cores just for the purpose of idle load balancing
4630 * when there are other idle cpu's which are better suited for that job.
4631 */
4632static int find_new_ilb(int cpu)
4633{
4634 struct sched_domain *sd;
4635 struct sched_group *ilb_group;
4636
4637 /*
4638 * Have idle load balancer selection from semi-idle packages only
4639 * when power-aware load balancing is enabled
4640 */
4641 if (!(sched_smt_power_savings || sched_mc_power_savings))
4642 goto out_done;
4643
4644 /*
4645 * Optimize for the case when we have no idle CPUs or only one
4646 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4647 */
4648 if (cpumask_weight(nohz.cpu_mask) < 2)
4649 goto out_done;
4650
4651 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4652 ilb_group = sd->groups;
4653
4654 do {
4655 if (is_semi_idle_group(ilb_group))
4656 return cpumask_first(nohz.ilb_grp_nohz_mask);
4657
4658 ilb_group = ilb_group->next;
4659
4660 } while (ilb_group != sd->groups);
4661 }
4662
4663out_done:
4664 return cpumask_first(nohz.cpu_mask);
4665}
4666#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4667static inline int find_new_ilb(int call_cpu)
4668{
4669 return cpumask_first(nohz.cpu_mask);
4670}
4671#endif
4672
4673/*
4674 * This routine will try to nominate the ilb (idle load balancing)
4675 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4676 * load balancing on behalf of all those cpus. If all the cpus in the system
4677 * go into this tickless mode, then there will be no ilb owner (as there is
4678 * no need for one) and all the cpus will sleep till the next wakeup event
4679 * arrives...
4680 *
4681 * For the ilb owner, tick is not stopped. And this tick will be used
4682 * for idle load balancing. ilb owner will still be part of
4683 * nohz.cpu_mask..
4684 *
4685 * While stopping the tick, this cpu will become the ilb owner if there
4686 * is no other owner. And will be the owner till that cpu becomes busy
4687 * or if all cpus in the system stop their ticks at which point
4688 * there is no need for ilb owner.
4689 *
4690 * When the ilb owner becomes busy, it nominates another owner, during the
4691 * next busy scheduler_tick()
4692 */
4693int select_nohz_load_balancer(int stop_tick)
4694{
4695 int cpu = smp_processor_id();
4696
4697 if (stop_tick) {
4698 cpu_rq(cpu)->in_nohz_recently = 1;
4699
4700 if (!cpu_active(cpu)) {
4701 if (atomic_read(&nohz.load_balancer) != cpu)
4702 return 0;
4703
4704 /*
4705 * If we are going offline and still the leader,
4706 * give up!
4707 */
4708 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4709 BUG();
4710
4711 return 0;
4712 }
4713
4714 cpumask_set_cpu(cpu, nohz.cpu_mask);
4715
4716 /* time for ilb owner also to sleep */
4717 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4718 if (atomic_read(&nohz.load_balancer) == cpu)
4719 atomic_set(&nohz.load_balancer, -1);
4720 return 0;
4721 }
4722
4723 if (atomic_read(&nohz.load_balancer) == -1) {
4724 /* make me the ilb owner */
4725 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4726 return 1;
4727 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4728 int new_ilb;
4729
4730 if (!(sched_smt_power_savings ||
4731 sched_mc_power_savings))
4732 return 1;
4733 /*
4734 * Check to see if there is a more power-efficient
4735 * ilb.
4736 */
4737 new_ilb = find_new_ilb(cpu);
4738 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4739 atomic_set(&nohz.load_balancer, -1);
4740 resched_cpu(new_ilb);
4741 return 0;
4742 }
4743 return 1;
4744 }
4745 } else {
4746 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4747 return 0;
4748
4749 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4750
4751 if (atomic_read(&nohz.load_balancer) == cpu)
4752 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4753 BUG();
4754 }
4755 return 0;
4756}
4757#endif
4758
4759static DEFINE_SPINLOCK(balancing);
4760
4761/*
4762 * It checks each scheduling domain to see if it is due to be balanced,
4763 * and initiates a balancing operation if so.
4764 *
4765 * Balancing parameters are set up in arch_init_sched_domains.
4766 */
4767static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4768{
4769 int balance = 1;
4770 struct rq *rq = cpu_rq(cpu);
4771 unsigned long interval;
4772 struct sched_domain *sd;
4773 /* Earliest time when we have to do rebalance again */
4774 unsigned long next_balance = jiffies + 60*HZ;
4775 int update_next_balance = 0;
4776 int need_serialize;
4777
4778 for_each_domain(cpu, sd) {
4779 if (!(sd->flags & SD_LOAD_BALANCE))
4780 continue;
4781
4782 interval = sd->balance_interval;
4783 if (idle != CPU_IDLE)
4784 interval *= sd->busy_factor;
4785
4786 /* scale ms to jiffies */
4787 interval = msecs_to_jiffies(interval);
4788 if (unlikely(!interval))
4789 interval = 1;
4790 if (interval > HZ*NR_CPUS/10)
4791 interval = HZ*NR_CPUS/10;
4792
4793 need_serialize = sd->flags & SD_SERIALIZE;
4794
4795 if (need_serialize) {
4796 if (!spin_trylock(&balancing))
4797 goto out;
4798 }
4799
4800 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4801 if (load_balance(cpu, rq, sd, idle, &balance)) {
4802 /*
4803 * We've pulled tasks over so either we're no
4804 * longer idle, or one of our SMT siblings is
4805 * not idle.
4806 */
4807 idle = CPU_NOT_IDLE;
4808 }
4809 sd->last_balance = jiffies;
4810 }
4811 if (need_serialize)
4812 spin_unlock(&balancing);
4813out:
4814 if (time_after(next_balance, sd->last_balance + interval)) {
4815 next_balance = sd->last_balance + interval;
4816 update_next_balance = 1;
4817 }
4818
4819 /*
4820 * Stop the load balance at this level. There is another
4821 * CPU in our sched group which is doing load balancing more
4822 * actively.
4823 */
4824 if (!balance)
4825 break;
4826 }
4827
4828 /*
4829 * next_balance will be updated only when there is a need.
4830 * When the cpu is attached to null domain for ex, it will not be
4831 * updated.
4832 */
4833 if (likely(update_next_balance))
4834 rq->next_balance = next_balance;
4835}
4836
4837/*
4838 * run_rebalance_domains is triggered when needed from the scheduler tick.
4839 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4840 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4841 */
4842static void run_rebalance_domains(struct softirq_action *h)
4843{
4844 int this_cpu = smp_processor_id();
4845 struct rq *this_rq = cpu_rq(this_cpu);
4846 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4847 CPU_IDLE : CPU_NOT_IDLE;
4848
4849 rebalance_domains(this_cpu, idle);
4850
4851#ifdef CONFIG_NO_HZ
4852 /*
4853 * If this cpu is the owner for idle load balancing, then do the
4854 * balancing on behalf of the other idle cpus whose ticks are
4855 * stopped.
4856 */
4857 if (this_rq->idle_at_tick &&
4858 atomic_read(&nohz.load_balancer) == this_cpu) {
4859 struct rq *rq;
4860 int balance_cpu;
4861
4862 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4863 if (balance_cpu == this_cpu)
4864 continue;
4865
4866 /*
4867 * If this cpu gets work to do, stop the load balancing
4868 * work being done for other cpus. Next load
4869 * balancing owner will pick it up.
4870 */
4871 if (need_resched())
4872 break;
4873
4874 rebalance_domains(balance_cpu, CPU_IDLE);
4875
4876 rq = cpu_rq(balance_cpu);
4877 if (time_after(this_rq->next_balance, rq->next_balance))
4878 this_rq->next_balance = rq->next_balance;
4879 }
4880 }
4881#endif
4882}
4883
4884static inline int on_null_domain(int cpu)
4885{
4886 return !rcu_dereference(cpu_rq(cpu)->sd);
4887}
4888
4889/*
4890 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4891 *
4892 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4893 * idle load balancing owner or decide to stop the periodic load balancing,
4894 * if the whole system is idle.
4895 */
4896static inline void trigger_load_balance(struct rq *rq, int cpu)
4897{
4898#ifdef CONFIG_NO_HZ
4899 /*
4900 * If we were in the nohz mode recently and busy at the current
4901 * scheduler tick, then check if we need to nominate new idle
4902 * load balancer.
4903 */
4904 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4905 rq->in_nohz_recently = 0;
4906
4907 if (atomic_read(&nohz.load_balancer) == cpu) {
4908 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4909 atomic_set(&nohz.load_balancer, -1);
4910 }
4911
4912 if (atomic_read(&nohz.load_balancer) == -1) {
4913 int ilb = find_new_ilb(cpu);
4914
4915 if (ilb < nr_cpu_ids)
4916 resched_cpu(ilb);
4917 }
4918 }
4919
4920 /*
4921 * If this cpu is idle and doing idle load balancing for all the
4922 * cpus with ticks stopped, is it time for that to stop?
4923 */
4924 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4925 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4926 resched_cpu(cpu);
4927 return;
4928 }
4929
4930 /*
4931 * If this cpu is idle and the idle load balancing is done by
4932 * someone else, then no need raise the SCHED_SOFTIRQ
4933 */
4934 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4935 cpumask_test_cpu(cpu, nohz.cpu_mask))
4936 return;
4937#endif
4938 /* Don't need to rebalance while attached to NULL domain */
4939 if (time_after_eq(jiffies, rq->next_balance) &&
4940 likely(!on_null_domain(cpu)))
4941 raise_softirq(SCHED_SOFTIRQ);
4942}
4943
4944#else /* CONFIG_SMP */
4945
4946/*
4947 * on UP we do not need to balance between CPUs:
4948 */
4949static inline void idle_balance(int cpu, struct rq *rq)
4950{
4951}
4952
4953#endif 3175#endif
4954 3176
4955DEFINE_PER_CPU(struct kernel_stat, kstat); 3177DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5568,7 +3790,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5568 * the mutex owner just released it and exited. 3790 * the mutex owner just released it and exited.
5569 */ 3791 */
5570 if (probe_kernel_address(&owner->cpu, cpu)) 3792 if (probe_kernel_address(&owner->cpu, cpu))
5571 goto out; 3793 return 0;
5572#else 3794#else
5573 cpu = owner->cpu; 3795 cpu = owner->cpu;
5574#endif 3796#endif
@@ -5578,14 +3800,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5578 * the cpu field may no longer be valid. 3800 * the cpu field may no longer be valid.
5579 */ 3801 */
5580 if (cpu >= nr_cpumask_bits) 3802 if (cpu >= nr_cpumask_bits)
5581 goto out; 3803 return 0;
5582 3804
5583 /* 3805 /*
5584 * We need to validate that we can do a 3806 * We need to validate that we can do a
5585 * get_cpu() and that we have the percpu area. 3807 * get_cpu() and that we have the percpu area.
5586 */ 3808 */
5587 if (!cpu_online(cpu)) 3809 if (!cpu_online(cpu))
5588 goto out; 3810 return 0;
5589 3811
5590 rq = cpu_rq(cpu); 3812 rq = cpu_rq(cpu);
5591 3813
@@ -5604,7 +3826,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5604 3826
5605 cpu_relax(); 3827 cpu_relax();
5606 } 3828 }
5607out: 3829
5608 return 1; 3830 return 1;
5609} 3831}
5610#endif 3832#endif
@@ -6049,7 +4271,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6049 unsigned long flags; 4271 unsigned long flags;
6050 int oldprio, on_rq, running; 4272 int oldprio, on_rq, running;
6051 struct rq *rq; 4273 struct rq *rq;
6052 const struct sched_class *prev_class = p->sched_class; 4274 const struct sched_class *prev_class;
6053 4275
6054 BUG_ON(prio < 0 || prio > MAX_PRIO); 4276 BUG_ON(prio < 0 || prio > MAX_PRIO);
6055 4277
@@ -6057,6 +4279,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6057 update_rq_clock(rq); 4279 update_rq_clock(rq);
6058 4280
6059 oldprio = p->prio; 4281 oldprio = p->prio;
4282 prev_class = p->sched_class;
6060 on_rq = p->se.on_rq; 4283 on_rq = p->se.on_rq;
6061 running = task_current(rq, p); 4284 running = task_current(rq, p);
6062 if (on_rq) 4285 if (on_rq)
@@ -6074,7 +4297,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6074 if (running) 4297 if (running)
6075 p->sched_class->set_curr_task(rq); 4298 p->sched_class->set_curr_task(rq);
6076 if (on_rq) { 4299 if (on_rq) {
6077 enqueue_task(rq, p, 0); 4300 enqueue_task(rq, p, 0, oldprio < prio);
6078 4301
6079 check_class_changed(rq, p, prev_class, oldprio, running); 4302 check_class_changed(rq, p, prev_class, oldprio, running);
6080 } 4303 }
@@ -6118,7 +4341,7 @@ void set_user_nice(struct task_struct *p, long nice)
6118 delta = p->prio - old_prio; 4341 delta = p->prio - old_prio;
6119 4342
6120 if (on_rq) { 4343 if (on_rq) {
6121 enqueue_task(rq, p, 0); 4344 enqueue_task(rq, p, 0, false);
6122 /* 4345 /*
6123 * If the task increased its priority or is running and 4346 * If the task increased its priority or is running and
6124 * lowered its priority, then reschedule its CPU: 4347 * lowered its priority, then reschedule its CPU:
@@ -6141,7 +4364,7 @@ int can_nice(const struct task_struct *p, const int nice)
6141 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4364 /* convert nice value [19,-20] to rlimit style value [1,40] */
6142 int nice_rlim = 20 - nice; 4365 int nice_rlim = 20 - nice;
6143 4366
6144 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4367 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6145 capable(CAP_SYS_NICE)); 4368 capable(CAP_SYS_NICE));
6146} 4369}
6147 4370
@@ -6276,7 +4499,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6276{ 4499{
6277 int retval, oldprio, oldpolicy = -1, on_rq, running; 4500 int retval, oldprio, oldpolicy = -1, on_rq, running;
6278 unsigned long flags; 4501 unsigned long flags;
6279 const struct sched_class *prev_class = p->sched_class; 4502 const struct sched_class *prev_class;
6280 struct rq *rq; 4503 struct rq *rq;
6281 int reset_on_fork; 4504 int reset_on_fork;
6282 4505
@@ -6318,7 +4541,7 @@ recheck:
6318 4541
6319 if (!lock_task_sighand(p, &flags)) 4542 if (!lock_task_sighand(p, &flags))
6320 return -ESRCH; 4543 return -ESRCH;
6321 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4544 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6322 unlock_task_sighand(p, &flags); 4545 unlock_task_sighand(p, &flags);
6323 4546
6324 /* can't set/change the rt policy */ 4547 /* can't set/change the rt policy */
@@ -6390,6 +4613,7 @@ recheck:
6390 p->sched_reset_on_fork = reset_on_fork; 4613 p->sched_reset_on_fork = reset_on_fork;
6391 4614
6392 oldprio = p->prio; 4615 oldprio = p->prio;
4616 prev_class = p->sched_class;
6393 __setscheduler(rq, p, policy, param->sched_priority); 4617 __setscheduler(rq, p, policy, param->sched_priority);
6394 4618
6395 if (running) 4619 if (running)
@@ -6689,7 +4913,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6689 int ret; 4913 int ret;
6690 cpumask_var_t mask; 4914 cpumask_var_t mask;
6691 4915
6692 if (len < cpumask_size()) 4916 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4917 return -EINVAL;
4918 if (len & (sizeof(unsigned long)-1))
6693 return -EINVAL; 4919 return -EINVAL;
6694 4920
6695 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4921 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -6697,10 +4923,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6697 4923
6698 ret = sched_getaffinity(pid, mask); 4924 ret = sched_getaffinity(pid, mask);
6699 if (ret == 0) { 4925 if (ret == 0) {
6700 if (copy_to_user(user_mask_ptr, mask, cpumask_size())) 4926 size_t retlen = min_t(size_t, len, cpumask_size());
4927
4928 if (copy_to_user(user_mask_ptr, mask, retlen))
6701 ret = -EFAULT; 4929 ret = -EFAULT;
6702 else 4930 else
6703 ret = cpumask_size(); 4931 ret = retlen;
6704 } 4932 }
6705 free_cpumask_var(mask); 4933 free_cpumask_var(mask);
6706 4934
@@ -7140,23 +5368,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7140 struct rq *rq; 5368 struct rq *rq;
7141 int ret = 0; 5369 int ret = 0;
7142 5370
7143 /*
7144 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7145 * the ->cpus_allowed mask from under waking tasks, which would be
7146 * possible when we change rq->lock in ttwu(), so synchronize against
7147 * TASK_WAKING to avoid that.
7148 */
7149again:
7150 while (p->state == TASK_WAKING)
7151 cpu_relax();
7152
7153 rq = task_rq_lock(p, &flags); 5371 rq = task_rq_lock(p, &flags);
7154 5372
7155 if (p->state == TASK_WAKING) {
7156 task_rq_unlock(rq, &flags);
7157 goto again;
7158 }
7159
7160 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5373 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7161 ret = -EINVAL; 5374 ret = -EINVAL;
7162 goto out; 5375 goto out;
@@ -7185,7 +5398,7 @@ again:
7185 5398
7186 get_task_struct(mt); 5399 get_task_struct(mt);
7187 task_rq_unlock(rq, &flags); 5400 task_rq_unlock(rq, &flags);
7188 wake_up_process(rq->migration_thread); 5401 wake_up_process(mt);
7189 put_task_struct(mt); 5402 put_task_struct(mt);
7190 wait_for_completion(&req.done); 5403 wait_for_completion(&req.done);
7191 tlb_migrate_finish(p->mm); 5404 tlb_migrate_finish(p->mm);
@@ -9208,11 +7421,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
9208 7421
9209#ifdef CONFIG_SCHED_MC 7422#ifdef CONFIG_SCHED_MC
9210static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7423static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7424 struct sysdev_class_attribute *attr,
9211 char *page) 7425 char *page)
9212{ 7426{
9213 return sprintf(page, "%u\n", sched_mc_power_savings); 7427 return sprintf(page, "%u\n", sched_mc_power_savings);
9214} 7428}
9215static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7429static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7430 struct sysdev_class_attribute *attr,
9216 const char *buf, size_t count) 7431 const char *buf, size_t count)
9217{ 7432{
9218 return sched_power_savings_store(buf, count, 0); 7433 return sched_power_savings_store(buf, count, 0);
@@ -9224,11 +7439,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
9224 7439
9225#ifdef CONFIG_SCHED_SMT 7440#ifdef CONFIG_SCHED_SMT
9226static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7441static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7442 struct sysdev_class_attribute *attr,
9227 char *page) 7443 char *page)
9228{ 7444{
9229 return sprintf(page, "%u\n", sched_smt_power_savings); 7445 return sprintf(page, "%u\n", sched_smt_power_savings);
9230} 7446}
9231static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7447static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7448 struct sysdev_class_attribute *attr,
9232 const char *buf, size_t count) 7449 const char *buf, size_t count)
9233{ 7450{
9234 return sched_power_savings_store(buf, count, 1); 7451 return sched_power_savings_store(buf, count, 1);
@@ -9443,7 +7660,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9443 tg->rt_rq[cpu] = rt_rq; 7660 tg->rt_rq[cpu] = rt_rq;
9444 init_rt_rq(rt_rq, rq); 7661 init_rt_rq(rt_rq, rq);
9445 rt_rq->tg = tg; 7662 rt_rq->tg = tg;
9446 rt_rq->rt_se = rt_se;
9447 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7663 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9448 if (add) 7664 if (add)
9449 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7665 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9474,9 +7690,6 @@ void __init sched_init(void)
9474#ifdef CONFIG_RT_GROUP_SCHED 7690#ifdef CONFIG_RT_GROUP_SCHED
9475 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7691 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9476#endif 7692#endif
9477#ifdef CONFIG_USER_SCHED
9478 alloc_size *= 2;
9479#endif
9480#ifdef CONFIG_CPUMASK_OFFSTACK 7693#ifdef CONFIG_CPUMASK_OFFSTACK
9481 alloc_size += num_possible_cpus() * cpumask_size(); 7694 alloc_size += num_possible_cpus() * cpumask_size();
9482#endif 7695#endif
@@ -9490,13 +7703,6 @@ void __init sched_init(void)
9490 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7703 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9491 ptr += nr_cpu_ids * sizeof(void **); 7704 ptr += nr_cpu_ids * sizeof(void **);
9492 7705
9493#ifdef CONFIG_USER_SCHED
9494 root_task_group.se = (struct sched_entity **)ptr;
9495 ptr += nr_cpu_ids * sizeof(void **);
9496
9497 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9498 ptr += nr_cpu_ids * sizeof(void **);
9499#endif /* CONFIG_USER_SCHED */
9500#endif /* CONFIG_FAIR_GROUP_SCHED */ 7706#endif /* CONFIG_FAIR_GROUP_SCHED */
9501#ifdef CONFIG_RT_GROUP_SCHED 7707#ifdef CONFIG_RT_GROUP_SCHED
9502 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7708 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9505,13 +7711,6 @@ void __init sched_init(void)
9505 init_task_group.rt_rq = (struct rt_rq **)ptr; 7711 init_task_group.rt_rq = (struct rt_rq **)ptr;
9506 ptr += nr_cpu_ids * sizeof(void **); 7712 ptr += nr_cpu_ids * sizeof(void **);
9507 7713
9508#ifdef CONFIG_USER_SCHED
9509 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9510 ptr += nr_cpu_ids * sizeof(void **);
9511
9512 root_task_group.rt_rq = (struct rt_rq **)ptr;
9513 ptr += nr_cpu_ids * sizeof(void **);
9514#endif /* CONFIG_USER_SCHED */
9515#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9516#ifdef CONFIG_CPUMASK_OFFSTACK 7715#ifdef CONFIG_CPUMASK_OFFSTACK
9517 for_each_possible_cpu(i) { 7716 for_each_possible_cpu(i) {
@@ -9531,22 +7730,13 @@ void __init sched_init(void)
9531#ifdef CONFIG_RT_GROUP_SCHED 7730#ifdef CONFIG_RT_GROUP_SCHED
9532 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7731 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9533 global_rt_period(), global_rt_runtime()); 7732 global_rt_period(), global_rt_runtime());
9534#ifdef CONFIG_USER_SCHED
9535 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9536 global_rt_period(), RUNTIME_INF);
9537#endif /* CONFIG_USER_SCHED */
9538#endif /* CONFIG_RT_GROUP_SCHED */ 7733#endif /* CONFIG_RT_GROUP_SCHED */
9539 7734
9540#ifdef CONFIG_GROUP_SCHED 7735#ifdef CONFIG_CGROUP_SCHED
9541 list_add(&init_task_group.list, &task_groups); 7736 list_add(&init_task_group.list, &task_groups);
9542 INIT_LIST_HEAD(&init_task_group.children); 7737 INIT_LIST_HEAD(&init_task_group.children);
9543 7738
9544#ifdef CONFIG_USER_SCHED 7739#endif /* CONFIG_CGROUP_SCHED */
9545 INIT_LIST_HEAD(&root_task_group.children);
9546 init_task_group.parent = &root_task_group;
9547 list_add(&init_task_group.siblings, &root_task_group.children);
9548#endif /* CONFIG_USER_SCHED */
9549#endif /* CONFIG_GROUP_SCHED */
9550 7740
9551#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7741#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9552 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7742 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9586,25 +7776,6 @@ void __init sched_init(void)
9586 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7776 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9587 */ 7777 */
9588 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7778 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9589#elif defined CONFIG_USER_SCHED
9590 root_task_group.shares = NICE_0_LOAD;
9591 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9592 /*
9593 * In case of task-groups formed thr' the user id of tasks,
9594 * init_task_group represents tasks belonging to root user.
9595 * Hence it forms a sibling of all subsequent groups formed.
9596 * In this case, init_task_group gets only a fraction of overall
9597 * system cpu resource, based on the weight assigned to root
9598 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9599 * by letting tasks of init_task_group sit in a separate cfs_rq
9600 * (init_tg_cfs_rq) and having one entity represent this group of
9601 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9602 */
9603 init_tg_cfs_entry(&init_task_group,
9604 &per_cpu(init_tg_cfs_rq, i),
9605 &per_cpu(init_sched_entity, i), i, 1,
9606 root_task_group.se[i]);
9607
9608#endif 7779#endif
9609#endif /* CONFIG_FAIR_GROUP_SCHED */ 7780#endif /* CONFIG_FAIR_GROUP_SCHED */
9610 7781
@@ -9613,12 +7784,6 @@ void __init sched_init(void)
9613 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7784 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9614#ifdef CONFIG_CGROUP_SCHED 7785#ifdef CONFIG_CGROUP_SCHED
9615 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7786 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9616#elif defined CONFIG_USER_SCHED
9617 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9618 init_tg_rt_entry(&init_task_group,
9619 &per_cpu(init_rt_rq_var, i),
9620 &per_cpu(init_sched_rt_entity, i), i, 1,
9621 root_task_group.rt_se[i]);
9622#endif 7787#endif
9623#endif 7788#endif
9624 7789
@@ -9703,7 +7868,7 @@ static inline int preempt_count_equals(int preempt_offset)
9703 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7868 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9704} 7869}
9705 7870
9706void __might_sleep(char *file, int line, int preempt_offset) 7871void __might_sleep(const char *file, int line, int preempt_offset)
9707{ 7872{
9708#ifdef in_atomic 7873#ifdef in_atomic
9709 static unsigned long prev_jiffy; /* ratelimiting */ 7874 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10014,7 +8179,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10014} 8179}
10015#endif /* CONFIG_RT_GROUP_SCHED */ 8180#endif /* CONFIG_RT_GROUP_SCHED */
10016 8181
10017#ifdef CONFIG_GROUP_SCHED 8182#ifdef CONFIG_CGROUP_SCHED
10018static void free_sched_group(struct task_group *tg) 8183static void free_sched_group(struct task_group *tg)
10019{ 8184{
10020 free_fair_sched_group(tg); 8185 free_fair_sched_group(tg);
@@ -10119,11 +8284,11 @@ void sched_move_task(struct task_struct *tsk)
10119 if (unlikely(running)) 8284 if (unlikely(running))
10120 tsk->sched_class->set_curr_task(rq); 8285 tsk->sched_class->set_curr_task(rq);
10121 if (on_rq) 8286 if (on_rq)
10122 enqueue_task(rq, tsk, 0); 8287 enqueue_task(rq, tsk, 0, false);
10123 8288
10124 task_rq_unlock(rq, &flags); 8289 task_rq_unlock(rq, &flags);
10125} 8290}
10126#endif /* CONFIG_GROUP_SCHED */ 8291#endif /* CONFIG_CGROUP_SCHED */
10127 8292
10128#ifdef CONFIG_FAIR_GROUP_SCHED 8293#ifdef CONFIG_FAIR_GROUP_SCHED
10129static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8294static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10265,13 +8430,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10265 runtime = d->rt_runtime; 8430 runtime = d->rt_runtime;
10266 } 8431 }
10267 8432
10268#ifdef CONFIG_USER_SCHED
10269 if (tg == &root_task_group) {
10270 period = global_rt_period();
10271 runtime = global_rt_runtime();
10272 }
10273#endif
10274
10275 /* 8433 /*
10276 * Cannot have more runtime than the period. 8434 * Cannot have more runtime than the period.
10277 */ 8435 */
@@ -10674,7 +8832,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10674struct cpuacct { 8832struct cpuacct {
10675 struct cgroup_subsys_state css; 8833 struct cgroup_subsys_state css;
10676 /* cpuusage holds pointer to a u64-type object on every cpu */ 8834 /* cpuusage holds pointer to a u64-type object on every cpu */
10677 u64 *cpuusage; 8835 u64 __percpu *cpuusage;
10678 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8836 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10679 struct cpuacct *parent; 8837 struct cpuacct *parent;
10680}; 8838};
@@ -10891,12 +9049,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10891} 9049}
10892 9050
10893/* 9051/*
9052 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9053 * in cputime_t units. As a result, cpuacct_update_stats calls
9054 * percpu_counter_add with values large enough to always overflow the
9055 * per cpu batch limit causing bad SMP scalability.
9056 *
9057 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9058 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9059 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9060 */
9061#ifdef CONFIG_SMP
9062#define CPUACCT_BATCH \
9063 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9064#else
9065#define CPUACCT_BATCH 0
9066#endif
9067
9068/*
10894 * Charge the system/user time to the task's accounting group. 9069 * Charge the system/user time to the task's accounting group.
10895 */ 9070 */
10896static void cpuacct_update_stats(struct task_struct *tsk, 9071static void cpuacct_update_stats(struct task_struct *tsk,
10897 enum cpuacct_stat_index idx, cputime_t val) 9072 enum cpuacct_stat_index idx, cputime_t val)
10898{ 9073{
10899 struct cpuacct *ca; 9074 struct cpuacct *ca;
9075 int batch = CPUACCT_BATCH;
10900 9076
10901 if (unlikely(!cpuacct_subsys.active)) 9077 if (unlikely(!cpuacct_subsys.active))
10902 return; 9078 return;
@@ -10905,7 +9081,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10905 ca = task_ca(tsk); 9081 ca = task_ca(tsk);
10906 9082
10907 do { 9083 do {
10908 percpu_counter_add(&ca->cpustat[idx], val); 9084 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10909 ca = ca->parent; 9085 ca = ca->parent;
10910 } while (ca); 9086 } while (ca);
10911 rcu_read_unlock(); 9087 rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
27 * of the License. 27 * of the License.
28 */ 28 */
29 29
30#include <linux/gfp.h>
30#include "sched_cpupri.h" 31#include "sched_cpupri.h"
31 32
32/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -47,9 +48,7 @@ static int convert_prio(int prio)
47} 48}
48 49
49#define for_each_cpupri_active(array, idx) \ 50#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 52
54/** 53/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 54 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -58,7 +57,7 @@ static int convert_prio(int prio)
58 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) 57 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59 * 58 *
60 * Note: This function returns the recommended CPUs as calculated during the 59 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in 60 * current invocation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not 61 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct 62 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current 63 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b9..19be00ba6123 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
114 { 114 {
115 char path[64]; 115 char path[64];
116 116
117 rcu_read_lock();
117 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); 118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
118 SEQ_printf(m, " %s", path); 120 SEQ_printf(m, " %s", path);
119 } 121 }
120#endif 122#endif
@@ -518,8 +520,4 @@ void proc_sched_set_task(struct task_struct *p)
518 p->se.nr_wakeups_idle = 0; 520 p->se.nr_wakeups_idle = 0;
519 p->sched_info.bkl_count = 0; 521 p->sched_info.bkl_count = 0;
520#endif 522#endif
521 p->se.sum_exec_runtime = 0;
522 p->se.prev_sum_exec_runtime = 0;
523 p->nvcsw = 0;
524 p->nivcsw = 0;
525} 523}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..5a5ea2cd924f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1850 1903
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1934{ 2050{
1935 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1936 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
1937 2053
1938 cfs_rq_iterator.start = load_balance_start_fair; 2054 do {
1939 cfs_rq_iterator.next = load_balance_next_fair; 2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1940 2058
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
1942 /* 2062 /*
1943 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1944 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1945 */ 2066 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1948 &cfs_rq_iterator)) 2069
1949 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
1950 } 2155 }
1951 2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
1952 return 0; 2297 return 0;
1953} 2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
3348 return 0;
3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
1954 3536
1955static void rq_online_fair(struct rq *rq) 3537static void rq_online_fair(struct rq *rq)
1956{ 3538{
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3544 update_sysctl();
1963} 3545}
1964 3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1965#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1966 3557
1967/* 3558/*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3667}
2077#endif 3668#endif
2078 3669
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3671{
2081 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2110 3701
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3702 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3703 .rq_offline = rq_offline_fair,
2115 3704
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..b5b920ae2ea7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1136,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1136 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
1137 continue; 1147 continue;
1138 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1148 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1139 struct task_struct *p = rt_task_of(rt_se); 1149 struct task_struct *p;
1150
1151 if (!rt_entity_is_task(rt_se))
1152 continue;
1153
1154 p = rt_task_of(rt_se);
1140 if (pick_rt_task(rq, p, cpu)) { 1155 if (pick_rt_task(rq, p, cpu)) {
1141 next = p; 1156 next = p;
1142 break; 1157 break;
@@ -1481,24 +1496,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1496 push_rt_tasks(rq);
1482} 1497}
1483 1498
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1499static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1500 const struct cpumask *new_mask)
1504{ 1501{
@@ -1670,8 +1667,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1670 if (!p->signal) 1667 if (!p->signal)
1671 return; 1668 return;
1672 1669
1673 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1670 /* max may change after cur was read, this will be fixed next tick */
1674 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1671 soft = task_rlimit(p, RLIMIT_RTTIME);
1672 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1675 1673
1676 if (soft != RLIM_INFINITY) { 1674 if (soft != RLIM_INFINITY) {
1677 unsigned long next; 1675 unsigned long next;
@@ -1721,7 +1719,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
1722} 1720}
1723 1721
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1722static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1723{
1726 /* 1724 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1725 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1744,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1744#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1745 .select_task_rq = select_task_rq_rt,
1748 1746
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1747 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1748 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1749 .rq_offline = rq_offline_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
159 159
160/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
161 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
162int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
163{ 167{
164 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
166 170
167 s = pending->signal.sig; 171 s = pending->signal.sig;
168 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
169 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
170 default: 187 default:
171 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
172 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
173 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
174 break; 191 continue;
175 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
176 break; 195 break;
177 196
178 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
179 sig = 1; 198 x = s[1] &~ m[1];
180 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
181 sig = _NSIG_BPW + 1;
182 else
183 break; 200 break;
184 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
185 break; 202 break;
186 203
187 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
188 sig = ffz(~x) + 1; 205 /* Nothing to do */
189 break; 206 break;
190 } 207 }
191 208
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
228 245
229 if (override_rlimit || 246 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
232 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else { 250 } else {
234 print_dropped_signal(sig); 251 print_dropped_signal(sig);
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a270..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
637 goto cancelled; 637 goto cancelled;
638 638
639 /* the timer holds a reference whilst it is pending */ 639 /* the timer holds a reference whilst it is pending */
640 ret = work->ops->get_ref(work); 640 ret = slow_work_get_ref(work);
641 if (ret < 0) 641 if (ret < 0)
642 goto cant_get_ref; 642 goto cant_get_ref;
643 643
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
43 */ 43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid) 44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{ 45{
46#ifdef CONFIG_SLOW_WORK_PROC 46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid; 47 slow_work_pids[id] = pid;
48#endif 48#endif
49} 49}
50 50
51static inline void slow_work_mark_time(struct slow_work *work) 51static inline void slow_work_mark_time(struct slow_work *work)
52{ 52{
53#ifdef CONFIG_SLOW_WORK_PROC 53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME; 54 work->mark = CURRENT_TIME;
55#endif 55#endif
56} 56}
57 57
58static inline void slow_work_begin_exec(int id, struct slow_work *work) 58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{ 59{
60#ifdef CONFIG_SLOW_WORK_PROC 60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work; 61 slow_work_execs[id] = work;
62#endif 62#endif
63} 63}
64 64
65static inline void slow_work_end_exec(int id, struct slow_work *work) 65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{ 66{
67#ifdef CONFIG_SLOW_WORK_PROC 67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock); 68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL; 69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock); 70 write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..3fc697336183 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,11 +9,10 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/gfp.h>
12#include <linux/smp.h> 13#include <linux/smp.h>
13#include <linux/cpu.h> 14#include <linux/cpu.h>
14 15
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 16static struct {
18 struct list_head queue; 17 struct list_head queue;
19 raw_spinlock_t lock; 18 raw_spinlock_t lock;
@@ -33,12 +32,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 32 cpumask_var_t cpumask;
34}; 33};
35 34
35static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
36
36struct call_single_queue { 37struct call_single_queue {
37 struct list_head list; 38 struct list_head list;
38 raw_spinlock_t lock; 39 raw_spinlock_t lock;
39}; 40};
40 41
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 42static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 43
43static int 44static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 45hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +257,7 @@ void generic_smp_call_function_single_interrupt(void)
256 } 257 }
257} 258}
258 259
259static DEFINE_PER_CPU(struct call_single_data, csd_data); 260static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
260 261
261/* 262/*
262 * smp_call_function_single - Run a function on a specific CPU 263 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef75..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e27..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
118 } 125 }
119 126
120 if (touch_ts == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
@@ -140,11 +155,11 @@ void softlockup_tick(void)
140 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
141 * threshold timespan. 156 * threshold timespan.
142 */ 157 */
143 if (now > touch_ts + softlockup_thresh/2) 158 if (time_after(now - softlockup_thresh/2, touch_ts))
144 wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
145 160
146 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
147 if (now <= (touch_ts + softlockup_thresh)) 162 if (time_before_eq(now - softlockup_thresh, touch_ts))
148 return; 163 return;
149 164
150 per_cpu(softlockup_print_ts, this_cpu) = touch_ts; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,10 +30,33 @@
30#include <linux/preempt.h> 30#include <linux/preempt.h>
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/slab.h>
34#include <linux/smp.h> 33#include <linux/smp.h>
35#include <linux/srcu.h> 34#include <linux/srcu.h>
36 35
36static int init_srcu_struct_fields(struct srcu_struct *sp)
37{
38 sp->completed = 0;
39 mutex_init(&sp->mutex);
40 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
41 return sp->per_cpu_ref ? 0 : -ENOMEM;
42}
43
44#ifdef CONFIG_DEBUG_LOCK_ALLOC
45
46int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key)
48{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp);
55}
56EXPORT_SYMBOL_GPL(__init_srcu_struct);
57
58#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
59
37/** 60/**
38 * init_srcu_struct - initialize a sleep-RCU structure 61 * init_srcu_struct - initialize a sleep-RCU structure
39 * @sp: structure to initialize. 62 * @sp: structure to initialize.
@@ -44,13 +67,12 @@
44 */ 67 */
45int init_srcu_struct(struct srcu_struct *sp) 68int init_srcu_struct(struct srcu_struct *sp)
46{ 69{
47 sp->completed = 0; 70 return init_srcu_struct_fields(sp);
48 mutex_init(&sp->mutex);
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 71}
52EXPORT_SYMBOL_GPL(init_srcu_struct); 72EXPORT_SYMBOL_GPL(init_srcu_struct);
53 73
74#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
75
54/* 76/*
55 * srcu_readers_active_idx -- returns approximate number of readers 77 * srcu_readers_active_idx -- returns approximate number of readers
56 * active on the specified rank of per-CPU counters. 78 * active on the specified rank of per-CPU counters.
@@ -100,15 +122,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
100} 122}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 123EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
102 124
103/** 125/*
104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
105 * @sp: srcu_struct in which to register the new reader.
106 *
107 * Counts the new reader in the appropriate per-CPU element of the 126 * Counts the new reader in the appropriate per-CPU element of the
108 * srcu_struct. Must be called from process context. 127 * srcu_struct. Must be called from process context.
109 * Returns an index that must be passed to the matching srcu_read_unlock(). 128 * Returns an index that must be passed to the matching srcu_read_unlock().
110 */ 129 */
111int srcu_read_lock(struct srcu_struct *sp) 130int __srcu_read_lock(struct srcu_struct *sp)
112{ 131{
113 int idx; 132 int idx;
114 133
@@ -120,31 +139,27 @@ int srcu_read_lock(struct srcu_struct *sp)
120 preempt_enable(); 139 preempt_enable();
121 return idx; 140 return idx;
122} 141}
123EXPORT_SYMBOL_GPL(srcu_read_lock); 142EXPORT_SYMBOL_GPL(__srcu_read_lock);
124 143
125/** 144/*
126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
127 * @sp: srcu_struct in which to unregister the old reader.
128 * @idx: return value from corresponding srcu_read_lock().
129 *
130 * Removes the count for the old reader from the appropriate per-CPU 145 * Removes the count for the old reader from the appropriate per-CPU
131 * element of the srcu_struct. Note that this may well be a different 146 * element of the srcu_struct. Note that this may well be a different
132 * CPU than that which was incremented by the corresponding srcu_read_lock(). 147 * CPU than that which was incremented by the corresponding srcu_read_lock().
133 * Must be called from process context. 148 * Must be called from process context.
134 */ 149 */
135void srcu_read_unlock(struct srcu_struct *sp, int idx) 150void __srcu_read_unlock(struct srcu_struct *sp, int idx)
136{ 151{
137 preempt_disable(); 152 preempt_disable();
138 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 153 srcu_barrier(); /* ensure compiler won't misorder critical section. */
139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 154 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
140 preempt_enable(); 155 preempt_enable();
141} 156}
142EXPORT_SYMBOL_GPL(srcu_read_unlock); 157EXPORT_SYMBOL_GPL(__srcu_read_unlock);
143 158
144/* 159/*
145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 160 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
146 */ 161 */
147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 162static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
148{ 163{
149 int idx; 164 int idx;
150 165
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void __percpu *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
51{ 51{
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..7cb426a58965 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,8 +33,10 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/personality.h>
36#include <linux/ptrace.h> 37#include <linux/ptrace.h>
37#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h>
38 40
39#include <linux/compat.h> 41#include <linux/compat.h>
40#include <linux/syscalls.h> 42#include <linux/syscalls.h>
@@ -222,6 +224,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
222 if (which > PRIO_USER || which < PRIO_PROCESS) 224 if (which > PRIO_USER || which < PRIO_PROCESS)
223 return -EINVAL; 225 return -EINVAL;
224 226
227 rcu_read_lock();
225 read_lock(&tasklist_lock); 228 read_lock(&tasklist_lock);
226 switch (which) { 229 switch (which) {
227 case PRIO_PROCESS: 230 case PRIO_PROCESS:
@@ -267,6 +270,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
267 } 270 }
268out_unlock: 271out_unlock:
269 read_unlock(&tasklist_lock); 272 read_unlock(&tasklist_lock);
273 rcu_read_unlock();
270 274
271 return retval; 275 return retval;
272} 276}
@@ -569,13 +573,7 @@ static int set_user(struct cred *new)
569 if (!new_user) 573 if (!new_user)
570 return -EAGAIN; 574 return -EAGAIN;
571 575
572 if (!task_can_switch_user(new_user, current)) { 576 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
573 free_uid(new_user);
574 return -EINVAL;
575 }
576
577 if (atomic_read(&new_user->processes) >=
578 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
579 new_user != INIT_USER) { 577 new_user != INIT_USER) {
580 free_uid(new_user); 578 free_uid(new_user);
581 return -EAGAIN; 579 return -EAGAIN;
@@ -1118,6 +1116,15 @@ out:
1118 1116
1119DECLARE_RWSEM(uts_sem); 1117DECLARE_RWSEM(uts_sem);
1120 1118
1119#ifdef COMPAT_UTS_MACHINE
1120#define override_architecture(name) \
1121 (personality(current->personality) == PER_LINUX32 && \
1122 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1123 sizeof(COMPAT_UTS_MACHINE)))
1124#else
1125#define override_architecture(name) 0
1126#endif
1127
1121SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1128SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1122{ 1129{
1123 int errno = 0; 1130 int errno = 0;
@@ -1126,9 +1133,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1126 if (copy_to_user(name, utsname(), sizeof *name)) 1133 if (copy_to_user(name, utsname(), sizeof *name))
1127 errno = -EFAULT; 1134 errno = -EFAULT;
1128 up_read(&uts_sem); 1135 up_read(&uts_sem);
1136
1137 if (!errno && override_architecture(name))
1138 errno = -EFAULT;
1129 return errno; 1139 return errno;
1130} 1140}
1131 1141
1142#ifdef __ARCH_WANT_SYS_OLD_UNAME
1143/*
1144 * Old cruft
1145 */
1146SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1147{
1148 int error = 0;
1149
1150 if (!name)
1151 return -EFAULT;
1152
1153 down_read(&uts_sem);
1154 if (copy_to_user(name, utsname(), sizeof(*name)))
1155 error = -EFAULT;
1156 up_read(&uts_sem);
1157
1158 if (!error && override_architecture(name))
1159 error = -EFAULT;
1160 return error;
1161}
1162
1163SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1164{
1165 int error;
1166
1167 if (!name)
1168 return -EFAULT;
1169 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1170 return -EFAULT;
1171
1172 down_read(&uts_sem);
1173 error = __copy_to_user(&name->sysname, &utsname()->sysname,
1174 __OLD_UTS_LEN);
1175 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1176 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1177 __OLD_UTS_LEN);
1178 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1179 error |= __copy_to_user(&name->release, &utsname()->release,
1180 __OLD_UTS_LEN);
1181 error |= __put_user(0, name->release + __OLD_UTS_LEN);
1182 error |= __copy_to_user(&name->version, &utsname()->version,
1183 __OLD_UTS_LEN);
1184 error |= __put_user(0, name->version + __OLD_UTS_LEN);
1185 error |= __copy_to_user(&name->machine, &utsname()->machine,
1186 __OLD_UTS_LEN);
1187 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1188 up_read(&uts_sem);
1189
1190 if (!error && override_architecture(name))
1191 error = -EFAULT;
1192 return error ? -EFAULT : 0;
1193}
1194#endif
1195
1132SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1196SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1133{ 1197{
1134 int errno; 1198 int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
126cond_syscall(sys_setuid16); 126cond_syscall(sys_setuid16);
127cond_syscall(sys_vm86old); 127cond_syscall(sys_vm86old);
128cond_syscall(sys_vm86); 128cond_syscall(sys_vm86);
129cond_syscall(sys_ipc);
129cond_syscall(compat_sys_ipc); 130cond_syscall(compat_sys_ipc);
130cond_syscall(compat_sys_sysctl); 131cond_syscall(compat_sys_sysctl);
131cond_syscall(sys_flock); 132cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ac72c9e6bd9b..a38af430f0d8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h>
26#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
27#include <linux/security.h> 28#include <linux/security.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
@@ -50,6 +51,7 @@
50#include <linux/ftrace.h> 51#include <linux/ftrace.h>
51#include <linux/slow-work.h> 52#include <linux/slow-work.h>
52#include <linux/perf_event.h> 53#include <linux/perf_event.h>
54#include <linux/kprobes.h>
53 55
54#include <asm/uaccess.h> 56#include <asm/uaccess.h>
55#include <asm/processor.h> 57#include <asm/processor.h>
@@ -59,6 +61,18 @@
59#include <asm/stacktrace.h> 61#include <asm/stacktrace.h>
60#include <asm/io.h> 62#include <asm/io.h>
61#endif 63#endif
64#ifdef CONFIG_BSD_PROCESS_ACCT
65#include <linux/acct.h>
66#endif
67#ifdef CONFIG_RT_MUTEXES
68#include <linux/rtmutex.h>
69#endif
70#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
71#include <linux/lockdep.h>
72#endif
73#ifdef CONFIG_CHR_DEV_SG
74#include <scsi/sg.h>
75#endif
62 76
63#ifdef CONFIG_NMI_WATCHDOG 77#ifdef CONFIG_NMI_WATCHDOG
64#include <linux/nmi.h> 78#include <linux/nmi.h>
@@ -68,8 +82,6 @@
68#if defined(CONFIG_SYSCTL) 82#if defined(CONFIG_SYSCTL)
69 83
70/* External variables not in a header file. */ 84/* External variables not in a header file. */
71extern int C_A_D;
72extern int print_fatal_signals;
73extern int sysctl_overcommit_memory; 85extern int sysctl_overcommit_memory;
74extern int sysctl_overcommit_ratio; 86extern int sysctl_overcommit_ratio;
75extern int sysctl_panic_on_oom; 87extern int sysctl_panic_on_oom;
@@ -91,9 +103,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
91#ifndef CONFIG_MMU 103#ifndef CONFIG_MMU
92extern int sysctl_nr_trim_pages; 104extern int sysctl_nr_trim_pages;
93#endif 105#endif
94#ifdef CONFIG_RCU_TORTURE_TEST
95extern int rcutorture_runnable;
96#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
97#ifdef CONFIG_BLOCK 106#ifdef CONFIG_BLOCK
98extern int blk_iopoll_enabled; 107extern int blk_iopoll_enabled;
99#endif 108#endif
@@ -123,14 +132,6 @@ static int min_percpu_pagelist_fract = 8;
123 132
124static int ngroups_max = NGROUPS_MAX; 133static int ngroups_max = NGROUPS_MAX;
125 134
126#ifdef CONFIG_MODULES
127extern char modprobe_path[];
128extern int modules_disabled;
129#endif
130#ifdef CONFIG_CHR_DEV_SG
131extern int sg_big_buff;
132#endif
133
134#ifdef CONFIG_SPARC 135#ifdef CONFIG_SPARC
135#include <asm/system.h> 136#include <asm/system.h>
136#endif 137#endif
@@ -152,10 +153,6 @@ extern int sysctl_userprocess_debug;
152extern int spin_retry; 153extern int spin_retry;
153#endif 154#endif
154 155
155#ifdef CONFIG_BSD_PROCESS_ACCT
156extern int acct_parm[];
157#endif
158
159#ifdef CONFIG_IA64 156#ifdef CONFIG_IA64
160extern int no_unaligned_warning; 157extern int no_unaligned_warning;
161extern int unaligned_dump_stack; 158extern int unaligned_dump_stack;
@@ -163,10 +160,6 @@ extern int unaligned_dump_stack;
163 160
164extern struct ratelimit_state printk_ratelimit_state; 161extern struct ratelimit_state printk_ratelimit_state;
165 162
166#ifdef CONFIG_RT_MUTEXES
167extern int max_lock_depth;
168#endif
169
170#ifdef CONFIG_PROC_SYSCTL 163#ifdef CONFIG_PROC_SYSCTL
171static int proc_do_cad_pid(struct ctl_table *table, int write, 164static int proc_do_cad_pid(struct ctl_table *table, int write,
172 void __user *buffer, size_t *lenp, loff_t *ppos); 165 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -205,9 +198,6 @@ extern struct ctl_table epoll_table[];
205int sysctl_legacy_va_layout; 198int sysctl_legacy_va_layout;
206#endif 199#endif
207 200
208extern int prove_locking;
209extern int lock_stat;
210
211/* The default sysctl tables: */ 201/* The default sysctl tables: */
212 202
213static struct ctl_table root_table[] = { 203static struct ctl_table root_table[] = {
@@ -1454,7 +1444,7 @@ static struct ctl_table fs_table[] = {
1454}; 1444};
1455 1445
1456static struct ctl_table debug_table[] = { 1446static struct ctl_table debug_table[] = {
1457#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1447#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
1458 { 1448 {
1459 .procname = "exception-trace", 1449 .procname = "exception-trace",
1460 .data = &show_unhandled_signals, 1450 .data = &show_unhandled_signals,
@@ -1463,6 +1453,17 @@ static struct ctl_table debug_table[] = {
1463 .proc_handler = proc_dointvec 1453 .proc_handler = proc_dointvec
1464 }, 1454 },
1465#endif 1455#endif
1456#if defined(CONFIG_OPTPROBES)
1457 {
1458 .procname = "kprobes-optimization",
1459 .data = &sysctl_kprobes_optimization,
1460 .maxlen = sizeof(int),
1461 .mode = 0644,
1462 .proc_handler = proc_kprobes_optimization_handler,
1463 .extra1 = &zero,
1464 .extra2 = &one,
1465 },
1466#endif
1466 { } 1467 { }
1467}; 1468};
1468 1469
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..59030570f5ca 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/slab.h>
16 17
17#ifdef CONFIG_SYSCTL_SYSCALL 18#ifdef CONFIG_SYSCTL_SYSCALL
18 19
@@ -1331,7 +1332,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1331 ssize_t result; 1332 ssize_t result;
1332 char *pathname; 1333 char *pathname;
1333 int flags; 1334 int flags;
1334 int acc_mode, fmode; 1335 int acc_mode;
1335 1336
1336 pathname = sysctl_getname(name, nlen, &table); 1337 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname); 1338 result = PTR_ERR(pathname);
@@ -1342,15 +1343,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1342 if (oldval && oldlen && newval && newlen) { 1343 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR; 1344 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE; 1345 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) { 1346 } else if (newval && newlen) {
1347 flags = O_WRONLY; 1347 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE; 1348 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) { 1349 } else if (oldval && oldlen) {
1351 flags = O_RDONLY; 1350 flags = O_RDONLY;
1352 acc_mode = MAY_READ; 1351 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else { 1352 } else {
1355 result = 0; 1353 result = 0;
1356 goto out_putname; 1354 goto out_putname;
@@ -1361,7 +1359,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1361 if (result) 1359 if (result)
1362 goto out_putname; 1360 goto out_putname;
1363 1361
1364 result = may_open(&nd.path, acc_mode, fmode); 1362 result = may_open(&nd.path, acc_mode, flags);
1365 if (result) 1363 if (result)
1366 goto out_putpath; 1364 goto out_putpath;
1367 1365
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/cpumask.h> 23#include <linux/cpumask.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/slab.h>
25#include <linux/cgroupstats.h> 26#include <linux/cgroupstats.h>
26#include <linux/cgroup.h> 27#include <linux/cgroup.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
@@ -46,15 +47,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 48};
48 49
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 50static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 55
56static struct nla_policy 56static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 58};
60 59
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..656dccfe1cbb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
39#include <linux/math64.h> 38#include <linux/math64.h>
40#include <linux/ptrace.h> 39#include <linux/ptrace.h>
41 40
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d34..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
343{ 343{
344 unsigned long flags; 344 unsigned long flags;
345 345
346 spin_lock_irqsave(&watchdog_lock, flags); 346 /*
347 * We use trylock here to avoid a potential dead lock when
348 * kgdb calls this code after the kernel has been stopped with
349 * watchdog_lock held. When watchdog_lock is held we just
350 * return and accept, that the watchdog might trigger and mark
351 * the monitored clock source (usually TSC) unstable.
352 *
353 * This does not affect the other caller clocksource_resume()
354 * because at this point the kernel is UP, interrupts are
355 * disabled and nothing can hold watchdog_lock.
356 */
357 if (!spin_trylock_irqsave(&watchdog_lock, flags))
358 return;
347 clocksource_reset_watchdog(); 359 clocksource_reset_watchdog();
348 spin_unlock_irqrestore(&watchdog_lock, flags); 360 spin_unlock_irqrestore(&watchdog_lock, flags);
349} 361}
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
441#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
442 454
443/** 455/**
456 * clocksource_suspend - suspend the clocksource(s)
457 */
458void clocksource_suspend(void)
459{
460 struct clocksource *cs;
461
462 list_for_each_entry_reverse(cs, &clocksource_list, list)
463 if (cs->suspend)
464 cs->suspend(cs);
465}
466
467/**
444 * clocksource_resume - resume the clocksource(s) 468 * clocksource_resume - resume the clocksource(s)
445 */ 469 */
446void clocksource_resume(void) 470void clocksource_resume(void)
@@ -449,7 +473,7 @@ void clocksource_resume(void)
449 473
450 list_for_each_entry(cs, &clocksource_list, list) 474 list_for_each_entry(cs, &clocksource_list, list)
451 if (cs->resume) 475 if (cs->resume)
452 cs->resume(); 476 cs->resume(cs);
453 477
454 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
455} 479}
@@ -458,8 +482,8 @@ void clocksource_resume(void)
458 * clocksource_touch_watchdog - Update watchdog 482 * clocksource_touch_watchdog - Update watchdog
459 * 483 *
460 * Update the watchdog after exception contexts such as kgdb so as not 484 * Update the watchdog after exception contexts such as kgdb so as not
461 * to incorrectly trip the watchdog. 485 * to incorrectly trip the watchdog. This might fail when the kernel
462 * 486 * was stopped in code which holds watchdog_lock.
463 */ 487 */
464void clocksource_touch_watchdog(void) 488void clocksource_touch_watchdog(void)
465{ 489{
@@ -568,6 +592,10 @@ static inline void clocksource_select(void) { }
568 */ 592 */
569static int __init clocksource_done_booting(void) 593static int __init clocksource_done_booting(void)
570{ 594{
595 mutex_lock(&clocksource_mutex);
596 curr_clocksource = clocksource_default_clock();
597 mutex_unlock(&clocksource_mutex);
598
571 finished_booting = 1; 599 finished_booting = 1;
572 600
573 /* 601 /*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64 time_offset;
58static long time_constant = 2; 58static long time_constant = 2;
59 59
60/* maximum error (usecs): */ 60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT; 61static long time_maxerror = NTP_PHASE_LIMIT;
62 62
63/* estimated error (usecs): */ 63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT; 64static long time_esterror = NTP_PHASE_LIMIT;
65 65
66/* frequency offset (scaled nsecs/secs): */ 66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq; 67static s64 time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
142 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
143 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
144 */ 144 */
145 secs = xtime.tv_sec - time_reftime; 145 secs = get_seconds() - time_reftime;
146 if (unlikely(time_status & STA_FREQHOLD)) 146 if (unlikely(time_status & STA_FREQHOLD))
147 secs = 0; 147 secs = 0;
148 148
149 time_reftime = xtime.tv_sec; 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
368 * reference time to current time. 368 * reference time to current time.
369 */ 369 */
370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) 370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
371 time_reftime = xtime.tv_sec; 371 time_reftime = get_seconds();
372 372
373 /* only set allowed bits */ 373 /* only set allowed bits */
374 time_status &= STA_RONLY; 374 time_status &= STA_RONLY;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
22 22
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/* Limit min_delta to a jiffie */
26#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
27
28static int tick_increase_min_delta(struct clock_event_device *dev)
29{
30 /* Nothing to do if we already reached the limit */
31 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
32 return -ETIME;
33
34 if (dev->min_delta_ns < 5000)
35 dev->min_delta_ns = 5000;
36 else
37 dev->min_delta_ns += dev->min_delta_ns >> 1;
38
39 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
40 dev->min_delta_ns = MIN_DELTA_LIMIT;
41
42 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
43 dev->name ? dev->name : "?",
44 (unsigned long long) dev->min_delta_ns);
45 return 0;
46}
47
25/** 48/**
26 * tick_program_event internal worker function 49 * tick_program_event internal worker function
27 */ 50 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
37 if (!ret || !force) 60 if (!ret || !force)
38 return ret; 61 return ret;
39 62
63 dev->retries++;
40 /* 64 /*
41 * We tried 2 times to program the device with the given 65 * We tried 3 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it 66 * min_delta_ns. If that's not working then we increase it
43 * and emit a warning. 67 * and emit a warning.
44 */ 68 */
45 if (++i > 2) { 69 if (++i > 2) {
46 /* Increase the min. delta and try again */ 70 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns) 71 if (tick_increase_min_delta(dev)) {
48 dev->min_delta_ns = 5000; 72 /*
49 else 73 * Get out of the loop if min_delta_ns
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 74 * hit the limit already. That's
51 75 * better than staying here forever.
52 printk(KERN_WARNING 76 *
53 "CE: %s increasing min_delta_ns to %llu nsec\n", 77 * We clear next_event so we have a
54 dev->name ? dev->name : "?", 78 * chance that the box survives.
55 (unsigned long long) dev->min_delta_ns << 1); 79 */
56 80 printk(KERN_WARNING
81 "CE: Reprogramming failure. Giving up\n");
82 dev->next_event.tv64 = KTIME_MAX;
83 return -ETIME;
84 }
57 i = 0; 85 i = 0;
58 } 86 }
59 87
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 12f5c55090be..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/timecompare.h> 20#include <linux/timecompare.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h>
22#include <linux/math64.h> 23#include <linux/math64.h>
23 24
24/* 25/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4f..39f6177fafac 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -622,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
622 write_sequnlock_irqrestore(&xtime_lock, flags); 622 write_sequnlock_irqrestore(&xtime_lock, flags);
623 623
624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
625 clocksource_suspend();
625 626
626 return 0; 627 return 0;
627} 628}
@@ -817,7 +818,8 @@ void update_wall_time(void)
817 shift = min(shift, maxshift); 818 shift = min(shift, maxshift);
818 while (offset >= timekeeper.cycle_interval) { 819 while (offset >= timekeeper.cycle_interval) {
819 offset = logarithmic_accumulation(offset, shift); 820 offset = logarithmic_accumulation(offset, shift);
820 shift--; 821 if(offset < timekeeper.cycle_interval<<shift)
822 shift--;
821 } 823 }
822 824
823 /* correct the clock when NTP error is too big */ 825 /* correct the clock when NTP error is too big */
@@ -880,6 +882,7 @@ void getboottime(struct timespec *ts)
880 882
881 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 883 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
882} 884}
885EXPORT_SYMBOL_GPL(getboottime);
883 886
884/** 887/**
885 * monotonic_to_bootbased - Convert the monotonic time to boot based. 888 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -889,6 +892,7 @@ void monotonic_to_bootbased(struct timespec *ts)
889{ 892{
890 *ts = timespec_add_safe(*ts, total_sleep_time); 893 *ts = timespec_add_safe(*ts, total_sleep_time);
891} 894}
895EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
892 896
893unsigned long get_seconds(void) 897unsigned long get_seconds(void)
894{ 898{
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..1a4a7dd78777 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
228 SEQ_printf(m, " event_handler: "); 228 SEQ_printf(m, " event_handler: ");
229 print_name_offset(m, dev->event_handler); 229 print_name_offset(m, dev->event_handler);
230 SEQ_printf(m, "\n"); 230 SEQ_printf(m, "\n");
231 SEQ_printf(m, " retries: %lu\n", dev->retries);
231} 232}
232 233
233static void timer_list_show_tickdevices(struct seq_file *m) 234static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)
257 u64 now = ktime_to_ns(ktime_get()); 258 u64 now = ktime_to_ns(ktime_get());
258 int cpu; 259 int cpu;
259 260
260 SEQ_printf(m, "Timer List Version: v0.5\n"); 261 SEQ_printf(m, "Timer List Version: v0.6\n");
261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 262 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 263 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
263 264
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..aeb6a54f2771 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/unistd.h> 45#include <asm/unistd.h>
@@ -880,6 +881,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
880 if (base->running_timer == timer) 881 if (base->running_timer == timer)
881 goto out; 882 goto out;
882 883
884 timer_stats_timer_clear_start_info(timer);
883 ret = 0; 885 ret = 0;
884 if (timer_pending(timer)) { 886 if (timer_pending(timer)) {
885 detach_timer(timer, 1); 887 detach_timer(timer, 1);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6c22d8a2f289..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
29 help 29 help
30 An arch may pass in a unique value (frame pointer) to both the 30 See Documentation/trace/ftrace-design.txt
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
33 31
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 33 bool
@@ -330,15 +328,6 @@ config BRANCH_TRACER
330 328
331 Say N if unsure. 329 Say N if unsure.
332 330
333config POWER_TRACER
334 bool "Trace power consumption behavior"
335 depends on X86
336 select GENERIC_TRACER
337 help
338 This tracer helps developers to analyze and optimize the kernel's
339 power management decisions, specifically the C-state and P-state
340 behavior.
341
342config KSYM_TRACER 331config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations" 332 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT 333 depends on HAVE_HW_BREAKPOINT
@@ -451,7 +440,7 @@ config BLK_DEV_IO_TRACE
451 440
452config KPROBE_EVENT 441config KPROBE_EVENT
453 depends on KPROBES 442 depends on KPROBES
454 depends on X86 443 depends on HAVE_REGS_AND_STACK_ACCESS_API
455 bool "Enable kprobes-based dynamic events" 444 bool "Enable kprobes-based dynamic events"
456 select TRACING 445 select TRACING
457 default y 446 default y
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d00c6fe23f54..78edc6490038 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54ifeq ($(CONFIG_PERF_EVENTS),y) 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o 55obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
56endif 56endif
57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..b3bc91a3f510 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/debugfs.h> 25#include <linux/debugfs.h>
25#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
26#include <linux/time.h> 27#include <linux/time.h>
@@ -540,9 +541,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
540 if (ret) 541 if (ret)
541 return ret; 542 return ret;
542 543
543 if (copy_to_user(arg, &buts, sizeof(buts))) 544 if (copy_to_user(arg, &buts, sizeof(buts))) {
545 blk_trace_remove(q);
544 return -EFAULT; 546 return -EFAULT;
545 547 }
546 return 0; 548 return 0;
547} 549}
548EXPORT_SYMBOL_GPL(blk_trace_setup); 550EXPORT_SYMBOL_GPL(blk_trace_setup);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1904797f4a8a..2404b59b3097 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -24,9 +24,11 @@
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/ftrace.h> 25#include <linux/ftrace.h>
26#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/slab.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/list.h> 29#include <linux/list.h>
29#include <linux/hash.h> 30#include <linux/hash.h>
31#include <linux/rcupdate.h>
30 32
31#include <trace/events/sched.h> 33#include <trace/events/sched.h>
32 34
@@ -84,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
84ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
85ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
86 88
87#ifdef CONFIG_FUNCTION_GRAPH_TRACER 89/*
88static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 90 * Traverse the ftrace_list, invoking all entries. The reason that we
89#endif 91 * can use rcu_dereference_raw() is that elements removed from this list
90 92 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list.
95 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */
91static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
92{ 99{
93 struct ftrace_ops *op = ftrace_list; 100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
94
95 /* in case someone actually ports this to alpha! */
96 read_barrier_depends();
97 101
98 while (op != &ftrace_list_end) { 102 while (op != &ftrace_list_end) {
99 /* silly alpha */
100 read_barrier_depends();
101 op->func(ip, parent_ip); 103 op->func(ip, parent_ip);
102 op = op->next; 104 op = rcu_dereference_raw(op->next); /*see above*/
103 }; 105 };
104} 106}
105 107
@@ -154,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
154 * the ops->next pointer is valid before another CPU sees 156 * the ops->next pointer is valid before another CPU sees
155 * the ops pointer included into the ftrace_list. 157 * the ops pointer included into the ftrace_list.
156 */ 158 */
157 smp_wmb(); 159 rcu_assign_pointer(ftrace_list, ops);
158 ftrace_list = ops;
159 160
160 if (ftrace_enabled) { 161 if (ftrace_enabled) {
161 ftrace_func_t func; 162 ftrace_func_t func;
@@ -2276,6 +2277,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
2276 2277
2277#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2278#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2278static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 2279static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2280static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
2281
2279static int __init set_graph_function(char *str) 2282static int __init set_graph_function(char *str)
2280{ 2283{
2281 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 2284 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -2402,6 +2405,7 @@ static const struct file_operations ftrace_notrace_fops = {
2402static DEFINE_MUTEX(graph_lock); 2405static DEFINE_MUTEX(graph_lock);
2403 2406
2404int ftrace_graph_count; 2407int ftrace_graph_count;
2408int ftrace_graph_filter_enabled;
2405unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2409unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2406 2410
2407static void * 2411static void *
@@ -2424,7 +2428,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2424 mutex_lock(&graph_lock); 2428 mutex_lock(&graph_lock);
2425 2429
2426 /* Nothing, tell g_show to print all functions are enabled */ 2430 /* Nothing, tell g_show to print all functions are enabled */
2427 if (!ftrace_graph_count && !*pos) 2431 if (!ftrace_graph_filter_enabled && !*pos)
2428 return (void *)1; 2432 return (void *)1;
2429 2433
2430 return __g_next(m, pos); 2434 return __g_next(m, pos);
@@ -2470,6 +2474,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2470 mutex_lock(&graph_lock); 2474 mutex_lock(&graph_lock);
2471 if ((file->f_mode & FMODE_WRITE) && 2475 if ((file->f_mode & FMODE_WRITE) &&
2472 (file->f_flags & O_TRUNC)) { 2476 (file->f_flags & O_TRUNC)) {
2477 ftrace_graph_filter_enabled = 0;
2473 ftrace_graph_count = 0; 2478 ftrace_graph_count = 0;
2474 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2479 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2475 } 2480 }
@@ -2495,7 +2500,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2495 struct dyn_ftrace *rec; 2500 struct dyn_ftrace *rec;
2496 struct ftrace_page *pg; 2501 struct ftrace_page *pg;
2497 int search_len; 2502 int search_len;
2498 int found = 0; 2503 int fail = 1;
2499 int type, not; 2504 int type, not;
2500 char *search; 2505 char *search;
2501 bool exists; 2506 bool exists;
@@ -2506,37 +2511,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2506 2511
2507 /* decode regex */ 2512 /* decode regex */
2508 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 2513 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2509 if (not) 2514 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2510 return -EINVAL; 2515 return -EBUSY;
2511 2516
2512 search_len = strlen(search); 2517 search_len = strlen(search);
2513 2518
2514 mutex_lock(&ftrace_lock); 2519 mutex_lock(&ftrace_lock);
2515 do_for_each_ftrace_rec(pg, rec) { 2520 do_for_each_ftrace_rec(pg, rec) {
2516 2521
2517 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2518 break;
2519
2520 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2522 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2521 continue; 2523 continue;
2522 2524
2523 if (ftrace_match_record(rec, search, search_len, type)) { 2525 if (ftrace_match_record(rec, search, search_len, type)) {
2524 /* ensure it is not already in the array */ 2526 /* if it is in the array */
2525 exists = false; 2527 exists = false;
2526 for (i = 0; i < *idx; i++) 2528 for (i = 0; i < *idx; i++) {
2527 if (array[i] == rec->ip) { 2529 if (array[i] == rec->ip) {
2528 exists = true; 2530 exists = true;
2529 break; 2531 break;
2530 } 2532 }
2531 if (!exists) 2533 }
2532 array[(*idx)++] = rec->ip; 2534
2533 found = 1; 2535 if (!not) {
2536 fail = 0;
2537 if (!exists) {
2538 array[(*idx)++] = rec->ip;
2539 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2540 goto out;
2541 }
2542 } else {
2543 if (exists) {
2544 array[i] = array[--(*idx)];
2545 array[*idx] = 0;
2546 fail = 0;
2547 }
2548 }
2534 } 2549 }
2535 } while_for_each_ftrace_rec(); 2550 } while_for_each_ftrace_rec();
2536 2551out:
2537 mutex_unlock(&ftrace_lock); 2552 mutex_unlock(&ftrace_lock);
2538 2553
2539 return found ? 0 : -EINVAL; 2554 if (fail)
2555 return -EINVAL;
2556
2557 ftrace_graph_filter_enabled = 1;
2558 return 0;
2540} 2559}
2541 2560
2542static ssize_t 2561static ssize_t
@@ -2546,16 +2565,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2546 struct trace_parser parser; 2565 struct trace_parser parser;
2547 ssize_t read, ret; 2566 ssize_t read, ret;
2548 2567
2549 if (!cnt || cnt < 0) 2568 if (!cnt)
2550 return 0; 2569 return 0;
2551 2570
2552 mutex_lock(&graph_lock); 2571 mutex_lock(&graph_lock);
2553 2572
2554 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2555 ret = -EBUSY;
2556 goto out_unlock;
2557 }
2558
2559 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2573 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2560 ret = -ENOMEM; 2574 ret = -ENOMEM;
2561 goto out_unlock; 2575 goto out_unlock;
@@ -3340,6 +3354,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3340{ 3354{
3341 /* Make sure we do not use the parent ret_stack */ 3355 /* Make sure we do not use the parent ret_stack */
3342 t->ret_stack = NULL; 3356 t->ret_stack = NULL;
3357 t->curr_ret_stack = -1;
3343 3358
3344 if (ftrace_graph_active) { 3359 if (ftrace_graph_active) {
3345 struct ftrace_ret_stack *ret_stack; 3360 struct ftrace_ret_stack *ret_stack;
@@ -3349,7 +3364,6 @@ void ftrace_graph_init_task(struct task_struct *t)
3349 GFP_KERNEL); 3364 GFP_KERNEL);
3350 if (!ret_stack) 3365 if (!ret_stack)
3351 return; 3366 return;
3352 t->curr_ret_stack = -1;
3353 atomic_set(&t->tracing_graph_pause, 0); 3367 atomic_set(&t->tracing_graph_pause, 0);
3354 atomic_set(&t->trace_overrun, 0); 3368 atomic_set(&t->trace_overrun, 0);
3355 t->ftrace_timestamp = 0; 3369 t->ftrace_timestamp = 0;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9f4f565b01e6..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,7 +9,6 @@
9#include <linux/workqueue.h> 9#include <linux/workqueue.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
13 12
14#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 14#include <trace/events/power.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801b..41ca394feb22 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/hash.h> 19#include <linux/hash.h>
19#include <linux/list.h> 20#include <linux/list.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/fs.h> 22#include <linux/fs.h>
22 23
24#include <asm/local.h>
23#include "trace.h" 25#include "trace.h"
24 26
25/* 27/*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
207#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
208 210
211#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
212# define RB_FORCE_8BYTE_ALIGNMENT 0
213# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
214#else
215# define RB_FORCE_8BYTE_ALIGNMENT 1
216# define RB_ARCH_ALIGNMENT 8U
217#endif
218
209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 219/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 220#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
211 221
@@ -464,6 +474,8 @@ struct ring_buffer_iter {
464 struct ring_buffer_per_cpu *cpu_buffer; 474 struct ring_buffer_per_cpu *cpu_buffer;
465 unsigned long head; 475 unsigned long head;
466 struct buffer_page *head_page; 476 struct buffer_page *head_page;
477 struct buffer_page *cache_reader_page;
478 unsigned long cache_read;
467 u64 read_stamp; 479 u64 read_stamp;
468}; 480};
469 481
@@ -1198,18 +1210,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1198 1210
1199 for (i = 0; i < nr_pages; i++) { 1211 for (i = 0; i < nr_pages; i++) {
1200 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1212 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1201 return; 1213 goto out;
1202 p = cpu_buffer->pages->next; 1214 p = cpu_buffer->pages->next;
1203 bpage = list_entry(p, struct buffer_page, list); 1215 bpage = list_entry(p, struct buffer_page, list);
1204 list_del_init(&bpage->list); 1216 list_del_init(&bpage->list);
1205 free_buffer_page(bpage); 1217 free_buffer_page(bpage);
1206 } 1218 }
1207 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1219 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1208 return; 1220 goto out;
1209 1221
1210 rb_reset_cpu(cpu_buffer); 1222 rb_reset_cpu(cpu_buffer);
1211 rb_check_pages(cpu_buffer); 1223 rb_check_pages(cpu_buffer);
1212 1224
1225out:
1213 spin_unlock_irq(&cpu_buffer->reader_lock); 1226 spin_unlock_irq(&cpu_buffer->reader_lock);
1214} 1227}
1215 1228
@@ -1226,7 +1239,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1226 1239
1227 for (i = 0; i < nr_pages; i++) { 1240 for (i = 0; i < nr_pages; i++) {
1228 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1241 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1229 return; 1242 goto out;
1230 p = pages->next; 1243 p = pages->next;
1231 bpage = list_entry(p, struct buffer_page, list); 1244 bpage = list_entry(p, struct buffer_page, list);
1232 list_del_init(&bpage->list); 1245 list_del_init(&bpage->list);
@@ -1235,6 +1248,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1235 rb_reset_cpu(cpu_buffer); 1248 rb_reset_cpu(cpu_buffer);
1236 rb_check_pages(cpu_buffer); 1249 rb_check_pages(cpu_buffer);
1237 1250
1251out:
1238 spin_unlock_irq(&cpu_buffer->reader_lock); 1252 spin_unlock_irq(&cpu_buffer->reader_lock);
1239} 1253}
1240 1254
@@ -1544,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event,
1544 1558
1545 case 0: 1559 case 0:
1546 length -= RB_EVNT_HDR_SIZE; 1560 length -= RB_EVNT_HDR_SIZE;
1547 if (length > RB_MAX_SMALL_DATA) 1561 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1548 event->array[0] = length; 1562 event->array[0] = length;
1549 else 1563 else
1550 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1564 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1719,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length)
1719 if (!length) 1733 if (!length)
1720 length = 1; 1734 length = 1;
1721 1735
1722 if (length > RB_MAX_SMALL_DATA) 1736 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1723 length += sizeof(event.array[0]); 1737 length += sizeof(event.array[0]);
1724 1738
1725 length += RB_EVNT_HDR_SIZE; 1739 length += RB_EVNT_HDR_SIZE;
1726 length = ALIGN(length, RB_ALIGNMENT); 1740 length = ALIGN(length, RB_ARCH_ALIGNMENT);
1727 1741
1728 return length; 1742 return length;
1729} 1743}
@@ -2230,12 +2244,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2230 if (ring_buffer_flags != RB_BUFFERS_ON) 2244 if (ring_buffer_flags != RB_BUFFERS_ON)
2231 return NULL; 2245 return NULL;
2232 2246
2233 if (atomic_read(&buffer->record_disabled))
2234 return NULL;
2235
2236 /* If we are tracing schedule, we don't want to recurse */ 2247 /* If we are tracing schedule, we don't want to recurse */
2237 resched = ftrace_preempt_disable(); 2248 resched = ftrace_preempt_disable();
2238 2249
2250 if (atomic_read(&buffer->record_disabled))
2251 goto out_nocheck;
2252
2239 if (trace_recursive_lock()) 2253 if (trace_recursive_lock())
2240 goto out_nocheck; 2254 goto out_nocheck;
2241 2255
@@ -2467,11 +2481,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
2467 if (ring_buffer_flags != RB_BUFFERS_ON) 2481 if (ring_buffer_flags != RB_BUFFERS_ON)
2468 return -EBUSY; 2482 return -EBUSY;
2469 2483
2470 if (atomic_read(&buffer->record_disabled))
2471 return -EBUSY;
2472
2473 resched = ftrace_preempt_disable(); 2484 resched = ftrace_preempt_disable();
2474 2485
2486 if (atomic_read(&buffer->record_disabled))
2487 goto out;
2488
2475 cpu = raw_smp_processor_id(); 2489 cpu = raw_smp_processor_id();
2476 2490
2477 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2491 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2539,7 +2553,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2539 * @buffer: The ring buffer to enable writes 2553 * @buffer: The ring buffer to enable writes
2540 * 2554 *
2541 * Note, multiple disables will need the same number of enables 2555 * Note, multiple disables will need the same number of enables
2542 * to truely enable the writing (much like preempt_disable). 2556 * to truly enable the writing (much like preempt_disable).
2543 */ 2557 */
2544void ring_buffer_record_enable(struct ring_buffer *buffer) 2558void ring_buffer_record_enable(struct ring_buffer *buffer)
2545{ 2559{
@@ -2575,7 +2589,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2575 * @cpu: The CPU to enable. 2589 * @cpu: The CPU to enable.
2576 * 2590 *
2577 * Note, multiple disables will need the same number of enables 2591 * Note, multiple disables will need the same number of enables
2578 * to truely enable the writing (much like preempt_disable). 2592 * to truly enable the writing (much like preempt_disable).
2579 */ 2593 */
2580void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 2594void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2581{ 2595{
@@ -2716,6 +2730,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2716 iter->read_stamp = cpu_buffer->read_stamp; 2730 iter->read_stamp = cpu_buffer->read_stamp;
2717 else 2731 else
2718 iter->read_stamp = iter->head_page->page->time_stamp; 2732 iter->read_stamp = iter->head_page->page->time_stamp;
2733 iter->cache_reader_page = cpu_buffer->reader_page;
2734 iter->cache_read = cpu_buffer->read;
2719} 2735}
2720 2736
2721/** 2737/**
@@ -3060,13 +3076,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3060 struct ring_buffer_event *event; 3076 struct ring_buffer_event *event;
3061 int nr_loops = 0; 3077 int nr_loops = 0;
3062 3078
3063 if (ring_buffer_iter_empty(iter))
3064 return NULL;
3065
3066 cpu_buffer = iter->cpu_buffer; 3079 cpu_buffer = iter->cpu_buffer;
3067 buffer = cpu_buffer->buffer; 3080 buffer = cpu_buffer->buffer;
3068 3081
3082 /*
3083 * Check if someone performed a consuming read to
3084 * the buffer. A consuming read invalidates the iterator
3085 * and we need to reset the iterator in this case.
3086 */
3087 if (unlikely(iter->cache_read != cpu_buffer->read ||
3088 iter->cache_reader_page != cpu_buffer->reader_page))
3089 rb_iter_reset(iter);
3090
3069 again: 3091 again:
3092 if (ring_buffer_iter_empty(iter))
3093 return NULL;
3094
3070 /* 3095 /*
3071 * We repeat when a timestamp is encountered. 3096 * We repeat when a timestamp is encountered.
3072 * We can get multiple timestamps by nested interrupts or also 3097 * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3106,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3081 if (rb_per_cpu_empty(cpu_buffer)) 3106 if (rb_per_cpu_empty(cpu_buffer))
3082 return NULL; 3107 return NULL;
3083 3108
3109 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3110 rb_inc_iter(iter);
3111 goto again;
3112 }
3113
3084 event = rb_iter_head_event(iter); 3114 event = rb_iter_head_event(iter);
3085 3115
3086 switch (event->type_len) { 3116 switch (event->type_len) {
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9e..44f916a04065 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,10 +32,11 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
36#include <linux/slab.h>
35#include <linux/ctype.h> 37#include <linux/ctype.h>
36#include <linux/init.h> 38#include <linux/init.h>
37#include <linux/poll.h> 39#include <linux/poll.h>
38#include <linux/gfp.h>
39#include <linux/fs.h> 40#include <linux/fs.h>
40 41
41#include "trace.h" 42#include "trace.h"
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
91static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
92{ 93{
93 preempt_disable(); 94 preempt_disable();
94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
95} 96}
96 97
97static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
98{ 99{
99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
100 preempt_enable(); 101 preempt_enable();
101} 102}
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -297,6 +374,21 @@ static int __init set_buf_size(char *str)
297} 374}
298__setup("trace_buf_size=", set_buf_size); 375__setup("trace_buf_size=", set_buf_size);
299 376
377static int __init set_tracing_thresh(char *str)
378{
379 unsigned long threshhold;
380 int ret;
381
382 if (!str)
383 return 0;
384 ret = strict_strtoul(str, 0, &threshhold);
385 if (ret < 0)
386 return 0;
387 tracing_thresh = threshhold * 1000;
388 return 1;
389}
390__setup("tracing_thresh=", set_tracing_thresh);
391
300unsigned long nsecs_to_usecs(unsigned long nsecs) 392unsigned long nsecs_to_usecs(unsigned long nsecs)
301{ 393{
302 return nsecs / 1000; 394 return nsecs / 1000;
@@ -502,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
502static arch_spinlock_t ftrace_max_lock = 594static arch_spinlock_t ftrace_max_lock =
503 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 595 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
504 596
597unsigned long __read_mostly tracing_thresh;
598
505#ifdef CONFIG_TRACER_MAX_TRACE 599#ifdef CONFIG_TRACER_MAX_TRACE
506unsigned long __read_mostly tracing_max_latency; 600unsigned long __read_mostly tracing_max_latency;
507unsigned long __read_mostly tracing_thresh;
508 601
509/* 602/*
510 * Copy the new maximum trace into the separate maximum-trace 603 * Copy the new maximum trace into the separate maximum-trace
@@ -515,7 +608,7 @@ static void
515__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 608__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
516{ 609{
517 struct trace_array_cpu *data = tr->data[cpu]; 610 struct trace_array_cpu *data = tr->data[cpu];
518 struct trace_array_cpu *max_data = tr->data[cpu]; 611 struct trace_array_cpu *max_data;
519 612
520 max_tr.cpu = cpu; 613 max_tr.cpu = cpu;
521 max_tr.time_start = data->preempt_timestamp; 614 max_tr.time_start = data->preempt_timestamp;
@@ -525,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
525 max_data->critical_start = data->critical_start; 618 max_data->critical_start = data->critical_start;
526 max_data->critical_end = data->critical_end; 619 max_data->critical_end = data->critical_end;
527 620
528 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 621 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
529 max_data->pid = tsk->pid; 622 max_data->pid = tsk->pid;
530 max_data->uid = task_uid(tsk); 623 max_data->uid = task_uid(tsk);
531 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 624 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -747,10 +840,10 @@ out:
747 mutex_unlock(&trace_types_lock); 840 mutex_unlock(&trace_types_lock);
748} 841}
749 842
750static void __tracing_reset(struct trace_array *tr, int cpu) 843static void __tracing_reset(struct ring_buffer *buffer, int cpu)
751{ 844{
752 ftrace_disable_cpu(); 845 ftrace_disable_cpu();
753 ring_buffer_reset_cpu(tr->buffer, cpu); 846 ring_buffer_reset_cpu(buffer, cpu);
754 ftrace_enable_cpu(); 847 ftrace_enable_cpu();
755} 848}
756 849
@@ -762,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
762 855
763 /* Make sure all commits have finished */ 856 /* Make sure all commits have finished */
764 synchronize_sched(); 857 synchronize_sched();
765 __tracing_reset(tr, cpu); 858 __tracing_reset(buffer, cpu);
766 859
767 ring_buffer_record_enable(buffer); 860 ring_buffer_record_enable(buffer);
768} 861}
@@ -780,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
780 tr->time_start = ftrace_now(tr->cpu); 873 tr->time_start = ftrace_now(tr->cpu);
781 874
782 for_each_online_cpu(cpu) 875 for_each_online_cpu(cpu)
783 __tracing_reset(tr, cpu); 876 __tracing_reset(buffer, cpu);
784 877
785 ring_buffer_record_enable(buffer); 878 ring_buffer_record_enable(buffer);
786} 879}
@@ -857,6 +950,8 @@ void tracing_start(void)
857 goto out; 950 goto out;
858 } 951 }
859 952
953 /* Prevent the buffers from switching */
954 arch_spin_lock(&ftrace_max_lock);
860 955
861 buffer = global_trace.buffer; 956 buffer = global_trace.buffer;
862 if (buffer) 957 if (buffer)
@@ -866,6 +961,8 @@ void tracing_start(void)
866 if (buffer) 961 if (buffer)
867 ring_buffer_record_enable(buffer); 962 ring_buffer_record_enable(buffer);
868 963
964 arch_spin_unlock(&ftrace_max_lock);
965
869 ftrace_start(); 966 ftrace_start();
870 out: 967 out:
871 spin_unlock_irqrestore(&tracing_start_lock, flags); 968 spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -887,6 +984,9 @@ void tracing_stop(void)
887 if (trace_stop_count++) 984 if (trace_stop_count++)
888 goto out; 985 goto out;
889 986
987 /* Prevent the buffers from switching */
988 arch_spin_lock(&ftrace_max_lock);
989
890 buffer = global_trace.buffer; 990 buffer = global_trace.buffer;
891 if (buffer) 991 if (buffer)
892 ring_buffer_record_disable(buffer); 992 ring_buffer_record_disable(buffer);
@@ -895,6 +995,8 @@ void tracing_stop(void)
895 if (buffer) 995 if (buffer)
896 ring_buffer_record_disable(buffer); 996 ring_buffer_record_disable(buffer);
897 997
998 arch_spin_unlock(&ftrace_max_lock);
999
898 out: 1000 out:
899 spin_unlock_irqrestore(&tracing_start_lock, flags); 1001 spin_unlock_irqrestore(&tracing_start_lock, flags);
900} 1002}
@@ -951,6 +1053,11 @@ void trace_find_cmdline(int pid, char comm[])
951 return; 1053 return;
952 } 1054 }
953 1055
1056 if (WARN_ON_ONCE(pid < 0)) {
1057 strcpy(comm, "<XXX>");
1058 return;
1059 }
1060
954 if (pid > PID_MAX_DEFAULT) { 1061 if (pid > PID_MAX_DEFAULT) {
955 strcpy(comm, "<...>"); 1062 strcpy(comm, "<...>");
956 return; 1063 return;
@@ -1084,7 +1191,7 @@ trace_function(struct trace_array *tr,
1084 struct ftrace_entry *entry; 1191 struct ftrace_entry *entry;
1085 1192
1086 /* If we are reading the ring buffer, don't trace */ 1193 /* If we are reading the ring buffer, don't trace */
1087 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 1194 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1088 return; 1195 return;
1089 1196
1090 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1197 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1177,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1177 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1284 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1178 return; 1285 return;
1179 1286
1287 /*
1288 * NMIs can not handle page faults, even with fix ups.
1289 * The save user stack can (and often does) fault.
1290 */
1291 if (unlikely(in_nmi()))
1292 return;
1293
1180 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1294 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1181 sizeof(*entry), flags, pc); 1295 sizeof(*entry), flags, pc);
1182 if (!event) 1296 if (!event)
@@ -1315,8 +1429,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1315 entry->fmt = fmt; 1429 entry->fmt = fmt;
1316 1430
1317 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1431 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1318 if (!filter_check_discard(call, entry, buffer, event)) 1432 if (!filter_check_discard(call, entry, buffer, event)) {
1319 ring_buffer_unlock_commit(buffer, event); 1433 ring_buffer_unlock_commit(buffer, event);
1434 ftrace_trace_stack(buffer, flags, 6, pc);
1435 }
1320 1436
1321out_unlock: 1437out_unlock:
1322 arch_spin_unlock(&trace_buf_lock); 1438 arch_spin_unlock(&trace_buf_lock);
@@ -1389,8 +1505,10 @@ int trace_array_vprintk(struct trace_array *tr,
1389 1505
1390 memcpy(&entry->buf, trace_buf, len); 1506 memcpy(&entry->buf, trace_buf, len);
1391 entry->buf[len] = '\0'; 1507 entry->buf[len] = '\0';
1392 if (!filter_check_discard(call, entry, buffer, event)) 1508 if (!filter_check_discard(call, entry, buffer, event)) {
1393 ring_buffer_unlock_commit(buffer, event); 1509 ring_buffer_unlock_commit(buffer, event);
1510 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1511 }
1394 1512
1395 out_unlock: 1513 out_unlock:
1396 arch_spin_unlock(&trace_buf_lock); 1514 arch_spin_unlock(&trace_buf_lock);
@@ -1580,12 +1698,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1580} 1698}
1581 1699
1582/* 1700/*
1583 * No necessary locking here. The worst thing which can
1584 * happen is loosing events consumed at the same time
1585 * by a trace_pipe reader.
1586 * Other than that, we don't risk to crash the ring buffer
1587 * because it serializes the readers.
1588 *
1589 * The current tracer is copied to avoid a global locking 1701 * The current tracer is copied to avoid a global locking
1590 * all around. 1702 * all around.
1591 */ 1703 */
@@ -1623,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1623 1735
1624 ftrace_enable_cpu(); 1736 ftrace_enable_cpu();
1625 1737
1738 iter->leftover = 0;
1626 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1739 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1627 ; 1740 ;
1628 1741
@@ -1640,12 +1753,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1640 } 1753 }
1641 1754
1642 trace_event_read_lock(); 1755 trace_event_read_lock();
1756 trace_access_lock(cpu_file);
1643 return p; 1757 return p;
1644} 1758}
1645 1759
1646static void s_stop(struct seq_file *m, void *p) 1760static void s_stop(struct seq_file *m, void *p)
1647{ 1761{
1762 struct trace_iterator *iter = m->private;
1763
1648 atomic_dec(&trace_record_cmdline_disabled); 1764 atomic_dec(&trace_record_cmdline_disabled);
1765 trace_access_unlock(iter->cpu_file);
1649 trace_event_read_unlock(); 1766 trace_event_read_unlock();
1650} 1767}
1651 1768
@@ -2836,22 +2953,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2836 2953
2837 mutex_lock(&trace_types_lock); 2954 mutex_lock(&trace_types_lock);
2838 2955
2839 /* We only allow one reader per cpu */
2840 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2841 if (!cpumask_empty(tracing_reader_cpumask)) {
2842 ret = -EBUSY;
2843 goto out;
2844 }
2845 cpumask_setall(tracing_reader_cpumask);
2846 } else {
2847 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2848 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2849 else {
2850 ret = -EBUSY;
2851 goto out;
2852 }
2853 }
2854
2855 /* create a buffer to store the information to pass to userspace */ 2956 /* create a buffer to store the information to pass to userspace */
2856 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2957 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2857 if (!iter) { 2958 if (!iter) {
@@ -2907,12 +3008,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2907 3008
2908 mutex_lock(&trace_types_lock); 3009 mutex_lock(&trace_types_lock);
2909 3010
2910 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2911 cpumask_clear(tracing_reader_cpumask);
2912 else
2913 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2914
2915
2916 if (iter->trace->pipe_close) 3011 if (iter->trace->pipe_close)
2917 iter->trace->pipe_close(iter); 3012 iter->trace->pipe_close(iter);
2918 3013
@@ -3074,6 +3169,7 @@ waitagain:
3074 iter->pos = -1; 3169 iter->pos = -1;
3075 3170
3076 trace_event_read_lock(); 3171 trace_event_read_lock();
3172 trace_access_lock(iter->cpu_file);
3077 while (find_next_entry_inc(iter) != NULL) { 3173 while (find_next_entry_inc(iter) != NULL) {
3078 enum print_line_t ret; 3174 enum print_line_t ret;
3079 int len = iter->seq.len; 3175 int len = iter->seq.len;
@@ -3090,6 +3186,7 @@ waitagain:
3090 if (iter->seq.len >= cnt) 3186 if (iter->seq.len >= cnt)
3091 break; 3187 break;
3092 } 3188 }
3189 trace_access_unlock(iter->cpu_file);
3093 trace_event_read_unlock(); 3190 trace_event_read_unlock();
3094 3191
3095 /* Now copy what we have to the user */ 3192 /* Now copy what we have to the user */
@@ -3215,6 +3312,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3215 } 3312 }
3216 3313
3217 trace_event_read_lock(); 3314 trace_event_read_lock();
3315 trace_access_lock(iter->cpu_file);
3218 3316
3219 /* Fill as many pages as possible. */ 3317 /* Fill as many pages as possible. */
3220 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3318 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3238,6 +3336,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3238 trace_seq_init(&iter->seq); 3336 trace_seq_init(&iter->seq);
3239 } 3337 }
3240 3338
3339 trace_access_unlock(iter->cpu_file);
3241 trace_event_read_unlock(); 3340 trace_event_read_unlock();
3242 mutex_unlock(&iter->mutex); 3341 mutex_unlock(&iter->mutex);
3243 3342
@@ -3539,10 +3638,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3539 3638
3540 info->read = 0; 3639 info->read = 0;
3541 3640
3641 trace_access_lock(info->cpu);
3542 ret = ring_buffer_read_page(info->tr->buffer, 3642 ret = ring_buffer_read_page(info->tr->buffer,
3543 &info->spare, 3643 &info->spare,
3544 count, 3644 count,
3545 info->cpu, 0); 3645 info->cpu, 0);
3646 trace_access_unlock(info->cpu);
3546 if (ret < 0) 3647 if (ret < 0)
3547 return 0; 3648 return 0;
3548 3649
@@ -3670,6 +3771,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3670 len &= PAGE_MASK; 3771 len &= PAGE_MASK;
3671 } 3772 }
3672 3773
3774 trace_access_lock(info->cpu);
3673 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3775 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3674 3776
3675 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3777 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3717,6 +3819,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3717 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3819 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3718 } 3820 }
3719 3821
3822 trace_access_unlock(info->cpu);
3720 spd.nr_pages = i; 3823 spd.nr_pages = i;
3721 3824
3722 /* did we read anything? */ 3825 /* did we read anything? */
@@ -4153,6 +4256,8 @@ static __init int tracer_init_debugfs(void)
4153 struct dentry *d_tracer; 4256 struct dentry *d_tracer;
4154 int cpu; 4257 int cpu;
4155 4258
4259 trace_access_lock_init();
4260
4156 d_tracer = tracing_init_dentry(); 4261 d_tracer = tracing_init_dentry();
4157 4262
4158 trace_create_file("tracing_enabled", 0644, d_tracer, 4263 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4176,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
4176#ifdef CONFIG_TRACER_MAX_TRACE 4281#ifdef CONFIG_TRACER_MAX_TRACE
4177 trace_create_file("tracing_max_latency", 0644, d_tracer, 4282 trace_create_file("tracing_max_latency", 0644, d_tracer,
4178 &tracing_max_latency, &tracing_max_lat_fops); 4283 &tracing_max_latency, &tracing_max_lat_fops);
4284#endif
4179 4285
4180 trace_create_file("tracing_thresh", 0644, d_tracer, 4286 trace_create_file("tracing_thresh", 0644, d_tracer,
4181 &tracing_thresh, &tracing_max_lat_fops); 4287 &tracing_thresh, &tracing_max_lat_fops);
4182#endif
4183 4288
4184 trace_create_file("README", 0444, d_tracer, 4289 trace_create_file("README", 0444, d_tracer,
4185 NULL, &tracing_readme_fops); 4290 NULL, &tracing_readme_fops);
@@ -4387,9 +4492,6 @@ __init static int tracer_alloc_buffers(void)
4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4492 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4388 goto out_free_buffer_mask; 4493 goto out_free_buffer_mask;
4389 4494
4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4391 goto out_free_tracing_cpumask;
4392
4393 /* To save memory, keep the ring buffer size to its minimum */ 4495 /* To save memory, keep the ring buffer size to its minimum */
4394 if (ring_buffer_expanded) 4496 if (ring_buffer_expanded)
4395 ring_buf_size = trace_buf_size; 4497 ring_buf_size = trace_buf_size;
@@ -4447,8 +4549,6 @@ __init static int tracer_alloc_buffers(void)
4447 return 0; 4549 return 0;
4448 4550
4449out_free_cpumask: 4551out_free_cpumask:
4450 free_cpumask_var(tracing_reader_cpumask);
4451out_free_tracing_cpumask:
4452 free_cpumask_var(tracing_cpumask); 4552 free_cpumask_var(tracing_cpumask);
4453out_free_buffer_mask: 4553out_free_buffer_mask:
4454 free_cpumask_var(tracing_buffer_mask); 4554 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396 396
397extern unsigned long nsecs_to_usecs(unsigned long nsecs); 397extern unsigned long nsecs_to_usecs(unsigned long nsecs);
398 398
399extern unsigned long tracing_thresh;
400
399#ifdef CONFIG_TRACER_MAX_TRACE 401#ifdef CONFIG_TRACER_MAX_TRACE
400extern unsigned long tracing_max_latency; 402extern unsigned long tracing_max_latency;
401extern unsigned long tracing_thresh;
402 403
403void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 404void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
404void update_max_tr_single(struct trace_array *tr, 405void update_max_tr_single(struct trace_array *tr,
@@ -497,6 +498,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
497#ifdef CONFIG_DYNAMIC_FTRACE 498#ifdef CONFIG_DYNAMIC_FTRACE
498/* TODO: make this variable */ 499/* TODO: make this variable */
499#define FTRACE_GRAPH_MAX_FUNCS 32 500#define FTRACE_GRAPH_MAX_FUNCS 32
501extern int ftrace_graph_filter_enabled;
500extern int ftrace_graph_count; 502extern int ftrace_graph_count;
501extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 503extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
502 504
@@ -504,7 +506,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
504{ 506{
505 int i; 507 int i;
506 508
507 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 509 if (!ftrace_graph_filter_enabled)
508 return 1; 510 return 1;
509 511
510 for (i = 0; i < ftrace_graph_count; i++) { 512 for (i = 0; i < ftrace_graph_count; i++) {
@@ -549,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
549 * struct trace_parser - servers for reading the user input separated by spaces 551 * struct trace_parser - servers for reading the user input separated by spaces
550 * @cont: set if the input is not complete - no final space char was found 552 * @cont: set if the input is not complete - no final space char was found
551 * @buffer: holds the parsed user input 553 * @buffer: holds the parsed user input
552 * @idx: user input lenght 554 * @idx: user input length
553 * @size: buffer size 555 * @size: buffer size
554 */ 556 */
555struct trace_parser { 557struct trace_parser {
@@ -791,7 +793,8 @@ extern const char *__stop___trace_bprintk_fmt[];
791 793
792#undef FTRACE_ENTRY 794#undef FTRACE_ENTRY
793#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 795#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
794 extern struct ftrace_event_call event_##call; 796 extern struct ftrace_event_call \
797 __attribute__((__aligned__(4))) event_##call;
795#undef FTRACE_ENTRY_DUP 798#undef FTRACE_ENTRY_DUP
796#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 799#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
797 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 800 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 307 return -1;
308 if (percent_a > percent_b) 308 if (percent_a > percent_b)
309 return 1; 309 return 1;
310 else 310
311 return 0; 311 if (a->incorrect < b->incorrect)
312 return -1;
313 if (a->incorrect > b->incorrect)
314 return 1;
315
316 /*
317 * Since the above shows worse (incorrect) cases
318 * first, we continue that by showing best (correct)
319 * cases last.
320 */
321 if (a->correct > b->correct)
322 return -1;
323 if (a->correct < b->correct)
324 return 1;
325
326 return 0;
312} 327}
313 328
314static struct tracer_stat annotated_branch_stats = { 329static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
13 * Tracer plugins will chose a default from these clocks. 13 * Tracer plugins will chose a default from these clocks.
14 */ 14 */
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/irqflags.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
@@ -83,7 +84,7 @@ u64 notrace trace_clock_global(void)
83 int this_cpu; 84 int this_cpu;
84 u64 now; 85 u64 now;
85 86
86 raw_local_irq_save(flags); 87 local_irq_save(flags);
87 88
88 this_cpu = raw_smp_processor_id(); 89 this_cpu = raw_smp_processor_id();
89 now = cpu_clock(this_cpu); 90 now = cpu_clock(this_cpu);
@@ -109,7 +110,7 @@ u64 notrace trace_clock_global(void)
109 arch_spin_unlock(&trace_clock_struct.lock); 110 arch_spin_unlock(&trace_clock_struct.lock);
110 111
111 out: 112 out:
112 raw_local_irq_restore(flags); 113 local_irq_restore(flags);
113 114
114 return now; 115 return now;
115} 116}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c
index f0d693005075..0565bb42566f 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_perf.c
@@ -1,32 +1,41 @@
1/* 1/*
2 * trace event based perf counter profiling 2 * trace event based perf event profiling/tracing
3 * 3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> 4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 * 5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
12 16
13static char *perf_trace_buf; 17static char *perf_trace_buf;
14static char *perf_trace_buf_nmi; 18static char *perf_trace_buf_nmi;
15 19
16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 20/*
21 * Force it to be aligned to unsigned long to avoid misaligned accesses
22 * suprises
23 */
24typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
25 perf_trace_t;
17 26
18/* Count the events in use (per event id, not per instance) */ 27/* Count the events in use (per event id, not per instance) */
19static int total_profile_count; 28static int total_ref_count;
20 29
21static int ftrace_profile_enable_event(struct ftrace_event_call *event) 30static int perf_trace_event_enable(struct ftrace_event_call *event)
22{ 31{
23 char *buf; 32 char *buf;
24 int ret = -ENOMEM; 33 int ret = -ENOMEM;
25 34
26 if (event->profile_count++ > 0) 35 if (event->perf_refcount++ > 0)
27 return 0; 36 return 0;
28 37
29 if (!total_profile_count) { 38 if (!total_ref_count) {
30 buf = (char *)alloc_percpu(perf_trace_t); 39 buf = (char *)alloc_percpu(perf_trace_t);
31 if (!buf) 40 if (!buf)
32 goto fail_buf; 41 goto fail_buf;
@@ -40,35 +49,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
40 rcu_assign_pointer(perf_trace_buf_nmi, buf); 49 rcu_assign_pointer(perf_trace_buf_nmi, buf);
41 } 50 }
42 51
43 ret = event->profile_enable(event); 52 ret = event->perf_event_enable(event);
44 if (!ret) { 53 if (!ret) {
45 total_profile_count++; 54 total_ref_count++;
46 return 0; 55 return 0;
47 } 56 }
48 57
49fail_buf_nmi: 58fail_buf_nmi:
50 if (!total_profile_count) { 59 if (!total_ref_count) {
51 free_percpu(perf_trace_buf_nmi); 60 free_percpu(perf_trace_buf_nmi);
52 free_percpu(perf_trace_buf); 61 free_percpu(perf_trace_buf);
53 perf_trace_buf_nmi = NULL; 62 perf_trace_buf_nmi = NULL;
54 perf_trace_buf = NULL; 63 perf_trace_buf = NULL;
55 } 64 }
56fail_buf: 65fail_buf:
57 event->profile_count--; 66 event->perf_refcount--;
58 67
59 return ret; 68 return ret;
60} 69}
61 70
62int ftrace_profile_enable(int event_id) 71int perf_trace_enable(int event_id)
63{ 72{
64 struct ftrace_event_call *event; 73 struct ftrace_event_call *event;
65 int ret = -EINVAL; 74 int ret = -EINVAL;
66 75
67 mutex_lock(&event_mutex); 76 mutex_lock(&event_mutex);
68 list_for_each_entry(event, &ftrace_events, list) { 77 list_for_each_entry(event, &ftrace_events, list) {
69 if (event->id == event_id && event->profile_enable && 78 if (event->id == event_id && event->perf_event_enable &&
70 try_module_get(event->mod)) { 79 try_module_get(event->mod)) {
71 ret = ftrace_profile_enable_event(event); 80 ret = perf_trace_event_enable(event);
72 break; 81 break;
73 } 82 }
74 } 83 }
@@ -77,16 +86,16 @@ int ftrace_profile_enable(int event_id)
77 return ret; 86 return ret;
78} 87}
79 88
80static void ftrace_profile_disable_event(struct ftrace_event_call *event) 89static void perf_trace_event_disable(struct ftrace_event_call *event)
81{ 90{
82 char *buf, *nmi_buf; 91 char *buf, *nmi_buf;
83 92
84 if (--event->profile_count > 0) 93 if (--event->perf_refcount > 0)
85 return; 94 return;
86 95
87 event->profile_disable(event); 96 event->perf_event_disable(event);
88 97
89 if (!--total_profile_count) { 98 if (!--total_ref_count) {
90 buf = perf_trace_buf; 99 buf = perf_trace_buf;
91 rcu_assign_pointer(perf_trace_buf, NULL); 100 rcu_assign_pointer(perf_trace_buf, NULL);
92 101
@@ -104,14 +113,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
104 } 113 }
105} 114}
106 115
107void ftrace_profile_disable(int event_id) 116void perf_trace_disable(int event_id)
108{ 117{
109 struct ftrace_event_call *event; 118 struct ftrace_event_call *event;
110 119
111 mutex_lock(&event_mutex); 120 mutex_lock(&event_mutex);
112 list_for_each_entry(event, &ftrace_events, list) { 121 list_for_each_entry(event, &ftrace_events, list) {
113 if (event->id == event_id) { 122 if (event->id == event_id) {
114 ftrace_profile_disable_event(event); 123 perf_trace_event_disable(event);
115 module_put(event->mod); 124 module_put(event->mod);
116 break; 125 break;
117 } 126 }
@@ -119,13 +128,15 @@ void ftrace_profile_disable(int event_id)
119 mutex_unlock(&event_mutex); 128 mutex_unlock(&event_mutex);
120} 129}
121 130
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, 131__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags) 132 int *rctxp, unsigned long *irq_flags)
124{ 133{
125 struct trace_entry *entry; 134 struct trace_entry *entry;
126 char *trace_buf, *raw_data; 135 char *trace_buf, *raw_data;
127 int pc, cpu; 136 int pc, cpu;
128 137
138 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
139
129 pc = preempt_count(); 140 pc = preempt_count();
130 141
131 /* Protect the per cpu buffer, begin the rcu read side */ 142 /* Protect the per cpu buffer, begin the rcu read side */
@@ -138,9 +149,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
138 cpu = smp_processor_id(); 149 cpu = smp_processor_id();
139 150
140 if (in_nmi()) 151 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi); 152 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
142 else 153 else
143 trace_buf = rcu_dereference(perf_trace_buf); 154 trace_buf = rcu_dereference_sched(perf_trace_buf);
144 155
145 if (!trace_buf) 156 if (!trace_buf)
146 goto err; 157 goto err;
@@ -148,7 +159,7 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
148 raw_data = per_cpu_ptr(trace_buf, cpu); 159 raw_data = per_cpu_ptr(trace_buf, cpu);
149 160
150 /* zero the dead bytes from align to not leak stack to user */ 161 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 162 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
152 163
153 entry = (struct trace_entry *)raw_data; 164 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc); 165 tracing_generic_entry_update(entry, *irq_flags, pc);
@@ -161,4 +172,4 @@ err_recursion:
161 local_irq_restore(*irq_flags); 172 local_irq_restore(*irq_flags);
162 return NULL; 173 return NULL;
163} 174}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); 175EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..c697c7043349 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/slab.h>
18#include <linux/delay.h> 19#include <linux/delay.h>
19 20
20#include <asm/setup.h> 21#include <asm/setup.h>
@@ -60,10 +61,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
60 return 0; 61 return 0;
61 62
62err: 63err:
63 if (field) { 64 if (field)
64 kfree(field->name); 65 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 66 kfree(field);
68 67
69 return -ENOMEM; 68 return -ENOMEM;
@@ -520,41 +519,16 @@ out:
520 return ret; 519 return ret;
521} 520}
522 521
523extern char *__bad_type_size(void);
524
525#undef FIELD
526#define FIELD(type, name) \
527 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
528 #type, "common_" #name, offsetof(typeof(field), name), \
529 sizeof(field.name), is_signed_type(type)
530
531static int trace_write_header(struct trace_seq *s)
532{
533 struct trace_entry field;
534
535 /* struct trace_entry */
536 return trace_seq_printf(s,
537 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
538 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
539 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
540 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
541 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
542 "\n",
543 FIELD(unsigned short, type),
544 FIELD(unsigned char, flags),
545 FIELD(unsigned char, preempt_count),
546 FIELD(int, pid),
547 FIELD(int, lock_depth));
548}
549
550static ssize_t 522static ssize_t
551event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 523event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
552 loff_t *ppos) 524 loff_t *ppos)
553{ 525{
554 struct ftrace_event_call *call = filp->private_data; 526 struct ftrace_event_call *call = filp->private_data;
527 struct ftrace_event_field *field;
555 struct trace_seq *s; 528 struct trace_seq *s;
529 int common_field_count = 5;
556 char *buf; 530 char *buf;
557 int r; 531 int r = 0;
558 532
559 if (*ppos) 533 if (*ppos)
560 return 0; 534 return 0;
@@ -565,14 +539,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
565 539
566 trace_seq_init(s); 540 trace_seq_init(s);
567 541
568 /* If any of the first writes fail, so will the show_format. */
569
570 trace_seq_printf(s, "name: %s\n", call->name); 542 trace_seq_printf(s, "name: %s\n", call->name);
571 trace_seq_printf(s, "ID: %d\n", call->id); 543 trace_seq_printf(s, "ID: %d\n", call->id);
572 trace_seq_printf(s, "format:\n"); 544 trace_seq_printf(s, "format:\n");
573 trace_write_header(s);
574 545
575 r = call->show_format(call, s); 546 list_for_each_entry_reverse(field, &call->fields, link) {
547 /*
548 * Smartly shows the array type(except dynamic array).
549 * Normal:
550 * field:TYPE VAR
551 * If TYPE := TYPE[LEN], it is shown:
552 * field:TYPE VAR[LEN]
553 */
554 const char *array_descriptor = strchr(field->type, '[');
555
556 if (!strncmp(field->type, "__data_loc", 10))
557 array_descriptor = NULL;
558
559 if (!array_descriptor) {
560 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
561 "\tsize:%u;\tsigned:%d;\n",
562 field->type, field->name, field->offset,
563 field->size, !!field->is_signed);
564 } else {
565 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
566 "\tsize:%u;\tsigned:%d;\n",
567 (int)(array_descriptor - field->type),
568 field->type, field->name,
569 array_descriptor, field->offset,
570 field->size, !!field->is_signed);
571 }
572
573 if (--common_field_count == 0)
574 r = trace_seq_printf(s, "\n");
575
576 if (!r)
577 break;
578 }
579
580 if (r)
581 r = trace_seq_printf(s, "\nprint fmt: %s\n",
582 call->print_fmt);
583
576 if (!r) { 584 if (!r) {
577 /* 585 /*
578 * ug! The format output is bigger than a PAGE!! 586 * ug! The format output is bigger than a PAGE!!
@@ -931,7 +939,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
931 trace_create_file("enable", 0644, call->dir, call, 939 trace_create_file("enable", 0644, call->dir, call,
932 enable); 940 enable);
933 941
934 if (call->id && call->profile_enable) 942 if (call->id && call->perf_event_enable)
935 trace_create_file("id", 0444, call->dir, call, 943 trace_create_file("id", 0444, call->dir, call,
936 id); 944 id);
937 945
@@ -948,10 +956,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
948 filter); 956 filter);
949 } 957 }
950 958
951 /* A trace may not want to export its format */
952 if (!call->show_format)
953 return 0;
954
955 trace_create_file("format", 0444, call->dir, call, 959 trace_create_file("format", 0444, call->dir, call,
956 format); 960 format);
957 961
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4615f62a04f1..88c0b6dbd7fe 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,6 +22,7 @@
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/slab.h>
25 26
26#include "trace.h" 27#include "trace.h"
27#include "trace_output.h" 28#include "trace_output.h"
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4e..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
84
85#undef __array
86#define __array(type, item, len) \
87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
89 offsetof(typeof(field), item), \
90 sizeof(field.item), is_signed_type(type)); \
91 if (!ret) \
92 return 0;
93
94#undef __array_desc
95#define __array_desc(type, container, item, len) \
96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
101 if (!ret) \
102 return 0;
103
104#undef __dynamic_array
105#define __dynamic_array(type, item) \
106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
112
113#undef F_printk
114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
115
116#undef __entry
117#define __entry REC
118
119#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121static int \
122ftrace_format_##name(struct ftrace_event_call *unused, \
123 struct trace_seq *s) \
124{ \
125 struct struct_name field __attribute__((unused)); \
126 int ret = 0; \
127 \
128 tstruct; \
129 \
130 trace_seq_printf(s, "\nprint fmt: " print); \
131 \
132 return ret; \
133}
134
135#include "trace_entries.h"
136
137#undef __field 65#undef __field
138#define __field(type, item) \ 66#define __field(type, item) \
139 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
175 return ret; 103 return ret;
176 104
177#undef __dynamic_array 105#undef __dynamic_array
178#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
179 112
180#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
181#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
198 return 0; 131 return 0;
199} 132}
200 133
134#undef __entry
135#define __entry REC
136
201#undef __field 137#undef __field
202#define __field(type, item) 138#define __field(type, item)
203 139
@@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
213#undef __dynamic_array 149#undef __dynamic_array
214#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
215 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
216#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
217#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
218 \ 157 \
@@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
223 .id = type, \ 162 .id = type, \
224 .system = __stringify(TRACE_SYSTEM), \ 163 .system = __stringify(TRACE_SYSTEM), \
225 .raw_init = ftrace_raw_init_event, \ 164 .raw_init = ftrace_raw_init_event, \
226 .show_format = ftrace_format_##call, \ 165 .print_fmt = print, \
227 .define_fields = ftrace_define_fields_##call, \ 166 .define_fields = ftrace_define_fields_##call, \
228}; \ 167}; \
229 168
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..9aed1a5cf553 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/slab.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
14#include "trace.h" 15#include "trace.h"
@@ -18,6 +19,7 @@ struct fgraph_cpu_data {
18 pid_t last_pid; 19 pid_t last_pid;
19 int depth; 20 int depth;
20 int ignore; 21 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
21}; 23};
22 24
23struct fgraph_data { 25struct fgraph_data {
@@ -187,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr,
187 struct ring_buffer *buffer = tr->buffer; 189 struct ring_buffer *buffer = tr->buffer;
188 struct ftrace_graph_ent_entry *entry; 190 struct ftrace_graph_ent_entry *entry;
189 191
190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 192 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
191 return 0; 193 return 0;
192 194
193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 195 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
212 int cpu; 214 int cpu;
213 int pc; 215 int pc;
214 216
215 if (unlikely(!tr))
216 return 0;
217
218 if (!ftrace_trace_task(current)) 217 if (!ftrace_trace_task(current))
219 return 0; 218 return 0;
220 219
221 if (!ftrace_graph_addr(trace->func)) 220 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func)))
222 return 0; 222 return 0;
223 223
224 local_irq_save(flags); 224 local_irq_save(flags);
@@ -231,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
231 } else { 231 } else {
232 ret = 0; 232 ret = 0;
233 } 233 }
234 /* Only do the atomic if it is not already set */
235 if (!test_tsk_trace_graph(current))
236 set_tsk_trace_graph(current);
237 234
238 atomic_dec(&data->disabled); 235 atomic_dec(&data->disabled);
239 local_irq_restore(flags); 236 local_irq_restore(flags);
@@ -241,6 +238,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
241 return ret; 238 return ret;
242} 239}
243 240
241int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
242{
243 if (tracing_thresh)
244 return 1;
245 else
246 return trace_graph_entry(trace);
247}
248
244static void __trace_graph_return(struct trace_array *tr, 249static void __trace_graph_return(struct trace_array *tr,
245 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
246 unsigned long flags, 251 unsigned long flags,
@@ -251,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr,
251 struct ring_buffer *buffer = tr->buffer; 256 struct ring_buffer *buffer = tr->buffer;
252 struct ftrace_graph_ret_entry *entry; 257 struct ftrace_graph_ret_entry *entry;
253 258
254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 259 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
255 return; 260 return;
256 261
257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 262 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
281 pc = preempt_count(); 286 pc = preempt_count();
282 __trace_graph_return(tr, trace, flags, pc); 287 __trace_graph_return(tr, trace, flags, pc);
283 } 288 }
284 if (!trace->depth)
285 clear_tsk_trace_graph(current);
286 atomic_dec(&data->disabled); 289 atomic_dec(&data->disabled);
287 local_irq_restore(flags); 290 local_irq_restore(flags);
288} 291}
289 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296
297 /* Make graph_array visible before we start tracing */
298
299 smp_mb();
300}
301
302void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
303{
304 if (tracing_thresh &&
305 (trace->rettime - trace->calltime < tracing_thresh))
306 return;
307 else
308 trace_graph_return(trace);
309}
310
290static int graph_trace_init(struct trace_array *tr) 311static int graph_trace_init(struct trace_array *tr)
291{ 312{
292 int ret; 313 int ret;
293 314
294 graph_array = tr; 315 set_graph_array(tr);
295 ret = register_ftrace_graph(&trace_graph_return, 316 if (tracing_thresh)
296 &trace_graph_entry); 317 ret = register_ftrace_graph(&trace_graph_thresh_return,
318 &trace_graph_thresh_entry);
319 else
320 ret = register_ftrace_graph(&trace_graph_return,
321 &trace_graph_entry);
297 if (ret) 322 if (ret)
298 return ret; 323 return ret;
299 tracing_start_cmdline_record(); 324 tracing_start_cmdline_record();
@@ -301,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr)
301 return 0; 326 return 0;
302} 327}
303 328
304void set_graph_array(struct trace_array *tr)
305{
306 graph_array = tr;
307}
308
309static void graph_trace_reset(struct trace_array *tr) 329static void graph_trace_reset(struct trace_array *tr)
310{ 330{
311 tracing_stop_cmdline_record(); 331 tracing_stop_cmdline_record();
@@ -673,15 +693,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
673 duration = graph_ret->rettime - graph_ret->calltime; 693 duration = graph_ret->rettime - graph_ret->calltime;
674 694
675 if (data) { 695 if (data) {
696 struct fgraph_cpu_data *cpu_data;
676 int cpu = iter->cpu; 697 int cpu = iter->cpu;
677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 698
699 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
678 700
679 /* 701 /*
680 * Comments display at + 1 to depth. Since 702 * Comments display at + 1 to depth. Since
681 * this is a leaf function, keep the comments 703 * this is a leaf function, keep the comments
682 * equal to this depth. 704 * equal to this depth.
683 */ 705 */
684 *depth = call->depth - 1; 706 cpu_data->depth = call->depth - 1;
707
708 /* No need to keep this function around for this depth */
709 if (call->depth < FTRACE_RETFUNC_DEPTH)
710 cpu_data->enter_funcs[call->depth] = 0;
685 } 711 }
686 712
687 /* Overhead */ 713 /* Overhead */
@@ -721,10 +747,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
721 int i; 747 int i;
722 748
723 if (data) { 749 if (data) {
750 struct fgraph_cpu_data *cpu_data;
724 int cpu = iter->cpu; 751 int cpu = iter->cpu;
725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
726 752
727 *depth = call->depth; 753 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
754 cpu_data->depth = call->depth;
755
756 /* Save this function pointer to see if the exit matches */
757 if (call->depth < FTRACE_RETFUNC_DEPTH)
758 cpu_data->enter_funcs[call->depth] = call->func;
728 } 759 }
729 760
730 /* No overhead */ 761 /* No overhead */
@@ -854,19 +885,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
854 struct fgraph_data *data = iter->private; 885 struct fgraph_data *data = iter->private;
855 pid_t pid = ent->pid; 886 pid_t pid = ent->pid;
856 int cpu = iter->cpu; 887 int cpu = iter->cpu;
888 int func_match = 1;
857 int ret; 889 int ret;
858 int i; 890 int i;
859 891
860 if (data) { 892 if (data) {
893 struct fgraph_cpu_data *cpu_data;
861 int cpu = iter->cpu; 894 int cpu = iter->cpu;
862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 895
896 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
863 897
864 /* 898 /*
865 * Comments display at + 1 to depth. This is the 899 * Comments display at + 1 to depth. This is the
866 * return from a function, we now want the comments 900 * return from a function, we now want the comments
867 * to display at the same level of the bracket. 901 * to display at the same level of the bracket.
868 */ 902 */
869 *depth = trace->depth - 1; 903 cpu_data->depth = trace->depth - 1;
904
905 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
906 if (cpu_data->enter_funcs[trace->depth] != trace->func)
907 func_match = 0;
908 cpu_data->enter_funcs[trace->depth] = 0;
909 }
870 } 910 }
871 911
872 if (print_graph_prologue(iter, s, 0, 0)) 912 if (print_graph_prologue(iter, s, 0, 0))
@@ -891,9 +931,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
891 return TRACE_TYPE_PARTIAL_LINE; 931 return TRACE_TYPE_PARTIAL_LINE;
892 } 932 }
893 933
894 ret = trace_seq_printf(s, "}\n"); 934 /*
895 if (!ret) 935 * If the return function does not have a matching entry,
896 return TRACE_TYPE_PARTIAL_LINE; 936 * then the entry was lost. Instead of just printing
937 * the '}' and letting the user guess what function this
938 * belongs to, write out the function name.
939 */
940 if (func_match) {
941 ret = trace_seq_printf(s, "}\n");
942 if (!ret)
943 return TRACE_TYPE_PARTIAL_LINE;
944 } else {
945 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
946 if (!ret)
947 return TRACE_TYPE_PARTIAL_LINE;
948 }
897 949
898 /* Overrun */ 950 /* Overrun */
899 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 951 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6178abf3637e..1251e367bae9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -635,12 +635,12 @@ static int create_trace_probe(int argc, char **argv)
635 event = strchr(group, '/') + 1; 635 event = strchr(group, '/') + 1;
636 event[-1] = '\0'; 636 event[-1] = '\0';
637 if (strlen(group) == 0) { 637 if (strlen(group) == 0) {
638 pr_info("Group name is not specifiled\n"); 638 pr_info("Group name is not specified\n");
639 return -EINVAL; 639 return -EINVAL;
640 } 640 }
641 } 641 }
642 if (strlen(event) == 0) { 642 if (strlen(event) == 0) {
643 pr_info("Event name is not specifiled\n"); 643 pr_info("Event name is not specified\n");
644 return -EINVAL; 644 return -EINVAL;
645 } 645 }
646 } 646 }
@@ -673,7 +673,7 @@ static int create_trace_probe(int argc, char **argv)
673 return -EINVAL; 673 return -EINVAL;
674 } 674 }
675 /* an address specified */ 675 /* an address specified */
676 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); 676 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
677 if (ret) { 677 if (ret) {
678 pr_info("Failed to parse address.\n"); 678 pr_info("Failed to parse address.\n");
679 return ret; 679 return ret;
@@ -1155,86 +1155,66 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1155 return 0; 1155 return 0;
1156} 1156}
1157 1157
1158static int __probe_event_show_format(struct trace_seq *s, 1158static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1159 struct trace_probe *tp, const char *fmt,
1160 const char *arg)
1161{ 1159{
1162 int i; 1160 int i;
1161 int pos = 0;
1163 1162
1164 /* Show format */ 1163 const char *fmt, *arg;
1165 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1166 return 0;
1167 1164
1168 for (i = 0; i < tp->nr_args; i++) 1165 if (!probe_is_return(tp)) {
1169 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) 1166 fmt = "(%lx)";
1170 return 0; 1167 arg = "REC->" FIELD_STRING_IP;
1168 } else {
1169 fmt = "(%lx <- %lx)";
1170 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1171 }
1171 1172
1172 if (!trace_seq_printf(s, "\", %s", arg)) 1173 /* When len=0, we just calculate the needed length */
1173 return 0; 1174#define LEN_OR_ZERO (len ? len - pos : 0)
1174 1175
1175 for (i = 0; i < tp->nr_args; i++) 1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1176 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1177 return 0;
1178 1177
1179 return trace_seq_puts(s, "\n"); 1178 for (i = 0; i < tp->nr_args; i++) {
1180} 1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
1180 tp->args[i].name);
1181 }
1181 1182
1182#undef SHOW_FIELD 1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1183#define SHOW_FIELD(type, item, name) \
1184 do { \
1185 ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \
1186 "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
1187 (unsigned int)offsetof(typeof(field), item),\
1188 (unsigned int)sizeof(type), \
1189 is_signed_type(type)); \
1190 if (!ret) \
1191 return 0; \
1192 } while (0)
1193 1184
1194static int kprobe_event_show_format(struct ftrace_event_call *call, 1185 for (i = 0; i < tp->nr_args; i++) {
1195 struct trace_seq *s) 1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1196{ 1187 tp->args[i].name);
1197 struct kprobe_trace_entry field __attribute__((unused)); 1188 }
1198 int ret, i;
1199 struct trace_probe *tp = (struct trace_probe *)call->data;
1200
1201 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1202 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1203 1189
1204 /* Show fields */ 1190#undef LEN_OR_ZERO
1205 for (i = 0; i < tp->nr_args; i++)
1206 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1207 trace_seq_puts(s, "\n");
1208 1191
1209 return __probe_event_show_format(s, tp, "(%lx)", 1192 /* return the length of print_fmt */
1210 "REC->" FIELD_STRING_IP); 1193 return pos;
1211} 1194}
1212 1195
1213static int kretprobe_event_show_format(struct ftrace_event_call *call, 1196static int set_print_fmt(struct trace_probe *tp)
1214 struct trace_seq *s)
1215{ 1197{
1216 struct kretprobe_trace_entry field __attribute__((unused)); 1198 int len;
1217 int ret, i; 1199 char *print_fmt;
1218 struct trace_probe *tp = (struct trace_probe *)call->data;
1219 1200
1220 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); 1201 /* First: called with 0 length to calculate the needed length */
1221 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); 1202 len = __set_print_fmt(tp, NULL, 0);
1222 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1203 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1204 if (!print_fmt)
1205 return -ENOMEM;
1223 1206
1224 /* Show fields */ 1207 /* Second: actually write the @print_fmt */
1225 for (i = 0; i < tp->nr_args; i++) 1208 __set_print_fmt(tp, print_fmt, len + 1);
1226 SHOW_FIELD(unsigned long, args[i], tp->args[i].name); 1209 tp->call.print_fmt = print_fmt;
1227 trace_seq_puts(s, "\n");
1228 1210
1229 return __probe_event_show_format(s, tp, "(%lx <- %lx)", 1211 return 0;
1230 "REC->" FIELD_STRING_FUNC
1231 ", REC->" FIELD_STRING_RETIP);
1232} 1212}
1233 1213
1234#ifdef CONFIG_PERF_EVENTS 1214#ifdef CONFIG_PERF_EVENTS
1235 1215
1236/* Kprobe profile handler */ 1216/* Kprobe profile handler */
1237static __kprobes void kprobe_profile_func(struct kprobe *kp, 1217static __kprobes void kprobe_perf_func(struct kprobe *kp,
1238 struct pt_regs *regs) 1218 struct pt_regs *regs)
1239{ 1219{
1240 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1247,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
1247 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1248 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1228 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1249 size -= sizeof(u32); 1229 size -= sizeof(u32);
1250 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1230 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1251 "profile buffer not large enough")) 1231 "profile buffer not large enough"))
1252 return; 1232 return;
1253 1233
1254 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); 1234 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
1255 if (!entry) 1235 if (!entry)
1256 return; 1236 return;
1257 1237
@@ -1260,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
1260 for (i = 0; i < tp->nr_args; i++) 1240 for (i = 0; i < tp->nr_args; i++)
1261 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1262 1242
1263 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); 1243 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
1264} 1244}
1265 1245
1266/* Kretprobe profile handler */ 1246/* Kretprobe profile handler */
1267static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, 1247static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1268 struct pt_regs *regs) 1248 struct pt_regs *regs)
1269{ 1249{
1270 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1277,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1277 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1278 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1258 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1279 size -= sizeof(u32); 1259 size -= sizeof(u32);
1280 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1260 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1281 "profile buffer not large enough")) 1261 "profile buffer not large enough"))
1282 return; 1262 return;
1283 1263
1284 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); 1264 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
1285 if (!entry) 1265 if (!entry)
1286 return; 1266 return;
1287 1267
@@ -1291,10 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1291 for (i = 0; i < tp->nr_args; i++) 1271 for (i = 0; i < tp->nr_args; i++)
1292 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1293 1273
1294 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); 1274 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
1275 irq_flags, regs);
1295} 1276}
1296 1277
1297static int probe_profile_enable(struct ftrace_event_call *call) 1278static int probe_perf_enable(struct ftrace_event_call *call)
1298{ 1279{
1299 struct trace_probe *tp = (struct trace_probe *)call->data; 1280 struct trace_probe *tp = (struct trace_probe *)call->data;
1300 1281
@@ -1306,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
1306 return enable_kprobe(&tp->rp.kp); 1287 return enable_kprobe(&tp->rp.kp);
1307} 1288}
1308 1289
1309static void probe_profile_disable(struct ftrace_event_call *call) 1290static void probe_perf_disable(struct ftrace_event_call *call)
1310{ 1291{
1311 struct trace_probe *tp = (struct trace_probe *)call->data; 1292 struct trace_probe *tp = (struct trace_probe *)call->data;
1312 1293
@@ -1331,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1331 kprobe_trace_func(kp, regs); 1312 kprobe_trace_func(kp, regs);
1332#ifdef CONFIG_PERF_EVENTS 1313#ifdef CONFIG_PERF_EVENTS
1333 if (tp->flags & TP_FLAG_PROFILE) 1314 if (tp->flags & TP_FLAG_PROFILE)
1334 kprobe_profile_func(kp, regs); 1315 kprobe_perf_func(kp, regs);
1335#endif 1316#endif
1336 return 0; /* We don't tweek kernel, so just return 0 */ 1317 return 0; /* We don't tweek kernel, so just return 0 */
1337} 1318}
@@ -1345,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1345 kretprobe_trace_func(ri, regs); 1326 kretprobe_trace_func(ri, regs);
1346#ifdef CONFIG_PERF_EVENTS 1327#ifdef CONFIG_PERF_EVENTS
1347 if (tp->flags & TP_FLAG_PROFILE) 1328 if (tp->flags & TP_FLAG_PROFILE)
1348 kretprobe_profile_func(ri, regs); 1329 kretprobe_perf_func(ri, regs);
1349#endif 1330#endif
1350 return 0; /* We don't tweek kernel, so just return 0 */ 1331 return 0; /* We don't tweek kernel, so just return 0 */
1351} 1332}
@@ -1359,30 +1340,33 @@ static int register_probe_event(struct trace_probe *tp)
1359 if (probe_is_return(tp)) { 1340 if (probe_is_return(tp)) {
1360 tp->event.trace = print_kretprobe_event; 1341 tp->event.trace = print_kretprobe_event;
1361 call->raw_init = probe_event_raw_init; 1342 call->raw_init = probe_event_raw_init;
1362 call->show_format = kretprobe_event_show_format;
1363 call->define_fields = kretprobe_event_define_fields; 1343 call->define_fields = kretprobe_event_define_fields;
1364 } else { 1344 } else {
1365 tp->event.trace = print_kprobe_event; 1345 tp->event.trace = print_kprobe_event;
1366 call->raw_init = probe_event_raw_init; 1346 call->raw_init = probe_event_raw_init;
1367 call->show_format = kprobe_event_show_format;
1368 call->define_fields = kprobe_event_define_fields; 1347 call->define_fields = kprobe_event_define_fields;
1369 } 1348 }
1349 if (set_print_fmt(tp) < 0)
1350 return -ENOMEM;
1370 call->event = &tp->event; 1351 call->event = &tp->event;
1371 call->id = register_ftrace_event(&tp->event); 1352 call->id = register_ftrace_event(&tp->event);
1372 if (!call->id) 1353 if (!call->id) {
1354 kfree(call->print_fmt);
1373 return -ENODEV; 1355 return -ENODEV;
1356 }
1374 call->enabled = 0; 1357 call->enabled = 0;
1375 call->regfunc = probe_event_enable; 1358 call->regfunc = probe_event_enable;
1376 call->unregfunc = probe_event_disable; 1359 call->unregfunc = probe_event_disable;
1377 1360
1378#ifdef CONFIG_PERF_EVENTS 1361#ifdef CONFIG_PERF_EVENTS
1379 call->profile_enable = probe_profile_enable; 1362 call->perf_event_enable = probe_perf_enable;
1380 call->profile_disable = probe_profile_disable; 1363 call->perf_event_disable = probe_perf_disable;
1381#endif 1364#endif
1382 call->data = tp; 1365 call->data = tp;
1383 ret = trace_add_event_call(call); 1366 ret = trace_add_event_call(call);
1384 if (ret) { 1367 if (ret) {
1385 pr_info("Failed to register kprobe event: %s\n", call->name); 1368 pr_info("Failed to register kprobe event: %s\n", call->name);
1369 kfree(call->print_fmt);
1386 unregister_ftrace_event(&tp->event); 1370 unregister_ftrace_event(&tp->event);
1387 } 1371 }
1388 return ret; 1372 return ret;
@@ -1392,6 +1376,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1392{ 1376{
1393 /* tp->event is unregistered in trace_remove_event_call() */ 1377 /* tp->event is unregistered in trace_remove_event_call() */
1394 trace_remove_event_call(&tp->call); 1378 trace_remove_event_call(&tp->call);
1379 kfree(tp->call.print_fmt);
1395} 1380}
1396 1381
1397/* Make a debugfs interface for controling probe points */ 1382/* Make a debugfs interface for controling probe points */
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 94103cdcf9d8..d59cd6879477 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -23,6 +23,7 @@
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/slab.h>
26#include <linux/fs.h> 27#include <linux/fs.h>
27 28
28#include "trace_output.h" 29#include "trace_output.h"
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/slab.h>
12#include <linux/time.h> 13#include <linux/time.h>
13 14
14#include <asm/atomic.h> 15#include <asm/atomic.h>
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..81003b4d617f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
3#include <linux/stringify.h> 3#include <linux/stringify.h>
4#include <linux/kthread.h> 4#include <linux/kthread.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/slab.h>
6 7
7static inline int trace_valid_entry(struct trace_entry *entry) 8static inline int trace_valid_entry(struct trace_entry *entry)
8{ 9{
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee30..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
174 arch_spin_lock(&max_stack_lock); 184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 arch_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
210 arch_spin_lock(&max_stack_lock); 228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
238 int cpu;
239
220 arch_spin_unlock(&max_stack_lock); 240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
10 10
11 11
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/slab.h>
13#include <linux/rbtree.h> 14#include <linux/rbtree.h>
14#include <linux/debugfs.h> 15#include <linux/debugfs.h>
15#include "trace_stat.h" 16#include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4e332b9e449c..4d6d711717f2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/slab.h>
3#include <linux/kernel.h> 4#include <linux/kernel.h>
4#include <linux/ftrace.h> 5#include <linux/ftrace.h>
5#include <linux/perf_event.h> 6#include <linux/perf_event.h>
@@ -143,70 +144,65 @@ extern char *__bad_type_size(void);
143 #type, #name, offsetof(typeof(trace), name), \ 144 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type) 145 sizeof(trace.name), is_signed_type(type)
145 146
146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 147static
148int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
147{ 149{
148 int i; 150 int i;
149 int ret; 151 int pos = 0;
150 struct syscall_metadata *entry = call->data;
151 struct syscall_trace_enter trace;
152 int offset = offsetof(struct syscall_trace_enter, args);
153 152
154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 153 /* When len=0, we just calculate the needed length */
155 "\tsigned:%u;\n", 154#define LEN_OR_ZERO (len ? len - pos : 0)
156 SYSCALL_FIELD(int, nr));
157 if (!ret)
158 return 0;
159 155
156 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
160 for (i = 0; i < entry->nb_args; i++) { 157 for (i = 0; i < entry->nb_args; i++) {
161 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 158 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
162 entry->args[i]); 159 entry->args[i], sizeof(unsigned long),
163 if (!ret) 160 i == entry->nb_args - 1 ? "" : ", ");
164 return 0;
165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
169 if (!ret)
170 return 0;
171 offset += sizeof(unsigned long);
172 } 161 }
162 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
173 163
174 trace_seq_puts(s, "\nprint fmt: \"");
175 for (i = 0; i < entry->nb_args; i++) { 164 for (i = 0; i < entry->nb_args; i++) {
176 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 165 pos += snprintf(buf + pos, LEN_OR_ZERO,
177 sizeof(unsigned long), 166 ", ((unsigned long)(REC->%s))", entry->args[i]);
178 i == entry->nb_args - 1 ? "" : ", ");
179 if (!ret)
180 return 0;
181 } 167 }
182 trace_seq_putc(s, '"');
183 168
184 for (i = 0; i < entry->nb_args; i++) { 169#undef LEN_OR_ZERO
185 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186 entry->args[i]);
187 if (!ret)
188 return 0;
189 }
190 170
191 return trace_seq_putc(s, '\n'); 171 /* return the length of print_fmt */
172 return pos;
192} 173}
193 174
194int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 175static int set_syscall_print_fmt(struct ftrace_event_call *call)
195{ 176{
196 int ret; 177 char *print_fmt;
197 struct syscall_trace_exit trace; 178 int len;
179 struct syscall_metadata *entry = call->data;
198 180
199 ret = trace_seq_printf(s, 181 if (entry->enter_event != call) {
200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 182 call->print_fmt = "\"0x%lx\", REC->ret";
201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
204 SYSCALL_FIELD(int, nr),
205 SYSCALL_FIELD(long, ret));
206 if (!ret)
207 return 0; 183 return 0;
184 }
208 185
209 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 186 /* First: called with 0 length to calculate the needed length */
187 len = __set_enter_print_fmt(entry, NULL, 0);
188
189 print_fmt = kmalloc(len + 1, GFP_KERNEL);
190 if (!print_fmt)
191 return -ENOMEM;
192
193 /* Second: actually write the @print_fmt */
194 __set_enter_print_fmt(entry, print_fmt, len + 1);
195 call->print_fmt = print_fmt;
196
197 return 0;
198}
199
200static void free_syscall_print_fmt(struct ftrace_event_call *call)
201{
202 struct syscall_metadata *entry = call->data;
203
204 if (entry->enter_event == call)
205 kfree(call->print_fmt);
210} 206}
211 207
212int syscall_enter_define_fields(struct ftrace_event_call *call) 208int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -386,12 +382,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
386{ 382{
387 int id; 383 int id;
388 384
389 id = register_ftrace_event(call->event); 385 if (set_syscall_print_fmt(call) < 0)
390 if (!id) 386 return -ENOMEM;
391 return -ENODEV; 387
392 call->id = id; 388 id = trace_event_raw_init(call);
393 INIT_LIST_HEAD(&call->fields); 389
394 return 0; 390 if (id < 0) {
391 free_syscall_print_fmt(call);
392 return id;
393 }
394
395 return id;
396}
397
398unsigned long __init arch_syscall_addr(int nr)
399{
400 return (unsigned long)sys_call_table[nr];
395} 401}
396 402
397int __init init_ftrace_syscalls(void) 403int __init init_ftrace_syscalls(void)
@@ -423,12 +429,12 @@ core_initcall(init_ftrace_syscalls);
423 429
424#ifdef CONFIG_PERF_EVENTS 430#ifdef CONFIG_PERF_EVENTS
425 431
426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 432static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 433static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
428static int sys_prof_refcount_enter; 434static int sys_perf_refcount_enter;
429static int sys_prof_refcount_exit; 435static int sys_perf_refcount_exit;
430 436
431static void prof_syscall_enter(struct pt_regs *regs, long id) 437static void perf_syscall_enter(struct pt_regs *regs, long id)
432{ 438{
433 struct syscall_metadata *sys_data; 439 struct syscall_metadata *sys_data;
434 struct syscall_trace_enter *rec; 440 struct syscall_trace_enter *rec;
@@ -438,7 +444,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
438 int size; 444 int size;
439 445
440 syscall_nr = syscall_get_nr(current, regs); 446 syscall_nr = syscall_get_nr(current, regs);
441 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 447 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
442 return; 448 return;
443 449
444 sys_data = syscall_nr_to_meta(syscall_nr); 450 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -450,11 +456,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
450 size = ALIGN(size + sizeof(u32), sizeof(u64)); 456 size = ALIGN(size + sizeof(u32), sizeof(u64));
451 size -= sizeof(u32); 457 size -= sizeof(u32);
452 458
453 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 459 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
454 "profile buffer not large enough")) 460 "perf buffer not large enough"))
455 return; 461 return;
456 462
457 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, 463 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
458 sys_data->enter_event->id, &rctx, &flags); 464 sys_data->enter_event->id, &rctx, &flags);
459 if (!rec) 465 if (!rec)
460 return; 466 return;
@@ -462,10 +468,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
462 rec->nr = syscall_nr; 468 rec->nr = syscall_nr;
463 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 469 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
464 (unsigned long *)&rec->args); 470 (unsigned long *)&rec->args);
465 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); 471 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
466} 472}
467 473
468int prof_sysenter_enable(struct ftrace_event_call *call) 474int perf_sysenter_enable(struct ftrace_event_call *call)
469{ 475{
470 int ret = 0; 476 int ret = 0;
471 int num; 477 int num;
@@ -473,34 +479,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
473 num = ((struct syscall_metadata *)call->data)->syscall_nr; 479 num = ((struct syscall_metadata *)call->data)->syscall_nr;
474 480
475 mutex_lock(&syscall_trace_lock); 481 mutex_lock(&syscall_trace_lock);
476 if (!sys_prof_refcount_enter) 482 if (!sys_perf_refcount_enter)
477 ret = register_trace_sys_enter(prof_syscall_enter); 483 ret = register_trace_sys_enter(perf_syscall_enter);
478 if (ret) { 484 if (ret) {
479 pr_info("event trace: Could not activate" 485 pr_info("event trace: Could not activate"
480 "syscall entry trace point"); 486 "syscall entry trace point");
481 } else { 487 } else {
482 set_bit(num, enabled_prof_enter_syscalls); 488 set_bit(num, enabled_perf_enter_syscalls);
483 sys_prof_refcount_enter++; 489 sys_perf_refcount_enter++;
484 } 490 }
485 mutex_unlock(&syscall_trace_lock); 491 mutex_unlock(&syscall_trace_lock);
486 return ret; 492 return ret;
487} 493}
488 494
489void prof_sysenter_disable(struct ftrace_event_call *call) 495void perf_sysenter_disable(struct ftrace_event_call *call)
490{ 496{
491 int num; 497 int num;
492 498
493 num = ((struct syscall_metadata *)call->data)->syscall_nr; 499 num = ((struct syscall_metadata *)call->data)->syscall_nr;
494 500
495 mutex_lock(&syscall_trace_lock); 501 mutex_lock(&syscall_trace_lock);
496 sys_prof_refcount_enter--; 502 sys_perf_refcount_enter--;
497 clear_bit(num, enabled_prof_enter_syscalls); 503 clear_bit(num, enabled_perf_enter_syscalls);
498 if (!sys_prof_refcount_enter) 504 if (!sys_perf_refcount_enter)
499 unregister_trace_sys_enter(prof_syscall_enter); 505 unregister_trace_sys_enter(perf_syscall_enter);
500 mutex_unlock(&syscall_trace_lock); 506 mutex_unlock(&syscall_trace_lock);
501} 507}
502 508
503static void prof_syscall_exit(struct pt_regs *regs, long ret) 509static void perf_syscall_exit(struct pt_regs *regs, long ret)
504{ 510{
505 struct syscall_metadata *sys_data; 511 struct syscall_metadata *sys_data;
506 struct syscall_trace_exit *rec; 512 struct syscall_trace_exit *rec;
@@ -510,7 +516,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
510 int size; 516 int size;
511 517
512 syscall_nr = syscall_get_nr(current, regs); 518 syscall_nr = syscall_get_nr(current, regs);
513 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 519 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
514 return; 520 return;
515 521
516 sys_data = syscall_nr_to_meta(syscall_nr); 522 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -525,11 +531,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
525 * Impossible, but be paranoid with the future 531 * Impossible, but be paranoid with the future
526 * How to put this check outside runtime? 532 * How to put this check outside runtime?
527 */ 533 */
528 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 534 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
529 "exit event has grown above profile buffer size")) 535 "exit event has grown above perf buffer size"))
530 return; 536 return;
531 537
532 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, 538 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
533 sys_data->exit_event->id, &rctx, &flags); 539 sys_data->exit_event->id, &rctx, &flags);
534 if (!rec) 540 if (!rec)
535 return; 541 return;
@@ -537,10 +543,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
537 rec->nr = syscall_nr; 543 rec->nr = syscall_nr;
538 rec->ret = syscall_get_return_value(current, regs); 544 rec->ret = syscall_get_return_value(current, regs);
539 545
540 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); 546 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
541} 547}
542 548
543int prof_sysexit_enable(struct ftrace_event_call *call) 549int perf_sysexit_enable(struct ftrace_event_call *call)
544{ 550{
545 int ret = 0; 551 int ret = 0;
546 int num; 552 int num;
@@ -548,30 +554,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
548 num = ((struct syscall_metadata *)call->data)->syscall_nr; 554 num = ((struct syscall_metadata *)call->data)->syscall_nr;
549 555
550 mutex_lock(&syscall_trace_lock); 556 mutex_lock(&syscall_trace_lock);
551 if (!sys_prof_refcount_exit) 557 if (!sys_perf_refcount_exit)
552 ret = register_trace_sys_exit(prof_syscall_exit); 558 ret = register_trace_sys_exit(perf_syscall_exit);
553 if (ret) { 559 if (ret) {
554 pr_info("event trace: Could not activate" 560 pr_info("event trace: Could not activate"
555 "syscall entry trace point"); 561 "syscall exit trace point");
556 } else { 562 } else {
557 set_bit(num, enabled_prof_exit_syscalls); 563 set_bit(num, enabled_perf_exit_syscalls);
558 sys_prof_refcount_exit++; 564 sys_perf_refcount_exit++;
559 } 565 }
560 mutex_unlock(&syscall_trace_lock); 566 mutex_unlock(&syscall_trace_lock);
561 return ret; 567 return ret;
562} 568}
563 569
564void prof_sysexit_disable(struct ftrace_event_call *call) 570void perf_sysexit_disable(struct ftrace_event_call *call)
565{ 571{
566 int num; 572 int num;
567 573
568 num = ((struct syscall_metadata *)call->data)->syscall_nr; 574 num = ((struct syscall_metadata *)call->data)->syscall_nr;
569 575
570 mutex_lock(&syscall_trace_lock); 576 mutex_lock(&syscall_trace_lock);
571 sys_prof_refcount_exit--; 577 sys_perf_refcount_exit--;
572 clear_bit(num, enabled_prof_exit_syscalls); 578 clear_bit(num, enabled_perf_exit_syscalls);
573 if (!sys_prof_refcount_exit) 579 if (!sys_perf_refcount_exit)
574 unregister_trace_sys_exit(prof_syscall_exit); 580 unregister_trace_sys_exit(perf_syscall_exit);
575 mutex_unlock(&syscall_trace_lock); 581 mutex_unlock(&syscall_trace_lock);
576} 582}
577 583
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..cc2d2faa7d9e 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/slab.h>
12#include <linux/kref.h> 13#include <linux/kref.h>
13#include "trace_stat.h" 14#include "trace_stat.h"
14#include "trace.h" 15#include "trace.h"
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..5bfb213984b2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork)
774{ 774{
775 if (del_timer_sync(&dwork->timer)) { 775 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq; 776 struct cpu_workqueue_struct *cwq;
777 cwq = wq_per_cpu(keventd_wq, get_cpu()); 777 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
778 __queue_work(cwq, &dwork->work); 778 __queue_work(cwq, &dwork->work);
779 put_cpu(); 779 put_cpu();
780 } 780 }