aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-04-02 14:02:55 -0400
committerIngo Molnar <mingo@elte.hu>2010-04-02 14:03:08 -0400
commitc9494727cf293ae2ec66af57547a3e79c724fec2 (patch)
tree44ae197b64fa7530ee695a90ad31326dda06f1e1 /kernel
parent6427462bfa50f50dc6c088c07037264fcc73eca1 (diff)
parent42be79e37e264557f12860fa4cc84b4de3685954 (diff)
Merge branch 'linus' into sched/core
Merge reason: update to latest upstream Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/acct.c10
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_tree.c100
-rw-r--r--kernel/auditsc.c7
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c694
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c106
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/early_res.c584
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fork.c66
-rw-r--r--kernel/hw_breakpoint.c11
-rw-r--r--kernel/irq/chip.c89
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/manage.c22
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c648
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/lockdep.c10
-rw-r--r--kernel/module.c32
-rw-r--r--kernel/nsproxy.c13
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c46
-rw-r--r--kernel/params.c12
-rw-r--r--kernel/perf_event.c133
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/posix-cpu-timers.c46
-rw-r--r--kernel/power/hibernate.c9
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/printk.c55
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c23
-rw-r--r--kernel/rcutorture.c8
-rw-r--r--kernel/rcutree.h21
-rw-r--r--kernel/rcutree_plugin.h8
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c53
-rw-r--r--kernel/sched.c24
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c2
-rw-r--r--kernel/sched_rt.c5
-rw-r--r--kernel/signal.c45
-rw-r--r--kernel/slow-work.c2
-rw-r--r--kernel/slow-work.h8
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c70
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c51
-rw-r--r--kernel/sysctl_binary.c7
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/tick-oneshot.c52
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer_list.c3
-rw-r--r--kernel/timer.c1
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c30
-rw-r--r--kernel/trace/ring_buffer.c31
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c55
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_clock.c1
-rw-r--r--kernel/trace/trace_event_perf.c (renamed from kernel/trace/trace_event_profile.c)54
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_functions_graph.c31
-rw-r--r--kernel/trace/trace_kprobe.c29
-rw-r--r--kernel/trace/trace_syscalls.c72
-rw-r--r--kernel/tsacct.c1
77 files changed, 2950 insertions, 789 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aebdeb2aa34..a987aa1676b5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
90obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 93obj-$(CONFIG_LATENCYTOP) += latencytop.o
94obj-$(CONFIG_BINFMT_ELF) += elfcore.o
95obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..24f8c81fc48d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -588,16 +588,6 @@ out:
588} 588}
589 589
590/** 590/**
591 * acct_init_pacct - initialize a new pacct_struct
592 * @pacct: per-process accounting info struct to initialize
593 */
594void acct_init_pacct(struct pacct_struct *pacct)
595{
596 memset(pacct, 0, sizeof(struct pacct_struct));
597 pacct->ac_utime = pacct->ac_stime = cputime_zero;
598}
599
600/**
601 * acct_collect - collect accounting information into pacct_struct 591 * acct_collect - collect accounting information into pacct_struct
602 * @exitcode: task exit code 592 * @exitcode: task exit code
603 * @group_dead: not 0, if this thread is the last one in the process. 593 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..78f7f86aa238 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 skb_get(skb); 398 skb_get(skb);
399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
400 if (err < 0) { 400 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 401 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 403 audit_log_lost("auditd dissapeared\n");
404 audit_pid = 0; 404 audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..028e85663f27 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
548 return 0; 548 return 0;
549} 549}
550 550
551static int compare_root(struct vfsmount *mnt, void *arg)
552{
553 return mnt->mnt_root->d_inode == arg;
554}
555
551void audit_trim_trees(void) 556void audit_trim_trees(void)
552{ 557{
553 struct list_head cursor; 558 struct list_head cursor;
@@ -559,7 +564,6 @@ void audit_trim_trees(void)
559 struct path path; 564 struct path path;
560 struct vfsmount *root_mnt; 565 struct vfsmount *root_mnt;
561 struct node *node; 566 struct node *node;
562 struct list_head list;
563 int err; 567 int err;
564 568
565 tree = container_of(cursor.next, struct audit_tree, list); 569 tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +581,16 @@ void audit_trim_trees(void)
577 if (!root_mnt) 581 if (!root_mnt)
578 goto skip_it; 582 goto skip_it;
579 583
580 list_add_tail(&list, &root_mnt->mnt_list);
581 spin_lock(&hash_lock); 584 spin_lock(&hash_lock);
582 list_for_each_entry(node, &tree->chunks, list) { 585 list_for_each_entry(node, &tree->chunks, list) {
583 struct audit_chunk *chunk = find_chunk(node); 586 struct inode *inode = find_chunk(node)->watch.inode;
584 struct inode *inode = chunk->watch.inode;
585 struct vfsmount *mnt;
586 node->index |= 1U<<31; 587 node->index |= 1U<<31;
587 list_for_each_entry(mnt, &list, mnt_list) { 588 if (iterate_mounts(compare_root, inode, root_mnt))
588 if (mnt->mnt_root->d_inode == inode) { 589 node->index &= ~(1U<<31);
589 node->index &= ~(1U<<31);
590 break;
591 }
592 }
593 } 590 }
594 spin_unlock(&hash_lock); 591 spin_unlock(&hash_lock);
595 trim_marked(tree); 592 trim_marked(tree);
596 put_tree(tree); 593 put_tree(tree);
597 list_del_init(&list);
598 drop_collected_mounts(root_mnt); 594 drop_collected_mounts(root_mnt);
599skip_it: 595skip_it:
600 mutex_lock(&audit_filter_mutex); 596 mutex_lock(&audit_filter_mutex);
@@ -603,22 +599,6 @@ skip_it:
603 mutex_unlock(&audit_filter_mutex); 599 mutex_unlock(&audit_filter_mutex);
604} 600}
605 601
606static int is_under(struct vfsmount *mnt, struct dentry *dentry,
607 struct path *path)
608{
609 if (mnt != path->mnt) {
610 for (;;) {
611 if (mnt->mnt_parent == mnt)
612 return 0;
613 if (mnt->mnt_parent == path->mnt)
614 break;
615 mnt = mnt->mnt_parent;
616 }
617 dentry = mnt->mnt_mountpoint;
618 }
619 return is_subdir(dentry, path->dentry);
620}
621
622int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 602int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
623{ 603{
624 604
@@ -638,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree)
638 put_tree(tree); 618 put_tree(tree);
639} 619}
640 620
621static int tag_mount(struct vfsmount *mnt, void *arg)
622{
623 return tag_chunk(mnt->mnt_root->d_inode, arg);
624}
625
641/* called with audit_filter_mutex */ 626/* called with audit_filter_mutex */
642int audit_add_tree_rule(struct audit_krule *rule) 627int audit_add_tree_rule(struct audit_krule *rule)
643{ 628{
644 struct audit_tree *seed = rule->tree, *tree; 629 struct audit_tree *seed = rule->tree, *tree;
645 struct path path; 630 struct path path;
646 struct vfsmount *mnt, *p; 631 struct vfsmount *mnt;
647 struct list_head list;
648 int err; 632 int err;
649 633
650 list_for_each_entry(tree, &tree_list, list) { 634 list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
670 err = -ENOMEM; 654 err = -ENOMEM;
671 goto Err; 655 goto Err;
672 } 656 }
673 list_add_tail(&list, &mnt->mnt_list);
674 657
675 get_tree(tree); 658 get_tree(tree);
676 list_for_each_entry(p, &list, mnt_list) { 659 err = iterate_mounts(tag_mount, tree, mnt);
677 err = tag_chunk(p->mnt_root->d_inode, tree);
678 if (err)
679 break;
680 }
681
682 list_del(&list);
683 drop_collected_mounts(mnt); 660 drop_collected_mounts(mnt);
684 661
685 if (!err) { 662 if (!err) {
@@ -714,31 +691,23 @@ int audit_tag_tree(char *old, char *new)
714{ 691{
715 struct list_head cursor, barrier; 692 struct list_head cursor, barrier;
716 int failed = 0; 693 int failed = 0;
717 struct path path; 694 struct path path1, path2;
718 struct vfsmount *tagged; 695 struct vfsmount *tagged;
719 struct list_head list;
720 struct vfsmount *mnt;
721 struct dentry *dentry;
722 int err; 696 int err;
723 697
724 err = kern_path(new, 0, &path); 698 err = kern_path(new, 0, &path2);
725 if (err) 699 if (err)
726 return err; 700 return err;
727 tagged = collect_mounts(&path); 701 tagged = collect_mounts(&path2);
728 path_put(&path); 702 path_put(&path2);
729 if (!tagged) 703 if (!tagged)
730 return -ENOMEM; 704 return -ENOMEM;
731 705
732 err = kern_path(old, 0, &path); 706 err = kern_path(old, 0, &path1);
733 if (err) { 707 if (err) {
734 drop_collected_mounts(tagged); 708 drop_collected_mounts(tagged);
735 return err; 709 return err;
736 } 710 }
737 mnt = mntget(path.mnt);
738 dentry = dget(path.dentry);
739 path_put(&path);
740
741 list_add_tail(&list, &tagged->mnt_list);
742 711
743 mutex_lock(&audit_filter_mutex); 712 mutex_lock(&audit_filter_mutex);
744 list_add(&barrier, &tree_list); 713 list_add(&barrier, &tree_list);
@@ -746,7 +715,7 @@ int audit_tag_tree(char *old, char *new)
746 715
747 while (cursor.next != &tree_list) { 716 while (cursor.next != &tree_list) {
748 struct audit_tree *tree; 717 struct audit_tree *tree;
749 struct vfsmount *p; 718 int good_one = 0;
750 719
751 tree = container_of(cursor.next, struct audit_tree, list); 720 tree = container_of(cursor.next, struct audit_tree, list);
752 get_tree(tree); 721 get_tree(tree);
@@ -754,30 +723,19 @@ int audit_tag_tree(char *old, char *new)
754 list_add(&cursor, &tree->list); 723 list_add(&cursor, &tree->list);
755 mutex_unlock(&audit_filter_mutex); 724 mutex_unlock(&audit_filter_mutex);
756 725
757 err = kern_path(tree->pathname, 0, &path); 726 err = kern_path(tree->pathname, 0, &path2);
758 if (err) { 727 if (!err) {
759 put_tree(tree); 728 good_one = path_is_under(&path1, &path2);
760 mutex_lock(&audit_filter_mutex); 729 path_put(&path2);
761 continue;
762 } 730 }
763 731
764 spin_lock(&vfsmount_lock); 732 if (!good_one) {
765 if (!is_under(mnt, dentry, &path)) {
766 spin_unlock(&vfsmount_lock);
767 path_put(&path);
768 put_tree(tree); 733 put_tree(tree);
769 mutex_lock(&audit_filter_mutex); 734 mutex_lock(&audit_filter_mutex);
770 continue; 735 continue;
771 } 736 }
772 spin_unlock(&vfsmount_lock);
773 path_put(&path);
774
775 list_for_each_entry(p, &list, mnt_list) {
776 failed = tag_chunk(p->mnt_root->d_inode, tree);
777 if (failed)
778 break;
779 }
780 737
738 failed = iterate_mounts(tag_mount, tree, tagged);
781 if (failed) { 739 if (failed) {
782 put_tree(tree); 740 put_tree(tree);
783 mutex_lock(&audit_filter_mutex); 741 mutex_lock(&audit_filter_mutex);
@@ -818,10 +776,8 @@ int audit_tag_tree(char *old, char *new)
818 } 776 }
819 list_del(&barrier); 777 list_del(&barrier);
820 list_del(&cursor); 778 list_del(&cursor);
821 list_del(&list);
822 mutex_unlock(&audit_filter_mutex); 779 mutex_unlock(&audit_filter_mutex);
823 dput(dentry); 780 path_put(&path1);
824 mntput(mnt);
825 drop_collected_mounts(tagged); 781 drop_collected_mounts(tagged);
826 return failed; 782 return failed;
827} 783}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..f3a461c0970a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1988 1988
1989/** 1989/**
1990 * audit_inode_child - collect inode info for created/removed objects 1990 * audit_inode_child - collect inode info for created/removed objects
1991 * @dname: inode's dentry name
1992 * @dentry: dentry being audited 1991 * @dentry: dentry being audited
1993 * @parent: inode of dentry parent 1992 * @parent: inode of dentry parent
1994 * 1993 *
@@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2000 * must be hooked prior, in order to capture the target inode during 1999 * must be hooked prior, in order to capture the target inode during
2001 * unsuccessful attempts. 2000 * unsuccessful attempts.
2002 */ 2001 */
2003void __audit_inode_child(const char *dname, const struct dentry *dentry, 2002void __audit_inode_child(const struct dentry *dentry,
2004 const struct inode *parent) 2003 const struct inode *parent)
2005{ 2004{
2006 int idx; 2005 int idx;
2007 struct audit_context *context = current->audit_context; 2006 struct audit_context *context = current->audit_context;
2008 const char *found_parent = NULL, *found_child = NULL; 2007 const char *found_parent = NULL, *found_child = NULL;
2009 const struct inode *inode = dentry->d_inode; 2008 const struct inode *inode = dentry->d_inode;
2009 const char *dname = dentry->d_name.name;
2010 int dirlen = 0; 2010 int dirlen = 0;
2011 2011
2012 if (!context->in_syscall) 2012 if (!context->in_syscall)
@@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2014 2014
2015 if (inode) 2015 if (inode)
2016 handle_one(inode); 2016 handle_one(inode);
2017 /* determine matching parent */
2018 if (!dname)
2019 goto add_names;
2020 2017
2021 /* parent is more likely, look for it first */ 2018 /* parent is more likely, look for it first */
2022 for (idx = 0; idx < context->name_count; idx++) { 2019 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
135 if (pid && (pid != task_pid_vnr(current))) { 135 if (pid && (pid != task_pid_vnr(current))) {
136 struct task_struct *target; 136 struct task_struct *target;
137 137
138 read_lock(&tasklist_lock); 138 rcu_read_lock();
139 139
140 target = find_task_by_vpid(pid); 140 target = find_task_by_vpid(pid);
141 if (!target) 141 if (!target)
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
143 else 143 else
144 ret = security_capget(target, pEp, pIp, pPp); 144 ret = security_capget(target, pEp, pIp, pPp);
145 145
146 read_unlock(&tasklist_lock); 146 rcu_read_unlock();
147 } else 147 } else
148 ret = security_capget(current, pEp, pIp, pPp); 148 ret = security_capget(current, pEp, pIp, pPp);
149 149
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4fd90e129772..e2769e13980c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -23,7 +27,6 @@
23 */ 27 */
24 28
25#include <linux/cgroup.h> 29#include <linux/cgroup.h>
26#include <linux/module.h>
27#include <linux/ctype.h> 30#include <linux/ctype.h>
28#include <linux/errno.h> 31#include <linux/errno.h>
29#include <linux/fs.h> 32#include <linux/fs.h>
@@ -44,6 +47,7 @@
44#include <linux/string.h> 47#include <linux/string.h>
45#include <linux/sort.h> 48#include <linux/sort.h>
46#include <linux/kmod.h> 49#include <linux/kmod.h>
50#include <linux/module.h>
47#include <linux/delayacct.h> 51#include <linux/delayacct.h>
48#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
49#include <linux/hash.h> 53#include <linux/hash.h>
@@ -52,15 +56,21 @@
52#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
53#include <linux/idr.h> 57#include <linux/idr.h>
54#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
55 61
56#include <asm/atomic.h> 62#include <asm/atomic.h>
57 63
58static DEFINE_MUTEX(cgroup_mutex); 64static DEFINE_MUTEX(cgroup_mutex);
59 65
60/* Generate an array of cgroup subsystem pointers */ 66/*
67 * Generate an array of cgroup subsystem pointers. At boot time, this is
68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69 * registered after that. The mutable section of this array is protected by
70 * cgroup_mutex.
71 */
61#define SUBSYS(_x) &_x ## _subsys, 72#define SUBSYS(_x) &_x ## _subsys,
62 73static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
63static struct cgroup_subsys *subsys[] = {
64#include <linux/cgroup_subsys.h> 74#include <linux/cgroup_subsys.h>
65}; 75};
66 76
@@ -147,6 +157,35 @@ struct css_id {
147 unsigned short stack[0]; /* Array of Length (depth+1) */ 157 unsigned short stack[0]; /* Array of Length (depth+1) */
148}; 158};
149 159
160/*
161 * cgroup_event represents events which userspace want to recieve.
162 */
163struct cgroup_event {
164 /*
165 * Cgroup which the event belongs to.
166 */
167 struct cgroup *cgrp;
168 /*
169 * Control file which the event associated.
170 */
171 struct cftype *cft;
172 /*
173 * eventfd to signal userspace about the event.
174 */
175 struct eventfd_ctx *eventfd;
176 /*
177 * Each of these stored in a list by the cgroup.
178 */
179 struct list_head list;
180 /*
181 * All fields below needed to unregister event when
182 * userspace closes eventfd.
183 */
184 poll_table pt;
185 wait_queue_head_t *wqh;
186 wait_queue_t wait;
187 struct work_struct remove;
188};
150 189
151/* The list of hierarchy roots */ 190/* The list of hierarchy roots */
152 191
@@ -250,7 +289,8 @@ struct cg_cgroup_link {
250static struct css_set init_css_set; 289static struct css_set init_css_set;
251static struct cg_cgroup_link init_css_set_link; 290static struct cg_cgroup_link init_css_set_link;
252 291
253static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 292static int cgroup_init_idr(struct cgroup_subsys *ss,
293 struct cgroup_subsys_state *css);
254 294
255/* css_set_lock protects the list of css_set objects, and the 295/* css_set_lock protects the list of css_set objects, and the
256 * chain of tasks off each css_set. Nests outside task->alloc_lock 296 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -448,8 +488,11 @@ static struct css_set *find_existing_css_set(
448 struct hlist_node *node; 488 struct hlist_node *node;
449 struct css_set *cg; 489 struct css_set *cg;
450 490
451 /* Built the set of subsystem state objects that we want to 491 /*
452 * see in the new css_set */ 492 * Build the set of subsystem state objects that we want to see in the
493 * new css_set. while subsystems can change globally, the entries here
494 * won't change, so no need for locking.
495 */
453 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 496 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
454 if (root->subsys_bits & (1UL << i)) { 497 if (root->subsys_bits & (1UL << i)) {
455 /* Subsystem is in this hierarchy. So we want 498 /* Subsystem is in this hierarchy. So we want
@@ -696,6 +739,7 @@ void cgroup_lock(void)
696{ 739{
697 mutex_lock(&cgroup_mutex); 740 mutex_lock(&cgroup_mutex);
698} 741}
742EXPORT_SYMBOL_GPL(cgroup_lock);
699 743
700/** 744/**
701 * cgroup_unlock - release lock on cgroup changes 745 * cgroup_unlock - release lock on cgroup changes
@@ -706,6 +750,7 @@ void cgroup_unlock(void)
706{ 750{
707 mutex_unlock(&cgroup_mutex); 751 mutex_unlock(&cgroup_mutex);
708} 752}
753EXPORT_SYMBOL_GPL(cgroup_unlock);
709 754
710/* 755/*
711 * A couple of forward declarations required, due to cyclic reference loop: 756 * A couple of forward declarations required, due to cyclic reference loop:
@@ -757,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
757 if (ret) 802 if (ret)
758 break; 803 break;
759 } 804 }
805
760 return ret; 806 return ret;
761} 807}
762 808
@@ -884,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
884 css_put(css); 930 css_put(css);
885} 931}
886 932
887 933/*
934 * Call with cgroup_mutex held. Drops reference counts on modules, including
935 * any duplicate ones that parse_cgroupfs_options took. If this function
936 * returns an error, no reference counts are touched.
937 */
888static int rebind_subsystems(struct cgroupfs_root *root, 938static int rebind_subsystems(struct cgroupfs_root *root,
889 unsigned long final_bits) 939 unsigned long final_bits)
890{ 940{
@@ -892,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
892 struct cgroup *cgrp = &root->top_cgroup; 942 struct cgroup *cgrp = &root->top_cgroup;
893 int i; 943 int i;
894 944
945 BUG_ON(!mutex_is_locked(&cgroup_mutex));
946
895 removed_bits = root->actual_subsys_bits & ~final_bits; 947 removed_bits = root->actual_subsys_bits & ~final_bits;
896 added_bits = final_bits & ~root->actual_subsys_bits; 948 added_bits = final_bits & ~root->actual_subsys_bits;
897 /* Check that any added subsystems are currently free */ 949 /* Check that any added subsystems are currently free */
@@ -900,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
900 struct cgroup_subsys *ss = subsys[i]; 952 struct cgroup_subsys *ss = subsys[i];
901 if (!(bit & added_bits)) 953 if (!(bit & added_bits))
902 continue; 954 continue;
955 /*
956 * Nobody should tell us to do a subsys that doesn't exist:
957 * parse_cgroupfs_options should catch that case and refcounts
958 * ensure that subsystems won't disappear once selected.
959 */
960 BUG_ON(ss == NULL);
903 if (ss->root != &rootnode) { 961 if (ss->root != &rootnode) {
904 /* Subsystem isn't free */ 962 /* Subsystem isn't free */
905 return -EBUSY; 963 return -EBUSY;
@@ -919,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
919 unsigned long bit = 1UL << i; 977 unsigned long bit = 1UL << i;
920 if (bit & added_bits) { 978 if (bit & added_bits) {
921 /* We're binding this subsystem to this hierarchy */ 979 /* We're binding this subsystem to this hierarchy */
980 BUG_ON(ss == NULL);
922 BUG_ON(cgrp->subsys[i]); 981 BUG_ON(cgrp->subsys[i]);
923 BUG_ON(!dummytop->subsys[i]); 982 BUG_ON(!dummytop->subsys[i]);
924 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 983 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -930,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
930 if (ss->bind) 989 if (ss->bind)
931 ss->bind(ss, cgrp); 990 ss->bind(ss, cgrp);
932 mutex_unlock(&ss->hierarchy_mutex); 991 mutex_unlock(&ss->hierarchy_mutex);
992 /* refcount was already taken, and we're keeping it */
933 } else if (bit & removed_bits) { 993 } else if (bit & removed_bits) {
934 /* We're removing this subsystem */ 994 /* We're removing this subsystem */
995 BUG_ON(ss == NULL);
935 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 996 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
936 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 997 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
937 mutex_lock(&ss->hierarchy_mutex); 998 mutex_lock(&ss->hierarchy_mutex);
@@ -942,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
942 subsys[i]->root = &rootnode; 1003 subsys[i]->root = &rootnode;
943 list_move(&ss->sibling, &rootnode.subsys_list); 1004 list_move(&ss->sibling, &rootnode.subsys_list);
944 mutex_unlock(&ss->hierarchy_mutex); 1005 mutex_unlock(&ss->hierarchy_mutex);
1006 /* subsystem is now free - drop reference on module */
1007 module_put(ss->module);
945 } else if (bit & final_bits) { 1008 } else if (bit & final_bits) {
946 /* Subsystem state should already exist */ 1009 /* Subsystem state should already exist */
1010 BUG_ON(ss == NULL);
947 BUG_ON(!cgrp->subsys[i]); 1011 BUG_ON(!cgrp->subsys[i]);
1012 /*
1013 * a refcount was taken, but we already had one, so
1014 * drop the extra reference.
1015 */
1016 module_put(ss->module);
1017#ifdef CONFIG_MODULE_UNLOAD
1018 BUG_ON(ss->module && !module_refcount(ss->module));
1019#endif
948 } else { 1020 } else {
949 /* Subsystem state shouldn't exist */ 1021 /* Subsystem state shouldn't exist */
950 BUG_ON(cgrp->subsys[i]); 1022 BUG_ON(cgrp->subsys[i]);
@@ -986,13 +1058,20 @@ struct cgroup_sb_opts {
986 1058
987}; 1059};
988 1060
989/* Convert a hierarchy specifier into a bitmask of subsystems and 1061/*
990 * flags. */ 1062 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
991static int parse_cgroupfs_options(char *data, 1063 * with cgroup_mutex held to protect the subsys[] array. This function takes
992 struct cgroup_sb_opts *opts) 1064 * refcounts on subsystems to be used, unless it returns error, in which case
1065 * no refcounts are taken.
1066 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
993{ 1068{
994 char *token, *o = data ?: "all"; 1069 char *token, *o = data ?: "all";
995 unsigned long mask = (unsigned long)-1; 1070 unsigned long mask = (unsigned long)-1;
1071 int i;
1072 bool module_pin_failed = false;
1073
1074 BUG_ON(!mutex_is_locked(&cgroup_mutex));
996 1075
997#ifdef CONFIG_CPUSETS 1076#ifdef CONFIG_CPUSETS
998 mask = ~(1UL << cpuset_subsys_id); 1077 mask = ~(1UL << cpuset_subsys_id);
@@ -1005,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
1005 return -EINVAL; 1084 return -EINVAL;
1006 if (!strcmp(token, "all")) { 1085 if (!strcmp(token, "all")) {
1007 /* Add all non-disabled subsystems */ 1086 /* Add all non-disabled subsystems */
1008 int i;
1009 opts->subsys_bits = 0; 1087 opts->subsys_bits = 0;
1010 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1011 struct cgroup_subsys *ss = subsys[i]; 1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
1012 if (!ss->disabled) 1092 if (!ss->disabled)
1013 opts->subsys_bits |= 1ul << i; 1093 opts->subsys_bits |= 1ul << i;
1014 } 1094 }
@@ -1026,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
1026 if (!opts->release_agent) 1106 if (!opts->release_agent)
1027 return -ENOMEM; 1107 return -ENOMEM;
1028 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
1029 int i;
1030 const char *name = token + 5; 1109 const char *name = token + 5;
1031 /* Can't specify an empty name */ 1110 /* Can't specify an empty name */
1032 if (!strlen(name)) 1111 if (!strlen(name))
@@ -1050,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
1050 return -ENOMEM; 1129 return -ENOMEM;
1051 } else { 1130 } else {
1052 struct cgroup_subsys *ss; 1131 struct cgroup_subsys *ss;
1053 int i;
1054 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1055 ss = subsys[i]; 1133 ss = subsys[i];
1134 if (ss == NULL)
1135 continue;
1056 if (!strcmp(token, ss->name)) { 1136 if (!strcmp(token, ss->name)) {
1057 if (!ss->disabled) 1137 if (!ss->disabled)
1058 set_bit(i, &opts->subsys_bits); 1138 set_bit(i, &opts->subsys_bits);
@@ -1087,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
1087 if (!opts->subsys_bits && !opts->name) 1167 if (!opts->subsys_bits && !opts->name)
1088 return -EINVAL; 1168 return -EINVAL;
1089 1169
1170 /*
1171 * Grab references on all the modules we'll need, so the subsystems
1172 * don't dance around before rebind_subsystems attaches them. This may
1173 * take duplicate reference counts on a subsystem that's already used,
1174 * but rebind_subsystems handles this case.
1175 */
1176 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1177 unsigned long bit = 1UL << i;
1178
1179 if (!(bit & opts->subsys_bits))
1180 continue;
1181 if (!try_module_get(subsys[i]->module)) {
1182 module_pin_failed = true;
1183 break;
1184 }
1185 }
1186 if (module_pin_failed) {
1187 /*
1188 * oops, one of the modules was going away. this means that we
1189 * raced with a module_delete call, and to the user this is
1190 * essentially a "subsystem doesn't exist" case.
1191 */
1192 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1193 /* drop refcounts only on the ones we took */
1194 unsigned long bit = 1UL << i;
1195
1196 if (!(bit & opts->subsys_bits))
1197 continue;
1198 module_put(subsys[i]->module);
1199 }
1200 return -ENOENT;
1201 }
1202
1090 return 0; 1203 return 0;
1091} 1204}
1092 1205
1206static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1207{
1208 int i;
1209 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1210 unsigned long bit = 1UL << i;
1211
1212 if (!(bit & subsys_bits))
1213 continue;
1214 module_put(subsys[i]->module);
1215 }
1216}
1217
1093static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1218static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1094{ 1219{
1095 int ret = 0; 1220 int ret = 0;
@@ -1106,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1106 if (ret) 1231 if (ret)
1107 goto out_unlock; 1232 goto out_unlock;
1108 1233
1109 /* Don't allow flags to change at remount */ 1234 /* Don't allow flags or name to change at remount */
1110 if (opts.flags != root->flags) { 1235 if (opts.flags != root->flags ||
1111 ret = -EINVAL; 1236 (opts.name && strcmp(opts.name, root->name))) {
1112 goto out_unlock;
1113 }
1114
1115 /* Don't allow name to change at remount */
1116 if (opts.name && strcmp(opts.name, root->name)) {
1117 ret = -EINVAL; 1237 ret = -EINVAL;
1238 drop_parsed_module_refcounts(opts.subsys_bits);
1118 goto out_unlock; 1239 goto out_unlock;
1119 } 1240 }
1120 1241
1121 ret = rebind_subsystems(root, opts.subsys_bits); 1242 ret = rebind_subsystems(root, opts.subsys_bits);
1122 if (ret) 1243 if (ret) {
1244 drop_parsed_module_refcounts(opts.subsys_bits);
1123 goto out_unlock; 1245 goto out_unlock;
1246 }
1124 1247
1125 /* (re)populate subsystem files */ 1248 /* (re)populate subsystem files */
1126 cgroup_populate_dir(cgrp); 1249 cgroup_populate_dir(cgrp);
@@ -1151,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1151 INIT_LIST_HEAD(&cgrp->release_list); 1274 INIT_LIST_HEAD(&cgrp->release_list);
1152 INIT_LIST_HEAD(&cgrp->pidlists); 1275 INIT_LIST_HEAD(&cgrp->pidlists);
1153 mutex_init(&cgrp->pidlist_mutex); 1276 mutex_init(&cgrp->pidlist_mutex);
1277 INIT_LIST_HEAD(&cgrp->event_list);
1278 spin_lock_init(&cgrp->event_list_lock);
1154} 1279}
1155 1280
1156static void init_cgroup_root(struct cgroupfs_root *root) 1281static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1306,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1306 struct cgroupfs_root *new_root; 1431 struct cgroupfs_root *new_root;
1307 1432
1308 /* First find the desired set of subsystems */ 1433 /* First find the desired set of subsystems */
1434 mutex_lock(&cgroup_mutex);
1309 ret = parse_cgroupfs_options(data, &opts); 1435 ret = parse_cgroupfs_options(data, &opts);
1436 mutex_unlock(&cgroup_mutex);
1310 if (ret) 1437 if (ret)
1311 goto out_err; 1438 goto out_err;
1312 1439
@@ -1317,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1317 new_root = cgroup_root_from_opts(&opts); 1444 new_root = cgroup_root_from_opts(&opts);
1318 if (IS_ERR(new_root)) { 1445 if (IS_ERR(new_root)) {
1319 ret = PTR_ERR(new_root); 1446 ret = PTR_ERR(new_root);
1320 goto out_err; 1447 goto drop_modules;
1321 } 1448 }
1322 opts.new_root = new_root; 1449 opts.new_root = new_root;
1323 1450
@@ -1326,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1326 if (IS_ERR(sb)) { 1453 if (IS_ERR(sb)) {
1327 ret = PTR_ERR(sb); 1454 ret = PTR_ERR(sb);
1328 cgroup_drop_root(opts.new_root); 1455 cgroup_drop_root(opts.new_root);
1329 goto out_err; 1456 goto drop_modules;
1330 } 1457 }
1331 1458
1332 root = sb->s_fs_info; 1459 root = sb->s_fs_info;
@@ -1382,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1382 free_cg_links(&tmp_cg_links); 1509 free_cg_links(&tmp_cg_links);
1383 goto drop_new_super; 1510 goto drop_new_super;
1384 } 1511 }
1512 /*
1513 * There must be no failure case after here, since rebinding
1514 * takes care of subsystems' refcounts, which are explicitly
1515 * dropped in the failure exit path.
1516 */
1385 1517
1386 /* EBUSY should be the only error here */ 1518 /* EBUSY should be the only error here */
1387 BUG_ON(ret); 1519 BUG_ON(ret);
@@ -1420,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1420 * any) is not needed 1552 * any) is not needed
1421 */ 1553 */
1422 cgroup_drop_root(opts.new_root); 1554 cgroup_drop_root(opts.new_root);
1555 /* no subsys rebinding, so refcounts don't change */
1556 drop_parsed_module_refcounts(opts.subsys_bits);
1423 } 1557 }
1424 1558
1425 simple_set_mnt(mnt, sb); 1559 simple_set_mnt(mnt, sb);
@@ -1429,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1429 1563
1430 drop_new_super: 1564 drop_new_super:
1431 deactivate_locked_super(sb); 1565 deactivate_locked_super(sb);
1566 drop_modules:
1567 drop_parsed_module_refcounts(opts.subsys_bits);
1432 out_err: 1568 out_err:
1433 kfree(opts.release_agent); 1569 kfree(opts.release_agent);
1434 kfree(opts.name); 1570 kfree(opts.name);
@@ -1542,6 +1678,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1542 memmove(buf, start, buf + buflen - start); 1678 memmove(buf, start, buf + buflen - start);
1543 return 0; 1679 return 0;
1544} 1680}
1681EXPORT_SYMBOL_GPL(cgroup_path);
1545 1682
1546/** 1683/**
1547 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1684 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1554,7 +1691,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1554int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1691int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1555{ 1692{
1556 int retval = 0; 1693 int retval = 0;
1557 struct cgroup_subsys *ss; 1694 struct cgroup_subsys *ss, *failed_ss = NULL;
1558 struct cgroup *oldcgrp; 1695 struct cgroup *oldcgrp;
1559 struct css_set *cg; 1696 struct css_set *cg;
1560 struct css_set *newcg; 1697 struct css_set *newcg;
@@ -1568,8 +1705,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 for_each_subsys(root, ss) { 1705 for_each_subsys(root, ss) {
1569 if (ss->can_attach) { 1706 if (ss->can_attach) {
1570 retval = ss->can_attach(ss, cgrp, tsk, false); 1707 retval = ss->can_attach(ss, cgrp, tsk, false);
1571 if (retval) 1708 if (retval) {
1572 return retval; 1709 /*
1710 * Remember on which subsystem the can_attach()
1711 * failed, so that we only call cancel_attach()
1712 * against the subsystems whose can_attach()
1713 * succeeded. (See below)
1714 */
1715 failed_ss = ss;
1716 goto out;
1717 }
1573 } 1718 }
1574 } 1719 }
1575 1720
@@ -1583,14 +1728,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1583 */ 1728 */
1584 newcg = find_css_set(cg, cgrp); 1729 newcg = find_css_set(cg, cgrp);
1585 put_css_set(cg); 1730 put_css_set(cg);
1586 if (!newcg) 1731 if (!newcg) {
1587 return -ENOMEM; 1732 retval = -ENOMEM;
1733 goto out;
1734 }
1588 1735
1589 task_lock(tsk); 1736 task_lock(tsk);
1590 if (tsk->flags & PF_EXITING) { 1737 if (tsk->flags & PF_EXITING) {
1591 task_unlock(tsk); 1738 task_unlock(tsk);
1592 put_css_set(newcg); 1739 put_css_set(newcg);
1593 return -ESRCH; 1740 retval = -ESRCH;
1741 goto out;
1594 } 1742 }
1595 rcu_assign_pointer(tsk->cgroups, newcg); 1743 rcu_assign_pointer(tsk->cgroups, newcg);
1596 task_unlock(tsk); 1744 task_unlock(tsk);
@@ -1616,7 +1764,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1616 * is no longer empty. 1764 * is no longer empty.
1617 */ 1765 */
1618 cgroup_wakeup_rmdir_waiter(cgrp); 1766 cgroup_wakeup_rmdir_waiter(cgrp);
1619 return 0; 1767out:
1768 if (retval) {
1769 for_each_subsys(root, ss) {
1770 if (ss == failed_ss)
1771 /*
1772 * This subsystem was the one that failed the
1773 * can_attach() check earlier, so we don't need
1774 * to call cancel_attach() against it or any
1775 * remaining subsystems.
1776 */
1777 break;
1778 if (ss->cancel_attach)
1779 ss->cancel_attach(ss, cgrp, tsk, false);
1780 }
1781 }
1782 return retval;
1620} 1783}
1621 1784
1622/* 1785/*
@@ -1682,6 +1845,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1682 } 1845 }
1683 return true; 1846 return true;
1684} 1847}
1848EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1685 1849
1686static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1850static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1687 const char *buffer) 1851 const char *buffer)
@@ -1950,6 +2114,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1950 .rename = cgroup_rename, 2114 .rename = cgroup_rename,
1951}; 2115};
1952 2116
2117/*
2118 * Check if a file is a control file
2119 */
2120static inline struct cftype *__file_cft(struct file *file)
2121{
2122 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2123 return ERR_PTR(-EINVAL);
2124 return __d_cft(file->f_dentry);
2125}
2126
1953static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2127static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1954 struct super_block *sb) 2128 struct super_block *sb)
1955{ 2129{
@@ -2069,6 +2243,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2069 error = PTR_ERR(dentry); 2243 error = PTR_ERR(dentry);
2070 return error; 2244 return error;
2071} 2245}
2246EXPORT_SYMBOL_GPL(cgroup_add_file);
2072 2247
2073int cgroup_add_files(struct cgroup *cgrp, 2248int cgroup_add_files(struct cgroup *cgrp,
2074 struct cgroup_subsys *subsys, 2249 struct cgroup_subsys *subsys,
@@ -2083,6 +2258,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2083 } 2258 }
2084 return 0; 2259 return 0;
2085} 2260}
2261EXPORT_SYMBOL_GPL(cgroup_add_files);
2086 2262
2087/** 2263/**
2088 * cgroup_task_count - count the number of tasks in a cgroup. 2264 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2468,7 +2644,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2468{ 2644{
2469 struct cgroup_pidlist *l; 2645 struct cgroup_pidlist *l;
2470 /* don't need task_nsproxy() if we're looking at ourself */ 2646 /* don't need task_nsproxy() if we're looking at ourself */
2471 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2647 struct pid_namespace *ns = current->nsproxy->pid_ns;
2648
2472 /* 2649 /*
2473 * We can't drop the pidlist_mutex before taking the l->mutex in case 2650 * We can't drop the pidlist_mutex before taking the l->mutex in case
2474 * the last ref-holder is trying to remove l from the list at the same 2651 * the last ref-holder is trying to remove l from the list at the same
@@ -2478,8 +2655,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2478 mutex_lock(&cgrp->pidlist_mutex); 2655 mutex_lock(&cgrp->pidlist_mutex);
2479 list_for_each_entry(l, &cgrp->pidlists, links) { 2656 list_for_each_entry(l, &cgrp->pidlists, links) {
2480 if (l->key.type == type && l->key.ns == ns) { 2657 if (l->key.type == type && l->key.ns == ns) {
2481 /* found a matching list - drop the extra refcount */
2482 put_pid_ns(ns);
2483 /* make sure l doesn't vanish out from under us */ 2658 /* make sure l doesn't vanish out from under us */
2484 down_write(&l->mutex); 2659 down_write(&l->mutex);
2485 mutex_unlock(&cgrp->pidlist_mutex); 2660 mutex_unlock(&cgrp->pidlist_mutex);
@@ -2490,13 +2665,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2490 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2665 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2491 if (!l) { 2666 if (!l) {
2492 mutex_unlock(&cgrp->pidlist_mutex); 2667 mutex_unlock(&cgrp->pidlist_mutex);
2493 put_pid_ns(ns);
2494 return l; 2668 return l;
2495 } 2669 }
2496 init_rwsem(&l->mutex); 2670 init_rwsem(&l->mutex);
2497 down_write(&l->mutex); 2671 down_write(&l->mutex);
2498 l->key.type = type; 2672 l->key.type = type;
2499 l->key.ns = ns; 2673 l->key.ns = get_pid_ns(ns);
2500 l->use_count = 0; /* don't increment here */ 2674 l->use_count = 0; /* don't increment here */
2501 l->list = NULL; 2675 l->list = NULL;
2502 l->owner = cgrp; 2676 l->owner = cgrp;
@@ -2804,6 +2978,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2804} 2978}
2805 2979
2806/* 2980/*
2981 * Unregister event and free resources.
2982 *
2983 * Gets called from workqueue.
2984 */
2985static void cgroup_event_remove(struct work_struct *work)
2986{
2987 struct cgroup_event *event = container_of(work, struct cgroup_event,
2988 remove);
2989 struct cgroup *cgrp = event->cgrp;
2990
2991 /* TODO: check return code */
2992 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2993
2994 eventfd_ctx_put(event->eventfd);
2995 kfree(event);
2996 dput(cgrp->dentry);
2997}
2998
2999/*
3000 * Gets called on POLLHUP on eventfd when user closes it.
3001 *
3002 * Called with wqh->lock held and interrupts disabled.
3003 */
3004static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3005 int sync, void *key)
3006{
3007 struct cgroup_event *event = container_of(wait,
3008 struct cgroup_event, wait);
3009 struct cgroup *cgrp = event->cgrp;
3010 unsigned long flags = (unsigned long)key;
3011
3012 if (flags & POLLHUP) {
3013 remove_wait_queue_locked(event->wqh, &event->wait);
3014 spin_lock(&cgrp->event_list_lock);
3015 list_del(&event->list);
3016 spin_unlock(&cgrp->event_list_lock);
3017 /*
3018 * We are in atomic context, but cgroup_event_remove() may
3019 * sleep, so we have to call it in workqueue.
3020 */
3021 schedule_work(&event->remove);
3022 }
3023
3024 return 0;
3025}
3026
3027static void cgroup_event_ptable_queue_proc(struct file *file,
3028 wait_queue_head_t *wqh, poll_table *pt)
3029{
3030 struct cgroup_event *event = container_of(pt,
3031 struct cgroup_event, pt);
3032
3033 event->wqh = wqh;
3034 add_wait_queue(wqh, &event->wait);
3035}
3036
3037/*
3038 * Parse input and register new cgroup event handler.
3039 *
3040 * Input must be in format '<event_fd> <control_fd> <args>'.
3041 * Interpretation of args is defined by control file implementation.
3042 */
3043static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3044 const char *buffer)
3045{
3046 struct cgroup_event *event = NULL;
3047 unsigned int efd, cfd;
3048 struct file *efile = NULL;
3049 struct file *cfile = NULL;
3050 char *endp;
3051 int ret;
3052
3053 efd = simple_strtoul(buffer, &endp, 10);
3054 if (*endp != ' ')
3055 return -EINVAL;
3056 buffer = endp + 1;
3057
3058 cfd = simple_strtoul(buffer, &endp, 10);
3059 if ((*endp != ' ') && (*endp != '\0'))
3060 return -EINVAL;
3061 buffer = endp + 1;
3062
3063 event = kzalloc(sizeof(*event), GFP_KERNEL);
3064 if (!event)
3065 return -ENOMEM;
3066 event->cgrp = cgrp;
3067 INIT_LIST_HEAD(&event->list);
3068 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3069 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3070 INIT_WORK(&event->remove, cgroup_event_remove);
3071
3072 efile = eventfd_fget(efd);
3073 if (IS_ERR(efile)) {
3074 ret = PTR_ERR(efile);
3075 goto fail;
3076 }
3077
3078 event->eventfd = eventfd_ctx_fileget(efile);
3079 if (IS_ERR(event->eventfd)) {
3080 ret = PTR_ERR(event->eventfd);
3081 goto fail;
3082 }
3083
3084 cfile = fget(cfd);
3085 if (!cfile) {
3086 ret = -EBADF;
3087 goto fail;
3088 }
3089
3090 /* the process need read permission on control file */
3091 ret = file_permission(cfile, MAY_READ);
3092 if (ret < 0)
3093 goto fail;
3094
3095 event->cft = __file_cft(cfile);
3096 if (IS_ERR(event->cft)) {
3097 ret = PTR_ERR(event->cft);
3098 goto fail;
3099 }
3100
3101 if (!event->cft->register_event || !event->cft->unregister_event) {
3102 ret = -EINVAL;
3103 goto fail;
3104 }
3105
3106 ret = event->cft->register_event(cgrp, event->cft,
3107 event->eventfd, buffer);
3108 if (ret)
3109 goto fail;
3110
3111 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3112 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3113 ret = 0;
3114 goto fail;
3115 }
3116
3117 /*
3118 * Events should be removed after rmdir of cgroup directory, but before
3119 * destroying subsystem state objects. Let's take reference to cgroup
3120 * directory dentry to do that.
3121 */
3122 dget(cgrp->dentry);
3123
3124 spin_lock(&cgrp->event_list_lock);
3125 list_add(&event->list, &cgrp->event_list);
3126 spin_unlock(&cgrp->event_list_lock);
3127
3128 fput(cfile);
3129 fput(efile);
3130
3131 return 0;
3132
3133fail:
3134 if (cfile)
3135 fput(cfile);
3136
3137 if (event && event->eventfd && !IS_ERR(event->eventfd))
3138 eventfd_ctx_put(event->eventfd);
3139
3140 if (!IS_ERR_OR_NULL(efile))
3141 fput(efile);
3142
3143 kfree(event);
3144
3145 return ret;
3146}
3147
3148/*
2807 * for the common functions, 'private' gives the type of file 3149 * for the common functions, 'private' gives the type of file
2808 */ 3150 */
2809/* for hysterical raisins, we can't put this on the older files */ 3151/* for hysterical raisins, we can't put this on the older files */
@@ -2828,6 +3170,11 @@ static struct cftype files[] = {
2828 .read_u64 = cgroup_read_notify_on_release, 3170 .read_u64 = cgroup_read_notify_on_release,
2829 .write_u64 = cgroup_write_notify_on_release, 3171 .write_u64 = cgroup_write_notify_on_release,
2830 }, 3172 },
3173 {
3174 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3175 .write_string = cgroup_write_event_control,
3176 .mode = S_IWUGO,
3177 },
2831}; 3178};
2832 3179
2833static struct cftype cft_release_agent = { 3180static struct cftype cft_release_agent = {
@@ -2892,8 +3239,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2892 /* We need to take each hierarchy_mutex in a consistent order */ 3239 /* We need to take each hierarchy_mutex in a consistent order */
2893 int i; 3240 int i;
2894 3241
3242 /*
3243 * No worry about a race with rebind_subsystems that might mess up the
3244 * locking order, since both parties are under cgroup_mutex.
3245 */
2895 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3246 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2896 struct cgroup_subsys *ss = subsys[i]; 3247 struct cgroup_subsys *ss = subsys[i];
3248 if (ss == NULL)
3249 continue;
2897 if (ss->root == root) 3250 if (ss->root == root)
2898 mutex_lock(&ss->hierarchy_mutex); 3251 mutex_lock(&ss->hierarchy_mutex);
2899 } 3252 }
@@ -2905,6 +3258,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2905 3258
2906 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3259 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2907 struct cgroup_subsys *ss = subsys[i]; 3260 struct cgroup_subsys *ss = subsys[i];
3261 if (ss == NULL)
3262 continue;
2908 if (ss->root == root) 3263 if (ss->root == root)
2909 mutex_unlock(&ss->hierarchy_mutex); 3264 mutex_unlock(&ss->hierarchy_mutex);
2910 } 3265 }
@@ -3028,11 +3383,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3028 * synchronization other than RCU, and the subsystem linked 3383 * synchronization other than RCU, and the subsystem linked
3029 * list isn't RCU-safe */ 3384 * list isn't RCU-safe */
3030 int i; 3385 int i;
3386 /*
3387 * We won't need to lock the subsys array, because the subsystems
3388 * we're concerned about aren't going anywhere since our cgroup root
3389 * has a reference on them.
3390 */
3031 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3391 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3032 struct cgroup_subsys *ss = subsys[i]; 3392 struct cgroup_subsys *ss = subsys[i];
3033 struct cgroup_subsys_state *css; 3393 struct cgroup_subsys_state *css;
3034 /* Skip subsystems not in this hierarchy */ 3394 /* Skip subsystems not present or not in this hierarchy */
3035 if (ss->root != cgrp->root) 3395 if (ss == NULL || ss->root != cgrp->root)
3036 continue; 3396 continue;
3037 css = cgrp->subsys[ss->subsys_id]; 3397 css = cgrp->subsys[ss->subsys_id];
3038 /* When called from check_for_release() it's possible 3398 /* When called from check_for_release() it's possible
@@ -3106,6 +3466,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3106 struct dentry *d; 3466 struct dentry *d;
3107 struct cgroup *parent; 3467 struct cgroup *parent;
3108 DEFINE_WAIT(wait); 3468 DEFINE_WAIT(wait);
3469 struct cgroup_event *event, *tmp;
3109 int ret; 3470 int ret;
3110 3471
3111 /* the vfs holds both inode->i_mutex already */ 3472 /* the vfs holds both inode->i_mutex already */
@@ -3189,6 +3550,20 @@ again:
3189 set_bit(CGRP_RELEASABLE, &parent->flags); 3550 set_bit(CGRP_RELEASABLE, &parent->flags);
3190 check_for_release(parent); 3551 check_for_release(parent);
3191 3552
3553 /*
3554 * Unregister events and notify userspace.
3555 * Notify userspace about cgroup removing only after rmdir of cgroup
3556 * directory to avoid race between userspace and kernelspace
3557 */
3558 spin_lock(&cgrp->event_list_lock);
3559 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3560 list_del(&event->list);
3561 remove_wait_queue(event->wqh, &event->wait);
3562 eventfd_signal(event->eventfd, 1);
3563 schedule_work(&event->remove);
3564 }
3565 spin_unlock(&cgrp->event_list_lock);
3566
3192 mutex_unlock(&cgroup_mutex); 3567 mutex_unlock(&cgroup_mutex);
3193 return 0; 3568 return 0;
3194} 3569}
@@ -3223,9 +3598,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3223 mutex_init(&ss->hierarchy_mutex); 3598 mutex_init(&ss->hierarchy_mutex);
3224 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3599 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3225 ss->active = 1; 3600 ss->active = 1;
3601
3602 /* this function shouldn't be used with modular subsystems, since they
3603 * need to register a subsys_id, among other things */
3604 BUG_ON(ss->module);
3226} 3605}
3227 3606
3228/** 3607/**
3608 * cgroup_load_subsys: load and register a modular subsystem at runtime
3609 * @ss: the subsystem to load
3610 *
3611 * This function should be called in a modular subsystem's initcall. If the
3612 * subsytem is built as a module, it will be assigned a new subsys_id and set
3613 * up for use. If the subsystem is built-in anyway, work is delegated to the
3614 * simpler cgroup_init_subsys.
3615 */
3616int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3617{
3618 int i;
3619 struct cgroup_subsys_state *css;
3620
3621 /* check name and function validity */
3622 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3623 ss->create == NULL || ss->destroy == NULL)
3624 return -EINVAL;
3625
3626 /*
3627 * we don't support callbacks in modular subsystems. this check is
3628 * before the ss->module check for consistency; a subsystem that could
3629 * be a module should still have no callbacks even if the user isn't
3630 * compiling it as one.
3631 */
3632 if (ss->fork || ss->exit)
3633 return -EINVAL;
3634
3635 /*
3636 * an optionally modular subsystem is built-in: we want to do nothing,
3637 * since cgroup_init_subsys will have already taken care of it.
3638 */
3639 if (ss->module == NULL) {
3640 /* a few sanity checks */
3641 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3642 BUG_ON(subsys[ss->subsys_id] != ss);
3643 return 0;
3644 }
3645
3646 /*
3647 * need to register a subsys id before anything else - for example,
3648 * init_cgroup_css needs it.
3649 */
3650 mutex_lock(&cgroup_mutex);
3651 /* find the first empty slot in the array */
3652 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3653 if (subsys[i] == NULL)
3654 break;
3655 }
3656 if (i == CGROUP_SUBSYS_COUNT) {
3657 /* maximum number of subsystems already registered! */
3658 mutex_unlock(&cgroup_mutex);
3659 return -EBUSY;
3660 }
3661 /* assign ourselves the subsys_id */
3662 ss->subsys_id = i;
3663 subsys[i] = ss;
3664
3665 /*
3666 * no ss->create seems to need anything important in the ss struct, so
3667 * this can happen first (i.e. before the rootnode attachment).
3668 */
3669 css = ss->create(ss, dummytop);
3670 if (IS_ERR(css)) {
3671 /* failure case - need to deassign the subsys[] slot. */
3672 subsys[i] = NULL;
3673 mutex_unlock(&cgroup_mutex);
3674 return PTR_ERR(css);
3675 }
3676
3677 list_add(&ss->sibling, &rootnode.subsys_list);
3678 ss->root = &rootnode;
3679
3680 /* our new subsystem will be attached to the dummy hierarchy. */
3681 init_cgroup_css(css, ss, dummytop);
3682 /* init_idr must be after init_cgroup_css because it sets css->id. */
3683 if (ss->use_id) {
3684 int ret = cgroup_init_idr(ss, css);
3685 if (ret) {
3686 dummytop->subsys[ss->subsys_id] = NULL;
3687 ss->destroy(ss, dummytop);
3688 subsys[i] = NULL;
3689 mutex_unlock(&cgroup_mutex);
3690 return ret;
3691 }
3692 }
3693
3694 /*
3695 * Now we need to entangle the css into the existing css_sets. unlike
3696 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3697 * will need a new pointer to it; done by iterating the css_set_table.
3698 * furthermore, modifying the existing css_sets will corrupt the hash
3699 * table state, so each changed css_set will need its hash recomputed.
3700 * this is all done under the css_set_lock.
3701 */
3702 write_lock(&css_set_lock);
3703 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3704 struct css_set *cg;
3705 struct hlist_node *node, *tmp;
3706 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3707
3708 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3709 /* skip entries that we already rehashed */
3710 if (cg->subsys[ss->subsys_id])
3711 continue;
3712 /* remove existing entry */
3713 hlist_del(&cg->hlist);
3714 /* set new value */
3715 cg->subsys[ss->subsys_id] = css;
3716 /* recompute hash and restore entry */
3717 new_bucket = css_set_hash(cg->subsys);
3718 hlist_add_head(&cg->hlist, new_bucket);
3719 }
3720 }
3721 write_unlock(&css_set_lock);
3722
3723 mutex_init(&ss->hierarchy_mutex);
3724 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3725 ss->active = 1;
3726
3727 /* success! */
3728 mutex_unlock(&cgroup_mutex);
3729 return 0;
3730}
3731EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3732
3733/**
3734 * cgroup_unload_subsys: unload a modular subsystem
3735 * @ss: the subsystem to unload
3736 *
3737 * This function should be called in a modular subsystem's exitcall. When this
3738 * function is invoked, the refcount on the subsystem's module will be 0, so
3739 * the subsystem will not be attached to any hierarchy.
3740 */
3741void cgroup_unload_subsys(struct cgroup_subsys *ss)
3742{
3743 struct cg_cgroup_link *link;
3744 struct hlist_head *hhead;
3745
3746 BUG_ON(ss->module == NULL);
3747
3748 /*
3749 * we shouldn't be called if the subsystem is in use, and the use of
3750 * try_module_get in parse_cgroupfs_options should ensure that it
3751 * doesn't start being used while we're killing it off.
3752 */
3753 BUG_ON(ss->root != &rootnode);
3754
3755 mutex_lock(&cgroup_mutex);
3756 /* deassign the subsys_id */
3757 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3758 subsys[ss->subsys_id] = NULL;
3759
3760 /* remove subsystem from rootnode's list of subsystems */
3761 list_del(&ss->sibling);
3762
3763 /*
3764 * disentangle the css from all css_sets attached to the dummytop. as
3765 * in loading, we need to pay our respects to the hashtable gods.
3766 */
3767 write_lock(&css_set_lock);
3768 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3769 struct css_set *cg = link->cg;
3770
3771 hlist_del(&cg->hlist);
3772 BUG_ON(!cg->subsys[ss->subsys_id]);
3773 cg->subsys[ss->subsys_id] = NULL;
3774 hhead = css_set_hash(cg->subsys);
3775 hlist_add_head(&cg->hlist, hhead);
3776 }
3777 write_unlock(&css_set_lock);
3778
3779 /*
3780 * remove subsystem's css from the dummytop and free it - need to free
3781 * before marking as null because ss->destroy needs the cgrp->subsys
3782 * pointer to find their state. note that this also takes care of
3783 * freeing the css_id.
3784 */
3785 ss->destroy(ss, dummytop);
3786 dummytop->subsys[ss->subsys_id] = NULL;
3787
3788 mutex_unlock(&cgroup_mutex);
3789}
3790EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3791
3792/**
3229 * cgroup_init_early - cgroup initialization at system boot 3793 * cgroup_init_early - cgroup initialization at system boot
3230 * 3794 *
3231 * Initialize cgroups at system boot, and initialize any 3795 * Initialize cgroups at system boot, and initialize any
@@ -3253,7 +3817,8 @@ int __init cgroup_init_early(void)
3253 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3817 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3254 INIT_HLIST_HEAD(&css_set_table[i]); 3818 INIT_HLIST_HEAD(&css_set_table[i]);
3255 3819
3256 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3820 /* at bootup time, we don't worry about modular subsystems */
3821 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3257 struct cgroup_subsys *ss = subsys[i]; 3822 struct cgroup_subsys *ss = subsys[i];
3258 3823
3259 BUG_ON(!ss->name); 3824 BUG_ON(!ss->name);
@@ -3288,12 +3853,13 @@ int __init cgroup_init(void)
3288 if (err) 3853 if (err)
3289 return err; 3854 return err;
3290 3855
3291 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3856 /* at bootup time, we don't worry about modular subsystems */
3857 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3292 struct cgroup_subsys *ss = subsys[i]; 3858 struct cgroup_subsys *ss = subsys[i];
3293 if (!ss->early_init) 3859 if (!ss->early_init)
3294 cgroup_init_subsys(ss); 3860 cgroup_init_subsys(ss);
3295 if (ss->use_id) 3861 if (ss->use_id)
3296 cgroup_subsys_init_idr(ss); 3862 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3297 } 3863 }
3298 3864
3299 /* Add init_css_set to the hash table */ 3865 /* Add init_css_set to the hash table */
@@ -3397,9 +3963,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3397 int i; 3963 int i;
3398 3964
3399 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3965 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3966 /*
3967 * ideally we don't want subsystems moving around while we do this.
3968 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3969 * subsys/hierarchy state.
3970 */
3400 mutex_lock(&cgroup_mutex); 3971 mutex_lock(&cgroup_mutex);
3401 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3972 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3402 struct cgroup_subsys *ss = subsys[i]; 3973 struct cgroup_subsys *ss = subsys[i];
3974 if (ss == NULL)
3975 continue;
3403 seq_printf(m, "%s\t%d\t%d\t%d\n", 3976 seq_printf(m, "%s\t%d\t%d\t%d\n",
3404 ss->name, ss->root->hierarchy_id, 3977 ss->name, ss->root->hierarchy_id,
3405 ss->root->number_of_cgroups, !ss->disabled); 3978 ss->root->number_of_cgroups, !ss->disabled);
@@ -3457,7 +4030,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3457{ 4030{
3458 if (need_forkexit_callback) { 4031 if (need_forkexit_callback) {
3459 int i; 4032 int i;
3460 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4033 /*
4034 * forkexit callbacks are only supported for builtin
4035 * subsystems, and the builtin section of the subsys array is
4036 * immutable, so we don't need to lock the subsys array here.
4037 */
4038 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3461 struct cgroup_subsys *ss = subsys[i]; 4039 struct cgroup_subsys *ss = subsys[i];
3462 if (ss->fork) 4040 if (ss->fork)
3463 ss->fork(ss, child); 4041 ss->fork(ss, child);
@@ -3526,7 +4104,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3526 struct css_set *cg; 4104 struct css_set *cg;
3527 4105
3528 if (run_callbacks && need_forkexit_callback) { 4106 if (run_callbacks && need_forkexit_callback) {
3529 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4107 /*
4108 * modular subsystems can't use callbacks, so no need to lock
4109 * the subsys array
4110 */
4111 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3530 struct cgroup_subsys *ss = subsys[i]; 4112 struct cgroup_subsys *ss = subsys[i];
3531 if (ss->exit) 4113 if (ss->exit)
3532 ss->exit(ss, tsk); 4114 ss->exit(ss, tsk);
@@ -3720,12 +4302,13 @@ static void check_for_release(struct cgroup *cgrp)
3720 } 4302 }
3721} 4303}
3722 4304
3723void __css_put(struct cgroup_subsys_state *css) 4305/* Caller must verify that the css is not for root cgroup */
4306void __css_put(struct cgroup_subsys_state *css, int count)
3724{ 4307{
3725 struct cgroup *cgrp = css->cgroup; 4308 struct cgroup *cgrp = css->cgroup;
3726 int val; 4309 int val;
3727 rcu_read_lock(); 4310 rcu_read_lock();
3728 val = atomic_dec_return(&css->refcnt); 4311 val = atomic_sub_return(count, &css->refcnt);
3729 if (val == 1) { 4312 if (val == 1) {
3730 if (notify_on_release(cgrp)) { 4313 if (notify_on_release(cgrp)) {
3731 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4314 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3736,6 +4319,7 @@ void __css_put(struct cgroup_subsys_state *css)
3736 rcu_read_unlock(); 4319 rcu_read_unlock();
3737 WARN_ON_ONCE(val < 1); 4320 WARN_ON_ONCE(val < 1);
3738} 4321}
4322EXPORT_SYMBOL_GPL(__css_put);
3739 4323
3740/* 4324/*
3741 * Notify userspace when a cgroup is released, by running the 4325 * Notify userspace when a cgroup is released, by running the
@@ -3817,8 +4401,11 @@ static int __init cgroup_disable(char *str)
3817 while ((token = strsep(&str, ",")) != NULL) { 4401 while ((token = strsep(&str, ",")) != NULL) {
3818 if (!*token) 4402 if (!*token)
3819 continue; 4403 continue;
3820 4404 /*
3821 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4405 * cgroup_disable, being at boot time, can't know about module
4406 * subsystems, so we don't worry about them.
4407 */
4408 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3822 struct cgroup_subsys *ss = subsys[i]; 4409 struct cgroup_subsys *ss = subsys[i];
3823 4410
3824 if (!strcmp(token, ss->name)) { 4411 if (!strcmp(token, ss->name)) {
@@ -3848,6 +4435,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
3848 return cssid->id; 4435 return cssid->id;
3849 return 0; 4436 return 0;
3850} 4437}
4438EXPORT_SYMBOL_GPL(css_id);
3851 4439
3852unsigned short css_depth(struct cgroup_subsys_state *css) 4440unsigned short css_depth(struct cgroup_subsys_state *css)
3853{ 4441{
@@ -3857,6 +4445,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
3857 return cssid->depth; 4445 return cssid->depth;
3858 return 0; 4446 return 0;
3859} 4447}
4448EXPORT_SYMBOL_GPL(css_depth);
3860 4449
3861bool css_is_ancestor(struct cgroup_subsys_state *child, 4450bool css_is_ancestor(struct cgroup_subsys_state *child,
3862 const struct cgroup_subsys_state *root) 4451 const struct cgroup_subsys_state *root)
@@ -3893,6 +4482,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3893 spin_unlock(&ss->id_lock); 4482 spin_unlock(&ss->id_lock);
3894 call_rcu(&id->rcu_head, __free_css_id_cb); 4483 call_rcu(&id->rcu_head, __free_css_id_cb);
3895} 4484}
4485EXPORT_SYMBOL_GPL(free_css_id);
3896 4486
3897/* 4487/*
3898 * This is called by init or create(). Then, calls to this function are 4488 * This is called by init or create(). Then, calls to this function are
@@ -3942,15 +4532,14 @@ err_out:
3942 4532
3943} 4533}
3944 4534
3945static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4535static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4536 struct cgroup_subsys_state *rootcss)
3946{ 4537{
3947 struct css_id *newid; 4538 struct css_id *newid;
3948 struct cgroup_subsys_state *rootcss;
3949 4539
3950 spin_lock_init(&ss->id_lock); 4540 spin_lock_init(&ss->id_lock);
3951 idr_init(&ss->idr); 4541 idr_init(&ss->idr);
3952 4542
3953 rootcss = init_css_set.subsys[ss->subsys_id];
3954 newid = get_new_cssid(ss, 0); 4543 newid = get_new_cssid(ss, 0);
3955 if (IS_ERR(newid)) 4544 if (IS_ERR(newid))
3956 return PTR_ERR(newid); 4545 return PTR_ERR(newid);
@@ -4010,6 +4599,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
4010 4599
4011 return rcu_dereference(cssid->css); 4600 return rcu_dereference(cssid->css);
4012} 4601}
4602EXPORT_SYMBOL_GPL(css_lookup);
4013 4603
4014/** 4604/**
4015 * css_get_next - lookup next cgroup under specified hierarchy. 4605 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 677f25376a38..f8cced2692b3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -338,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu)
338 if (!cpu_possible(cpu)) { 338 if (!cpu_possible(cpu)) {
339 printk(KERN_ERR "can't online cpu %d because it is not " 339 printk(KERN_ERR "can't online cpu %d because it is not "
340 "configured as may-hotadd at boot time\n", cpu); 340 "configured as may-hotadd at boot time\n", cpu);
341#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 341#if defined(CONFIG_IA64)
342 printk(KERN_ERR "please check additional_cpus= boot " 342 printk(KERN_ERR "please check additional_cpus= boot "
343 "parameter\n"); 343 "parameter\n");
344#endif 344#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..d10946748ec2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 * call to guarantee_online_mems(), as we know no one is changing 920 * call to guarantee_online_mems(), as we know no one is changing
921 * our task's cpuset. 921 * our task's cpuset.
922 * 922 *
923 * Hold callback_mutex around the two modifications of our tasks
924 * mems_allowed to synchronize with cpuset_mems_allowed().
925 *
926 * While the mm_struct we are migrating is typically from some 923 * While the mm_struct we are migrating is typically from some
927 * other task, the task_struct mems_allowed that we are hacking 924 * other task, the task_struct mems_allowed that we are hacking
928 * is for our current task, which must allocate new pages for that 925 * is for our current task, which must allocate new pages for that
@@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
973 struct cpuset *cs; 970 struct cpuset *cs;
974 int migrate; 971 int migrate;
975 const nodemask_t *oldmem = scan->data; 972 const nodemask_t *oldmem = scan->data;
976 nodemask_t newmems; 973 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
974
975 if (!newmems)
976 return;
977 977
978 cs = cgroup_cs(scan->cg); 978 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, &newmems); 979 guarantee_online_mems(cs, newmems);
980 980
981 task_lock(p); 981 task_lock(p);
982 cpuset_change_task_nodemask(p, &newmems); 982 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p); 983 task_unlock(p);
984 984
985 NODEMASK_FREE(newmems);
986
985 mm = get_task_mm(p); 987 mm = get_task_mm(p);
986 if (!mm) 988 if (!mm)
987 return; 989 return;
@@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1051static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1053static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1052 const char *buf) 1054 const char *buf)
1053{ 1055{
1054 nodemask_t oldmem; 1056 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1055 int retval; 1057 int retval;
1056 struct ptr_heap heap; 1058 struct ptr_heap heap;
1057 1059
1060 if (!oldmem)
1061 return -ENOMEM;
1062
1058 /* 1063 /*
1059 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1064 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1060 * it's read-only 1065 * it's read-only
1061 */ 1066 */
1062 if (cs == &top_cpuset) 1067 if (cs == &top_cpuset) {
1063 return -EACCES; 1068 retval = -EACCES;
1069 goto done;
1070 }
1064 1071
1065 /* 1072 /*
1066 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1073 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1076 goto done; 1083 goto done;
1077 1084
1078 if (!nodes_subset(trialcs->mems_allowed, 1085 if (!nodes_subset(trialcs->mems_allowed,
1079 node_states[N_HIGH_MEMORY])) 1086 node_states[N_HIGH_MEMORY])) {
1080 return -EINVAL; 1087 retval = -EINVAL;
1088 goto done;
1089 }
1081 } 1090 }
1082 oldmem = cs->mems_allowed; 1091 *oldmem = cs->mems_allowed;
1083 if (nodes_equal(oldmem, trialcs->mems_allowed)) { 1092 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1084 retval = 0; /* Too easy - nothing to do */ 1093 retval = 0; /* Too easy - nothing to do */
1085 goto done; 1094 goto done;
1086 } 1095 }
@@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1096 cs->mems_allowed = trialcs->mems_allowed; 1105 cs->mems_allowed = trialcs->mems_allowed;
1097 mutex_unlock(&callback_mutex); 1106 mutex_unlock(&callback_mutex);
1098 1107
1099 update_tasks_nodemask(cs, &oldmem, &heap); 1108 update_tasks_nodemask(cs, oldmem, &heap);
1100 1109
1101 heap_free(&heap); 1110 heap_free(&heap);
1102done: 1111done:
1112 NODEMASK_FREE(oldmem);
1103 return retval; 1113 return retval;
1104} 1114}
1105 1115
@@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1384 struct cgroup *oldcont, struct task_struct *tsk, 1394 struct cgroup *oldcont, struct task_struct *tsk,
1385 bool threadgroup) 1395 bool threadgroup)
1386{ 1396{
1387 nodemask_t from, to;
1388 struct mm_struct *mm; 1397 struct mm_struct *mm;
1389 struct cpuset *cs = cgroup_cs(cont); 1398 struct cpuset *cs = cgroup_cs(cont);
1390 struct cpuset *oldcs = cgroup_cs(oldcont); 1399 struct cpuset *oldcs = cgroup_cs(oldcont);
1400 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1401 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1402
1403 if (from == NULL || to == NULL)
1404 goto alloc_fail;
1391 1405
1392 if (cs == &top_cpuset) { 1406 if (cs == &top_cpuset) {
1393 cpumask_copy(cpus_attach, cpu_possible_mask); 1407 cpumask_copy(cpus_attach, cpu_possible_mask);
1394 to = node_possible_map;
1395 } else { 1408 } else {
1396 guarantee_online_cpus(cs, cpus_attach); 1409 guarantee_online_cpus(cs, cpus_attach);
1397 guarantee_online_mems(cs, &to);
1398 } 1410 }
1411 guarantee_online_mems(cs, to);
1399 1412
1400 /* do per-task migration stuff possibly for each in the threadgroup */ 1413 /* do per-task migration stuff possibly for each in the threadgroup */
1401 cpuset_attach_task(tsk, &to, cs); 1414 cpuset_attach_task(tsk, to, cs);
1402 if (threadgroup) { 1415 if (threadgroup) {
1403 struct task_struct *c; 1416 struct task_struct *c;
1404 rcu_read_lock(); 1417 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1418 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs); 1419 cpuset_attach_task(c, to, cs);
1407 } 1420 }
1408 rcu_read_unlock(); 1421 rcu_read_unlock();
1409 } 1422 }
1410 1423
1411 /* change mm; only needs to be done once even if threadgroup */ 1424 /* change mm; only needs to be done once even if threadgroup */
1412 from = oldcs->mems_allowed; 1425 *from = oldcs->mems_allowed;
1413 to = cs->mems_allowed; 1426 *to = cs->mems_allowed;
1414 mm = get_task_mm(tsk); 1427 mm = get_task_mm(tsk);
1415 if (mm) { 1428 if (mm) {
1416 mpol_rebind_mm(mm, &to); 1429 mpol_rebind_mm(mm, to);
1417 if (is_memory_migrate(cs)) 1430 if (is_memory_migrate(cs))
1418 cpuset_migrate_mm(mm, &from, &to); 1431 cpuset_migrate_mm(mm, from, to);
1419 mmput(mm); 1432 mmput(mm);
1420 } 1433 }
1434
1435alloc_fail:
1436 NODEMASK_FREE(from);
1437 NODEMASK_FREE(to);
1421} 1438}
1422 1439
1423/* The various types of files and directories in a cpuset file system */ 1440/* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1562 1579
1563static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1580static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1564{ 1581{
1565 nodemask_t mask; 1582 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
1583 int retval;
1584
1585 if (mask == NULL)
1586 return -ENOMEM;
1566 1587
1567 mutex_lock(&callback_mutex); 1588 mutex_lock(&callback_mutex);
1568 mask = cs->mems_allowed; 1589 *mask = cs->mems_allowed;
1569 mutex_unlock(&callback_mutex); 1590 mutex_unlock(&callback_mutex);
1570 1591
1571 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1592 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
1593
1594 NODEMASK_FREE(mask);
1595
1596 return retval;
1572} 1597}
1573 1598
1574static ssize_t cpuset_common_file_read(struct cgroup *cont, 1599static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1997 struct cpuset *cp; /* scans cpusets being updated */ 2022 struct cpuset *cp; /* scans cpusets being updated */
1998 struct cpuset *child; /* scans child cpusets of cp */ 2023 struct cpuset *child; /* scans child cpusets of cp */
1999 struct cgroup *cont; 2024 struct cgroup *cont;
2000 nodemask_t oldmems; 2025 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2026
2027 if (oldmems == NULL)
2028 return;
2001 2029
2002 list_add_tail((struct list_head *)&root->stack_list, &queue); 2030 list_add_tail((struct list_head *)&root->stack_list, &queue);
2003 2031
@@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2042 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2043 continue;
2016 2044
2017 oldmems = cp->mems_allowed; 2045 *oldmems = cp->mems_allowed;
2018 2046
2019 /* Remove offline cpus and mems from this cpuset. */ 2047 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2048 mutex_lock(&callback_mutex);
@@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2030 remove_tasks_in_empty_cpuset(cp); 2058 remove_tasks_in_empty_cpuset(cp);
2031 else { 2059 else {
2032 update_tasks_cpumask(cp, NULL); 2060 update_tasks_cpumask(cp, NULL);
2033 update_tasks_nodemask(cp, &oldmems, NULL); 2061 update_tasks_nodemask(cp, oldmems, NULL);
2034 } 2062 }
2035 } 2063 }
2064 NODEMASK_FREE(oldmems);
2036} 2065}
2037 2066
2038/* 2067/*
@@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2090static int cpuset_track_online_nodes(struct notifier_block *self, 2119static int cpuset_track_online_nodes(struct notifier_block *self,
2091 unsigned long action, void *arg) 2120 unsigned long action, void *arg)
2092{ 2121{
2122 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2123
2124 if (oldmems == NULL)
2125 return NOTIFY_DONE;
2126
2093 cgroup_lock(); 2127 cgroup_lock();
2094 switch (action) { 2128 switch (action) {
2095 case MEM_ONLINE: 2129 case MEM_ONLINE:
2096 case MEM_OFFLINE: 2130 *oldmems = top_cpuset.mems_allowed;
2097 mutex_lock(&callback_mutex); 2131 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2132 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 mutex_unlock(&callback_mutex); 2133 mutex_unlock(&callback_mutex);
2100 if (action == MEM_OFFLINE) 2134 update_tasks_nodemask(&top_cpuset, oldmems, NULL);
2101 scan_for_empty_cpusets(&top_cpuset); 2135 break;
2136 case MEM_OFFLINE:
2137 /*
2138 * needn't update top_cpuset.mems_allowed explicitly because
2139 * scan_for_empty_cpusets() will update it.
2140 */
2141 scan_for_empty_cpusets(&top_cpuset);
2102 break; 2142 break;
2103 default: 2143 default:
2104 break; 2144 break;
2105 } 2145 }
2106 cgroup_unlock(); 2146 cgroup_unlock();
2147
2148 NODEMASK_FREE(oldmems);
2107 return NOTIFY_OK; 2149 return NOTIFY_OK;
2108} 2150}
2109#endif 2151#endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 1ed8ca18790c..1b1129d0cce8 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -364,7 +364,7 @@ struct cred *prepare_usermodehelper_creds(void)
364 364
365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); 365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
366 if (!new) 366 if (!new)
367 return NULL; 367 goto free_tgcred;
368 368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new); 369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370 370
@@ -397,6 +397,10 @@ struct cred *prepare_usermodehelper_creds(void)
397 397
398error: 398error:
399 put_cred(new); 399 put_cred(new);
400free_tgcred:
401#ifdef CONFIG_KEYS
402 kfree(tgcred);
403#endif
400 return NULL; 404 return NULL;
401} 405}
402 406
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..31aa9332ef3f
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,584 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10
11/*
12 * Early reserved memory areas.
13 */
14/*
15 * need to make sure this one is bigger enough before
16 * find_fw_memmap_area could be used
17 */
18#define MAX_EARLY_RES_X 32
19
20struct early_res {
21 u64 start, end;
22 char name[15];
23 char overlap_ok;
24};
25static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
26
27static int max_early_res __initdata = MAX_EARLY_RES_X;
28static struct early_res *early_res __initdata = &early_res_x[0];
29static int early_res_count __initdata;
30
31static int __init find_overlapped_early(u64 start, u64 end)
32{
33 int i;
34 struct early_res *r;
35
36 for (i = 0; i < max_early_res && early_res[i].end; i++) {
37 r = &early_res[i];
38 if (end > r->start && start < r->end)
39 break;
40 }
41
42 return i;
43}
44
45/*
46 * Drop the i-th range from the early reservation map,
47 * by copying any higher ranges down one over it, and
48 * clearing what had been the last slot.
49 */
50static void __init drop_range(int i)
51{
52 int j;
53
54 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
55 ;
56
57 memmove(&early_res[i], &early_res[i + 1],
58 (j - 1 - i) * sizeof(struct early_res));
59
60 early_res[j - 1].end = 0;
61 early_res_count--;
62}
63
64static void __init drop_range_partial(int i, u64 start, u64 end)
65{
66 u64 common_start, common_end;
67 u64 old_start, old_end;
68
69 old_start = early_res[i].start;
70 old_end = early_res[i].end;
71 common_start = max(old_start, start);
72 common_end = min(old_end, end);
73
74 /* no overlap ? */
75 if (common_start >= common_end)
76 return;
77
78 if (old_start < common_start) {
79 /* make head segment */
80 early_res[i].end = common_start;
81 if (old_end > common_end) {
82 char name[15];
83
84 /*
85 * Save a local copy of the name, since the
86 * early_res array could get resized inside
87 * reserve_early_without_check() ->
88 * __check_and_double_early_res(), which would
89 * make the current name pointer invalid.
90 */
91 strncpy(name, early_res[i].name,
92 sizeof(early_res[i].name) - 1);
93 /* add another for left over on tail */
94 reserve_early_without_check(common_end, old_end, name);
95 }
96 return;
97 } else {
98 if (old_end > common_end) {
99 /* reuse the entry for tail left */
100 early_res[i].start = common_end;
101 return;
102 }
103 /* all covered */
104 drop_range(i);
105 }
106}
107
108/*
109 * Split any existing ranges that:
110 * 1) are marked 'overlap_ok', and
111 * 2) overlap with the stated range [start, end)
112 * into whatever portion (if any) of the existing range is entirely
113 * below or entirely above the stated range. Drop the portion
114 * of the existing range that overlaps with the stated range,
115 * which will allow the caller of this routine to then add that
116 * stated range without conflicting with any existing range.
117 */
118static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
119{
120 int i;
121 struct early_res *r;
122 u64 lower_start, lower_end;
123 u64 upper_start, upper_end;
124 char name[15];
125
126 for (i = 0; i < max_early_res && early_res[i].end; i++) {
127 r = &early_res[i];
128
129 /* Continue past non-overlapping ranges */
130 if (end <= r->start || start >= r->end)
131 continue;
132
133 /*
134 * Leave non-ok overlaps as is; let caller
135 * panic "Overlapping early reservations"
136 * when it hits this overlap.
137 */
138 if (!r->overlap_ok)
139 return;
140
141 /*
142 * We have an ok overlap. We will drop it from the early
143 * reservation map, and add back in any non-overlapping
144 * portions (lower or upper) as separate, overlap_ok,
145 * non-overlapping ranges.
146 */
147
148 /* 1. Note any non-overlapping (lower or upper) ranges. */
149 strncpy(name, r->name, sizeof(name) - 1);
150
151 lower_start = lower_end = 0;
152 upper_start = upper_end = 0;
153 if (r->start < start) {
154 lower_start = r->start;
155 lower_end = start;
156 }
157 if (r->end > end) {
158 upper_start = end;
159 upper_end = r->end;
160 }
161
162 /* 2. Drop the original ok overlapping range */
163 drop_range(i);
164
165 i--; /* resume for-loop on copied down entry */
166
167 /* 3. Add back in any non-overlapping ranges. */
168 if (lower_end)
169 reserve_early_overlap_ok(lower_start, lower_end, name);
170 if (upper_end)
171 reserve_early_overlap_ok(upper_start, upper_end, name);
172 }
173}
174
175static void __init __reserve_early(u64 start, u64 end, char *name,
176 int overlap_ok)
177{
178 int i;
179 struct early_res *r;
180
181 i = find_overlapped_early(start, end);
182 if (i >= max_early_res)
183 panic("Too many early reservations");
184 r = &early_res[i];
185 if (r->end)
186 panic("Overlapping early reservations "
187 "%llx-%llx %s to %llx-%llx %s\n",
188 start, end - 1, name ? name : "", r->start,
189 r->end - 1, r->name);
190 r->start = start;
191 r->end = end;
192 r->overlap_ok = overlap_ok;
193 if (name)
194 strncpy(r->name, name, sizeof(r->name) - 1);
195 early_res_count++;
196}
197
198/*
199 * A few early reservtations come here.
200 *
201 * The 'overlap_ok' in the name of this routine does -not- mean it
202 * is ok for these reservations to overlap an earlier reservation.
203 * Rather it means that it is ok for subsequent reservations to
204 * overlap this one.
205 *
206 * Use this entry point to reserve early ranges when you are doing
207 * so out of "Paranoia", reserving perhaps more memory than you need,
208 * just in case, and don't mind a subsequent overlapping reservation
209 * that is known to be needed.
210 *
211 * The drop_overlaps_that_are_ok() call here isn't really needed.
212 * It would be needed if we had two colliding 'overlap_ok'
213 * reservations, so that the second such would not panic on the
214 * overlap with the first. We don't have any such as of this
215 * writing, but might as well tolerate such if it happens in
216 * the future.
217 */
218void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
219{
220 drop_overlaps_that_are_ok(start, end);
221 __reserve_early(start, end, name, 1);
222}
223
224static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
225{
226 u64 start, end, size, mem;
227 struct early_res *new;
228
229 /* do we have enough slots left ? */
230 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
231 return;
232
233 /* double it */
234 mem = -1ULL;
235 size = sizeof(struct early_res) * max_early_res * 2;
236 if (early_res == early_res_x)
237 start = 0;
238 else
239 start = early_res[0].end;
240 end = ex_start;
241 if (start + size < end)
242 mem = find_fw_memmap_area(start, end, size,
243 sizeof(struct early_res));
244 if (mem == -1ULL) {
245 start = ex_end;
246 end = get_max_mapped();
247 if (start + size < end)
248 mem = find_fw_memmap_area(start, end, size,
249 sizeof(struct early_res));
250 }
251 if (mem == -1ULL)
252 panic("can not find more space for early_res array");
253
254 new = __va(mem);
255 /* save the first one for own */
256 new[0].start = mem;
257 new[0].end = mem + size;
258 new[0].overlap_ok = 0;
259 /* copy old to new */
260 if (early_res == early_res_x) {
261 memcpy(&new[1], &early_res[0],
262 sizeof(struct early_res) * max_early_res);
263 memset(&new[max_early_res+1], 0,
264 sizeof(struct early_res) * (max_early_res - 1));
265 early_res_count++;
266 } else {
267 memcpy(&new[1], &early_res[1],
268 sizeof(struct early_res) * (max_early_res - 1));
269 memset(&new[max_early_res], 0,
270 sizeof(struct early_res) * max_early_res);
271 }
272 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
273 early_res = new;
274 max_early_res *= 2;
275 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
276 max_early_res, mem, mem + size - 1);
277}
278
279/*
280 * Most early reservations come here.
281 *
282 * We first have drop_overlaps_that_are_ok() drop any pre-existing
283 * 'overlap_ok' ranges, so that we can then reserve this memory
284 * range without risk of panic'ing on an overlapping overlap_ok
285 * early reservation.
286 */
287void __init reserve_early(u64 start, u64 end, char *name)
288{
289 if (start >= end)
290 return;
291
292 __check_and_double_early_res(start, end);
293
294 drop_overlaps_that_are_ok(start, end);
295 __reserve_early(start, end, name, 0);
296}
297
298void __init reserve_early_without_check(u64 start, u64 end, char *name)
299{
300 struct early_res *r;
301
302 if (start >= end)
303 return;
304
305 __check_and_double_early_res(start, end);
306
307 r = &early_res[early_res_count];
308
309 r->start = start;
310 r->end = end;
311 r->overlap_ok = 0;
312 if (name)
313 strncpy(r->name, name, sizeof(r->name) - 1);
314 early_res_count++;
315}
316
317void __init free_early(u64 start, u64 end)
318{
319 struct early_res *r;
320 int i;
321
322 i = find_overlapped_early(start, end);
323 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start)
325 panic("free_early on not reserved area: %llx-%llx!",
326 start, end - 1);
327
328 drop_range(i);
329}
330
331void __init free_early_partial(u64 start, u64 end)
332{
333 struct early_res *r;
334 int i;
335
336 if (start == end)
337 return;
338
339 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
340 return;
341
342try_next:
343 i = find_overlapped_early(start, end);
344 if (i >= max_early_res)
345 return;
346
347 r = &early_res[i];
348 /* hole ? */
349 if (r->end >= end && r->start <= start) {
350 drop_range_partial(i, start, end);
351 return;
352 }
353
354 drop_range_partial(i, start, end);
355 goto try_next;
356}
357
358#ifdef CONFIG_NO_BOOTMEM
359static void __init subtract_early_res(struct range *range, int az)
360{
361 int i, count;
362 u64 final_start, final_end;
363 int idx = 0;
364
365 count = 0;
366 for (i = 0; i < max_early_res && early_res[i].end; i++)
367 count++;
368
369 /* need to skip first one ?*/
370 if (early_res != early_res_x)
371 idx = 1;
372
373#define DEBUG_PRINT_EARLY_RES 1
374
375#if DEBUG_PRINT_EARLY_RES
376 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
377#endif
378 for (i = idx; i < count; i++) {
379 struct early_res *r = &early_res[i];
380#if DEBUG_PRINT_EARLY_RES
381 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
382 r->start, r->end, r->name);
383#endif
384 final_start = PFN_DOWN(r->start);
385 final_end = PFN_UP(r->end);
386 if (final_start >= final_end)
387 continue;
388 subtract_range(range, az, final_start, final_end);
389 }
390
391}
392
393int __init get_free_all_memory_range(struct range **rangep, int nodeid)
394{
395 int i, count;
396 u64 start = 0, end;
397 u64 size;
398 u64 mem;
399 struct range *range;
400 int nr_range;
401
402 count = 0;
403 for (i = 0; i < max_early_res && early_res[i].end; i++)
404 count++;
405
406 count *= 2;
407
408 size = sizeof(struct range) * count;
409 end = get_max_mapped();
410#ifdef MAX_DMA32_PFN
411 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
412 start = MAX_DMA32_PFN << PAGE_SHIFT;
413#endif
414 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
415 if (mem == -1ULL)
416 panic("can not find more space for range free");
417
418 range = __va(mem);
419 /* use early_node_map[] and early_res to get range array at first */
420 memset(range, 0, size);
421 nr_range = 0;
422
423 /* need to go over early_node_map to find out good range for node */
424 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
425#ifdef CONFIG_X86_32
426 subtract_range(range, count, max_low_pfn, -1ULL);
427#endif
428 subtract_early_res(range, count);
429 nr_range = clean_sort_range(range, count);
430
431 /* need to clear it ? */
432 if (nodeid == MAX_NUMNODES) {
433 memset(&early_res[0], 0,
434 sizeof(struct early_res) * max_early_res);
435 early_res = NULL;
436 max_early_res = 0;
437 }
438
439 *rangep = range;
440 return nr_range;
441}
442#else
443void __init early_res_to_bootmem(u64 start, u64 end)
444{
445 int i, count;
446 u64 final_start, final_end;
447 int idx = 0;
448
449 count = 0;
450 for (i = 0; i < max_early_res && early_res[i].end; i++)
451 count++;
452
453 /* need to skip first one ?*/
454 if (early_res != early_res_x)
455 idx = 1;
456
457 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
458 count - idx, max_early_res, start, end);
459 for (i = idx; i < count; i++) {
460 struct early_res *r = &early_res[i];
461 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
462 r->start, r->end, r->name);
463 final_start = max(start, r->start);
464 final_end = min(end, r->end);
465 if (final_start >= final_end) {
466 printk(KERN_CONT "\n");
467 continue;
468 }
469 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
470 final_start, final_end);
471 reserve_bootmem_generic(final_start, final_end - final_start,
472 BOOTMEM_DEFAULT);
473 }
474 /* clear them */
475 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
476 early_res = NULL;
477 max_early_res = 0;
478 early_res_count = 0;
479}
480#endif
481
482/* Check for already reserved areas */
483static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
484{
485 int i;
486 u64 addr = *addrp;
487 int changed = 0;
488 struct early_res *r;
489again:
490 i = find_overlapped_early(addr, addr + size);
491 r = &early_res[i];
492 if (i < max_early_res && r->end) {
493 *addrp = addr = round_up(r->end, align);
494 changed = 1;
495 goto again;
496 }
497 return changed;
498}
499
500/* Check for already reserved areas */
501static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
502{
503 int i;
504 u64 addr = *addrp, last;
505 u64 size = *sizep;
506 int changed = 0;
507again:
508 last = addr + size;
509 for (i = 0; i < max_early_res && early_res[i].end; i++) {
510 struct early_res *r = &early_res[i];
511 if (last > r->start && addr < r->start) {
512 size = r->start - addr;
513 changed = 1;
514 goto again;
515 }
516 if (last > r->end && addr < r->end) {
517 addr = round_up(r->end, align);
518 size = last - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last <= r->end && addr >= r->start) {
523 (*sizep)++;
524 return 0;
525 }
526 }
527 if (changed) {
528 *addrp = addr;
529 *sizep = size;
530 }
531 return changed;
532}
533
534/*
535 * Find a free area with specified alignment in a specific range.
536 * only with the area.between start to end is active range from early_node_map
537 * so they are good as RAM
538 */
539u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
540 u64 size, u64 align)
541{
542 u64 addr, last;
543
544 addr = round_up(ei_start, align);
545 if (addr < start)
546 addr = round_up(start, align);
547 if (addr >= ei_last)
548 goto out;
549 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
550 ;
551 last = addr + size;
552 if (last > ei_last)
553 goto out;
554 if (last > end)
555 goto out;
556
557 return addr;
558
559out:
560 return -1ULL;
561}
562
563u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
564 u64 *sizep, u64 align)
565{
566 u64 addr, last;
567
568 addr = round_up(ei_start, align);
569 if (addr < start)
570 addr = round_up(start, align);
571 if (addr >= ei_last)
572 goto out;
573 *sizep = ei_last - addr;
574 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
575 ;
576 last = addr + *sizep;
577 if (last > ei_last)
578 goto out;
579
580 return addr;
581
582out:
583 return -1ULL;
584}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exit.c b/kernel/exit.c
index 45ed043b8bf5..cce59cb5ee6a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk)
87 87
88 sighand = rcu_dereference_check(tsk->sighand, 88 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() || 89 rcu_read_lock_held() ||
90 lockdep_is_held(&tasklist_lock)); 90 lockdep_tasklist_lock_is_held());
91 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
92 92
93 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code)
952 preempt_count()); 952 preempt_count());
953 953
954 acct_update_integrals(tsk); 954 acct_update_integrals(tsk);
955 955 /* sync mm's RSS info before statistics gathering */
956 sync_mm_rss(tsk, tsk->mm);
956 group_dead = atomic_dec_and_test(&tsk->signal->live); 957 group_dead = atomic_dec_and_test(&tsk->signal->live);
957 if (group_dead) { 958 if (group_dead) {
958 hrtimer_cancel(&tsk->signal->real_timer); 959 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1188,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1188 1189
1189 if (unlikely(wo->wo_flags & WNOWAIT)) { 1190 if (unlikely(wo->wo_flags & WNOWAIT)) {
1190 int exit_code = p->exit_code; 1191 int exit_code = p->exit_code;
1191 int why, status; 1192 int why;
1192 1193
1193 get_task_struct(p); 1194 get_task_struct(p);
1194 read_unlock(&tasklist_lock); 1195 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 17bbf093356d..4799c5f0e6d0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */
86DEFINE_PER_CPU(unsigned long, process_counts) = 0; 86DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89EXPORT_SYMBOL_GPL(tasklist_lock); 89
90#ifdef CONFIG_PROVE_RCU
91int lockdep_tasklist_lock_is_held(void)
92{
93 return lockdep_is_held(&tasklist_lock);
94}
95EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
96#endif /* #ifdef CONFIG_PROVE_RCU */
90 97
91int nr_processes(void) 98int nr_processes(void)
92{ 99{
@@ -329,15 +336,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
329 if (!tmp) 336 if (!tmp)
330 goto fail_nomem; 337 goto fail_nomem;
331 *tmp = *mpnt; 338 *tmp = *mpnt;
339 INIT_LIST_HEAD(&tmp->anon_vma_chain);
332 pol = mpol_dup(vma_policy(mpnt)); 340 pol = mpol_dup(vma_policy(mpnt));
333 retval = PTR_ERR(pol); 341 retval = PTR_ERR(pol);
334 if (IS_ERR(pol)) 342 if (IS_ERR(pol))
335 goto fail_nomem_policy; 343 goto fail_nomem_policy;
336 vma_set_policy(tmp, pol); 344 vma_set_policy(tmp, pol);
345 if (anon_vma_fork(tmp, mpnt))
346 goto fail_nomem_anon_vma_fork;
337 tmp->vm_flags &= ~VM_LOCKED; 347 tmp->vm_flags &= ~VM_LOCKED;
338 tmp->vm_mm = mm; 348 tmp->vm_mm = mm;
339 tmp->vm_next = NULL; 349 tmp->vm_next = NULL;
340 anon_vma_link(tmp);
341 file = tmp->vm_file; 350 file = tmp->vm_file;
342 if (file) { 351 if (file) {
343 struct inode *inode = file->f_path.dentry->d_inode; 352 struct inode *inode = file->f_path.dentry->d_inode;
@@ -392,6 +401,8 @@ out:
392 flush_tlb_mm(oldmm); 401 flush_tlb_mm(oldmm);
393 up_write(&oldmm->mmap_sem); 402 up_write(&oldmm->mmap_sem);
394 return retval; 403 return retval;
404fail_nomem_anon_vma_fork:
405 mpol_put(pol);
395fail_nomem_policy: 406fail_nomem_policy:
396 kmem_cache_free(vm_area_cachep, tmp); 407 kmem_cache_free(vm_area_cachep, tmp);
397fail_nomem: 408fail_nomem:
@@ -455,8 +466,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
455 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 466 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
456 mm->core_state = NULL; 467 mm->core_state = NULL;
457 mm->nr_ptes = 0; 468 mm->nr_ptes = 0;
458 set_mm_counter(mm, file_rss, 0); 469 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
459 set_mm_counter(mm, anon_rss, 0);
460 spin_lock_init(&mm->page_table_lock); 470 spin_lock_init(&mm->page_table_lock);
461 mm->free_area_cache = TASK_UNMAPPED_BASE; 471 mm->free_area_cache = TASK_UNMAPPED_BASE;
462 mm->cached_hole_size = ~0UL; 472 mm->cached_hole_size = ~0UL;
@@ -825,23 +835,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
825 */ 835 */
826static void posix_cpu_timers_init_group(struct signal_struct *sig) 836static void posix_cpu_timers_init_group(struct signal_struct *sig)
827{ 837{
838 unsigned long cpu_limit;
839
828 /* Thread group counters. */ 840 /* Thread group counters. */
829 thread_group_cputime_init(sig); 841 thread_group_cputime_init(sig);
830 842
831 /* Expiration times and increments. */ 843 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
832 sig->it[CPUCLOCK_PROF].expires = cputime_zero; 844 if (cpu_limit != RLIM_INFINITY) {
833 sig->it[CPUCLOCK_PROF].incr = cputime_zero; 845 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
834 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
835 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
836
837 /* Cached expiration times. */
838 sig->cputime_expires.prof_exp = cputime_zero;
839 sig->cputime_expires.virt_exp = cputime_zero;
840 sig->cputime_expires.sched_exp = 0;
841
842 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
843 sig->cputime_expires.prof_exp =
844 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
845 sig->cputimer.running = 1; 846 sig->cputimer.running = 1;
846 } 847 }
847 848
@@ -858,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
858 if (clone_flags & CLONE_THREAD) 859 if (clone_flags & CLONE_THREAD)
859 return 0; 860 return 0;
860 861
861 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 862 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
862 tsk->signal = sig; 863 tsk->signal = sig;
863 if (!sig) 864 if (!sig)
864 return -ENOMEM; 865 return -ENOMEM;
@@ -866,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
866 atomic_set(&sig->count, 1); 867 atomic_set(&sig->count, 1);
867 atomic_set(&sig->live, 1); 868 atomic_set(&sig->live, 1);
868 init_waitqueue_head(&sig->wait_chldexit); 869 init_waitqueue_head(&sig->wait_chldexit);
869 sig->flags = 0;
870 if (clone_flags & CLONE_NEWPID) 870 if (clone_flags & CLONE_NEWPID)
871 sig->flags |= SIGNAL_UNKILLABLE; 871 sig->flags |= SIGNAL_UNKILLABLE;
872 sig->group_exit_code = 0;
873 sig->group_exit_task = NULL;
874 sig->group_stop_count = 0;
875 sig->curr_target = tsk; 872 sig->curr_target = tsk;
876 init_sigpending(&sig->shared_pending); 873 init_sigpending(&sig->shared_pending);
877 INIT_LIST_HEAD(&sig->posix_timers); 874 INIT_LIST_HEAD(&sig->posix_timers);
878 875
879 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 876 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
880 sig->it_real_incr.tv64 = 0;
881 sig->real_timer.function = it_real_fn; 877 sig->real_timer.function = it_real_fn;
882 878
883 sig->leader = 0; /* session leadership doesn't inherit */
884 sig->tty_old_pgrp = NULL;
885 sig->tty = NULL;
886
887 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
888 sig->gtime = cputime_zero;
889 sig->cgtime = cputime_zero;
890#ifndef CONFIG_VIRT_CPU_ACCOUNTING
891 sig->prev_utime = sig->prev_stime = cputime_zero;
892#endif
893 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
894 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
895 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
896 sig->maxrss = sig->cmaxrss = 0;
897 task_io_accounting_init(&sig->ioac);
898 sig->sum_sched_runtime = 0;
899 taskstats_tgid_init(sig);
900
901 task_lock(current->group_leader); 879 task_lock(current->group_leader);
902 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 880 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
903 task_unlock(current->group_leader); 881 task_unlock(current->group_leader);
904 882
905 posix_cpu_timers_init_group(sig); 883 posix_cpu_timers_init_group(sig);
906 884
907 acct_init_pacct(&sig->pacct);
908
909 tty_audit_fork(sig); 885 tty_audit_fork(sig);
910 886
911 sig->oom_adj = current->signal->oom_adj; 887 sig->oom_adj = current->signal->oom_adj;
@@ -1034,7 +1010,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1034#endif 1010#endif
1035 retval = -EAGAIN; 1011 retval = -EAGAIN;
1036 if (atomic_read(&p->real_cred->user->processes) >= 1012 if (atomic_read(&p->real_cred->user->processes) >=
1037 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1013 task_rlimit(p, RLIMIT_NPROC)) {
1038 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1014 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1039 p->real_cred->user != INIT_USER) 1015 p->real_cred->user != INIT_USER)
1040 goto bad_fork_free; 1016 goto bad_fork_free;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 967e66143e11..03808ed342a6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
413 * 413 *
414 * @return a set of per_cpu pointers to perf events 414 * @return a set of per_cpu pointers to perf events
415 */ 415 */
416struct perf_event ** 416struct perf_event * __percpu *
417register_wide_hw_breakpoint(struct perf_event_attr *attr, 417register_wide_hw_breakpoint(struct perf_event_attr *attr,
418 perf_overflow_handler_t triggered) 418 perf_overflow_handler_t triggered)
419{ 419{
420 struct perf_event **cpu_events, **pevent, *bp; 420 struct perf_event * __percpu *cpu_events, **pevent, *bp;
421 long err; 421 long err;
422 int cpu; 422 int cpu;
423 423
424 cpu_events = alloc_percpu(typeof(*cpu_events)); 424 cpu_events = alloc_percpu(typeof(*cpu_events));
425 if (!cpu_events) 425 if (!cpu_events)
426 return ERR_PTR(-ENOMEM); 426 return (void __percpu __force *)ERR_PTR(-ENOMEM);
427 427
428 get_online_cpus(); 428 get_online_cpus();
429 for_each_online_cpu(cpu) { 429 for_each_online_cpu(cpu) {
@@ -451,7 +451,7 @@ fail:
451 put_online_cpus(); 451 put_online_cpus();
452 452
453 free_percpu(cpu_events); 453 free_percpu(cpu_events);
454 return ERR_PTR(err); 454 return (void __percpu __force *)ERR_PTR(err);
455} 455}
456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
457 457
@@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel 459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
460 * @cpu_events: the per cpu set of events to unregister 460 * @cpu_events: the per cpu set of events to unregister
461 */ 461 */
462void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) 462void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
463{ 463{
464 int cpu; 464 int cpu;
465 struct perf_event **pevent; 465 struct perf_event **pevent;
@@ -489,5 +489,4 @@ struct pmu perf_ops_bp = {
489 .enable = arch_install_hw_breakpoint, 489 .enable = arch_install_hw_breakpoint,
490 .disable = arch_uninstall_hw_breakpoint, 490 .disable = arch_uninstall_hw_breakpoint,
491 .read = hw_breakpoint_pmu_read, 491 .read = hw_breakpoint_pmu_read,
492 .unthrottle = hw_breakpoint_pmu_unthrottle
493}; 492};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
85 raw_spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 100}
87 101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
88 122
89/** 123/**
90 * set_irq_chip - set the irq chip for an irq 124 * set_irq_chip - set the irq chip for an irq
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
325 if (desc->chip->ack) 359 if (desc->chip->ack)
326 desc->chip->ack(irq); 360 desc->chip->ack(irq);
327 } 361 }
362 desc->status |= IRQ_MASKED;
363}
364
365static inline void mask_irq(struct irq_desc *desc, int irq)
366{
367 if (desc->chip->mask) {
368 desc->chip->mask(irq);
369 desc->status |= IRQ_MASKED;
370 }
371}
372
373static inline void unmask_irq(struct irq_desc *desc, int irq)
374{
375 if (desc->chip->unmask) {
376 desc->chip->unmask(irq);
377 desc->status &= ~IRQ_MASKED;
378 }
328} 379}
329 380
330/* 381/*
@@ -450,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
450 raw_spin_lock(&desc->lock); 501 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 502 desc->status &= ~IRQ_INPROGRESS;
452 503
453 if (unlikely(desc->status & IRQ_ONESHOT)) 504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
454 desc->status |= IRQ_MASKED; 505 unmask_irq(desc, irq);
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq);
457out_unlock: 506out_unlock:
458 raw_spin_unlock(&desc->lock); 507 raw_spin_unlock(&desc->lock);
459} 508}
@@ -490,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
490 action = desc->action; 539 action = desc->action;
491 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
492 desc->status |= IRQ_PENDING; 541 desc->status |= IRQ_PENDING;
493 if (desc->chip->mask) 542 mask_irq(desc, irq);
494 desc->chip->mask(irq);
495 goto out; 543 goto out;
496 } 544 }
497 545
@@ -520,7 +568,7 @@ out:
520 * signal. The occurence is latched into the irq controller hardware 568 * signal. The occurence is latched into the irq controller hardware
521 * and must be acked in order to be reenabled. After the ack another 569 * and must be acked in order to be reenabled. After the ack another
522 * interrupt can happen on the same source even before the first one 570 * interrupt can happen on the same source even before the first one
523 * is handled by the assosiacted event handler. If this happens it 571 * is handled by the associated event handler. If this happens it
524 * might be necessary to disable (mask) the interrupt depending on the 572 * might be necessary to disable (mask) the interrupt depending on the
525 * controller hardware. This requires to reenable the interrupt inside 573 * controller hardware. This requires to reenable the interrupt inside
526 * of the loop which handles the interrupts which have arrived while 574 * of the loop which handles the interrupts which have arrived while
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
559 irqreturn_t action_ret; 607 irqreturn_t action_ret;
560 608
561 if (unlikely(!action)) { 609 if (unlikely(!action)) {
562 desc->chip->mask(irq); 610 mask_irq(desc, irq);
563 goto out_unlock; 611 goto out_unlock;
564 } 612 }
565 613
@@ -571,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
571 if (unlikely((desc->status & 619 if (unlikely((desc->status &
572 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
573 (IRQ_PENDING | IRQ_MASKED))) { 621 (IRQ_PENDING | IRQ_MASKED))) {
574 desc->chip->unmask(irq); 622 unmask_irq(desc, irq);
575 desc->status &= ~IRQ_MASKED;
576 } 623 }
577 624
578 desc->status &= ~IRQ_PENDING; 625 desc->status &= ~IRQ_PENDING;
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
682 __set_irq_handler(irq, handle, 0, name); 729 __set_irq_handler(irq, handle, 0, name);
683} 730}
684 731
685void __init set_irq_noprobe(unsigned int irq) 732void set_irq_noprobe(unsigned int irq)
686{ 733{
687 struct irq_desc *desc = irq_to_desc(irq); 734 struct irq_desc *desc = irq_to_desc(irq);
688 unsigned long flags; 735 unsigned long flags;
@@ -697,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
697 raw_spin_unlock_irqrestore(&desc->lock, flags); 744 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 745}
699 746
700void __init set_irq_probe(unsigned int irq) 747void set_irq_probe(unsigned int irq)
701{ 748{
702 struct irq_desc *desc = irq_to_desc(irq); 749 struct irq_desc *desc = irq_to_desc(irq);
703 unsigned long flags; 750 unsigned long flags;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * automatically freed on driver detach. 42 * automatically freed on driver detach.
43 * 43 *
44 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
45 * separately, dev_free_irq() must be used. 45 * separately, devm_free_irq() must be used.
46 */ 46 */
47int devm_request_threaded_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
48 irq_handler_t handler, irq_handler_t thread_fn, 48 irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
81 * Except for the extra @dev argument, this function takes the 81 * Except for the extra @dev argument, this function takes the
82 * same arguments and performs the same function as free_irq(). 82 * same arguments and performs the same function as free_irq().
83 * This function instead of free_irq() should be used to manually 83 * This function instead of free_irq() should be used to manually
84 * free IRQs allocated with dev_request_irq(). 84 * free IRQs allocated with devm_request_irq().
85 */ 85 */
86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) 86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
132 */ 128 */
133DEFINE_RAW_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..398fda155f6e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
382{ 382{
383 struct irq_desc *desc = irq_to_desc(irq); 383 struct irq_desc *desc = irq_to_desc(irq);
384 struct irqaction *action; 384 struct irqaction *action;
385 unsigned long flags;
385 386
386 if (!desc) 387 if (!desc)
387 return 0; 388 return 0;
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
389 if (desc->status & IRQ_NOREQUEST) 390 if (desc->status & IRQ_NOREQUEST)
390 return 0; 391 return 0;
391 392
393 raw_spin_lock_irqsave(&desc->lock, flags);
392 action = desc->action; 394 action = desc->action;
393 if (action) 395 if (action)
394 if (irqflags & action->flags & IRQF_SHARED) 396 if (irqflags & action->flags & IRQF_SHARED)
395 action = NULL; 397 action = NULL;
396 398
399 raw_spin_unlock_irqrestore(&desc->lock, flags);
400
397 return !action; 401 return !action;
398} 402}
399 403
@@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
483 */ 487 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 488static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 489{
490again:
486 chip_bus_lock(irq, desc); 491 chip_bus_lock(irq, desc);
487 raw_spin_lock_irq(&desc->lock); 492 raw_spin_lock_irq(&desc->lock);
493
494 /*
495 * Implausible though it may be we need to protect us against
496 * the following scenario:
497 *
498 * The thread is faster done than the hard interrupt handler
499 * on the other CPU. If we unmask the irq line then the
500 * interrupt can come in again and masks the line, leaves due
501 * to IRQ_INPROGRESS and the irq line is masked forever.
502 */
503 if (unlikely(desc->status & IRQ_INPROGRESS)) {
504 raw_spin_unlock_irq(&desc->lock);
505 chip_bus_sync_unlock(irq, desc);
506 cpu_relax();
507 goto again;
508 }
509
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 510 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 511 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 512 desc->chip->unmask(irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..963559dbd858 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_to_desc(irq);
74 74
75 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
76 goto out_unlock; 76 goto out_unlock;
@@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 90 goto out_unlock;
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 replace_irq_desc(irq, desc);
94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..87ebe8adc474 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
41#include <asm/sections.h> 41#include <asm/sections.h>
42 42
43/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
44note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
45 45
46/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ccec774c716d..0ed46f3e51e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,9 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
47#include <linux/ftrace.h> 48#include <linux/ftrace.h>
49#include <linux/cpu.h>
48 50
49#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
50#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
105 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
106 * is a recipe for disaster 108 * is a recipe for disaster
107 */ 109 */
108#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
109
110struct kprobe_insn_page { 110struct kprobe_insn_page {
111 struct list_head list; 111 struct list_head list;
112 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
113 char slot_used[INSNS_PER_PAGE];
114 int nused; 113 int nused;
115 int ngarbage; 114 int ngarbage;
115 char slot_used[];
116};
117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
116}; 126};
117 127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
118enum kprobe_slot_state { 133enum kprobe_slot_state {
119 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
120 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
121 SLOT_USED = 2, 136 SLOT_USED = 2,
122}; 137};
123 138
124static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
125static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
126static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
127static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
143 .nr_garbage = 0,
144};
145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
128 146
129/** 147/**
130 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
131 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
132 */ 150 */
133static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
134{ 152{
135 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
136 154
137 retry: 155 retry:
138 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
139 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
140 int i; 158 int i;
141 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
142 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
143 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
144 kip->nused++; 162 kip->nused++;
145 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
146 } 164 }
147 } 165 }
148 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
149 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
150 } 169 }
151 } 170 }
152 171
153 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
154 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
155 goto retry; 174 goto retry;
156 } 175
157 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
158 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
159 if (!kip) 178 if (!kip)
160 return NULL; 179 return NULL;
161 180
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
170 return NULL; 189 return NULL;
171 } 190 }
172 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
173 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
174 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
175 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
176 kip->nused = 1; 194 kip->nused = 1;
177 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
178 return kip->insns; 197 return kip->insns;
179} 198}
180 199
200
181kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
182{ 202{
183 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
184 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
185 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
186 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
187 return ret; 209 return ret;
188} 210}
189 211
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
199 * so as not to have to set it up again the 221 * so as not to have to set it up again the
200 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
201 */ 223 */
202 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
203 list_del(&kip->list); 225 list_del(&kip->list);
204 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
205 kfree(kip); 227 kfree(kip);
@@ -209,51 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
209 return 0; 231 return 0;
210} 232}
211 233
212static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
213{ 235{
214 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
215 237
216 /* Ensure no-one is interrupted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
217 synchronize_sched(); 239 synchronize_sched();
218 240
219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
220 int i; 242 int i;
221 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
222 continue; 244 continue;
223 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
224 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
225 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
226 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
227 break; 249 break;
228 } 250 }
229 } 251 }
230 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
231 return 0; 253 return 0;
232} 254}
233 255
234void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
235{ 258{
236 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
237 260
238 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
239 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) /
240 if (kip->insns <= slot && 263 (c->insn_size * sizeof(kprobe_opcode_t));
241 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 if (idx >= 0 && idx < slots_per_page(c)) {
242 int i = (slot - kip->insns) / MAX_INSN_SIZE; 265 WARN_ON(kip->slot_used[idx] != SLOT_USED);
243 if (dirty) { 266 if (dirty) {
244 kip->slot_used[i] = SLOT_DIRTY; 267 kip->slot_used[idx] = SLOT_DIRTY;
245 kip->ngarbage++; 268 kip->ngarbage++;
269 if (++c->nr_garbage > slots_per_page(c))
270 collect_garbage_slots(c);
246 } else 271 } else
247 collect_one_slot(kip, i); 272 collect_one_slot(kip, idx);
248 break; 273 return;
249 } 274 }
250 } 275 }
276 /* Could not free this slot. */
277 WARN_ON(1);
278}
251 279
252 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 280void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
253 collect_garbage_slots(); 281{
254 282 mutex_lock(&kprobe_insn_mutex);
283 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
255 mutex_unlock(&kprobe_insn_mutex); 284 mutex_unlock(&kprobe_insn_mutex);
256} 285}
286#ifdef CONFIG_OPTPROBES
287/* For optimized_kprobe buffer */
288static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
289static struct kprobe_insn_cache kprobe_optinsn_slots = {
290 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
291 /* .insn_size is initialized later */
292 .nr_garbage = 0,
293};
294/* Get a slot for optimized_kprobe buffer */
295kprobe_opcode_t __kprobes *get_optinsn_slot(void)
296{
297 kprobe_opcode_t *ret = NULL;
298
299 mutex_lock(&kprobe_optinsn_mutex);
300 ret = __get_insn_slot(&kprobe_optinsn_slots);
301 mutex_unlock(&kprobe_optinsn_mutex);
302
303 return ret;
304}
305
306void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
307{
308 mutex_lock(&kprobe_optinsn_mutex);
309 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
310 mutex_unlock(&kprobe_optinsn_mutex);
311}
312#endif
257#endif 313#endif
258 314
259/* We have preemption disabled.. so it is safe to use __ versions */ 315/* We have preemption disabled.. so it is safe to use __ versions */
@@ -284,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
284 if (p->addr == addr) 340 if (p->addr == addr)
285 return p; 341 return p;
286 } 342 }
343
344 return NULL;
345}
346
347static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
348
349/* Return true if the kprobe is an aggregator */
350static inline int kprobe_aggrprobe(struct kprobe *p)
351{
352 return p->pre_handler == aggr_pre_handler;
353}
354
355/*
356 * Keep all fields in the kprobe consistent
357 */
358static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
359{
360 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
361 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
362}
363
364#ifdef CONFIG_OPTPROBES
365/* NOTE: change this value only with kprobe_mutex held */
366static bool kprobes_allow_optimization;
367
368/*
369 * Call all pre_handler on the list, but ignores its return value.
370 * This must be called from arch-dep optimized caller.
371 */
372void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
373{
374 struct kprobe *kp;
375
376 list_for_each_entry_rcu(kp, &p->list, list) {
377 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
378 set_kprobe_instance(kp);
379 kp->pre_handler(kp, regs);
380 }
381 reset_kprobe_instance();
382 }
383}
384
385/* Return true(!0) if the kprobe is ready for optimization. */
386static inline int kprobe_optready(struct kprobe *p)
387{
388 struct optimized_kprobe *op;
389
390 if (kprobe_aggrprobe(p)) {
391 op = container_of(p, struct optimized_kprobe, kp);
392 return arch_prepared_optinsn(&op->optinsn);
393 }
394
395 return 0;
396}
397
398/*
399 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint).
401 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{
404 int i;
405 struct kprobe *p = NULL;
406 struct optimized_kprobe *op;
407
408 /* Don't check i == 0, since that is a breakpoint case. */
409 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
410 p = get_kprobe((void *)(addr - i));
411
412 if (p && kprobe_optready(p)) {
413 op = container_of(p, struct optimized_kprobe, kp);
414 if (arch_within_optimized_kprobe(op, addr))
415 return p;
416 }
417
287 return NULL; 418 return NULL;
288} 419}
289 420
421/* Optimization staging list, protected by kprobe_mutex */
422static LIST_HEAD(optimizing_list);
423
424static void kprobe_optimizer(struct work_struct *work);
425static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
426#define OPTIMIZE_DELAY 5
427
428/* Kprobe jump optimizer */
429static __kprobes void kprobe_optimizer(struct work_struct *work)
430{
431 struct optimized_kprobe *op, *tmp;
432
433 /* Lock modules while optimizing kprobes */
434 mutex_lock(&module_mutex);
435 mutex_lock(&kprobe_mutex);
436 if (kprobes_all_disarmed || !kprobes_allow_optimization)
437 goto end;
438
439 /*
440 * Wait for quiesence period to ensure all running interrupts
441 * are done. Because optprobe may modify multiple instructions
442 * there is a chance that Nth instruction is interrupted. In that
443 * case, running interrupt can return to 2nd-Nth byte of jump
444 * instruction. This wait is for avoiding it.
445 */
446 synchronize_sched();
447
448 /*
449 * The optimization/unoptimization refers online_cpus via
450 * stop_machine() and cpu-hotplug modifies online_cpus.
451 * And same time, text_mutex will be held in cpu-hotplug and here.
452 * This combination can cause a deadlock (cpu-hotplug try to lock
453 * text_mutex but stop_machine can not be done because online_cpus
454 * has been changed)
455 * To avoid this deadlock, we need to call get_online_cpus()
456 * for preventing cpu-hotplug outside of text_mutex locking.
457 */
458 get_online_cpus();
459 mutex_lock(&text_mutex);
460 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
461 WARN_ON(kprobe_disabled(&op->kp));
462 if (arch_optimize_kprobe(op) < 0)
463 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
464 list_del_init(&op->list);
465 }
466 mutex_unlock(&text_mutex);
467 put_online_cpus();
468end:
469 mutex_unlock(&kprobe_mutex);
470 mutex_unlock(&module_mutex);
471}
472
473/* Optimize kprobe if p is ready to be optimized */
474static __kprobes void optimize_kprobe(struct kprobe *p)
475{
476 struct optimized_kprobe *op;
477
478 /* Check if the kprobe is disabled or not ready for optimization. */
479 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
480 (kprobe_disabled(p) || kprobes_all_disarmed))
481 return;
482
483 /* Both of break_handler and post_handler are not supported. */
484 if (p->break_handler || p->post_handler)
485 return;
486
487 op = container_of(p, struct optimized_kprobe, kp);
488
489 /* Check there is no other kprobes at the optimized instructions */
490 if (arch_check_optimized_kprobe(op) < 0)
491 return;
492
493 /* Check if it is already optimized. */
494 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
495 return;
496
497 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
498 list_add(&op->list, &optimizing_list);
499 if (!delayed_work_pending(&optimizing_work))
500 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
501}
502
503/* Unoptimize a kprobe if p is optimized */
504static __kprobes void unoptimize_kprobe(struct kprobe *p)
505{
506 struct optimized_kprobe *op;
507
508 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
509 op = container_of(p, struct optimized_kprobe, kp);
510 if (!list_empty(&op->list))
511 /* Dequeue from the optimization queue */
512 list_del_init(&op->list);
513 else
514 /* Replace jump with break */
515 arch_unoptimize_kprobe(op);
516 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
517 }
518}
519
520/* Remove optimized instructions */
521static void __kprobes kill_optimized_kprobe(struct kprobe *p)
522{
523 struct optimized_kprobe *op;
524
525 op = container_of(p, struct optimized_kprobe, kp);
526 if (!list_empty(&op->list)) {
527 /* Dequeue from the optimization queue */
528 list_del_init(&op->list);
529 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
530 }
531 /* Don't unoptimize, because the target code will be freed. */
532 arch_remove_optimized_kprobe(op);
533}
534
535/* Try to prepare optimized instructions */
536static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
537{
538 struct optimized_kprobe *op;
539
540 op = container_of(p, struct optimized_kprobe, kp);
541 arch_prepare_optimized_kprobe(op);
542}
543
544/* Free optimized instructions and optimized_kprobe */
545static __kprobes void free_aggr_kprobe(struct kprobe *p)
546{
547 struct optimized_kprobe *op;
548
549 op = container_of(p, struct optimized_kprobe, kp);
550 arch_remove_optimized_kprobe(op);
551 kfree(op);
552}
553
554/* Allocate new optimized_kprobe and try to prepare optimized instructions */
555static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
556{
557 struct optimized_kprobe *op;
558
559 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
560 if (!op)
561 return NULL;
562
563 INIT_LIST_HEAD(&op->list);
564 op->kp.addr = p->addr;
565 arch_prepare_optimized_kprobe(op);
566
567 return &op->kp;
568}
569
570static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
571
572/*
573 * Prepare an optimized_kprobe and optimize it
574 * NOTE: p must be a normal registered kprobe
575 */
576static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
577{
578 struct kprobe *ap;
579 struct optimized_kprobe *op;
580
581 ap = alloc_aggr_kprobe(p);
582 if (!ap)
583 return;
584
585 op = container_of(ap, struct optimized_kprobe, kp);
586 if (!arch_prepared_optinsn(&op->optinsn)) {
587 /* If failed to setup optimizing, fallback to kprobe */
588 free_aggr_kprobe(ap);
589 return;
590 }
591
592 init_aggr_kprobe(ap, p);
593 optimize_kprobe(ap);
594}
595
596#ifdef CONFIG_SYSCTL
597static void __kprobes optimize_all_kprobes(void)
598{
599 struct hlist_head *head;
600 struct hlist_node *node;
601 struct kprobe *p;
602 unsigned int i;
603
604 /* If optimization is already allowed, just return */
605 if (kprobes_allow_optimization)
606 return;
607
608 kprobes_allow_optimization = true;
609 mutex_lock(&text_mutex);
610 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
611 head = &kprobe_table[i];
612 hlist_for_each_entry_rcu(p, node, head, hlist)
613 if (!kprobe_disabled(p))
614 optimize_kprobe(p);
615 }
616 mutex_unlock(&text_mutex);
617 printk(KERN_INFO "Kprobes globally optimized\n");
618}
619
620static void __kprobes unoptimize_all_kprobes(void)
621{
622 struct hlist_head *head;
623 struct hlist_node *node;
624 struct kprobe *p;
625 unsigned int i;
626
627 /* If optimization is already prohibited, just return */
628 if (!kprobes_allow_optimization)
629 return;
630
631 kprobes_allow_optimization = false;
632 printk(KERN_INFO "Kprobes globally unoptimized\n");
633 get_online_cpus(); /* For avoiding text_mutex deadlock */
634 mutex_lock(&text_mutex);
635 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
636 head = &kprobe_table[i];
637 hlist_for_each_entry_rcu(p, node, head, hlist) {
638 if (!kprobe_disabled(p))
639 unoptimize_kprobe(p);
640 }
641 }
642
643 mutex_unlock(&text_mutex);
644 put_online_cpus();
645 /* Allow all currently running kprobes to complete */
646 synchronize_sched();
647}
648
649int sysctl_kprobes_optimization;
650int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
651 void __user *buffer, size_t *length,
652 loff_t *ppos)
653{
654 int ret;
655
656 mutex_lock(&kprobe_mutex);
657 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
658 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
659
660 if (sysctl_kprobes_optimization)
661 optimize_all_kprobes();
662 else
663 unoptimize_all_kprobes();
664 mutex_unlock(&kprobe_mutex);
665
666 return ret;
667}
668#endif /* CONFIG_SYSCTL */
669
670static void __kprobes __arm_kprobe(struct kprobe *p)
671{
672 struct kprobe *old_p;
673
674 /* Check collision with other optimized kprobes */
675 old_p = get_optimized_kprobe((unsigned long)p->addr);
676 if (unlikely(old_p))
677 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
678
679 arch_arm_kprobe(p);
680 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
681}
682
683static void __kprobes __disarm_kprobe(struct kprobe *p)
684{
685 struct kprobe *old_p;
686
687 unoptimize_kprobe(p); /* Try to unoptimize */
688 arch_disarm_kprobe(p);
689
690 /* If another kprobe was blocked, optimize it. */
691 old_p = get_optimized_kprobe((unsigned long)p->addr);
692 if (unlikely(old_p))
693 optimize_kprobe(old_p);
694}
695
696#else /* !CONFIG_OPTPROBES */
697
698#define optimize_kprobe(p) do {} while (0)
699#define unoptimize_kprobe(p) do {} while (0)
700#define kill_optimized_kprobe(p) do {} while (0)
701#define prepare_optimized_kprobe(p) do {} while (0)
702#define try_to_optimize_kprobe(p) do {} while (0)
703#define __arm_kprobe(p) arch_arm_kprobe(p)
704#define __disarm_kprobe(p) arch_disarm_kprobe(p)
705
706static __kprobes void free_aggr_kprobe(struct kprobe *p)
707{
708 kfree(p);
709}
710
711static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
712{
713 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
714}
715#endif /* CONFIG_OPTPROBES */
716
290/* Arm a kprobe with text_mutex */ 717/* Arm a kprobe with text_mutex */
291static void __kprobes arm_kprobe(struct kprobe *kp) 718static void __kprobes arm_kprobe(struct kprobe *kp)
292{ 719{
720 /*
721 * Here, since __arm_kprobe() doesn't use stop_machine(),
722 * this doesn't cause deadlock on text_mutex. So, we don't
723 * need get_online_cpus().
724 */
293 mutex_lock(&text_mutex); 725 mutex_lock(&text_mutex);
294 arch_arm_kprobe(kp); 726 __arm_kprobe(kp);
295 mutex_unlock(&text_mutex); 727 mutex_unlock(&text_mutex);
296} 728}
297 729
298/* Disarm a kprobe with text_mutex */ 730/* Disarm a kprobe with text_mutex */
299static void __kprobes disarm_kprobe(struct kprobe *kp) 731static void __kprobes disarm_kprobe(struct kprobe *kp)
300{ 732{
733 get_online_cpus(); /* For avoiding text_mutex deadlock */
301 mutex_lock(&text_mutex); 734 mutex_lock(&text_mutex);
302 arch_disarm_kprobe(kp); 735 __disarm_kprobe(kp);
303 mutex_unlock(&text_mutex); 736 mutex_unlock(&text_mutex);
737 put_online_cpus();
304} 738}
305 739
306/* 740/*
@@ -369,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
369void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 803void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
370{ 804{
371 struct kprobe *kp; 805 struct kprobe *kp;
372 if (p->pre_handler != aggr_pre_handler) { 806 if (!kprobe_aggrprobe(p)) {
373 p->nmissed++; 807 p->nmissed++;
374 } else { 808 } else {
375 list_for_each_entry_rcu(kp, &p->list, list) 809 list_for_each_entry_rcu(kp, &p->list, list)
@@ -493,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
493} 927}
494 928
495/* 929/*
496 * Keep all fields in the kprobe consistent
497 */
498static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
499{
500 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
501 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
502}
503
504/*
505* Add the new probe to ap->list. Fail if this is the 930* Add the new probe to ap->list. Fail if this is the
506* second jprobe at the address - two jprobes can't coexist 931* second jprobe at the address - two jprobes can't coexist
507*/ 932*/
508static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 933static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
509{ 934{
510 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 935 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
936
937 if (p->break_handler || p->post_handler)
938 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
939
511 if (p->break_handler) { 940 if (p->break_handler) {
512 if (ap->break_handler) 941 if (ap->break_handler)
513 return -EEXIST; 942 return -EEXIST;
@@ -522,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
522 ap->flags &= ~KPROBE_FLAG_DISABLED; 951 ap->flags &= ~KPROBE_FLAG_DISABLED;
523 if (!kprobes_all_disarmed) 952 if (!kprobes_all_disarmed)
524 /* Arm the breakpoint again. */ 953 /* Arm the breakpoint again. */
525 arm_kprobe(ap); 954 __arm_kprobe(ap);
526 } 955 }
527 return 0; 956 return 0;
528} 957}
@@ -531,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
531 * Fill in the required fields of the "manager kprobe". Replace the 960 * Fill in the required fields of the "manager kprobe". Replace the
532 * earlier kprobe in the hlist with the manager kprobe 961 * earlier kprobe in the hlist with the manager kprobe
533 */ 962 */
534static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 963static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
535{ 964{
965 /* Copy p's insn slot to ap */
536 copy_kprobe(p, ap); 966 copy_kprobe(p, ap);
537 flush_insn_slot(ap); 967 flush_insn_slot(ap);
538 ap->addr = p->addr; 968 ap->addr = p->addr;
539 ap->flags = p->flags; 969 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
540 ap->pre_handler = aggr_pre_handler; 970 ap->pre_handler = aggr_pre_handler;
541 ap->fault_handler = aggr_fault_handler; 971 ap->fault_handler = aggr_fault_handler;
542 /* We don't care the kprobe which has gone. */ 972 /* We don't care the kprobe which has gone. */
@@ -546,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
546 ap->break_handler = aggr_break_handler; 976 ap->break_handler = aggr_break_handler;
547 977
548 INIT_LIST_HEAD(&ap->list); 978 INIT_LIST_HEAD(&ap->list);
549 list_add_rcu(&p->list, &ap->list); 979 INIT_HLIST_NODE(&ap->hlist);
550 980
981 list_add_rcu(&p->list, &ap->list);
551 hlist_replace_rcu(&p->hlist, &ap->hlist); 982 hlist_replace_rcu(&p->hlist, &ap->hlist);
552} 983}
553 984
@@ -561,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
561 int ret = 0; 992 int ret = 0;
562 struct kprobe *ap = old_p; 993 struct kprobe *ap = old_p;
563 994
564 if (old_p->pre_handler != aggr_pre_handler) { 995 if (!kprobe_aggrprobe(old_p)) {
565 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 996 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
566 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 997 ap = alloc_aggr_kprobe(old_p);
567 if (!ap) 998 if (!ap)
568 return -ENOMEM; 999 return -ENOMEM;
569 add_aggr_kprobe(ap, old_p); 1000 init_aggr_kprobe(ap, old_p);
570 } 1001 }
571 1002
572 if (kprobe_gone(ap)) { 1003 if (kprobe_gone(ap)) {
@@ -585,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
585 */ 1016 */
586 return ret; 1017 return ret;
587 1018
1019 /* Prepare optimized instructions if possible. */
1020 prepare_optimized_kprobe(ap);
1021
588 /* 1022 /*
589 * Clear gone flag to prevent allocating new slot again, and 1023 * Clear gone flag to prevent allocating new slot again, and
590 * set disabled flag because it is not armed yet. 1024 * set disabled flag because it is not armed yet.
@@ -593,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
593 | KPROBE_FLAG_DISABLED; 1027 | KPROBE_FLAG_DISABLED;
594 } 1028 }
595 1029
1030 /* Copy ap's insn slot to p */
596 copy_kprobe(ap, p); 1031 copy_kprobe(ap, p);
597 return add_new_kprobe(ap, p); 1032 return add_new_kprobe(ap, p);
598} 1033}
@@ -743,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
743 p->nmissed = 0; 1178 p->nmissed = 0;
744 INIT_LIST_HEAD(&p->list); 1179 INIT_LIST_HEAD(&p->list);
745 mutex_lock(&kprobe_mutex); 1180 mutex_lock(&kprobe_mutex);
1181
1182 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1183 mutex_lock(&text_mutex);
1184
746 old_p = get_kprobe(p->addr); 1185 old_p = get_kprobe(p->addr);
747 if (old_p) { 1186 if (old_p) {
1187 /* Since this may unoptimize old_p, locking text_mutex. */
748 ret = register_aggr_kprobe(old_p, p); 1188 ret = register_aggr_kprobe(old_p, p);
749 goto out; 1189 goto out;
750 } 1190 }
751 1191
752 mutex_lock(&text_mutex);
753 ret = arch_prepare_kprobe(p); 1192 ret = arch_prepare_kprobe(p);
754 if (ret) 1193 if (ret)
755 goto out_unlock_text; 1194 goto out;
756 1195
757 INIT_HLIST_NODE(&p->hlist); 1196 INIT_HLIST_NODE(&p->hlist);
758 hlist_add_head_rcu(&p->hlist, 1197 hlist_add_head_rcu(&p->hlist,
759 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1198 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
760 1199
761 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1200 if (!kprobes_all_disarmed && !kprobe_disabled(p))
762 arch_arm_kprobe(p); 1201 __arm_kprobe(p);
1202
1203 /* Try to optimize kprobe */
1204 try_to_optimize_kprobe(p);
763 1205
764out_unlock_text:
765 mutex_unlock(&text_mutex);
766out: 1206out:
1207 mutex_unlock(&text_mutex);
1208 put_online_cpus();
767 mutex_unlock(&kprobe_mutex); 1209 mutex_unlock(&kprobe_mutex);
768 1210
769 if (probed_mod) 1211 if (probed_mod)
@@ -785,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
785 return -EINVAL; 1227 return -EINVAL;
786 1228
787 if (old_p == p || 1229 if (old_p == p ||
788 (old_p->pre_handler == aggr_pre_handler && 1230 (kprobe_aggrprobe(old_p) &&
789 list_is_singular(&old_p->list))) { 1231 list_is_singular(&old_p->list))) {
790 /* 1232 /*
791 * Only probe on the hash list. Disarm only if kprobes are 1233 * Only probe on the hash list. Disarm only if kprobes are
@@ -793,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
793 * already have been removed. We save on flushing icache. 1235 * already have been removed. We save on flushing icache.
794 */ 1236 */
795 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1237 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
796 disarm_kprobe(p); 1238 disarm_kprobe(old_p);
797 hlist_del_rcu(&old_p->hlist); 1239 hlist_del_rcu(&old_p->hlist);
798 } else { 1240 } else {
799 if (p->break_handler && !kprobe_gone(p)) 1241 if (p->break_handler && !kprobe_gone(p))
@@ -809,8 +1251,13 @@ noclean:
809 list_del_rcu(&p->list); 1251 list_del_rcu(&p->list);
810 if (!kprobe_disabled(old_p)) { 1252 if (!kprobe_disabled(old_p)) {
811 try_to_disable_aggr_kprobe(old_p); 1253 try_to_disable_aggr_kprobe(old_p);
812 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1254 if (!kprobes_all_disarmed) {
813 disarm_kprobe(old_p); 1255 if (kprobe_disabled(old_p))
1256 disarm_kprobe(old_p);
1257 else
1258 /* Try to optimize this probe again */
1259 optimize_kprobe(old_p);
1260 }
814 } 1261 }
815 } 1262 }
816 return 0; 1263 return 0;
@@ -827,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
827 old_p = list_entry(p->list.next, struct kprobe, list); 1274 old_p = list_entry(p->list.next, struct kprobe, list);
828 list_del(&p->list); 1275 list_del(&p->list);
829 arch_remove_kprobe(old_p); 1276 arch_remove_kprobe(old_p);
830 kfree(old_p); 1277 free_aggr_kprobe(old_p);
831 } 1278 }
832} 1279}
833 1280
@@ -1123,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1123 struct kprobe *kp; 1570 struct kprobe *kp;
1124 1571
1125 p->flags |= KPROBE_FLAG_GONE; 1572 p->flags |= KPROBE_FLAG_GONE;
1126 if (p->pre_handler == aggr_pre_handler) { 1573 if (kprobe_aggrprobe(p)) {
1127 /* 1574 /*
1128 * If this is an aggr_kprobe, we have to list all the 1575 * If this is an aggr_kprobe, we have to list all the
1129 * chained probes and mark them GONE. 1576 * chained probes and mark them GONE.
@@ -1132,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1132 kp->flags |= KPROBE_FLAG_GONE; 1579 kp->flags |= KPROBE_FLAG_GONE;
1133 p->post_handler = NULL; 1580 p->post_handler = NULL;
1134 p->break_handler = NULL; 1581 p->break_handler = NULL;
1582 kill_optimized_kprobe(p);
1135 } 1583 }
1136 /* 1584 /*
1137 * Here, we can remove insn_slot safely, because no thread calls 1585 * Here, we can remove insn_slot safely, because no thread calls
@@ -1241,6 +1689,15 @@ static int __init init_kprobes(void)
1241 } 1689 }
1242 } 1690 }
1243 1691
1692#if defined(CONFIG_OPTPROBES)
1693#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1694 /* Init kprobe_optinsn_slots */
1695 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1696#endif
1697 /* By default, kprobes can be optimized */
1698 kprobes_allow_optimization = true;
1699#endif
1700
1244 /* By default, kprobes are armed */ 1701 /* By default, kprobes are armed */
1245 kprobes_all_disarmed = false; 1702 kprobes_all_disarmed = false;
1246 1703
@@ -1259,7 +1716,7 @@ static int __init init_kprobes(void)
1259 1716
1260#ifdef CONFIG_DEBUG_FS 1717#ifdef CONFIG_DEBUG_FS
1261static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1718static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1262 const char *sym, int offset,char *modname) 1719 const char *sym, int offset, char *modname, struct kprobe *pp)
1263{ 1720{
1264 char *kprobe_type; 1721 char *kprobe_type;
1265 1722
@@ -1269,19 +1726,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1269 kprobe_type = "j"; 1726 kprobe_type = "j";
1270 else 1727 else
1271 kprobe_type = "k"; 1728 kprobe_type = "k";
1729
1272 if (sym) 1730 if (sym)
1273 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1731 seq_printf(pi, "%p %s %s+0x%x %s ",
1274 p->addr, kprobe_type, sym, offset, 1732 p->addr, kprobe_type, sym, offset,
1275 (modname ? modname : " "), 1733 (modname ? modname : " "));
1276 (kprobe_gone(p) ? "[GONE]" : ""),
1277 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1278 "[DISABLED]" : ""));
1279 else 1734 else
1280 seq_printf(pi, "%p %s %p %s%s\n", 1735 seq_printf(pi, "%p %s %p ",
1281 p->addr, kprobe_type, p->addr, 1736 p->addr, kprobe_type, p->addr);
1282 (kprobe_gone(p) ? "[GONE]" : ""), 1737
1283 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1738 if (!pp)
1284 "[DISABLED]" : "")); 1739 pp = p;
1740 seq_printf(pi, "%s%s%s\n",
1741 (kprobe_gone(p) ? "[GONE]" : ""),
1742 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1743 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1285} 1744}
1286 1745
1287static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1746static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1317,11 +1776,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1317 hlist_for_each_entry_rcu(p, node, head, hlist) { 1776 hlist_for_each_entry_rcu(p, node, head, hlist) {
1318 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1777 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1319 &offset, &modname, namebuf); 1778 &offset, &modname, namebuf);
1320 if (p->pre_handler == aggr_pre_handler) { 1779 if (kprobe_aggrprobe(p)) {
1321 list_for_each_entry_rcu(kp, &p->list, list) 1780 list_for_each_entry_rcu(kp, &p->list, list)
1322 report_probe(pi, kp, sym, offset, modname); 1781 report_probe(pi, kp, sym, offset, modname, p);
1323 } else 1782 } else
1324 report_probe(pi, p, sym, offset, modname); 1783 report_probe(pi, p, sym, offset, modname, NULL);
1325 } 1784 }
1326 preempt_enable(); 1785 preempt_enable();
1327 return 0; 1786 return 0;
@@ -1399,12 +1858,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
1399 goto out; 1858 goto out;
1400 } 1859 }
1401 1860
1402 if (!kprobes_all_disarmed && kprobe_disabled(p))
1403 arm_kprobe(p);
1404
1405 p->flags &= ~KPROBE_FLAG_DISABLED;
1406 if (p != kp) 1861 if (p != kp)
1407 kp->flags &= ~KPROBE_FLAG_DISABLED; 1862 kp->flags &= ~KPROBE_FLAG_DISABLED;
1863
1864 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1865 p->flags &= ~KPROBE_FLAG_DISABLED;
1866 arm_kprobe(p);
1867 }
1408out: 1868out:
1409 mutex_unlock(&kprobe_mutex); 1869 mutex_unlock(&kprobe_mutex);
1410 return ret; 1870 return ret;
@@ -1424,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
1424 if (!kprobes_all_disarmed) 1884 if (!kprobes_all_disarmed)
1425 goto already_enabled; 1885 goto already_enabled;
1426 1886
1887 /* Arming kprobes doesn't optimize kprobe itself */
1427 mutex_lock(&text_mutex); 1888 mutex_lock(&text_mutex);
1428 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1889 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1429 head = &kprobe_table[i]; 1890 head = &kprobe_table[i];
1430 hlist_for_each_entry_rcu(p, node, head, hlist) 1891 hlist_for_each_entry_rcu(p, node, head, hlist)
1431 if (!kprobe_disabled(p)) 1892 if (!kprobe_disabled(p))
1432 arch_arm_kprobe(p); 1893 __arm_kprobe(p);
1433 } 1894 }
1434 mutex_unlock(&text_mutex); 1895 mutex_unlock(&text_mutex);
1435 1896
@@ -1456,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
1456 1917
1457 kprobes_all_disarmed = true; 1918 kprobes_all_disarmed = true;
1458 printk(KERN_INFO "Kprobes globally disabled\n"); 1919 printk(KERN_INFO "Kprobes globally disabled\n");
1920
1921 /*
1922 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1923 * because disarming may also unoptimize kprobes.
1924 */
1925 get_online_cpus();
1459 mutex_lock(&text_mutex); 1926 mutex_lock(&text_mutex);
1460 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1927 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1461 head = &kprobe_table[i]; 1928 head = &kprobe_table[i];
1462 hlist_for_each_entry_rcu(p, node, head, hlist) { 1929 hlist_for_each_entry_rcu(p, node, head, hlist) {
1463 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1930 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1464 arch_disarm_kprobe(p); 1931 __disarm_kprobe(p);
1465 } 1932 }
1466 } 1933 }
1467 1934
1468 mutex_unlock(&text_mutex); 1935 mutex_unlock(&text_mutex);
1936 put_online_cpus();
1469 mutex_unlock(&kprobe_mutex); 1937 mutex_unlock(&kprobe_mutex);
1470 /* Allow all currently running kprobes to complete */ 1938 /* Allow all currently running kprobes to complete */
1471 synchronize_sched(); 1939 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6b1ccc3f0205..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
33} 33}
34KERNEL_ATTR_RO(uevent_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
35 35
36/* uevent helper program, used during early boo */ 36/* uevent helper program, used during early boot */
37static ssize_t uevent_helper_show(struct kobject *kobj, 37static ssize_t uevent_helper_show(struct kobject *kobj,
38 struct kobj_attribute *attr, char *buf) 38 struct kobj_attribute *attr, char *buf)
39{ 39{
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 82ed0ea15194..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
219 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
220 ignore_signals(tsk); 220 ignore_signals(tsk);
221 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
222 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_states[N_HIGH_MEMORY]);
223 223
224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
225 225
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0c30d0455de1..c927a549db2c 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3211,8 +3211,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3211{ 3211{
3212 unsigned long flags; 3212 unsigned long flags;
3213 3213
3214 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3215
3216 if (unlikely(current->lockdep_recursion)) 3214 if (unlikely(current->lockdep_recursion))
3217 return; 3215 return;
3218 3216
@@ -3220,6 +3218,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3220 check_flags(flags); 3218 check_flags(flags);
3221 3219
3222 current->lockdep_recursion = 1; 3220 current->lockdep_recursion = 1;
3221 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3223 __lock_acquire(lock, subclass, trylock, read, check, 3222 __lock_acquire(lock, subclass, trylock, read, check,
3224 irqs_disabled_flags(flags), nest_lock, ip, 0); 3223 irqs_disabled_flags(flags), nest_lock, ip, 0);
3225 current->lockdep_recursion = 0; 3224 current->lockdep_recursion = 0;
@@ -3232,14 +3231,13 @@ void lock_release(struct lockdep_map *lock, int nested,
3232{ 3231{
3233 unsigned long flags; 3232 unsigned long flags;
3234 3233
3235 trace_lock_release(lock, nested, ip);
3236
3237 if (unlikely(current->lockdep_recursion)) 3234 if (unlikely(current->lockdep_recursion))
3238 return; 3235 return;
3239 3236
3240 raw_local_irq_save(flags); 3237 raw_local_irq_save(flags);
3241 check_flags(flags); 3238 check_flags(flags);
3242 current->lockdep_recursion = 1; 3239 current->lockdep_recursion = 1;
3240 trace_lock_release(lock, nested, ip);
3243 __lock_release(lock, nested, ip); 3241 __lock_release(lock, nested, ip);
3244 current->lockdep_recursion = 0; 3242 current->lockdep_recursion = 0;
3245 raw_local_irq_restore(flags); 3243 raw_local_irq_restore(flags);
@@ -3413,8 +3411,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3413{ 3411{
3414 unsigned long flags; 3412 unsigned long flags;
3415 3413
3416 trace_lock_contended(lock, ip);
3417
3418 if (unlikely(!lock_stat)) 3414 if (unlikely(!lock_stat))
3419 return; 3415 return;
3420 3416
@@ -3424,6 +3420,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3424 raw_local_irq_save(flags); 3420 raw_local_irq_save(flags);
3425 check_flags(flags); 3421 check_flags(flags);
3426 current->lockdep_recursion = 1; 3422 current->lockdep_recursion = 1;
3423 trace_lock_contended(lock, ip);
3427 __lock_contended(lock, ip); 3424 __lock_contended(lock, ip);
3428 current->lockdep_recursion = 0; 3425 current->lockdep_recursion = 0;
3429 raw_local_irq_restore(flags); 3426 raw_local_irq_restore(flags);
@@ -3822,6 +3819,7 @@ void lockdep_rcu_dereference(const char *file, const int line)
3822 printk("%s:%d invoked rcu_dereference_check() without protection!\n", 3819 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3823 file, line); 3820 file, line);
3824 printk("\nother info that might help us debug this:\n\n"); 3821 printk("\nother info that might help us debug this:\n\n");
3822 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
3825 lockdep_print_held_locks(curr); 3823 lockdep_print_held_locks(curr);
3826 printk("\nstack backtrace:\n"); 3824 printk("\nstack backtrace:\n");
3827 dump_stack(); 3825 dump_stack();
diff --git a/kernel/module.c b/kernel/module.c
index f82386bd9ee9..c968d3606dca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod)
474 474
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 475 INIT_LIST_HEAD(&mod->modules_which_use_me);
476 for_each_possible_cpu(cpu) 476 for_each_possible_cpu(cpu)
477 local_set(__module_ref_addr(mod, cpu), 0); 477 per_cpu_ptr(mod->refptr, cpu)->count = 0;
478
478 /* Hold reference count during initialization. */ 479 /* Hold reference count during initialization. */
479 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 480 __this_cpu_write(mod->refptr->count, 1);
480 /* Backwards compatibility macros put refcount during init. */ 481 /* Backwards compatibility macros put refcount during init. */
481 mod->waiter = current; 482 mod->waiter = current;
482} 483}
@@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod)
619 int cpu; 620 int cpu;
620 621
621 for_each_possible_cpu(cpu) 622 for_each_possible_cpu(cpu)
622 total += local_read(__module_ref_addr(mod, cpu)); 623 total += per_cpu_ptr(mod->refptr, cpu)->count;
623 return total; 624 return total;
624} 625}
625EXPORT_SYMBOL(module_refcount); 626EXPORT_SYMBOL(module_refcount);
@@ -796,14 +797,15 @@ static struct module_attribute refcnt = {
796void module_put(struct module *module) 797void module_put(struct module *module)
797{ 798{
798 if (module) { 799 if (module) {
799 unsigned int cpu = get_cpu(); 800 preempt_disable();
800 local_dec(__module_ref_addr(module, cpu)); 801 __this_cpu_dec(module->refptr->count);
802
801 trace_module_put(module, _RET_IP_, 803 trace_module_put(module, _RET_IP_,
802 local_read(__module_ref_addr(module, cpu))); 804 __this_cpu_read(module->refptr->count));
803 /* Maybe they're waiting for us to drop reference? */ 805 /* Maybe they're waiting for us to drop reference? */
804 if (unlikely(!module_is_live(module))) 806 if (unlikely(!module_is_live(module)))
805 wake_up_process(module->waiter); 807 wake_up_process(module->waiter);
806 put_cpu(); 808 preempt_enable();
807 } 809 }
808} 810}
809EXPORT_SYMBOL(module_put); 811EXPORT_SYMBOL(module_put);
@@ -1083,6 +1085,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1083 if (sattr->name == NULL) 1085 if (sattr->name == NULL)
1084 goto out; 1086 goto out;
1085 sect_attrs->nsections++; 1087 sect_attrs->nsections++;
1088 sysfs_attr_init(&sattr->mattr.attr);
1086 sattr->mattr.show = module_sect_show; 1089 sattr->mattr.show = module_sect_show;
1087 sattr->mattr.store = NULL; 1090 sattr->mattr.store = NULL;
1088 sattr->mattr.attr.name = sattr->name; 1091 sattr->mattr.attr.name = sattr->name;
@@ -1178,6 +1181,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1178 if (sect_empty(&sechdrs[i])) 1181 if (sect_empty(&sechdrs[i]))
1179 continue; 1182 continue;
1180 if (sechdrs[i].sh_type == SHT_NOTE) { 1183 if (sechdrs[i].sh_type == SHT_NOTE) {
1184 sysfs_bin_attr_init(nattr);
1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1185 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1182 nattr->attr.mode = S_IRUGO; 1186 nattr->attr.mode = S_IRUGO;
1183 nattr->size = sechdrs[i].sh_size; 1187 nattr->size = sechdrs[i].sh_size;
@@ -1250,6 +1254,7 @@ int module_add_modinfo_attrs(struct module *mod)
1250 if (!attr->test || 1254 if (!attr->test ||
1251 (attr->test && attr->test(mod))) { 1255 (attr->test && attr->test(mod))) {
1252 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1256 memcpy(temp_attr, attr, sizeof(*temp_attr));
1257 sysfs_attr_init(&temp_attr->attr);
1253 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1258 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1254 ++temp_attr; 1259 ++temp_attr;
1255 } 1260 }
@@ -1397,9 +1402,9 @@ static void free_module(struct module *mod)
1397 kfree(mod->args); 1402 kfree(mod->args);
1398 if (mod->percpu) 1403 if (mod->percpu)
1399 percpu_modfree(mod->percpu); 1404 percpu_modfree(mod->percpu);
1400#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 1405#if defined(CONFIG_MODULE_UNLOAD)
1401 if (mod->refptr) 1406 if (mod->refptr)
1402 percpu_modfree(mod->refptr); 1407 free_percpu(mod->refptr);
1403#endif 1408#endif
1404 /* Free lock-classes: */ 1409 /* Free lock-classes: */
1405 lockdep_free_key_range(mod->module_core, mod->core_size); 1410 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2162,9 +2167,8 @@ static noinline struct module *load_module(void __user *umod,
2162 mod = (void *)sechdrs[modindex].sh_addr; 2167 mod = (void *)sechdrs[modindex].sh_addr;
2163 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2168 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2164 2169
2165#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2170#if defined(CONFIG_MODULE_UNLOAD)
2166 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2171 mod->refptr = alloc_percpu(struct module_ref);
2167 mod->name);
2168 if (!mod->refptr) { 2172 if (!mod->refptr) {
2169 err = -ENOMEM; 2173 err = -ENOMEM;
2170 goto free_init; 2174 goto free_init;
@@ -2396,8 +2400,8 @@ static noinline struct module *load_module(void __user *umod,
2396 kobject_put(&mod->mkobj.kobj); 2400 kobject_put(&mod->mkobj.kobj);
2397 free_unload: 2401 free_unload:
2398 module_unload_free(mod); 2402 module_unload_free(mod);
2399#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2403#if defined(CONFIG_MODULE_UNLOAD)
2400 percpu_modfree(mod->refptr); 2404 free_percpu(mod->refptr);
2401 free_init: 2405 free_init:
2402#endif 2406#endif
2403 module_free(mod, mod->module_init); 2407 module_free(mod, mod->module_init);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..2ab67233ee8f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -24,7 +24,18 @@
24 24
25static struct kmem_cache *nsproxy_cachep; 25static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = {
28 .count = ATOMIC_INIT(1),
29 .uts_ns = &init_uts_ns,
30#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
31 .ipc_ns = &init_ipc_ns,
32#endif
33 .mnt_ns = NULL,
34 .pid_ns = &init_pid_ns,
35#ifdef CONFIG_NET
36 .net_ns = &init_net,
37#endif
38};
28 39
29static inline struct nsproxy *create_nsproxy(void) 40static inline struct nsproxy *create_nsproxy(void)
30{ 41{
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f9bcb8313d6..93caf65ff57c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -642,6 +642,9 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
642 if (!pd) 642 if (!pd)
643 goto err_free_inst; 643 goto err_free_inst;
644 644
645 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
646 goto err_free_pd;
647
645 rcu_assign_pointer(pinst->pd, pd); 648 rcu_assign_pointer(pinst->pd, pd);
646 649
647 pinst->wq = wq; 650 pinst->wq = wq;
@@ -654,12 +657,14 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
654 pinst->cpu_notifier.priority = 0; 657 pinst->cpu_notifier.priority = 0;
655 err = register_hotcpu_notifier(&pinst->cpu_notifier); 658 err = register_hotcpu_notifier(&pinst->cpu_notifier);
656 if (err) 659 if (err)
657 goto err_free_pd; 660 goto err_free_cpumask;
658 661
659 mutex_init(&pinst->lock); 662 mutex_init(&pinst->lock);
660 663
661 return pinst; 664 return pinst;
662 665
666err_free_cpumask:
667 free_cpumask_var(pinst->cpumask);
663err_free_pd: 668err_free_pd:
664 padata_free_pd(pd); 669 padata_free_pd(pd);
665err_free_inst: 670err_free_inst:
@@ -685,6 +690,7 @@ void padata_free(struct padata_instance *pinst)
685 690
686 unregister_hotcpu_notifier(&pinst->cpu_notifier); 691 unregister_hotcpu_notifier(&pinst->cpu_notifier);
687 padata_free_pd(pinst->pd); 692 padata_free_pd(pinst->pd);
693 free_cpumask_var(pinst->cpumask);
688 kfree(pinst); 694 kfree(pinst);
689} 695}
690EXPORT_SYMBOL(padata_free); 696EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 36
37EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
38 38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
45long (*panic_blink)(long time); 40long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
47 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
48/** 69/**
49 * panic - halt the system 70 * panic - halt the system
50 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
95 116
96 bust_spinlocks(0); 117 bust_spinlocks(0);
97 118
98 if (!panic_blink)
99 panic_blink = no_blink;
100
101 if (panic_timeout > 0) { 119 if (panic_timeout > 0) {
102 /* 120 /*
103 * Delay timeout seconds before rebooting the machine. 121 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
105 */ 123 */
106 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 124 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
107 125
108 for (i = 0; i < panic_timeout*1000; ) { 126 for (i = 0; i < panic_timeout; i++) {
109 touch_nmi_watchdog(); 127 touch_nmi_watchdog();
110 i += panic_blink(i); 128 panic_blink_one_second();
111 mdelay(1);
112 i++;
113 } 129 }
114 /* 130 /*
115 * This will not be a clean reboot, with everything 131 * This will not be a clean reboot, with everything
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
135 } 151 }
136#endif 152#endif
137 local_irq_enable(); 153 local_irq_enable();
138 for (i = 0; ; ) { 154 while (1) {
139 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
140 i += panic_blink(i); 156 panic_blink_one_second();
141 mdelay(1);
142 i++;
143 } 157 }
144} 158}
145 159
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
28 27
29#if 0 28#if 0
30#define DEBUGP printk 29#define DEBUGP printk
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
402} 401}
403 402
404/* sysfs output in /sys/modules/XYZ/parameters/ */ 403/* sysfs output in /sys/modules/XYZ/parameters/ */
405#define to_module_attr(n) container_of(n, struct module_attribute, attr); 404#define to_module_attr(n) container_of(n, struct module_attribute, attr)
406#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 405#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
407 406
408extern struct kernel_param __start___param[], __stop___param[]; 407extern struct kernel_param __start___param[], __stop___param[];
409 408
@@ -421,7 +420,7 @@ struct module_param_attrs
421}; 420};
422 421
423#ifdef CONFIG_SYSFS 422#ifdef CONFIG_SYSFS
424#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 423#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
425 424
426static ssize_t param_attr_show(struct module_attribute *mattr, 425static ssize_t param_attr_show(struct module_attribute *mattr,
427 struct module *mod, char *buf) 426 struct module *mod, char *buf)
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
517 new->grp.attrs = attrs; 516 new->grp.attrs = attrs;
518 517
519 /* Tack new one on the end. */ 518 /* Tack new one on the end. */
519 sysfs_attr_init(&new->attrs[num].mattr.attr);
520 new->attrs[num].param = kp; 520 new->attrs[num].param = kp;
521 new->attrs[num].mattr.show = param_attr_show; 521 new->attrs[num].mattr.show = param_attr_show;
522 new->attrs[num].mattr.store = param_attr_store; 522 new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
723 return ret; 723 return ret;
724} 724}
725 725
726static struct sysfs_ops module_sysfs_ops = { 726static const struct sysfs_ops module_sysfs_ops = {
727 .show = module_attr_show, 727 .show = module_attr_show,
728 .store = module_attr_store, 728 .store = module_attr_store,
729}; 729};
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
737 return 0; 737 return 0;
738} 738}
739 739
740static struct kset_uevent_ops module_uevent_ops = { 740static const struct kset_uevent_ops module_uevent_ops = {
741 .filter = uevent_filter, 741 .filter = uevent_filter,
742}; 742};
743 743
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a661e7991865..574ee58a3046 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly;
56 */ 56 */
57int sysctl_perf_event_paranoid __read_mostly = 1; 57int sysctl_perf_event_paranoid __read_mostly = 1;
58 58
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 59int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75 60
76/* 61/*
@@ -96,10 +81,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
96void __weak hw_perf_disable(void) { barrier(); } 81void __weak hw_perf_disable(void) { barrier(); }
97void __weak hw_perf_enable(void) { barrier(); } 82void __weak hw_perf_enable(void) { barrier(); }
98 83
99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
102
103int __weak 84int __weak
104hw_perf_group_sched_in(struct perf_event *group_leader, 85hw_perf_group_sched_in(struct perf_event *group_leader,
105 struct perf_cpu_context *cpuctx, 86 struct perf_cpu_context *cpuctx,
@@ -112,25 +93,15 @@ void __weak perf_event_print_debug(void) { }
112 93
113static DEFINE_PER_CPU(int, perf_disable_count); 94static DEFINE_PER_CPU(int, perf_disable_count);
114 95
115void __perf_disable(void)
116{
117 __get_cpu_var(perf_disable_count)++;
118}
119
120bool __perf_enable(void)
121{
122 return !--__get_cpu_var(perf_disable_count);
123}
124
125void perf_disable(void) 96void perf_disable(void)
126{ 97{
127 __perf_disable(); 98 if (!__get_cpu_var(perf_disable_count)++)
128 hw_perf_disable(); 99 hw_perf_disable();
129} 100}
130 101
131void perf_enable(void) 102void perf_enable(void)
132{ 103{
133 if (__perf_enable()) 104 if (!--__get_cpu_var(perf_disable_count))
134 hw_perf_enable(); 105 hw_perf_enable();
135} 106}
136 107
@@ -1553,12 +1524,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1553 */ 1524 */
1554 if (interrupts == MAX_INTERRUPTS) { 1525 if (interrupts == MAX_INTERRUPTS) {
1555 perf_log_throttle(event, 1); 1526 perf_log_throttle(event, 1);
1527 perf_disable();
1556 event->pmu->unthrottle(event); 1528 event->pmu->unthrottle(event);
1529 perf_enable();
1557 } 1530 }
1558 1531
1559 if (!event->attr.freq || !event->attr.sample_freq) 1532 if (!event->attr.freq || !event->attr.sample_freq)
1560 continue; 1533 continue;
1561 1534
1535 perf_disable();
1562 event->pmu->read(event); 1536 event->pmu->read(event);
1563 now = atomic64_read(&event->count); 1537 now = atomic64_read(&event->count);
1564 delta = now - hwc->freq_count_stamp; 1538 delta = now - hwc->freq_count_stamp;
@@ -1566,6 +1540,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1566 1540
1567 if (delta > 0) 1541 if (delta > 0)
1568 perf_adjust_period(event, TICK_NSEC, delta); 1542 perf_adjust_period(event, TICK_NSEC, delta);
1543 perf_enable();
1569 } 1544 }
1570 raw_spin_unlock(&ctx->lock); 1545 raw_spin_unlock(&ctx->lock);
1571} 1546}
@@ -1575,9 +1550,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1575 */ 1550 */
1576static void rotate_ctx(struct perf_event_context *ctx) 1551static void rotate_ctx(struct perf_event_context *ctx)
1577{ 1552{
1578 if (!ctx->nr_events)
1579 return;
1580
1581 raw_spin_lock(&ctx->lock); 1553 raw_spin_lock(&ctx->lock);
1582 1554
1583 /* Rotate the first entry last of non-pinned groups */ 1555 /* Rotate the first entry last of non-pinned groups */
@@ -1590,19 +1562,28 @@ void perf_event_task_tick(struct task_struct *curr)
1590{ 1562{
1591 struct perf_cpu_context *cpuctx; 1563 struct perf_cpu_context *cpuctx;
1592 struct perf_event_context *ctx; 1564 struct perf_event_context *ctx;
1565 int rotate = 0;
1593 1566
1594 if (!atomic_read(&nr_events)) 1567 if (!atomic_read(&nr_events))
1595 return; 1568 return;
1596 1569
1597 cpuctx = &__get_cpu_var(perf_cpu_context); 1570 cpuctx = &__get_cpu_var(perf_cpu_context);
1598 ctx = curr->perf_event_ctxp; 1571 if (cpuctx->ctx.nr_events &&
1572 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1573 rotate = 1;
1599 1574
1600 perf_disable(); 1575 ctx = curr->perf_event_ctxp;
1576 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1577 rotate = 1;
1601 1578
1602 perf_ctx_adjust_freq(&cpuctx->ctx); 1579 perf_ctx_adjust_freq(&cpuctx->ctx);
1603 if (ctx) 1580 if (ctx)
1604 perf_ctx_adjust_freq(ctx); 1581 perf_ctx_adjust_freq(ctx);
1605 1582
1583 if (!rotate)
1584 return;
1585
1586 perf_disable();
1606 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 1587 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1607 if (ctx) 1588 if (ctx)
1608 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 1589 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1614,7 +1595,6 @@ void perf_event_task_tick(struct task_struct *curr)
1614 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1595 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1615 if (ctx) 1596 if (ctx)
1616 task_ctx_sched_in(curr, EVENT_FLEXIBLE); 1597 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618 perf_enable(); 1598 perf_enable();
1619} 1599}
1620 1600
@@ -2610,7 +2590,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2610 if (user_locked > user_lock_limit) 2590 if (user_locked > user_lock_limit)
2611 extra = user_locked - user_lock_limit; 2591 extra = user_locked - user_lock_limit;
2612 2592
2613 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2593 lock_limit = rlimit(RLIMIT_MEMLOCK);
2614 lock_limit >>= PAGE_SHIFT; 2594 lock_limit >>= PAGE_SHIFT;
2615 locked = vma->vm_mm->locked_vm + extra; 2595 locked = vma->vm_mm->locked_vm + extra;
2616 2596
@@ -2806,6 +2786,13 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2806 return NULL; 2786 return NULL;
2807} 2787}
2808 2788
2789#ifdef CONFIG_EVENT_TRACING
2790__weak
2791void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2792{
2793}
2794#endif
2795
2809/* 2796/*
2810 * Output 2797 * Output
2811 */ 2798 */
@@ -4123,8 +4110,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4123 if (rctx < 0) 4110 if (rctx < 0)
4124 return; 4111 return;
4125 4112
4126 data.addr = addr; 4113 perf_sample_data_init(&data, addr);
4127 data.raw = NULL;
4128 4114
4129 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4115 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4130 4116
@@ -4169,11 +4155,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4169 struct perf_event *event; 4155 struct perf_event *event;
4170 u64 period; 4156 u64 period;
4171 4157
4172 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4158 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4173 event->pmu->read(event); 4159 event->pmu->read(event);
4174 4160
4175 data.addr = 0; 4161 perf_sample_data_init(&data, 0);
4176 data.raw = NULL;
4177 data.period = event->hw.last_period; 4162 data.period = event->hw.last_period;
4178 regs = get_irq_regs(); 4163 regs = get_irq_regs();
4179 /* 4164 /*
@@ -4335,26 +4320,20 @@ static const struct pmu perf_ops_task_clock = {
4335#ifdef CONFIG_EVENT_TRACING 4320#ifdef CONFIG_EVENT_TRACING
4336 4321
4337void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4322void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4338 int entry_size) 4323 int entry_size, struct pt_regs *regs)
4339{ 4324{
4325 struct perf_sample_data data;
4340 struct perf_raw_record raw = { 4326 struct perf_raw_record raw = {
4341 .size = entry_size, 4327 .size = entry_size,
4342 .data = record, 4328 .data = record,
4343 }; 4329 };
4344 4330
4345 struct perf_sample_data data = { 4331 perf_sample_data_init(&data, addr);
4346 .addr = addr, 4332 data.raw = &raw;
4347 .raw = &raw,
4348 };
4349
4350 struct pt_regs *regs = get_irq_regs();
4351
4352 if (!regs)
4353 regs = task_pt_regs(current);
4354 4333
4355 /* Trace events already protected against recursion */ 4334 /* Trace events already protected against recursion */
4356 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4335 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4357 &data, regs); 4336 &data, regs);
4358} 4337}
4359EXPORT_SYMBOL_GPL(perf_tp_event); 4338EXPORT_SYMBOL_GPL(perf_tp_event);
4360 4339
@@ -4370,7 +4349,7 @@ static int perf_tp_event_match(struct perf_event *event,
4370 4349
4371static void tp_perf_event_destroy(struct perf_event *event) 4350static void tp_perf_event_destroy(struct perf_event *event)
4372{ 4351{
4373 ftrace_profile_disable(event->attr.config); 4352 perf_trace_disable(event->attr.config);
4374} 4353}
4375 4354
4376static const struct pmu *tp_perf_event_init(struct perf_event *event) 4355static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4384,7 +4363,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4384 !capable(CAP_SYS_ADMIN)) 4363 !capable(CAP_SYS_ADMIN))
4385 return ERR_PTR(-EPERM); 4364 return ERR_PTR(-EPERM);
4386 4365
4387 if (ftrace_profile_enable(event->attr.config)) 4366 if (perf_trace_enable(event->attr.config))
4388 return NULL; 4367 return NULL;
4389 4368
4390 event->destroy = tp_perf_event_destroy; 4369 event->destroy = tp_perf_event_destroy;
@@ -4463,8 +4442,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4463 struct perf_sample_data sample; 4442 struct perf_sample_data sample;
4464 struct pt_regs *regs = data; 4443 struct pt_regs *regs = data;
4465 4444
4466 sample.raw = NULL; 4445 perf_sample_data_init(&sample, bp->attr.bp_addr);
4467 sample.addr = bp->attr.bp_addr;
4468 4446
4469 if (!perf_exclude_event(bp, regs)) 4447 if (!perf_exclude_event(bp, regs))
4470 perf_swevent_add(bp, 1, 1, &sample, regs); 4448 perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -5392,18 +5370,26 @@ int perf_event_init_task(struct task_struct *child)
5392 return ret; 5370 return ret;
5393} 5371}
5394 5372
5373static void __init perf_event_init_all_cpus(void)
5374{
5375 int cpu;
5376 struct perf_cpu_context *cpuctx;
5377
5378 for_each_possible_cpu(cpu) {
5379 cpuctx = &per_cpu(perf_cpu_context, cpu);
5380 __perf_event_init_context(&cpuctx->ctx, NULL);
5381 }
5382}
5383
5395static void __cpuinit perf_event_init_cpu(int cpu) 5384static void __cpuinit perf_event_init_cpu(int cpu)
5396{ 5385{
5397 struct perf_cpu_context *cpuctx; 5386 struct perf_cpu_context *cpuctx;
5398 5387
5399 cpuctx = &per_cpu(perf_cpu_context, cpu); 5388 cpuctx = &per_cpu(perf_cpu_context, cpu);
5400 __perf_event_init_context(&cpuctx->ctx, NULL);
5401 5389
5402 spin_lock(&perf_resource_lock); 5390 spin_lock(&perf_resource_lock);
5403 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5391 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5404 spin_unlock(&perf_resource_lock); 5392 spin_unlock(&perf_resource_lock);
5405
5406 hw_perf_event_setup(cpu);
5407} 5393}
5408 5394
5409#ifdef CONFIG_HOTPLUG_CPU 5395#ifdef CONFIG_HOTPLUG_CPU
@@ -5443,20 +5429,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5443 perf_event_init_cpu(cpu); 5429 perf_event_init_cpu(cpu);
5444 break; 5430 break;
5445 5431
5446 case CPU_ONLINE:
5447 case CPU_ONLINE_FROZEN:
5448 hw_perf_event_setup_online(cpu);
5449 break;
5450
5451 case CPU_DOWN_PREPARE: 5432 case CPU_DOWN_PREPARE:
5452 case CPU_DOWN_PREPARE_FROZEN: 5433 case CPU_DOWN_PREPARE_FROZEN:
5453 perf_event_exit_cpu(cpu); 5434 perf_event_exit_cpu(cpu);
5454 break; 5435 break;
5455 5436
5456 case CPU_DEAD:
5457 hw_perf_event_setup_offline(cpu);
5458 break;
5459
5460 default: 5437 default:
5461 break; 5438 break;
5462 } 5439 }
@@ -5474,6 +5451,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
5474 5451
5475void __init perf_event_init(void) 5452void __init perf_event_init(void)
5476{ 5453{
5454 perf_event_init_all_cpus();
5477 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 5455 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5478 (void *)(long)smp_processor_id()); 5456 (void *)(long)smp_processor_id());
5479 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 5457 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5481,13 +5459,16 @@ void __init perf_event_init(void)
5481 register_cpu_notifier(&perf_cpu_nb); 5459 register_cpu_notifier(&perf_cpu_nb);
5482} 5460}
5483 5461
5484static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5462static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5463 struct sysdev_class_attribute *attr,
5464 char *buf)
5485{ 5465{
5486 return sprintf(buf, "%d\n", perf_reserved_percpu); 5466 return sprintf(buf, "%d\n", perf_reserved_percpu);
5487} 5467}
5488 5468
5489static ssize_t 5469static ssize_t
5490perf_set_reserve_percpu(struct sysdev_class *class, 5470perf_set_reserve_percpu(struct sysdev_class *class,
5471 struct sysdev_class_attribute *attr,
5491 const char *buf, 5472 const char *buf,
5492 size_t count) 5473 size_t count)
5493{ 5474{
@@ -5516,13 +5497,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5516 return count; 5497 return count;
5517} 5498}
5518 5499
5519static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5500static ssize_t perf_show_overcommit(struct sysdev_class *class,
5501 struct sysdev_class_attribute *attr,
5502 char *buf)
5520{ 5503{
5521 return sprintf(buf, "%d\n", perf_overcommit); 5504 return sprintf(buf, "%d\n", perf_overcommit);
5522} 5505}
5523 5506
5524static ssize_t 5507static ssize_t
5525perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5508perf_set_overcommit(struct sysdev_class *class,
5509 struct sysdev_class_attribute *attr,
5510 const char *buf, size_t count)
5526{ 5511{
5527 unsigned long val; 5512 unsigned long val;
5528 int err; 5513 int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index b08e697cd83f..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); 370 first = rcu_dereference_check(pid->tasks[type].first,
371 rcu_read_lock_held() ||
372 lockdep_tasklist_lock_is_held());
371 if (first) 373 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 374 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 375 }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 378EXPORT_SYMBOL(pid_task);
377 379
378/* 380/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 381 * Must be called under rcu_read_lock().
380 */ 382 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 383struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 384{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..79aac93acf99 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -161,13 +161,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
161 rcu_read_lock(); 161 rcu_read_lock();
162 162
163 /* 163 /*
164 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring 164 * Any nested-container's init processes won't ignore the
165 * any nested-container's init processes don't ignore the 165 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
166 * signal
167 */ 166 */
168 task = pid_task(find_vpid(nr), PIDTYPE_PID); 167 task = pid_task(find_vpid(nr), PIDTYPE_PID);
169 if (task) 168 if (task)
170 force_sig(SIGKILL, task); 169 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
171 170
172 rcu_read_unlock(); 171 rcu_read_unlock();
173 172
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..bc7704b3a443 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
982 int maxfire; 982 int maxfire;
983 struct list_head *timers = tsk->cpu_timers; 983 struct list_head *timers = tsk->cpu_timers;
984 struct signal_struct *const sig = tsk->signal; 984 struct signal_struct *const sig = tsk->signal;
985 unsigned long soft;
985 986
986 maxfire = 20; 987 maxfire = 20;
987 tsk->cputime_expires.prof_exp = cputime_zero; 988 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
1030 /* 1031 /*
1031 * Check for the special case thread timers. 1032 * Check for the special case thread timers.
1032 */ 1033 */
1033 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 1034 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1034 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 1035 if (soft != RLIM_INFINITY) {
1035 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 1036 unsigned long hard =
1037 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1036 1038
1037 if (hard != RLIM_INFINITY && 1039 if (hard != RLIM_INFINITY &&
1038 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 1040 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
1043 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1045 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1044 return; 1046 return;
1045 } 1047 }
1046 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 1048 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1047 /* 1049 /*
1048 * At the soft limit, send a SIGXCPU every second. 1050 * At the soft limit, send a SIGXCPU every second.
1049 */ 1051 */
1050 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 1052 if (soft < hard) {
1051 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 1053 soft += USEC_PER_SEC;
1052 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1054 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1053 USEC_PER_SEC;
1054 } 1055 }
1055 printk(KERN_INFO 1056 printk(KERN_INFO
1056 "RT Watchdog Timeout: %s[%d]\n", 1057 "RT Watchdog Timeout: %s[%d]\n",
@@ -1060,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,
1060 } 1061 }
1061} 1062}
1062 1063
1063static void stop_process_timers(struct task_struct *tsk) 1064static void stop_process_timers(struct signal_struct *sig)
1064{ 1065{
1065 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 1066 struct thread_group_cputimer *cputimer = &sig->cputimer;
1066 unsigned long flags; 1067 unsigned long flags;
1067 1068
1068 if (!cputimer->running) 1069 if (!cputimer->running)
@@ -1071,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)
1071 spin_lock_irqsave(&cputimer->lock, flags); 1072 spin_lock_irqsave(&cputimer->lock, flags);
1072 cputimer->running = 0; 1073 cputimer->running = 0;
1073 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1074} 1079}
1075 1080
1076static u32 onecputick; 1081static u32 onecputick;
@@ -1121,6 +1126,7 @@ static void check_process_timers(struct task_struct *tsk,
1121 unsigned long long sum_sched_runtime, sched_expires; 1126 unsigned long long sum_sched_runtime, sched_expires;
1122 struct list_head *timers = sig->cpu_timers; 1127 struct list_head *timers = sig->cpu_timers;
1123 struct task_cputime cputime; 1128 struct task_cputime cputime;
1129 unsigned long soft;
1124 1130
1125 /* 1131 /*
1126 * Don't sample the current process CPU clocks if there are no timers. 1132 * Don't sample the current process CPU clocks if there are no timers.
@@ -1131,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,
1131 list_empty(&timers[CPUCLOCK_VIRT]) && 1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1132 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && 1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1133 list_empty(&timers[CPUCLOCK_SCHED])) { 1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1134 stop_process_timers(tsk); 1140 stop_process_timers(sig);
1135 return; 1141 return;
1136 } 1142 }
1137 1143
@@ -1193,11 +1199,13 @@ static void check_process_timers(struct task_struct *tsk,
1193 SIGPROF); 1199 SIGPROF);
1194 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1200 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1195 SIGVTALRM); 1201 SIGVTALRM);
1196 1202 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1197 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1203 if (soft != RLIM_INFINITY) {
1198 unsigned long psecs = cputime_to_secs(ptime); 1204 unsigned long psecs = cputime_to_secs(ptime);
1205 unsigned long hard =
1206 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1199 cputime_t x; 1207 cputime_t x;
1200 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1208 if (psecs >= hard) {
1201 /* 1209 /*
1202 * At the hard limit, we just die. 1210 * At the hard limit, we just die.
1203 * No need to calculate anything else now. 1211 * No need to calculate anything else now.
@@ -1205,17 +1213,17 @@ static void check_process_timers(struct task_struct *tsk,
1205 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1213 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1206 return; 1214 return;
1207 } 1215 }
1208 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1216 if (psecs >= soft) {
1209 /* 1217 /*
1210 * At the soft limit, send a SIGXCPU every second. 1218 * At the soft limit, send a SIGXCPU every second.
1211 */ 1219 */
1212 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1220 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1213 if (sig->rlim[RLIMIT_CPU].rlim_cur 1221 if (soft < hard) {
1214 < sig->rlim[RLIMIT_CPU].rlim_max) { 1222 soft++;
1215 sig->rlim[RLIMIT_CPU].rlim_cur++; 1223 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1216 } 1224 }
1217 } 1225 }
1218 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1226 x = secs_to_cputime(soft);
1219 if (cputime_eq(prof_expires, cputime_zero) || 1227 if (cputime_eq(prof_expires, cputime_zero) ||
1220 cputime_lt(x, prof_expires)) { 1228 cputime_lt(x, prof_expires)) {
1221 prof_expires = x; 1229 prof_expires = x;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..da5288ec2392 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mode)
323int hibernation_snapshot(int platform_mode) 323int hibernation_snapshot(int platform_mode)
324{ 324{
325 int error; 325 int error;
326 gfp_t saved_mask;
326 327
327 error = platform_begin(platform_mode); 328 error = platform_begin(platform_mode);
328 if (error) 329 if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode)
334 goto Close; 335 goto Close;
335 336
336 suspend_console(); 337 suspend_console();
338 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
337 error = dpm_suspend_start(PMSG_FREEZE); 339 error = dpm_suspend_start(PMSG_FREEZE);
338 if (error) 340 if (error)
339 goto Recover_platform; 341 goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode)
351 353
352 dpm_resume_end(in_suspend ? 354 dpm_resume_end(in_suspend ?
353 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 355 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
356 set_gfp_allowed_mask(saved_mask);
354 resume_console(); 357 resume_console();
355 Close: 358 Close:
356 platform_end(platform_mode); 359 platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode)
445int hibernation_restore(int platform_mode) 448int hibernation_restore(int platform_mode)
446{ 449{
447 int error; 450 int error;
451 gfp_t saved_mask;
448 452
449 pm_prepare_console(); 453 pm_prepare_console();
450 suspend_console(); 454 suspend_console();
455 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
451 error = dpm_suspend_start(PMSG_QUIESCE); 456 error = dpm_suspend_start(PMSG_QUIESCE);
452 if (!error) { 457 if (!error) {
453 error = resume_target_kernel(platform_mode); 458 error = resume_target_kernel(platform_mode);
454 dpm_resume_end(PMSG_RECOVER); 459 dpm_resume_end(PMSG_RECOVER);
455 } 460 }
461 set_gfp_allowed_mask(saved_mask);
456 resume_console(); 462 resume_console();
457 pm_restore_console(); 463 pm_restore_console();
458 return error; 464 return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode)
466int hibernation_platform_enter(void) 472int hibernation_platform_enter(void)
467{ 473{
468 int error; 474 int error;
475 gfp_t saved_mask;
469 476
470 if (!hibernation_ops) 477 if (!hibernation_ops)
471 return -ENOSYS; 478 return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
481 488
482 entering_platform_hibernation = true; 489 entering_platform_hibernation = true;
483 suspend_console(); 490 suspend_console();
491 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
484 error = dpm_suspend_start(PMSG_HIBERNATE); 492 error = dpm_suspend_start(PMSG_HIBERNATE);
485 if (error) { 493 if (error) {
486 if (hibernation_ops->recover) 494 if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
518 Resume_devices: 526 Resume_devices:
519 entering_platform_hibernation = false; 527 entering_platform_hibernation = false;
520 dpm_resume_end(PMSG_RESTORE); 528 dpm_resume_end(PMSG_RESTORE);
529 set_gfp_allowed_mask(saved_mask);
521 resume_console(); 530 resume_console();
522 531
523 Close: 532 Close:
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..44cce10b582d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 189int suspend_devices_and_enter(suspend_state_t state)
190{ 190{
191 int error; 191 int error;
192 gfp_t saved_mask;
192 193
193 if (!suspend_ops) 194 if (!suspend_ops)
194 return -ENOSYS; 195 return -ENOSYS;
@@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 200 goto Close;
200 } 201 }
201 suspend_console(); 202 suspend_console();
203 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 204 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 205 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 206 if (error) {
@@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 217 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 218 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 219 suspend_test_finish("resume devices");
220 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 221 resume_console();
219 Close: 222 Close:
220 if (suspend_ops->end) 223 if (suspend_ops->end)
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 37#include <linux/kmsg_dump.h>
38#include <linux/syslog.h>
38 39
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
@@ -69,8 +70,6 @@ int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 70 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
70}; 71};
71 72
72static int saved_console_loglevel = -1;
73
74/* 73/*
75 * Low level drivers may need that to know if they can schedule in 74 * Low level drivers may need that to know if they can schedule in
76 * their unblank() callback or not. So let's export it. 75 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
145static char *log_buf = __log_buf; 144static char *log_buf = __log_buf;
146static int log_buf_len = __LOG_BUF_LEN; 145static int log_buf_len = __LOG_BUF_LEN;
147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 146static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
147static int saved_console_loglevel = -1;
148 148
149#ifdef CONFIG_KEXEC 149#ifdef CONFIG_KEXEC
150/* 150/*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
258} 258}
259#endif 259#endif
260 260
261/* 261int do_syslog(int type, char __user *buf, int len, bool from_file)
262 * Commands to do_syslog:
263 *
264 * 0 -- Close the log. Currently a NOP.
265 * 1 -- Open the log. Currently a NOP.
266 * 2 -- Read from the log.
267 * 3 -- Read all messages remaining in the ring buffer.
268 * 4 -- Read and clear all messages remaining in the ring buffer
269 * 5 -- Clear ring buffer.
270 * 6 -- Disable printk's to console
271 * 7 -- Enable printk's to console
272 * 8 -- Set level of messages printed to console
273 * 9 -- Return number of unread characters in the log buffer
274 * 10 -- Return size of the log buffer
275 */
276int do_syslog(int type, char __user *buf, int len)
277{ 262{
278 unsigned i, j, limit, count; 263 unsigned i, j, limit, count;
279 int do_clear = 0; 264 int do_clear = 0;
280 char c; 265 char c;
281 int error = 0; 266 int error = 0;
282 267
283 error = security_syslog(type); 268 error = security_syslog(type, from_file);
284 if (error) 269 if (error)
285 return error; 270 return error;
286 271
287 switch (type) { 272 switch (type) {
288 case 0: /* Close log */ 273 case SYSLOG_ACTION_CLOSE: /* Close log */
289 break; 274 break;
290 case 1: /* Open log */ 275 case SYSLOG_ACTION_OPEN: /* Open log */
291 break; 276 break;
292 case 2: /* Read from log */ 277 case SYSLOG_ACTION_READ: /* Read from log */
293 error = -EINVAL; 278 error = -EINVAL;
294 if (!buf || len < 0) 279 if (!buf || len < 0)
295 goto out; 280 goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
320 if (!error) 305 if (!error)
321 error = i; 306 error = i;
322 break; 307 break;
323 case 4: /* Read/clear last kernel messages */ 308 /* Read/clear last kernel messages */
309 case SYSLOG_ACTION_READ_CLEAR:
324 do_clear = 1; 310 do_clear = 1;
325 /* FALL THRU */ 311 /* FALL THRU */
326 case 3: /* Read last kernel messages */ 312 /* Read last kernel messages */
313 case SYSLOG_ACTION_READ_ALL:
327 error = -EINVAL; 314 error = -EINVAL;
328 if (!buf || len < 0) 315 if (!buf || len < 0)
329 goto out; 316 goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
376 } 363 }
377 } 364 }
378 break; 365 break;
379 case 5: /* Clear ring buffer */ 366 /* Clear ring buffer */
367 case SYSLOG_ACTION_CLEAR:
380 logged_chars = 0; 368 logged_chars = 0;
381 break; 369 break;
382 case 6: /* Disable logging to console */ 370 /* Disable logging to console */
371 case SYSLOG_ACTION_CONSOLE_OFF:
383 if (saved_console_loglevel == -1) 372 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel; 373 saved_console_loglevel = console_loglevel;
385 console_loglevel = minimum_console_loglevel; 374 console_loglevel = minimum_console_loglevel;
386 break; 375 break;
387 case 7: /* Enable logging to console */ 376 /* Enable logging to console */
377 case SYSLOG_ACTION_CONSOLE_ON:
388 if (saved_console_loglevel != -1) { 378 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel; 379 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1; 380 saved_console_loglevel = -1;
391 } 381 }
392 break; 382 break;
393 case 8: /* Set level of messages printed to console */ 383 /* Set level of messages printed to console */
384 case SYSLOG_ACTION_CONSOLE_LEVEL:
394 error = -EINVAL; 385 error = -EINVAL;
395 if (len < 1 || len > 8) 386 if (len < 1 || len > 8)
396 goto out; 387 goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
401 saved_console_loglevel = -1; 392 saved_console_loglevel = -1;
402 error = 0; 393 error = 0;
403 break; 394 break;
404 case 9: /* Number of chars in the log buffer */ 395 /* Number of chars in the log buffer */
396 case SYSLOG_ACTION_SIZE_UNREAD:
405 error = log_end - log_start; 397 error = log_end - log_start;
406 break; 398 break;
407 case 10: /* Size of the log buffer */ 399 /* Size of the log buffer */
400 case SYSLOG_ACTION_SIZE_BUFFER:
408 error = log_buf_len; 401 error = log_buf_len;
409 break; 402 break;
410 default: 403 default:
@@ -417,7 +410,7 @@ out:
417 410
418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 411SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
419{ 412{
420 return do_syslog(type, buf, len); 413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 414}
422 415
423/* 416/*
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f1125c1a6321..63fe25433980 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h> 47#include <linux/kernel_stat.h>
48#include <linux/hardirq.h>
48 49
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 50#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 51static struct lock_class_key rcu_lock_key;
@@ -66,6 +67,28 @@ EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
66int rcu_scheduler_active __read_mostly; 67int rcu_scheduler_active __read_mostly;
67EXPORT_SYMBOL_GPL(rcu_scheduler_active); 68EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68 69
70#ifdef CONFIG_DEBUG_LOCK_ALLOC
71
72/**
73 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
74 *
75 * Check for bottom half being disabled, which covers both the
76 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
77 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
78 * will show the situation.
79 *
80 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
81 */
82int rcu_read_lock_bh_held(void)
83{
84 if (!debug_lockdep_rcu_enabled())
85 return 1;
86 return in_softirq();
87}
88EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
89
90#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
91
69/* 92/*
70 * This function is invoked towards the end of the scheduler's initialization 93 * This function is invoked towards the end of the scheduler's initialization
71 * process. Before this is called, the idle task might contain 94 * process. Before this is called, the idle task might contain
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 258cdf0a91eb..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -818,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
818 /* Should not happen, but... */ 818 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 820 }
821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 821 __this_cpu_inc(rcu_torture_count[pipe_count]);
822 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 824 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
826 } 826 }
827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 827 __this_cpu_inc(rcu_torture_batch[completed]);
828 preempt_enable(); 828 preempt_enable();
829 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
830} 830}
@@ -877,13 +877,13 @@ rcu_torture_reader(void *arg)
877 /* Should not happen, but... */ 877 /* Should not happen, but... */
878 pipe_count = RCU_TORTURE_PIPE_LEN; 878 pipe_count = RCU_TORTURE_PIPE_LEN;
879 } 879 }
880 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 880 __this_cpu_inc(rcu_torture_count[pipe_count]);
881 completed = cur_ops->completed() - completed; 881 completed = cur_ops->completed() - completed;
882 if (completed > RCU_TORTURE_PIPE_LEN) { 882 if (completed > RCU_TORTURE_PIPE_LEN) {
883 /* Should not happen, but... */ 883 /* Should not happen, but... */
884 completed = RCU_TORTURE_PIPE_LEN; 884 completed = RCU_TORTURE_PIPE_LEN;
885 } 885 }
886 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 886 __this_cpu_inc(rcu_torture_batch[completed]);
887 preempt_enable(); 887 preempt_enable();
888 cur_ops->readunlock(idx); 888 cur_ops->readunlock(idx);
889 schedule(); 889 schedule();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1439eb504c22..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -246,12 +246,21 @@ struct rcu_data {
246 246
247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
248#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 248#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
249#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ 249
250#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ 250#ifdef CONFIG_PROVE_RCU
251#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 251#define RCU_STALL_DELAY_DELTA (5 * HZ)
252 /* to take at least one */ 252#else
253 /* scheduling clock irq */ 253#define RCU_STALL_DELAY_DELTA 0
254 /* before ratting on them. */ 254#endif
255
256#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
257 /* for rsp->jiffies_stall */
258#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
259 /* for rsp->jiffies_stall */
260#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
261 /* to take at least one */
262 /* scheduling clock irq */
263 /* before ratting on them. */
255 264
256#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 265#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257 266
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 464ad2cdee00..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1010,6 +1010,10 @@ int rcu_needs_cpu(int cpu)
1010 int c = 0; 1010 int c = 0;
1011 int thatcpu; 1011 int thatcpu;
1012 1012
1013 /* Check for being in the holdoff period. */
1014 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1015 return rcu_needs_cpu_quick_check(cpu);
1016
1013 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1014 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1018 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1015 if (thatcpu != cpu) { 1019 if (thatcpu != cpu) {
@@ -1041,10 +1045,8 @@ int rcu_needs_cpu(int cpu)
1041 } 1045 }
1042 1046
1043 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 1047 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1044 if (c) { 1048 if (c)
1045 raise_softirq(RCU_SOFTIRQ); 1049 raise_softirq(RCU_SOFTIRQ);
1046 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1047 }
1048 return c; 1050 return c;
1049} 1051}
1050 1052
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index 4e9d87fd7bc5..9c358e263534 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -219,19 +219,34 @@ void release_child_resources(struct resource *r)
219} 219}
220 220
221/** 221/**
222 * request_resource - request and reserve an I/O or memory resource 222 * request_resource_conflict - request and reserve an I/O or memory resource
223 * @root: root resource descriptor 223 * @root: root resource descriptor
224 * @new: resource descriptor desired by caller 224 * @new: resource descriptor desired by caller
225 * 225 *
226 * Returns 0 for success, negative error code on error. 226 * Returns 0 for success, conflict resource on error.
227 */ 227 */
228int request_resource(struct resource *root, struct resource *new) 228struct resource *request_resource_conflict(struct resource *root, struct resource *new)
229{ 229{
230 struct resource *conflict; 230 struct resource *conflict;
231 231
232 write_lock(&resource_lock); 232 write_lock(&resource_lock);
233 conflict = __request_resource(root, new); 233 conflict = __request_resource(root, new);
234 write_unlock(&resource_lock); 234 write_unlock(&resource_lock);
235 return conflict;
236}
237
238/**
239 * request_resource - request and reserve an I/O or memory resource
240 * @root: root resource descriptor
241 * @new: resource descriptor desired by caller
242 *
243 * Returns 0 for success, negative error code on error.
244 */
245int request_resource(struct resource *root, struct resource *new)
246{
247 struct resource *conflict;
248
249 conflict = request_resource_conflict(root, new);
235 return conflict ? -EBUSY : 0; 250 return conflict ? -EBUSY : 0;
236} 251}
237 252
@@ -304,7 +319,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
304 void *arg, int (*func)(unsigned long, unsigned long, void *)) 319 void *arg, int (*func)(unsigned long, unsigned long, void *))
305{ 320{
306 struct resource res; 321 struct resource res;
307 unsigned long pfn, len; 322 unsigned long pfn, end_pfn;
308 u64 orig_end; 323 u64 orig_end;
309 int ret = -1; 324 int ret = -1;
310 325
@@ -314,9 +329,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
314 orig_end = res.end; 329 orig_end = res.end;
315 while ((res.start < res.end) && 330 while ((res.start < res.end) &&
316 (find_next_system_ram(&res, "System RAM") >= 0)) { 331 (find_next_system_ram(&res, "System RAM") >= 0)) {
317 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 332 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
318 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 333 end_pfn = (res.end + 1) >> PAGE_SHIFT;
319 ret = (*func)(pfn, len, arg); 334 if (end_pfn > pfn)
335 ret = (*func)(pfn, end_pfn - pfn, arg);
320 if (ret) 336 if (ret)
321 break; 337 break;
322 res.start = res.end + 1; 338 res.start = res.end + 1;
@@ -473,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
473} 489}
474 490
475/** 491/**
476 * insert_resource - Inserts a resource in the resource tree 492 * insert_resource_conflict - Inserts resource in the resource tree
477 * @parent: parent of the new resource 493 * @parent: parent of the new resource
478 * @new: new resource to insert 494 * @new: new resource to insert
479 * 495 *
480 * Returns 0 on success, -EBUSY if the resource can't be inserted. 496 * Returns 0 on success, conflict resource if the resource can't be inserted.
481 * 497 *
482 * This function is equivalent to request_resource when no conflict 498 * This function is equivalent to request_resource_conflict when no conflict
483 * happens. If a conflict happens, and the conflicting resources 499 * happens. If a conflict happens, and the conflicting resources
484 * entirely fit within the range of the new resource, then the new 500 * entirely fit within the range of the new resource, then the new
485 * resource is inserted and the conflicting resources become children of 501 * resource is inserted and the conflicting resources become children of
486 * the new resource. 502 * the new resource.
487 */ 503 */
488int insert_resource(struct resource *parent, struct resource *new) 504struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
489{ 505{
490 struct resource *conflict; 506 struct resource *conflict;
491 507
492 write_lock(&resource_lock); 508 write_lock(&resource_lock);
493 conflict = __insert_resource(parent, new); 509 conflict = __insert_resource(parent, new);
494 write_unlock(&resource_lock); 510 write_unlock(&resource_lock);
511 return conflict;
512}
513
514/**
515 * insert_resource - Inserts a resource in the resource tree
516 * @parent: parent of the new resource
517 * @new: new resource to insert
518 *
519 * Returns 0 on success, -EBUSY if the resource can't be inserted.
520 */
521int insert_resource(struct resource *parent, struct resource *new)
522{
523 struct resource *conflict;
524
525 conflict = insert_resource_conflict(parent, new);
495 return conflict ? -EBUSY : 0; 526 return conflict ? -EBUSY : 0;
496} 527}
497 528
diff --git a/kernel/sched.c b/kernel/sched.c
index cc6dc8caa380..52b7efd27416 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1543,7 +1543,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1543
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1544#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1545
1546static __read_mostly unsigned long *update_shares_data; 1546static __read_mostly unsigned long __percpu *update_shares_data;
1547 1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549 1549
@@ -2604,7 +2604,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2604{ 2604{
2605 unsigned long flags; 2605 unsigned long flags;
2606 struct rq *rq; 2606 struct rq *rq;
2607 int cpu = get_cpu(); 2607 int cpu __maybe_unused = get_cpu();
2608 2608
2609#ifdef CONFIG_SMP 2609#ifdef CONFIG_SMP
2610 /* 2610 /*
@@ -4289,7 +4289,7 @@ int can_nice(const struct task_struct *p, const int nice)
4289 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4289 /* convert nice value [19,-20] to rlimit style value [1,40] */
4290 int nice_rlim = 20 - nice; 4290 int nice_rlim = 20 - nice;
4291 4291
4292 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4292 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4293 capable(CAP_SYS_NICE)); 4293 capable(CAP_SYS_NICE));
4294} 4294}
4295 4295
@@ -4466,7 +4466,7 @@ recheck:
4466 4466
4467 if (!lock_task_sighand(p, &flags)) 4467 if (!lock_task_sighand(p, &flags))
4468 return -ESRCH; 4468 return -ESRCH;
4469 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4469 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4470 unlock_task_sighand(p, &flags); 4470 unlock_task_sighand(p, &flags);
4471 4471
4472 /* can't set/change the rt policy */ 4472 /* can't set/change the rt policy */
@@ -4837,7 +4837,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4837 int ret; 4837 int ret;
4838 cpumask_var_t mask; 4838 cpumask_var_t mask;
4839 4839
4840 if (len < cpumask_size()) 4840 if (len < nr_cpu_ids)
4841 return -EINVAL;
4842 if (len & (sizeof(unsigned long)-1))
4841 return -EINVAL; 4843 return -EINVAL;
4842 4844
4843 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4845 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -4845,10 +4847,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4845 4847
4846 ret = sched_getaffinity(pid, mask); 4848 ret = sched_getaffinity(pid, mask);
4847 if (ret == 0) { 4849 if (ret == 0) {
4848 if (copy_to_user(user_mask_ptr, mask, cpumask_size())) 4850 size_t retlen = min_t(size_t, len, cpumask_size());
4851
4852 if (copy_to_user(user_mask_ptr, mask, retlen))
4849 ret = -EFAULT; 4853 ret = -EFAULT;
4850 else 4854 else
4851 ret = cpumask_size(); 4855 ret = retlen;
4852 } 4856 }
4853 free_cpumask_var(mask); 4857 free_cpumask_var(mask);
4854 4858
@@ -7338,11 +7342,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7338 7342
7339#ifdef CONFIG_SCHED_MC 7343#ifdef CONFIG_SCHED_MC
7340static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7344static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7345 struct sysdev_class_attribute *attr,
7341 char *page) 7346 char *page)
7342{ 7347{
7343 return sprintf(page, "%u\n", sched_mc_power_savings); 7348 return sprintf(page, "%u\n", sched_mc_power_savings);
7344} 7349}
7345static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7350static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7351 struct sysdev_class_attribute *attr,
7346 const char *buf, size_t count) 7352 const char *buf, size_t count)
7347{ 7353{
7348 return sched_power_savings_store(buf, count, 0); 7354 return sched_power_savings_store(buf, count, 0);
@@ -7354,11 +7360,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7354 7360
7355#ifdef CONFIG_SCHED_SMT 7361#ifdef CONFIG_SCHED_SMT
7356static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7362static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7363 struct sysdev_class_attribute *attr,
7357 char *page) 7364 char *page)
7358{ 7365{
7359 return sprintf(page, "%u\n", sched_smt_power_savings); 7366 return sprintf(page, "%u\n", sched_smt_power_savings);
7360} 7367}
7361static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7368static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7369 struct sysdev_class_attribute *attr,
7362 const char *buf, size_t count) 7370 const char *buf, size_t count)
7363{ 7371{
7364 return sched_power_savings_store(buf, count, 1); 7372 return sched_power_savings_store(buf, count, 1);
@@ -8742,7 +8750,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8742struct cpuacct { 8750struct cpuacct {
8743 struct cgroup_subsys_state css; 8751 struct cgroup_subsys_state css;
8744 /* cpuusage holds pointer to a u64-type object on every cpu */ 8752 /* cpuusage holds pointer to a u64-type object on every cpu */
8745 u64 *cpuusage; 8753 u64 __percpu *cpuusage;
8746 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8754 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
8747 struct cpuacct *parent; 8755 struct cpuacct *parent;
8748}; 8756};
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index eeb3506c4834..fccf9fbb0d7b 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,7 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for_each_bit(idx, array, CPUPRI_NR_PRIORITIES) 50 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 51
52/** 52/**
53 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -56,7 +56,7 @@ static int convert_prio(int prio)
56 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) 56 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
57 * 57 *
58 * Note: This function returns the recommended CPUs as calculated during the 58 * Note: This function returns the recommended CPUs as calculated during the
59 * current invokation. By the time the call returns, the CPUs may have in 59 * current invocation. By the time the call returns, the CPUs may have in
60 * fact changed priorities any number of times. While not ideal, it is not 60 * fact changed priorities any number of times. While not ideal, it is not
61 * an issue of correctness since the normal rebalancer logic will correct 61 * an issue of correctness since the normal rebalancer logic will correct
62 * any discrepancies created by racing against the uncertainty of the current 62 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 35a5c649638b..49ad99378f82 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3411,7 +3411,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3411 3411
3412static inline int on_null_domain(int cpu) 3412static inline int on_null_domain(int cpu)
3413{ 3413{
3414 return !rcu_dereference(cpu_rq(cpu)->sd); 3414 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
3415} 3415}
3416 3416
3417/* 3417/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0335e87f5204..012d69bb67c7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1667,8 +1667,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1667 if (!p->signal) 1667 if (!p->signal)
1668 return; 1668 return;
1669 1669
1670 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1670 /* max may change after cur was read, this will be fixed next tick */
1671 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1671 soft = task_rlimit(p, RLIMIT_RTTIME);
1672 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1672 1673
1673 if (soft != RLIM_INFINITY) { 1674 if (soft != RLIM_INFINITY) {
1674 unsigned long next; 1675 unsigned long next;
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
159 159
160/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
161 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
162int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
163{ 167{
164 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
166 170
167 s = pending->signal.sig; 171 s = pending->signal.sig;
168 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
169 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
170 default: 187 default:
171 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
172 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
173 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
174 break; 191 continue;
175 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
176 break; 195 break;
177 196
178 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
179 sig = 1; 198 x = s[1] &~ m[1];
180 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
181 sig = _NSIG_BPW + 1;
182 else
183 break; 200 break;
184 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
185 break; 202 break;
186 203
187 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
188 sig = ffz(~x) + 1; 205 /* Nothing to do */
189 break; 206 break;
190 } 207 }
191 208
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
228 245
229 if (override_rlimit || 246 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
232 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else { 250 } else {
234 print_dropped_signal(sig); 251 print_dropped_signal(sig);
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a270..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
637 goto cancelled; 637 goto cancelled;
638 638
639 /* the timer holds a reference whilst it is pending */ 639 /* the timer holds a reference whilst it is pending */
640 ret = work->ops->get_ref(work); 640 ret = slow_work_get_ref(work);
641 if (ret < 0) 641 if (ret < 0)
642 goto cant_get_ref; 642 goto cant_get_ref;
643 643
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
43 */ 43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid) 44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{ 45{
46#ifdef CONFIG_SLOW_WORK_PROC 46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid; 47 slow_work_pids[id] = pid;
48#endif 48#endif
49} 49}
50 50
51static inline void slow_work_mark_time(struct slow_work *work) 51static inline void slow_work_mark_time(struct slow_work *work)
52{ 52{
53#ifdef CONFIG_SLOW_WORK_PROC 53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME; 54 work->mark = CURRENT_TIME;
55#endif 55#endif
56} 56}
57 57
58static inline void slow_work_begin_exec(int id, struct slow_work *work) 58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{ 59{
60#ifdef CONFIG_SLOW_WORK_PROC 60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work; 61 slow_work_execs[id] = work;
62#endif 62#endif
63} 63}
64 64
65static inline void slow_work_end_exec(int id, struct slow_work *work) 65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{ 66{
67#ifdef CONFIG_SLOW_WORK_PROC 67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock); 68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL; 69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock); 70 write_unlock(&slow_work_execs_lock);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0d4c7898ab80..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -155,11 +155,11 @@ void softlockup_tick(void)
155 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan. 156 * threshold timespan.
157 */ 157 */
158 if (now > touch_ts + softlockup_thresh/2) 158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160 160
161 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
162 if (now <= (touch_ts + softlockup_thresh)) 162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return; 163 return;
164 164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
45static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus; 47static const struct cpumask *active_cpus;
48static void *stop_machine_work; 48static void __percpu *stop_machine_work;
49 49
50static void set_state(enum stopmachine_state newstate) 50static void set_state(enum stopmachine_state newstate)
51{ 51{
diff --git a/kernel/sys.c b/kernel/sys.c
index 877fe4f8e05e..8298878f4f71 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/personality.h>
36#include <linux/ptrace.h> 37#include <linux/ptrace.h>
37#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
38 39
@@ -571,8 +572,7 @@ static int set_user(struct cred *new)
571 if (!new_user) 572 if (!new_user)
572 return -EAGAIN; 573 return -EAGAIN;
573 574
574 if (atomic_read(&new_user->processes) >= 575 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
575 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
576 new_user != INIT_USER) { 576 new_user != INIT_USER) {
577 free_uid(new_user); 577 free_uid(new_user);
578 return -EAGAIN; 578 return -EAGAIN;
@@ -1115,6 +1115,15 @@ out:
1115 1115
1116DECLARE_RWSEM(uts_sem); 1116DECLARE_RWSEM(uts_sem);
1117 1117
1118#ifdef COMPAT_UTS_MACHINE
1119#define override_architecture(name) \
1120 (current->personality == PER_LINUX32 && \
1121 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1122 sizeof(COMPAT_UTS_MACHINE)))
1123#else
1124#define override_architecture(name) 0
1125#endif
1126
1118SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1127SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1119{ 1128{
1120 int errno = 0; 1129 int errno = 0;
@@ -1123,9 +1132,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1123 if (copy_to_user(name, utsname(), sizeof *name)) 1132 if (copy_to_user(name, utsname(), sizeof *name))
1124 errno = -EFAULT; 1133 errno = -EFAULT;
1125 up_read(&uts_sem); 1134 up_read(&uts_sem);
1135
1136 if (!errno && override_architecture(name))
1137 errno = -EFAULT;
1126 return errno; 1138 return errno;
1127} 1139}
1128 1140
1141#ifdef __ARCH_WANT_SYS_OLD_UNAME
1142/*
1143 * Old cruft
1144 */
1145SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1146{
1147 int error = 0;
1148
1149 if (!name)
1150 return -EFAULT;
1151
1152 down_read(&uts_sem);
1153 if (copy_to_user(name, utsname(), sizeof(*name)))
1154 error = -EFAULT;
1155 up_read(&uts_sem);
1156
1157 if (!error && override_architecture(name))
1158 error = -EFAULT;
1159 return error;
1160}
1161
1162SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1163{
1164 int error;
1165
1166 if (!name)
1167 return -EFAULT;
1168 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1169 return -EFAULT;
1170
1171 down_read(&uts_sem);
1172 error = __copy_to_user(&name->sysname, &utsname()->sysname,
1173 __OLD_UTS_LEN);
1174 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1175 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1176 __OLD_UTS_LEN);
1177 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1178 error |= __copy_to_user(&name->release, &utsname()->release,
1179 __OLD_UTS_LEN);
1180 error |= __put_user(0, name->release + __OLD_UTS_LEN);
1181 error |= __copy_to_user(&name->version, &utsname()->version,
1182 __OLD_UTS_LEN);
1183 error |= __put_user(0, name->version + __OLD_UTS_LEN);
1184 error |= __copy_to_user(&name->machine, &utsname()->machine,
1185 __OLD_UTS_LEN);
1186 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1187 up_read(&uts_sem);
1188
1189 if (!error && override_architecture(name))
1190 error = -EFAULT;
1191 return error ? -EFAULT : 0;
1192}
1193#endif
1194
1129SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1195SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1130{ 1196{
1131 int errno; 1197 int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
126cond_syscall(sys_setuid16); 126cond_syscall(sys_setuid16);
127cond_syscall(sys_vm86old); 127cond_syscall(sys_vm86old);
128cond_syscall(sys_vm86); 128cond_syscall(sys_vm86);
129cond_syscall(sys_ipc);
129cond_syscall(compat_sys_ipc); 130cond_syscall(compat_sys_ipc);
130cond_syscall(compat_sys_sysctl); 131cond_syscall(compat_sys_sysctl);
131cond_syscall(sys_flock); 132cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h>
26#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
27#include <linux/security.h> 28#include <linux/security.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
@@ -50,6 +51,7 @@
50#include <linux/ftrace.h> 51#include <linux/ftrace.h>
51#include <linux/slow-work.h> 52#include <linux/slow-work.h>
52#include <linux/perf_event.h> 53#include <linux/perf_event.h>
54#include <linux/kprobes.h>
53 55
54#include <asm/uaccess.h> 56#include <asm/uaccess.h>
55#include <asm/processor.h> 57#include <asm/processor.h>
@@ -59,13 +61,23 @@
59#include <asm/stacktrace.h> 61#include <asm/stacktrace.h>
60#include <asm/io.h> 62#include <asm/io.h>
61#endif 63#endif
64#ifdef CONFIG_BSD_PROCESS_ACCT
65#include <linux/acct.h>
66#endif
67#ifdef CONFIG_RT_MUTEXES
68#include <linux/rtmutex.h>
69#endif
70#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
71#include <linux/lockdep.h>
72#endif
73#ifdef CONFIG_CHR_DEV_SG
74#include <scsi/sg.h>
75#endif
62 76
63 77
64#if defined(CONFIG_SYSCTL) 78#if defined(CONFIG_SYSCTL)
65 79
66/* External variables not in a header file. */ 80/* External variables not in a header file. */
67extern int C_A_D;
68extern int print_fatal_signals;
69extern int sysctl_overcommit_memory; 81extern int sysctl_overcommit_memory;
70extern int sysctl_overcommit_ratio; 82extern int sysctl_overcommit_ratio;
71extern int sysctl_panic_on_oom; 83extern int sysctl_panic_on_oom;
@@ -87,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
87#ifndef CONFIG_MMU 99#ifndef CONFIG_MMU
88extern int sysctl_nr_trim_pages; 100extern int sysctl_nr_trim_pages;
89#endif 101#endif
90#ifdef CONFIG_RCU_TORTURE_TEST
91extern int rcutorture_runnable;
92#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
93#ifdef CONFIG_BLOCK 102#ifdef CONFIG_BLOCK
94extern int blk_iopoll_enabled; 103extern int blk_iopoll_enabled;
95#endif 104#endif
@@ -119,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
119 128
120static int ngroups_max = NGROUPS_MAX; 129static int ngroups_max = NGROUPS_MAX;
121 130
122#ifdef CONFIG_MODULES
123extern char modprobe_path[];
124extern int modules_disabled;
125#endif
126#ifdef CONFIG_CHR_DEV_SG
127extern int sg_big_buff;
128#endif
129
130#ifdef CONFIG_SPARC 131#ifdef CONFIG_SPARC
131#include <asm/system.h> 132#include <asm/system.h>
132#endif 133#endif
@@ -148,10 +149,6 @@ extern int sysctl_userprocess_debug;
148extern int spin_retry; 149extern int spin_retry;
149#endif 150#endif
150 151
151#ifdef CONFIG_BSD_PROCESS_ACCT
152extern int acct_parm[];
153#endif
154
155#ifdef CONFIG_IA64 152#ifdef CONFIG_IA64
156extern int no_unaligned_warning; 153extern int no_unaligned_warning;
157extern int unaligned_dump_stack; 154extern int unaligned_dump_stack;
@@ -159,10 +156,6 @@ extern int unaligned_dump_stack;
159 156
160extern struct ratelimit_state printk_ratelimit_state; 157extern struct ratelimit_state printk_ratelimit_state;
161 158
162#ifdef CONFIG_RT_MUTEXES
163extern int max_lock_depth;
164#endif
165
166#ifdef CONFIG_PROC_SYSCTL 159#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 160static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 161 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -201,9 +194,6 @@ extern struct ctl_table epoll_table[];
201int sysctl_legacy_va_layout; 194int sysctl_legacy_va_layout;
202#endif 195#endif
203 196
204extern int prove_locking;
205extern int lock_stat;
206
207/* The default sysctl tables: */ 197/* The default sysctl tables: */
208 198
209static struct ctl_table root_table[] = { 199static struct ctl_table root_table[] = {
@@ -1441,7 +1431,7 @@ static struct ctl_table fs_table[] = {
1441}; 1431};
1442 1432
1443static struct ctl_table debug_table[] = { 1433static struct ctl_table debug_table[] = {
1444#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
1445 { 1435 {
1446 .procname = "exception-trace", 1436 .procname = "exception-trace",
1447 .data = &show_unhandled_signals, 1437 .data = &show_unhandled_signals,
@@ -1450,6 +1440,17 @@ static struct ctl_table debug_table[] = {
1450 .proc_handler = proc_dointvec 1440 .proc_handler = proc_dointvec
1451 }, 1441 },
1452#endif 1442#endif
1443#if defined(CONFIG_OPTPROBES)
1444 {
1445 .procname = "kprobes-optimization",
1446 .data = &sysctl_kprobes_optimization,
1447 .maxlen = sizeof(int),
1448 .mode = 0644,
1449 .proc_handler = proc_kprobes_optimization_handler,
1450 .extra1 = &zero,
1451 .extra2 = &one,
1452 },
1453#endif
1453 { } 1454 { }
1454}; 1455};
1455 1456
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..8cd50d8f9bde 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1331 ssize_t result; 1331 ssize_t result;
1332 char *pathname; 1332 char *pathname;
1333 int flags; 1333 int flags;
1334 int acc_mode, fmode; 1334 int acc_mode;
1335 1335
1336 pathname = sysctl_getname(name, nlen, &table); 1336 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname); 1337 result = PTR_ERR(pathname);
@@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1342 if (oldval && oldlen && newval && newlen) { 1342 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR; 1343 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE; 1344 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) { 1345 } else if (newval && newlen) {
1347 flags = O_WRONLY; 1346 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE; 1347 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) { 1348 } else if (oldval && oldlen) {
1351 flags = O_RDONLY; 1349 flags = O_RDONLY;
1352 acc_mode = MAY_READ; 1350 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else { 1351 } else {
1355 result = 0; 1352 result = 0;
1356 goto out_putname; 1353 goto out_putname;
@@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1361 if (result) 1358 if (result)
1362 goto out_putname; 1359 goto out_putname;
1363 1360
1364 result = may_open(&nd.path, acc_mode, fmode); 1361 result = may_open(&nd.path, acc_mode, flags);
1365 if (result) 1362 if (result)
1366 goto out_putpath; 1363 goto out_putpath;
1367 1364
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..899ca51be5e8 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -46,15 +46,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 46 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 47};
48 48
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 49static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 50 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 52 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 53 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 54
56static struct nla_policy 55static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 56 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 57};
60 58
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f663d23e85e..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -592,6 +592,10 @@ static inline void clocksource_select(void) { }
592 */ 592 */
593static int __init clocksource_done_booting(void) 593static int __init clocksource_done_booting(void)
594{ 594{
595 mutex_lock(&clocksource_mutex);
596 curr_clocksource = clocksource_default_clock();
597 mutex_unlock(&clocksource_mutex);
598
595 finished_booting = 1; 599 finished_booting = 1;
596 600
597 /* 601 /*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
22 22
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/* Limit min_delta to a jiffie */
26#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
27
28static int tick_increase_min_delta(struct clock_event_device *dev)
29{
30 /* Nothing to do if we already reached the limit */
31 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
32 return -ETIME;
33
34 if (dev->min_delta_ns < 5000)
35 dev->min_delta_ns = 5000;
36 else
37 dev->min_delta_ns += dev->min_delta_ns >> 1;
38
39 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
40 dev->min_delta_ns = MIN_DELTA_LIMIT;
41
42 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
43 dev->name ? dev->name : "?",
44 (unsigned long long) dev->min_delta_ns);
45 return 0;
46}
47
25/** 48/**
26 * tick_program_event internal worker function 49 * tick_program_event internal worker function
27 */ 50 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
37 if (!ret || !force) 60 if (!ret || !force)
38 return ret; 61 return ret;
39 62
63 dev->retries++;
40 /* 64 /*
41 * We tried 2 times to program the device with the given 65 * We tried 3 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it 66 * min_delta_ns. If that's not working then we increase it
43 * and emit a warning. 67 * and emit a warning.
44 */ 68 */
45 if (++i > 2) { 69 if (++i > 2) {
46 /* Increase the min. delta and try again */ 70 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns) 71 if (tick_increase_min_delta(dev)) {
48 dev->min_delta_ns = 5000; 72 /*
49 else 73 * Get out of the loop if min_delta_ns
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 74 * hit the limit already. That's
51 75 * better than staying here forever.
52 printk(KERN_WARNING 76 *
53 "CE: %s increasing min_delta_ns to %llu nsec\n", 77 * We clear next_event so we have a
54 dev->name ? dev->name : "?", 78 * chance that the box survives.
55 (unsigned long long) dev->min_delta_ns << 1); 79 */
56 80 printk(KERN_WARNING
81 "CE: Reprogramming failure. Giving up\n");
82 dev->next_event.tv64 = KTIME_MAX;
83 return -ETIME;
84 }
57 i = 0; 85 i = 0;
58 } 86 }
59 87
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 16736379a9ca..39f6177fafac 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -818,7 +818,8 @@ void update_wall_time(void)
818 shift = min(shift, maxshift); 818 shift = min(shift, maxshift);
819 while (offset >= timekeeper.cycle_interval) { 819 while (offset >= timekeeper.cycle_interval) {
820 offset = logarithmic_accumulation(offset, shift); 820 offset = logarithmic_accumulation(offset, shift);
821 shift--; 821 if(offset < timekeeper.cycle_interval<<shift)
822 shift--;
822 } 823 }
823 824
824 /* correct the clock when NTP error is too big */ 825 /* correct the clock when NTP error is too big */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..1a4a7dd78777 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
228 SEQ_printf(m, " event_handler: "); 228 SEQ_printf(m, " event_handler: ");
229 print_name_offset(m, dev->event_handler); 229 print_name_offset(m, dev->event_handler);
230 SEQ_printf(m, "\n"); 230 SEQ_printf(m, "\n");
231 SEQ_printf(m, " retries: %lu\n", dev->retries);
231} 232}
232 233
233static void timer_list_show_tickdevices(struct seq_file *m) 234static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)
257 u64 now = ktime_to_ns(ktime_get()); 258 u64 now = ktime_to_ns(ktime_get());
258 int cpu; 259 int cpu;
259 260
260 SEQ_printf(m, "Timer List Version: v0.5\n"); 261 SEQ_printf(m, "Timer List Version: v0.6\n");
261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 262 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 263 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
263 264
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..fc965eae0e87 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -880,6 +880,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
880 if (base->running_timer == timer) 880 if (base->running_timer == timer)
881 goto out; 881 goto out;
882 882
883 timer_stats_timer_clear_start_info(timer);
883 ret = 0; 884 ret = 0;
884 if (timer_pending(timer)) { 885 if (timer_pending(timer)) {
885 detach_timer(timer, 1); 886 detach_timer(timer, 1);
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d00c6fe23f54..78edc6490038 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54ifeq ($(CONFIG_PERF_EVENTS),y) 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o 55obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
56endif 56endif
57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83783579378f..d9062f5cc0c0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
27#include <linux/ctype.h> 27#include <linux/ctype.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/hash.h> 29#include <linux/hash.h>
30#include <linux/rcupdate.h>
30 31
31#include <trace/events/sched.h> 32#include <trace/events/sched.h>
32 33
@@ -84,22 +85,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
84ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
85ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
86 87
87#ifdef CONFIG_FUNCTION_GRAPH_TRACER 88/*
88static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 89 * Traverse the ftrace_list, invoking all entries. The reason that we
89#endif 90 * can use rcu_dereference_raw() is that elements removed from this list
90 91 * are simply leaked, so there is no need to interact with a grace-period
92 * mechanism. The rcu_dereference_raw() calls are needed to handle
93 * concurrent insertions into the ftrace_list.
94 *
95 * Silly Alpha and silly pointer-speculation compiler optimizations!
96 */
91static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 97static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
92{ 98{
93 struct ftrace_ops *op = ftrace_list; 99 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
94
95 /* in case someone actually ports this to alpha! */
96 read_barrier_depends();
97 100
98 while (op != &ftrace_list_end) { 101 while (op != &ftrace_list_end) {
99 /* silly alpha */
100 read_barrier_depends();
101 op->func(ip, parent_ip); 102 op->func(ip, parent_ip);
102 op = op->next; 103 op = rcu_dereference_raw(op->next); /*see above*/
103 }; 104 };
104} 105}
105 106
@@ -154,8 +155,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
154 * the ops->next pointer is valid before another CPU sees 155 * the ops->next pointer is valid before another CPU sees
155 * the ops pointer included into the ftrace_list. 156 * the ops pointer included into the ftrace_list.
156 */ 157 */
157 smp_wmb(); 158 rcu_assign_pointer(ftrace_list, ops);
158 ftrace_list = ops;
159 159
160 if (ftrace_enabled) { 160 if (ftrace_enabled) {
161 ftrace_func_t func; 161 ftrace_func_t func;
@@ -2276,6 +2276,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
2276 2276
2277#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2277#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2278static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 2278static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2279static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
2280
2279static int __init set_graph_function(char *str) 2281static int __init set_graph_function(char *str)
2280{ 2282{
2281 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 2283 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -3351,6 +3353,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3351{ 3353{
3352 /* Make sure we do not use the parent ret_stack */ 3354 /* Make sure we do not use the parent ret_stack */
3353 t->ret_stack = NULL; 3355 t->ret_stack = NULL;
3356 t->curr_ret_stack = -1;
3354 3357
3355 if (ftrace_graph_active) { 3358 if (ftrace_graph_active) {
3356 struct ftrace_ret_stack *ret_stack; 3359 struct ftrace_ret_stack *ret_stack;
@@ -3360,7 +3363,6 @@ void ftrace_graph_init_task(struct task_struct *t)
3360 GFP_KERNEL); 3363 GFP_KERNEL);
3361 if (!ret_stack) 3364 if (!ret_stack)
3362 return; 3365 return;
3363 t->curr_ret_stack = -1;
3364 atomic_set(&t->tracing_graph_pause, 0); 3366 atomic_set(&t->tracing_graph_pause, 0);
3365 atomic_set(&t->trace_overrun, 0); 3367 atomic_set(&t->trace_overrun, 0);
3366 t->ftrace_timestamp = 0; 3368 t->ftrace_timestamp = 0;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8c1b2d290718..d1187ef20caf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22 22
23#include <asm/local.h>
23#include "trace.h" 24#include "trace.h"
24 25
25/* 26/*
@@ -206,6 +207,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 207#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
207#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 208#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
208 209
210#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
211# define RB_FORCE_8BYTE_ALIGNMENT 0
212# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
213#else
214# define RB_FORCE_8BYTE_ALIGNMENT 1
215# define RB_ARCH_ALIGNMENT 8U
216#endif
217
209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 218/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 219#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
211 220
@@ -1546,7 +1555,7 @@ rb_update_event(struct ring_buffer_event *event,
1546 1555
1547 case 0: 1556 case 0:
1548 length -= RB_EVNT_HDR_SIZE; 1557 length -= RB_EVNT_HDR_SIZE;
1549 if (length > RB_MAX_SMALL_DATA) 1558 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1550 event->array[0] = length; 1559 event->array[0] = length;
1551 else 1560 else
1552 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1561 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1721,11 +1730,11 @@ static unsigned rb_calculate_event_length(unsigned length)
1721 if (!length) 1730 if (!length)
1722 length = 1; 1731 length = 1;
1723 1732
1724 if (length > RB_MAX_SMALL_DATA) 1733 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1725 length += sizeof(event.array[0]); 1734 length += sizeof(event.array[0]);
1726 1735
1727 length += RB_EVNT_HDR_SIZE; 1736 length += RB_EVNT_HDR_SIZE;
1728 length = ALIGN(length, RB_ALIGNMENT); 1737 length = ALIGN(length, RB_ARCH_ALIGNMENT);
1729 1738
1730 return length; 1739 return length;
1731} 1740}
@@ -2232,12 +2241,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2232 if (ring_buffer_flags != RB_BUFFERS_ON) 2241 if (ring_buffer_flags != RB_BUFFERS_ON)
2233 return NULL; 2242 return NULL;
2234 2243
2235 if (atomic_read(&buffer->record_disabled))
2236 return NULL;
2237
2238 /* If we are tracing schedule, we don't want to recurse */ 2244 /* If we are tracing schedule, we don't want to recurse */
2239 resched = ftrace_preempt_disable(); 2245 resched = ftrace_preempt_disable();
2240 2246
2247 if (atomic_read(&buffer->record_disabled))
2248 goto out_nocheck;
2249
2241 if (trace_recursive_lock()) 2250 if (trace_recursive_lock())
2242 goto out_nocheck; 2251 goto out_nocheck;
2243 2252
@@ -2469,11 +2478,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
2469 if (ring_buffer_flags != RB_BUFFERS_ON) 2478 if (ring_buffer_flags != RB_BUFFERS_ON)
2470 return -EBUSY; 2479 return -EBUSY;
2471 2480
2472 if (atomic_read(&buffer->record_disabled))
2473 return -EBUSY;
2474
2475 resched = ftrace_preempt_disable(); 2481 resched = ftrace_preempt_disable();
2476 2482
2483 if (atomic_read(&buffer->record_disabled))
2484 goto out;
2485
2477 cpu = raw_smp_processor_id(); 2486 cpu = raw_smp_processor_id();
2478 2487
2479 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2488 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2541,7 +2550,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2541 * @buffer: The ring buffer to enable writes 2550 * @buffer: The ring buffer to enable writes
2542 * 2551 *
2543 * Note, multiple disables will need the same number of enables 2552 * Note, multiple disables will need the same number of enables
2544 * to truely enable the writing (much like preempt_disable). 2553 * to truly enable the writing (much like preempt_disable).
2545 */ 2554 */
2546void ring_buffer_record_enable(struct ring_buffer *buffer) 2555void ring_buffer_record_enable(struct ring_buffer *buffer)
2547{ 2556{
@@ -2577,7 +2586,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2577 * @cpu: The CPU to enable. 2586 * @cpu: The CPU to enable.
2578 * 2587 *
2579 * Note, multiple disables will need the same number of enables 2588 * Note, multiple disables will need the same number of enables
2580 * to truely enable the writing (much like preempt_disable). 2589 * to truly enable the writing (much like preempt_disable).
2581 */ 2590 */
2582void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 2591void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2583{ 2592{
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 032c57ca6502..3ec2ee6f6560 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -92,12 +92,12 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
92static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
93{ 93{
94 preempt_disable(); 94 preempt_disable();
95 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
96} 96}
97 97
98static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
99{ 99{
100 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
101 preempt_enable(); 101 preempt_enable();
102} 102}
103 103
@@ -374,6 +374,21 @@ static int __init set_buf_size(char *str)
374} 374}
375__setup("trace_buf_size=", set_buf_size); 375__setup("trace_buf_size=", set_buf_size);
376 376
377static int __init set_tracing_thresh(char *str)
378{
379 unsigned long threshhold;
380 int ret;
381
382 if (!str)
383 return 0;
384 ret = strict_strtoul(str, 0, &threshhold);
385 if (ret < 0)
386 return 0;
387 tracing_thresh = threshhold * 1000;
388 return 1;
389}
390__setup("tracing_thresh=", set_tracing_thresh);
391
377unsigned long nsecs_to_usecs(unsigned long nsecs) 392unsigned long nsecs_to_usecs(unsigned long nsecs)
378{ 393{
379 return nsecs / 1000; 394 return nsecs / 1000;
@@ -579,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
579static arch_spinlock_t ftrace_max_lock = 594static arch_spinlock_t ftrace_max_lock =
580 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 595 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
581 596
597unsigned long __read_mostly tracing_thresh;
598
582#ifdef CONFIG_TRACER_MAX_TRACE 599#ifdef CONFIG_TRACER_MAX_TRACE
583unsigned long __read_mostly tracing_max_latency; 600unsigned long __read_mostly tracing_max_latency;
584unsigned long __read_mostly tracing_thresh;
585 601
586/* 602/*
587 * Copy the new maximum trace into the separate maximum-trace 603 * Copy the new maximum trace into the separate maximum-trace
@@ -592,7 +608,7 @@ static void
592__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 608__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
593{ 609{
594 struct trace_array_cpu *data = tr->data[cpu]; 610 struct trace_array_cpu *data = tr->data[cpu];
595 struct trace_array_cpu *max_data = tr->data[cpu]; 611 struct trace_array_cpu *max_data;
596 612
597 max_tr.cpu = cpu; 613 max_tr.cpu = cpu;
598 max_tr.time_start = data->preempt_timestamp; 614 max_tr.time_start = data->preempt_timestamp;
@@ -602,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
602 max_data->critical_start = data->critical_start; 618 max_data->critical_start = data->critical_start;
603 max_data->critical_end = data->critical_end; 619 max_data->critical_end = data->critical_end;
604 620
605 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 621 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
606 max_data->pid = tsk->pid; 622 max_data->pid = tsk->pid;
607 max_data->uid = task_uid(tsk); 623 max_data->uid = task_uid(tsk);
608 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 624 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -824,10 +840,10 @@ out:
824 mutex_unlock(&trace_types_lock); 840 mutex_unlock(&trace_types_lock);
825} 841}
826 842
827static void __tracing_reset(struct trace_array *tr, int cpu) 843static void __tracing_reset(struct ring_buffer *buffer, int cpu)
828{ 844{
829 ftrace_disable_cpu(); 845 ftrace_disable_cpu();
830 ring_buffer_reset_cpu(tr->buffer, cpu); 846 ring_buffer_reset_cpu(buffer, cpu);
831 ftrace_enable_cpu(); 847 ftrace_enable_cpu();
832} 848}
833 849
@@ -839,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
839 855
840 /* Make sure all commits have finished */ 856 /* Make sure all commits have finished */
841 synchronize_sched(); 857 synchronize_sched();
842 __tracing_reset(tr, cpu); 858 __tracing_reset(buffer, cpu);
843 859
844 ring_buffer_record_enable(buffer); 860 ring_buffer_record_enable(buffer);
845} 861}
@@ -857,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
857 tr->time_start = ftrace_now(tr->cpu); 873 tr->time_start = ftrace_now(tr->cpu);
858 874
859 for_each_online_cpu(cpu) 875 for_each_online_cpu(cpu)
860 __tracing_reset(tr, cpu); 876 __tracing_reset(buffer, cpu);
861 877
862 ring_buffer_record_enable(buffer); 878 ring_buffer_record_enable(buffer);
863} 879}
@@ -934,6 +950,8 @@ void tracing_start(void)
934 goto out; 950 goto out;
935 } 951 }
936 952
953 /* Prevent the buffers from switching */
954 arch_spin_lock(&ftrace_max_lock);
937 955
938 buffer = global_trace.buffer; 956 buffer = global_trace.buffer;
939 if (buffer) 957 if (buffer)
@@ -943,6 +961,8 @@ void tracing_start(void)
943 if (buffer) 961 if (buffer)
944 ring_buffer_record_enable(buffer); 962 ring_buffer_record_enable(buffer);
945 963
964 arch_spin_unlock(&ftrace_max_lock);
965
946 ftrace_start(); 966 ftrace_start();
947 out: 967 out:
948 spin_unlock_irqrestore(&tracing_start_lock, flags); 968 spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -964,6 +984,9 @@ void tracing_stop(void)
964 if (trace_stop_count++) 984 if (trace_stop_count++)
965 goto out; 985 goto out;
966 986
987 /* Prevent the buffers from switching */
988 arch_spin_lock(&ftrace_max_lock);
989
967 buffer = global_trace.buffer; 990 buffer = global_trace.buffer;
968 if (buffer) 991 if (buffer)
969 ring_buffer_record_disable(buffer); 992 ring_buffer_record_disable(buffer);
@@ -972,6 +995,8 @@ void tracing_stop(void)
972 if (buffer) 995 if (buffer)
973 ring_buffer_record_disable(buffer); 996 ring_buffer_record_disable(buffer);
974 997
998 arch_spin_unlock(&ftrace_max_lock);
999
975 out: 1000 out:
976 spin_unlock_irqrestore(&tracing_start_lock, flags); 1001 spin_unlock_irqrestore(&tracing_start_lock, flags);
977} 1002}
@@ -1166,7 +1191,7 @@ trace_function(struct trace_array *tr,
1166 struct ftrace_entry *entry; 1191 struct ftrace_entry *entry;
1167 1192
1168 /* If we are reading the ring buffer, don't trace */ 1193 /* If we are reading the ring buffer, don't trace */
1169 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 1194 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1170 return; 1195 return;
1171 1196
1172 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1197 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1259,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1259 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1284 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1260 return; 1285 return;
1261 1286
1287 /*
1288 * NMIs can not handle page faults, even with fix ups.
1289 * The save user stack can (and often does) fault.
1290 */
1291 if (unlikely(in_nmi()))
1292 return;
1293
1262 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1294 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1263 sizeof(*entry), flags, pc); 1295 sizeof(*entry), flags, pc);
1264 if (!event) 1296 if (!event)
@@ -1703,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1703 1735
1704 ftrace_enable_cpu(); 1736 ftrace_enable_cpu();
1705 1737
1738 iter->leftover = 0;
1706 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1739 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1707 ; 1740 ;
1708 1741
@@ -4248,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
4248#ifdef CONFIG_TRACER_MAX_TRACE 4281#ifdef CONFIG_TRACER_MAX_TRACE
4249 trace_create_file("tracing_max_latency", 0644, d_tracer, 4282 trace_create_file("tracing_max_latency", 0644, d_tracer,
4250 &tracing_max_latency, &tracing_max_lat_fops); 4283 &tracing_max_latency, &tracing_max_lat_fops);
4284#endif
4251 4285
4252 trace_create_file("tracing_thresh", 0644, d_tracer, 4286 trace_create_file("tracing_thresh", 0644, d_tracer,
4253 &tracing_thresh, &tracing_max_lat_fops); 4287 &tracing_thresh, &tracing_max_lat_fops);
4254#endif
4255 4288
4256 trace_create_file("README", 0444, d_tracer, 4289 trace_create_file("README", 0444, d_tracer,
4257 NULL, &tracing_readme_fops); 4290 NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd05bcaf91b0..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396 396
397extern unsigned long nsecs_to_usecs(unsigned long nsecs); 397extern unsigned long nsecs_to_usecs(unsigned long nsecs);
398 398
399extern unsigned long tracing_thresh;
400
399#ifdef CONFIG_TRACER_MAX_TRACE 401#ifdef CONFIG_TRACER_MAX_TRACE
400extern unsigned long tracing_max_latency; 402extern unsigned long tracing_max_latency;
401extern unsigned long tracing_thresh;
402 403
403void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 404void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
404void update_max_tr_single(struct trace_array *tr, 405void update_max_tr_single(struct trace_array *tr,
@@ -550,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
550 * struct trace_parser - servers for reading the user input separated by spaces 551 * struct trace_parser - servers for reading the user input separated by spaces
551 * @cont: set if the input is not complete - no final space char was found 552 * @cont: set if the input is not complete - no final space char was found
552 * @buffer: holds the parsed user input 553 * @buffer: holds the parsed user input
553 * @idx: user input lenght 554 * @idx: user input length
554 * @size: buffer size 555 * @size: buffer size
555 */ 556 */
556struct trace_parser { 557struct trace_parser {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..6fbfb8f417b9 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
13 * Tracer plugins will chose a default from these clocks. 13 * Tracer plugins will chose a default from these clocks.
14 */ 14 */
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/irqflags.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c
index f0d693005075..81f691eb3a30 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_perf.c
@@ -1,32 +1,36 @@
1/* 1/*
2 * trace event based perf counter profiling 2 * trace event based perf event profiling/tracing
3 * 3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> 4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 * 5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
12 16
13static char *perf_trace_buf; 17static char *perf_trace_buf;
14static char *perf_trace_buf_nmi; 18static char *perf_trace_buf_nmi;
15 19
16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 20typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ;
17 21
18/* Count the events in use (per event id, not per instance) */ 22/* Count the events in use (per event id, not per instance) */
19static int total_profile_count; 23static int total_ref_count;
20 24
21static int ftrace_profile_enable_event(struct ftrace_event_call *event) 25static int perf_trace_event_enable(struct ftrace_event_call *event)
22{ 26{
23 char *buf; 27 char *buf;
24 int ret = -ENOMEM; 28 int ret = -ENOMEM;
25 29
26 if (event->profile_count++ > 0) 30 if (event->perf_refcount++ > 0)
27 return 0; 31 return 0;
28 32
29 if (!total_profile_count) { 33 if (!total_ref_count) {
30 buf = (char *)alloc_percpu(perf_trace_t); 34 buf = (char *)alloc_percpu(perf_trace_t);
31 if (!buf) 35 if (!buf)
32 goto fail_buf; 36 goto fail_buf;
@@ -40,35 +44,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
40 rcu_assign_pointer(perf_trace_buf_nmi, buf); 44 rcu_assign_pointer(perf_trace_buf_nmi, buf);
41 } 45 }
42 46
43 ret = event->profile_enable(event); 47 ret = event->perf_event_enable(event);
44 if (!ret) { 48 if (!ret) {
45 total_profile_count++; 49 total_ref_count++;
46 return 0; 50 return 0;
47 } 51 }
48 52
49fail_buf_nmi: 53fail_buf_nmi:
50 if (!total_profile_count) { 54 if (!total_ref_count) {
51 free_percpu(perf_trace_buf_nmi); 55 free_percpu(perf_trace_buf_nmi);
52 free_percpu(perf_trace_buf); 56 free_percpu(perf_trace_buf);
53 perf_trace_buf_nmi = NULL; 57 perf_trace_buf_nmi = NULL;
54 perf_trace_buf = NULL; 58 perf_trace_buf = NULL;
55 } 59 }
56fail_buf: 60fail_buf:
57 event->profile_count--; 61 event->perf_refcount--;
58 62
59 return ret; 63 return ret;
60} 64}
61 65
62int ftrace_profile_enable(int event_id) 66int perf_trace_enable(int event_id)
63{ 67{
64 struct ftrace_event_call *event; 68 struct ftrace_event_call *event;
65 int ret = -EINVAL; 69 int ret = -EINVAL;
66 70
67 mutex_lock(&event_mutex); 71 mutex_lock(&event_mutex);
68 list_for_each_entry(event, &ftrace_events, list) { 72 list_for_each_entry(event, &ftrace_events, list) {
69 if (event->id == event_id && event->profile_enable && 73 if (event->id == event_id && event->perf_event_enable &&
70 try_module_get(event->mod)) { 74 try_module_get(event->mod)) {
71 ret = ftrace_profile_enable_event(event); 75 ret = perf_trace_event_enable(event);
72 break; 76 break;
73 } 77 }
74 } 78 }
@@ -77,16 +81,16 @@ int ftrace_profile_enable(int event_id)
77 return ret; 81 return ret;
78} 82}
79 83
80static void ftrace_profile_disable_event(struct ftrace_event_call *event) 84static void perf_trace_event_disable(struct ftrace_event_call *event)
81{ 85{
82 char *buf, *nmi_buf; 86 char *buf, *nmi_buf;
83 87
84 if (--event->profile_count > 0) 88 if (--event->perf_refcount > 0)
85 return; 89 return;
86 90
87 event->profile_disable(event); 91 event->perf_event_disable(event);
88 92
89 if (!--total_profile_count) { 93 if (!--total_ref_count) {
90 buf = perf_trace_buf; 94 buf = perf_trace_buf;
91 rcu_assign_pointer(perf_trace_buf, NULL); 95 rcu_assign_pointer(perf_trace_buf, NULL);
92 96
@@ -104,14 +108,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
104 } 108 }
105} 109}
106 110
107void ftrace_profile_disable(int event_id) 111void perf_trace_disable(int event_id)
108{ 112{
109 struct ftrace_event_call *event; 113 struct ftrace_event_call *event;
110 114
111 mutex_lock(&event_mutex); 115 mutex_lock(&event_mutex);
112 list_for_each_entry(event, &ftrace_events, list) { 116 list_for_each_entry(event, &ftrace_events, list) {
113 if (event->id == event_id) { 117 if (event->id == event_id) {
114 ftrace_profile_disable_event(event); 118 perf_trace_event_disable(event);
115 module_put(event->mod); 119 module_put(event->mod);
116 break; 120 break;
117 } 121 }
@@ -119,8 +123,8 @@ void ftrace_profile_disable(int event_id)
119 mutex_unlock(&event_mutex); 123 mutex_unlock(&event_mutex);
120} 124}
121 125
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, 126__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags) 127 int *rctxp, unsigned long *irq_flags)
124{ 128{
125 struct trace_entry *entry; 129 struct trace_entry *entry;
126 char *trace_buf, *raw_data; 130 char *trace_buf, *raw_data;
@@ -138,9 +142,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
138 cpu = smp_processor_id(); 142 cpu = smp_processor_id();
139 143
140 if (in_nmi()) 144 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi); 145 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
142 else 146 else
143 trace_buf = rcu_dereference(perf_trace_buf); 147 trace_buf = rcu_dereference_sched(perf_trace_buf);
144 148
145 if (!trace_buf) 149 if (!trace_buf)
146 goto err; 150 goto err;
@@ -161,4 +165,4 @@ err_recursion:
161 local_irq_restore(*irq_flags); 165 local_irq_restore(*irq_flags);
162 return NULL; 166 return NULL;
163} 167}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); 168EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3f972ad98d04..beab8bf2f310 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -938,7 +938,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
938 trace_create_file("enable", 0644, call->dir, call, 938 trace_create_file("enable", 0644, call->dir, call,
939 enable); 939 enable);
940 940
941 if (call->id && call->profile_enable) 941 if (call->id && call->perf_event_enable)
942 trace_create_file("id", 0444, call->dir, call, 942 trace_create_file("id", 0444, call->dir, call,
943 id); 943 id);
944 944
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e998a824e9db..e6989d9b44da 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -188,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr,
188 struct ring_buffer *buffer = tr->buffer; 188 struct ring_buffer *buffer = tr->buffer;
189 struct ftrace_graph_ent_entry *entry; 189 struct ftrace_graph_ent_entry *entry;
190 190
191 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 191 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
192 return 0; 192 return 0;
193 193
194 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 194 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -237,6 +237,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
237 return ret; 237 return ret;
238} 238}
239 239
240int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
241{
242 if (tracing_thresh)
243 return 1;
244 else
245 return trace_graph_entry(trace);
246}
247
240static void __trace_graph_return(struct trace_array *tr, 248static void __trace_graph_return(struct trace_array *tr,
241 struct ftrace_graph_ret *trace, 249 struct ftrace_graph_ret *trace,
242 unsigned long flags, 250 unsigned long flags,
@@ -247,7 +255,7 @@ static void __trace_graph_return(struct trace_array *tr,
247 struct ring_buffer *buffer = tr->buffer; 255 struct ring_buffer *buffer = tr->buffer;
248 struct ftrace_graph_ret_entry *entry; 256 struct ftrace_graph_ret_entry *entry;
249 257
250 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 258 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
251 return; 259 return;
252 260
253 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 261 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -290,13 +298,26 @@ void set_graph_array(struct trace_array *tr)
290 smp_mb(); 298 smp_mb();
291} 299}
292 300
301void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
302{
303 if (tracing_thresh &&
304 (trace->rettime - trace->calltime < tracing_thresh))
305 return;
306 else
307 trace_graph_return(trace);
308}
309
293static int graph_trace_init(struct trace_array *tr) 310static int graph_trace_init(struct trace_array *tr)
294{ 311{
295 int ret; 312 int ret;
296 313
297 set_graph_array(tr); 314 set_graph_array(tr);
298 ret = register_ftrace_graph(&trace_graph_return, 315 if (tracing_thresh)
299 &trace_graph_entry); 316 ret = register_ftrace_graph(&trace_graph_thresh_return,
317 &trace_graph_thresh_entry);
318 else
319 ret = register_ftrace_graph(&trace_graph_return,
320 &trace_graph_entry);
300 if (ret) 321 if (ret)
301 return ret; 322 return ret;
302 tracing_start_cmdline_record(); 323 tracing_start_cmdline_record();
@@ -920,7 +941,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
920 if (!ret) 941 if (!ret)
921 return TRACE_TYPE_PARTIAL_LINE; 942 return TRACE_TYPE_PARTIAL_LINE;
922 } else { 943 } else {
923 ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); 944 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
924 if (!ret) 945 if (!ret)
925 return TRACE_TYPE_PARTIAL_LINE; 946 return TRACE_TYPE_PARTIAL_LINE;
926 } 947 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 505c92273b1a..1251e367bae9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1214,7 +1214,7 @@ static int set_print_fmt(struct trace_probe *tp)
1214#ifdef CONFIG_PERF_EVENTS 1214#ifdef CONFIG_PERF_EVENTS
1215 1215
1216/* Kprobe profile handler */ 1216/* Kprobe profile handler */
1217static __kprobes void kprobe_profile_func(struct kprobe *kp, 1217static __kprobes void kprobe_perf_func(struct kprobe *kp,
1218 struct pt_regs *regs) 1218 struct pt_regs *regs)
1219{ 1219{
1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1227,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1228 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1228 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1229 size -= sizeof(u32); 1229 size -= sizeof(u32);
1230 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1230 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1231 "profile buffer not large enough")) 1231 "profile buffer not large enough"))
1232 return; 1232 return;
1233 1233
1234 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); 1234 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
1235 if (!entry) 1235 if (!entry)
1236 return; 1236 return;
1237 1237
@@ -1240,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
1240 for (i = 0; i < tp->nr_args; i++) 1240 for (i = 0; i < tp->nr_args; i++)
1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1242 1242
1243 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); 1243 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
1244} 1244}
1245 1245
1246/* Kretprobe profile handler */ 1246/* Kretprobe profile handler */
1247static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, 1247static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1248 struct pt_regs *regs) 1248 struct pt_regs *regs)
1249{ 1249{
1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1257,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1258 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1258 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1259 size -= sizeof(u32); 1259 size -= sizeof(u32);
1260 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1260 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1261 "profile buffer not large enough")) 1261 "profile buffer not large enough"))
1262 return; 1262 return;
1263 1263
1264 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); 1264 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
1265 if (!entry) 1265 if (!entry)
1266 return; 1266 return;
1267 1267
@@ -1271,10 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1271 for (i = 0; i < tp->nr_args; i++) 1271 for (i = 0; i < tp->nr_args; i++)
1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1273 1273
1274 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); 1274 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
1275 irq_flags, regs);
1275} 1276}
1276 1277
1277static int probe_profile_enable(struct ftrace_event_call *call) 1278static int probe_perf_enable(struct ftrace_event_call *call)
1278{ 1279{
1279 struct trace_probe *tp = (struct trace_probe *)call->data; 1280 struct trace_probe *tp = (struct trace_probe *)call->data;
1280 1281
@@ -1286,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
1286 return enable_kprobe(&tp->rp.kp); 1287 return enable_kprobe(&tp->rp.kp);
1287} 1288}
1288 1289
1289static void probe_profile_disable(struct ftrace_event_call *call) 1290static void probe_perf_disable(struct ftrace_event_call *call)
1290{ 1291{
1291 struct trace_probe *tp = (struct trace_probe *)call->data; 1292 struct trace_probe *tp = (struct trace_probe *)call->data;
1292 1293
@@ -1311,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1311 kprobe_trace_func(kp, regs); 1312 kprobe_trace_func(kp, regs);
1312#ifdef CONFIG_PERF_EVENTS 1313#ifdef CONFIG_PERF_EVENTS
1313 if (tp->flags & TP_FLAG_PROFILE) 1314 if (tp->flags & TP_FLAG_PROFILE)
1314 kprobe_profile_func(kp, regs); 1315 kprobe_perf_func(kp, regs);
1315#endif 1316#endif
1316 return 0; /* We don't tweek kernel, so just return 0 */ 1317 return 0; /* We don't tweek kernel, so just return 0 */
1317} 1318}
@@ -1325,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1325 kretprobe_trace_func(ri, regs); 1326 kretprobe_trace_func(ri, regs);
1326#ifdef CONFIG_PERF_EVENTS 1327#ifdef CONFIG_PERF_EVENTS
1327 if (tp->flags & TP_FLAG_PROFILE) 1328 if (tp->flags & TP_FLAG_PROFILE)
1328 kretprobe_profile_func(ri, regs); 1329 kretprobe_perf_func(ri, regs);
1329#endif 1330#endif
1330 return 0; /* We don't tweek kernel, so just return 0 */ 1331 return 0; /* We don't tweek kernel, so just return 0 */
1331} 1332}
@@ -1358,8 +1359,8 @@ static int register_probe_event(struct trace_probe *tp)
1358 call->unregfunc = probe_event_disable; 1359 call->unregfunc = probe_event_disable;
1359 1360
1360#ifdef CONFIG_PERF_EVENTS 1361#ifdef CONFIG_PERF_EVENTS
1361 call->profile_enable = probe_profile_enable; 1362 call->perf_event_enable = probe_perf_enable;
1362 call->profile_disable = probe_profile_disable; 1363 call->perf_event_disable = probe_perf_disable;
1363#endif 1364#endif
1364 call->data = tp; 1365 call->data = tp;
1365 ret = trace_add_event_call(call); 1366 ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cba47d7935cc..33c2a5b769dc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -428,12 +428,12 @@ core_initcall(init_ftrace_syscalls);
428 428
429#ifdef CONFIG_PERF_EVENTS 429#ifdef CONFIG_PERF_EVENTS
430 430
431static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 431static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
432static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 432static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
433static int sys_prof_refcount_enter; 433static int sys_perf_refcount_enter;
434static int sys_prof_refcount_exit; 434static int sys_perf_refcount_exit;
435 435
436static void prof_syscall_enter(struct pt_regs *regs, long id) 436static void perf_syscall_enter(struct pt_regs *regs, long id)
437{ 437{
438 struct syscall_metadata *sys_data; 438 struct syscall_metadata *sys_data;
439 struct syscall_trace_enter *rec; 439 struct syscall_trace_enter *rec;
@@ -443,7 +443,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
443 int size; 443 int size;
444 444
445 syscall_nr = syscall_get_nr(current, regs); 445 syscall_nr = syscall_get_nr(current, regs);
446 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 446 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
447 return; 447 return;
448 448
449 sys_data = syscall_nr_to_meta(syscall_nr); 449 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -455,11 +455,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
455 size = ALIGN(size + sizeof(u32), sizeof(u64)); 455 size = ALIGN(size + sizeof(u32), sizeof(u64));
456 size -= sizeof(u32); 456 size -= sizeof(u32);
457 457
458 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 458 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
459 "profile buffer not large enough")) 459 "perf buffer not large enough"))
460 return; 460 return;
461 461
462 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, 462 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
463 sys_data->enter_event->id, &rctx, &flags); 463 sys_data->enter_event->id, &rctx, &flags);
464 if (!rec) 464 if (!rec)
465 return; 465 return;
@@ -467,10 +467,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
467 rec->nr = syscall_nr; 467 rec->nr = syscall_nr;
468 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 468 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
469 (unsigned long *)&rec->args); 469 (unsigned long *)&rec->args);
470 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); 470 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
471} 471}
472 472
473int prof_sysenter_enable(struct ftrace_event_call *call) 473int perf_sysenter_enable(struct ftrace_event_call *call)
474{ 474{
475 int ret = 0; 475 int ret = 0;
476 int num; 476 int num;
@@ -478,34 +478,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
478 num = ((struct syscall_metadata *)call->data)->syscall_nr; 478 num = ((struct syscall_metadata *)call->data)->syscall_nr;
479 479
480 mutex_lock(&syscall_trace_lock); 480 mutex_lock(&syscall_trace_lock);
481 if (!sys_prof_refcount_enter) 481 if (!sys_perf_refcount_enter)
482 ret = register_trace_sys_enter(prof_syscall_enter); 482 ret = register_trace_sys_enter(perf_syscall_enter);
483 if (ret) { 483 if (ret) {
484 pr_info("event trace: Could not activate" 484 pr_info("event trace: Could not activate"
485 "syscall entry trace point"); 485 "syscall entry trace point");
486 } else { 486 } else {
487 set_bit(num, enabled_prof_enter_syscalls); 487 set_bit(num, enabled_perf_enter_syscalls);
488 sys_prof_refcount_enter++; 488 sys_perf_refcount_enter++;
489 } 489 }
490 mutex_unlock(&syscall_trace_lock); 490 mutex_unlock(&syscall_trace_lock);
491 return ret; 491 return ret;
492} 492}
493 493
494void prof_sysenter_disable(struct ftrace_event_call *call) 494void perf_sysenter_disable(struct ftrace_event_call *call)
495{ 495{
496 int num; 496 int num;
497 497
498 num = ((struct syscall_metadata *)call->data)->syscall_nr; 498 num = ((struct syscall_metadata *)call->data)->syscall_nr;
499 499
500 mutex_lock(&syscall_trace_lock); 500 mutex_lock(&syscall_trace_lock);
501 sys_prof_refcount_enter--; 501 sys_perf_refcount_enter--;
502 clear_bit(num, enabled_prof_enter_syscalls); 502 clear_bit(num, enabled_perf_enter_syscalls);
503 if (!sys_prof_refcount_enter) 503 if (!sys_perf_refcount_enter)
504 unregister_trace_sys_enter(prof_syscall_enter); 504 unregister_trace_sys_enter(perf_syscall_enter);
505 mutex_unlock(&syscall_trace_lock); 505 mutex_unlock(&syscall_trace_lock);
506} 506}
507 507
508static void prof_syscall_exit(struct pt_regs *regs, long ret) 508static void perf_syscall_exit(struct pt_regs *regs, long ret)
509{ 509{
510 struct syscall_metadata *sys_data; 510 struct syscall_metadata *sys_data;
511 struct syscall_trace_exit *rec; 511 struct syscall_trace_exit *rec;
@@ -515,7 +515,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
515 int size; 515 int size;
516 516
517 syscall_nr = syscall_get_nr(current, regs); 517 syscall_nr = syscall_get_nr(current, regs);
518 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 518 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
519 return; 519 return;
520 520
521 sys_data = syscall_nr_to_meta(syscall_nr); 521 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -530,11 +530,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
530 * Impossible, but be paranoid with the future 530 * Impossible, but be paranoid with the future
531 * How to put this check outside runtime? 531 * How to put this check outside runtime?
532 */ 532 */
533 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 533 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
534 "exit event has grown above profile buffer size")) 534 "exit event has grown above perf buffer size"))
535 return; 535 return;
536 536
537 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, 537 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
538 sys_data->exit_event->id, &rctx, &flags); 538 sys_data->exit_event->id, &rctx, &flags);
539 if (!rec) 539 if (!rec)
540 return; 540 return;
@@ -542,10 +542,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
542 rec->nr = syscall_nr; 542 rec->nr = syscall_nr;
543 rec->ret = syscall_get_return_value(current, regs); 543 rec->ret = syscall_get_return_value(current, regs);
544 544
545 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); 545 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
546} 546}
547 547
548int prof_sysexit_enable(struct ftrace_event_call *call) 548int perf_sysexit_enable(struct ftrace_event_call *call)
549{ 549{
550 int ret = 0; 550 int ret = 0;
551 int num; 551 int num;
@@ -553,30 +553,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
553 num = ((struct syscall_metadata *)call->data)->syscall_nr; 553 num = ((struct syscall_metadata *)call->data)->syscall_nr;
554 554
555 mutex_lock(&syscall_trace_lock); 555 mutex_lock(&syscall_trace_lock);
556 if (!sys_prof_refcount_exit) 556 if (!sys_perf_refcount_exit)
557 ret = register_trace_sys_exit(prof_syscall_exit); 557 ret = register_trace_sys_exit(perf_syscall_exit);
558 if (ret) { 558 if (ret) {
559 pr_info("event trace: Could not activate" 559 pr_info("event trace: Could not activate"
560 "syscall exit trace point"); 560 "syscall exit trace point");
561 } else { 561 } else {
562 set_bit(num, enabled_prof_exit_syscalls); 562 set_bit(num, enabled_perf_exit_syscalls);
563 sys_prof_refcount_exit++; 563 sys_perf_refcount_exit++;
564 } 564 }
565 mutex_unlock(&syscall_trace_lock); 565 mutex_unlock(&syscall_trace_lock);
566 return ret; 566 return ret;
567} 567}
568 568
569void prof_sysexit_disable(struct ftrace_event_call *call) 569void perf_sysexit_disable(struct ftrace_event_call *call)
570{ 570{
571 int num; 571 int num;
572 572
573 num = ((struct syscall_metadata *)call->data)->syscall_nr; 573 num = ((struct syscall_metadata *)call->data)->syscall_nr;
574 574
575 mutex_lock(&syscall_trace_lock); 575 mutex_lock(&syscall_trace_lock);
576 sys_prof_refcount_exit--; 576 sys_perf_refcount_exit--;
577 clear_bit(num, enabled_prof_exit_syscalls); 577 clear_bit(num, enabled_perf_exit_syscalls);
578 if (!sys_prof_refcount_exit) 578 if (!sys_perf_refcount_exit)
579 unregister_trace_sys_exit(prof_syscall_exit); 579 unregister_trace_sys_exit(perf_syscall_exit);
580 mutex_unlock(&syscall_trace_lock); 580 mutex_unlock(&syscall_trace_lock);
581} 581}
582 582
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields